mirror of
https://github.com/openjdk/jdk.git
synced 2026-02-11 02:48:20 +00:00
12026 lines
424 KiB
C++
12026 lines
424 KiB
C++
/*
|
|
* Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*
|
|
*/
|
|
|
|
#include "asm/macroAssembler.hpp"
|
|
#include "asm/macroAssembler.inline.hpp"
|
|
#include "asm/register.hpp"
|
|
#include "atomic_aarch64.hpp"
|
|
#include "compiler/oopMap.hpp"
|
|
#include "gc/shared/barrierSet.hpp"
|
|
#include "gc/shared/barrierSetAssembler.hpp"
|
|
#include "gc/shared/gc_globals.hpp"
|
|
#include "gc/shared/tlab_globals.hpp"
|
|
#include "interpreter/interpreter.hpp"
|
|
#include "memory/universe.hpp"
|
|
#include "nativeInst_aarch64.hpp"
|
|
#include "oops/instanceOop.hpp"
|
|
#include "oops/method.hpp"
|
|
#include "oops/objArrayKlass.hpp"
|
|
#include "oops/oop.inline.hpp"
|
|
#include "prims/methodHandles.hpp"
|
|
#include "prims/upcallLinker.hpp"
|
|
#include "runtime/arguments.hpp"
|
|
#include "runtime/atomicAccess.hpp"
|
|
#include "runtime/continuation.hpp"
|
|
#include "runtime/continuationEntry.inline.hpp"
|
|
#include "runtime/frame.inline.hpp"
|
|
#include "runtime/handles.inline.hpp"
|
|
#include "runtime/javaThread.hpp"
|
|
#include "runtime/sharedRuntime.hpp"
|
|
#include "runtime/stubCodeGenerator.hpp"
|
|
#include "runtime/stubRoutines.hpp"
|
|
#include "utilities/align.hpp"
|
|
#include "utilities/checkedCast.hpp"
|
|
#include "utilities/debug.hpp"
|
|
#include "utilities/globalDefinitions.hpp"
|
|
#include "utilities/intpow.hpp"
|
|
#include "utilities/powerOfTwo.hpp"
|
|
#ifdef COMPILER2
|
|
#include "opto/runtime.hpp"
|
|
#endif
|
|
#if INCLUDE_ZGC
|
|
#include "gc/z/zThreadLocalData.hpp"
|
|
#endif
|
|
|
|
// Declaration and definition of StubGenerator (no .hpp file).
|
|
// For a more detailed description of the stub routine structure
|
|
// see the comment in stubRoutines.hpp
|
|
|
|
#undef __
|
|
#define __ _masm->
|
|
|
|
#ifdef PRODUCT
|
|
#define BLOCK_COMMENT(str) /* nothing */
|
|
#else
|
|
#define BLOCK_COMMENT(str) __ block_comment(str)
|
|
#endif
|
|
|
|
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
|
|
|
|
// Stub Code definitions
|
|
|
|
class StubGenerator: public StubCodeGenerator {
|
|
private:
|
|
|
|
#ifdef PRODUCT
|
|
#define inc_counter_np(counter) ((void)0)
|
|
#else
|
|
void inc_counter_np_(uint& counter) {
|
|
__ incrementw(ExternalAddress((address)&counter));
|
|
}
|
|
#define inc_counter_np(counter) \
|
|
BLOCK_COMMENT("inc_counter " #counter); \
|
|
inc_counter_np_(counter);
|
|
#endif
|
|
|
|
// Call stubs are used to call Java from C
|
|
//
|
|
// Arguments:
|
|
// c_rarg0: call wrapper address address
|
|
// c_rarg1: result address
|
|
// c_rarg2: result type BasicType
|
|
// c_rarg3: method Method*
|
|
// c_rarg4: (interpreter) entry point address
|
|
// c_rarg5: parameters intptr_t*
|
|
// c_rarg6: parameter size (in words) int
|
|
// c_rarg7: thread Thread*
|
|
//
|
|
// There is no return from the stub itself as any Java result
|
|
// is written to result
|
|
//
|
|
// we save r30 (lr) as the return PC at the base of the frame and
|
|
// link r29 (fp) below it as the frame pointer installing sp (r31)
|
|
// into fp.
|
|
//
|
|
// we save r0-r7, which accounts for all the c arguments.
|
|
//
|
|
// TODO: strictly do we need to save them all? they are treated as
|
|
// volatile by C so could we omit saving the ones we are going to
|
|
// place in global registers (thread? method?) or those we only use
|
|
// during setup of the Java call?
|
|
//
|
|
// we don't need to save r8 which C uses as an indirect result location
|
|
// return register.
|
|
//
|
|
// we don't need to save r9-r15 which both C and Java treat as
|
|
// volatile
|
|
//
|
|
// we don't need to save r16-18 because Java does not use them
|
|
//
|
|
// we save r19-r28 which Java uses as scratch registers and C
|
|
// expects to be callee-save
|
|
//
|
|
// we save the bottom 64 bits of each value stored in v8-v15; it is
|
|
// the responsibility of the caller to preserve larger values.
|
|
//
|
|
// so the stub frame looks like this when we enter Java code
|
|
//
|
|
// [ return_from_Java ] <--- sp
|
|
// [ argument word n ]
|
|
// ...
|
|
// -29 [ argument word 1 ]
|
|
// -28 [ saved Floating-point Control Register ]
|
|
// -26 [ saved v15 ] <--- sp_after_call
|
|
// -25 [ saved v14 ]
|
|
// -24 [ saved v13 ]
|
|
// -23 [ saved v12 ]
|
|
// -22 [ saved v11 ]
|
|
// -21 [ saved v10 ]
|
|
// -20 [ saved v9 ]
|
|
// -19 [ saved v8 ]
|
|
// -18 [ saved r28 ]
|
|
// -17 [ saved r27 ]
|
|
// -16 [ saved r26 ]
|
|
// -15 [ saved r25 ]
|
|
// -14 [ saved r24 ]
|
|
// -13 [ saved r23 ]
|
|
// -12 [ saved r22 ]
|
|
// -11 [ saved r21 ]
|
|
// -10 [ saved r20 ]
|
|
// -9 [ saved r19 ]
|
|
// -8 [ call wrapper (r0) ]
|
|
// -7 [ result (r1) ]
|
|
// -6 [ result type (r2) ]
|
|
// -5 [ method (r3) ]
|
|
// -4 [ entry point (r4) ]
|
|
// -3 [ parameters (r5) ]
|
|
// -2 [ parameter size (r6) ]
|
|
// -1 [ thread (r7) ]
|
|
// 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
|
|
// 1 [ saved lr (r30) ]
|
|
|
|
// Call stub stack layout word offsets from fp
|
|
enum call_stub_layout {
|
|
sp_after_call_off = -28,
|
|
|
|
fpcr_off = sp_after_call_off,
|
|
d15_off = -26,
|
|
d13_off = -24,
|
|
d11_off = -22,
|
|
d9_off = -20,
|
|
|
|
r28_off = -18,
|
|
r26_off = -16,
|
|
r24_off = -14,
|
|
r22_off = -12,
|
|
r20_off = -10,
|
|
call_wrapper_off = -8,
|
|
result_off = -7,
|
|
result_type_off = -6,
|
|
method_off = -5,
|
|
entry_point_off = -4,
|
|
parameter_size_off = -2,
|
|
thread_off = -1,
|
|
fp_f = 0,
|
|
retaddr_off = 1,
|
|
};
|
|
|
|
address generate_call_stub(address& return_address) {
|
|
assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
|
|
(int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
|
|
"adjust this code");
|
|
|
|
StubId stub_id = StubId::stubgen_call_stub_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Address sp_after_call (rfp, sp_after_call_off * wordSize);
|
|
|
|
const Address fpcr_save (rfp, fpcr_off * wordSize);
|
|
const Address call_wrapper (rfp, call_wrapper_off * wordSize);
|
|
const Address result (rfp, result_off * wordSize);
|
|
const Address result_type (rfp, result_type_off * wordSize);
|
|
const Address method (rfp, method_off * wordSize);
|
|
const Address entry_point (rfp, entry_point_off * wordSize);
|
|
const Address parameter_size(rfp, parameter_size_off * wordSize);
|
|
|
|
const Address thread (rfp, thread_off * wordSize);
|
|
|
|
const Address d15_save (rfp, d15_off * wordSize);
|
|
const Address d13_save (rfp, d13_off * wordSize);
|
|
const Address d11_save (rfp, d11_off * wordSize);
|
|
const Address d9_save (rfp, d9_off * wordSize);
|
|
|
|
const Address r28_save (rfp, r28_off * wordSize);
|
|
const Address r26_save (rfp, r26_off * wordSize);
|
|
const Address r24_save (rfp, r24_off * wordSize);
|
|
const Address r22_save (rfp, r22_off * wordSize);
|
|
const Address r20_save (rfp, r20_off * wordSize);
|
|
|
|
// stub code
|
|
|
|
address aarch64_entry = __ pc();
|
|
|
|
// set up frame and move sp to end of save area
|
|
__ enter();
|
|
__ sub(sp, rfp, -sp_after_call_off * wordSize);
|
|
|
|
// save register parameters and Java scratch/global registers
|
|
// n.b. we save thread even though it gets installed in
|
|
// rthread because we want to sanity check rthread later
|
|
__ str(c_rarg7, thread);
|
|
__ strw(c_rarg6, parameter_size);
|
|
__ stp(c_rarg4, c_rarg5, entry_point);
|
|
__ stp(c_rarg2, c_rarg3, result_type);
|
|
__ stp(c_rarg0, c_rarg1, call_wrapper);
|
|
|
|
__ stp(r20, r19, r20_save);
|
|
__ stp(r22, r21, r22_save);
|
|
__ stp(r24, r23, r24_save);
|
|
__ stp(r26, r25, r26_save);
|
|
__ stp(r28, r27, r28_save);
|
|
|
|
__ stpd(v9, v8, d9_save);
|
|
__ stpd(v11, v10, d11_save);
|
|
__ stpd(v13, v12, d13_save);
|
|
__ stpd(v15, v14, d15_save);
|
|
|
|
__ get_fpcr(rscratch1);
|
|
__ str(rscratch1, fpcr_save);
|
|
// Set FPCR to the state we need. We do want Round to Nearest. We
|
|
// don't want non-IEEE rounding modes or floating-point traps.
|
|
__ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
|
|
__ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
|
|
__ set_fpcr(rscratch1);
|
|
|
|
// install Java thread in global register now we have saved
|
|
// whatever value it held
|
|
__ mov(rthread, c_rarg7);
|
|
// And method
|
|
__ mov(rmethod, c_rarg3);
|
|
|
|
// set up the heapbase register
|
|
__ reinit_heapbase();
|
|
|
|
#ifdef ASSERT
|
|
// make sure we have no pending exceptions
|
|
{
|
|
Label L;
|
|
__ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
|
|
__ cmp(rscratch1, (u1)NULL_WORD);
|
|
__ br(Assembler::EQ, L);
|
|
__ stop("StubRoutines::call_stub: entered with pending exception");
|
|
__ BIND(L);
|
|
}
|
|
#endif
|
|
// pass parameters if any
|
|
__ mov(esp, sp);
|
|
__ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
|
|
__ andr(sp, rscratch1, -2 * wordSize);
|
|
|
|
BLOCK_COMMENT("pass parameters if any");
|
|
Label parameters_done;
|
|
// parameter count is still in c_rarg6
|
|
// and parameter pointer identifying param 1 is in c_rarg5
|
|
__ cbzw(c_rarg6, parameters_done);
|
|
|
|
address loop = __ pc();
|
|
__ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
|
|
__ subsw(c_rarg6, c_rarg6, 1);
|
|
__ push(rscratch1);
|
|
__ br(Assembler::GT, loop);
|
|
|
|
__ BIND(parameters_done);
|
|
|
|
// call Java entry -- passing methdoOop, and current sp
|
|
// rmethod: Method*
|
|
// r19_sender_sp: sender sp
|
|
BLOCK_COMMENT("call Java function");
|
|
__ mov(r19_sender_sp, sp);
|
|
__ blr(c_rarg4);
|
|
|
|
// we do this here because the notify will already have been done
|
|
// if we get to the next instruction via an exception
|
|
//
|
|
// n.b. adding this instruction here affects the calculation of
|
|
// whether or not a routine returns to the call stub (used when
|
|
// doing stack walks) since the normal test is to check the return
|
|
// pc against the address saved below. so we may need to allow for
|
|
// this extra instruction in the check.
|
|
|
|
// save current address for use by exception handling code
|
|
|
|
return_address = __ pc();
|
|
|
|
// store result depending on type (everything that is not
|
|
// T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
|
|
// n.b. this assumes Java returns an integral result in r0
|
|
// and a floating result in j_farg0
|
|
__ ldr(j_rarg2, result);
|
|
Label is_long, is_float, is_double, exit;
|
|
__ ldr(j_rarg1, result_type);
|
|
__ cmp(j_rarg1, (u1)T_OBJECT);
|
|
__ br(Assembler::EQ, is_long);
|
|
__ cmp(j_rarg1, (u1)T_LONG);
|
|
__ br(Assembler::EQ, is_long);
|
|
__ cmp(j_rarg1, (u1)T_FLOAT);
|
|
__ br(Assembler::EQ, is_float);
|
|
__ cmp(j_rarg1, (u1)T_DOUBLE);
|
|
__ br(Assembler::EQ, is_double);
|
|
|
|
// handle T_INT case
|
|
__ strw(r0, Address(j_rarg2));
|
|
|
|
__ BIND(exit);
|
|
|
|
// pop parameters
|
|
__ sub(esp, rfp, -sp_after_call_off * wordSize);
|
|
|
|
#ifdef ASSERT
|
|
// verify that threads correspond
|
|
{
|
|
Label L, S;
|
|
__ ldr(rscratch1, thread);
|
|
__ cmp(rthread, rscratch1);
|
|
__ br(Assembler::NE, S);
|
|
__ get_thread(rscratch1);
|
|
__ cmp(rthread, rscratch1);
|
|
__ br(Assembler::EQ, L);
|
|
__ BIND(S);
|
|
__ stop("StubRoutines::call_stub: threads must correspond");
|
|
__ BIND(L);
|
|
}
|
|
#endif
|
|
|
|
__ pop_cont_fastpath(rthread);
|
|
|
|
// restore callee-save registers
|
|
__ ldpd(v15, v14, d15_save);
|
|
__ ldpd(v13, v12, d13_save);
|
|
__ ldpd(v11, v10, d11_save);
|
|
__ ldpd(v9, v8, d9_save);
|
|
|
|
__ ldp(r28, r27, r28_save);
|
|
__ ldp(r26, r25, r26_save);
|
|
__ ldp(r24, r23, r24_save);
|
|
__ ldp(r22, r21, r22_save);
|
|
__ ldp(r20, r19, r20_save);
|
|
|
|
// restore fpcr
|
|
__ ldr(rscratch1, fpcr_save);
|
|
__ set_fpcr(rscratch1);
|
|
|
|
__ ldp(c_rarg0, c_rarg1, call_wrapper);
|
|
__ ldrw(c_rarg2, result_type);
|
|
__ ldr(c_rarg3, method);
|
|
__ ldp(c_rarg4, c_rarg5, entry_point);
|
|
__ ldp(c_rarg6, c_rarg7, parameter_size);
|
|
|
|
// leave frame and return to caller
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
// handle return types different from T_INT
|
|
|
|
__ BIND(is_long);
|
|
__ str(r0, Address(j_rarg2, 0));
|
|
__ br(Assembler::AL, exit);
|
|
|
|
__ BIND(is_float);
|
|
__ strs(j_farg0, Address(j_rarg2, 0));
|
|
__ br(Assembler::AL, exit);
|
|
|
|
__ BIND(is_double);
|
|
__ strd(j_farg0, Address(j_rarg2, 0));
|
|
__ br(Assembler::AL, exit);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Return point for a Java call if there's an exception thrown in
|
|
// Java code. The exception is caught and transformed into a
|
|
// pending exception stored in JavaThread that can be tested from
|
|
// within the VM.
|
|
//
|
|
// Note: Usually the parameters are removed by the callee. In case
|
|
// of an exception crossing an activation frame boundary, that is
|
|
// not the case if the callee is compiled code => need to setup the
|
|
// rsp.
|
|
//
|
|
// r0: exception oop
|
|
|
|
address generate_catch_exception() {
|
|
StubId stub_id = StubId::stubgen_catch_exception_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
// same as in generate_call_stub():
|
|
const Address sp_after_call(rfp, sp_after_call_off * wordSize);
|
|
const Address thread (rfp, thread_off * wordSize);
|
|
|
|
#ifdef ASSERT
|
|
// verify that threads correspond
|
|
{
|
|
Label L, S;
|
|
__ ldr(rscratch1, thread);
|
|
__ cmp(rthread, rscratch1);
|
|
__ br(Assembler::NE, S);
|
|
__ get_thread(rscratch1);
|
|
__ cmp(rthread, rscratch1);
|
|
__ br(Assembler::EQ, L);
|
|
__ bind(S);
|
|
__ stop("StubRoutines::catch_exception: threads must correspond");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
// set pending exception
|
|
__ verify_oop(r0);
|
|
|
|
__ str(r0, Address(rthread, Thread::pending_exception_offset()));
|
|
__ mov(rscratch1, (address)__FILE__);
|
|
__ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
|
|
__ movw(rscratch1, (int)__LINE__);
|
|
__ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
|
|
|
|
// complete return to VM
|
|
assert(StubRoutines::_call_stub_return_address != nullptr,
|
|
"_call_stub_return_address must have been generated before");
|
|
__ b(StubRoutines::_call_stub_return_address);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Continuation point for runtime calls returning with a pending
|
|
// exception. The pending exception check happened in the runtime
|
|
// or native call stub. The pending exception in Thread is
|
|
// converted into a Java-level exception.
|
|
//
|
|
// Contract with Java-level exception handlers:
|
|
// r0: exception
|
|
// r3: throwing pc
|
|
//
|
|
// NOTE: At entry of this stub, exception-pc must be in LR !!
|
|
|
|
// NOTE: this is always used as a jump target within generated code
|
|
// so it just needs to be generated code with no x86 prolog
|
|
|
|
address generate_forward_exception() {
|
|
StubId stub_id = StubId::stubgen_forward_exception_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
// Upon entry, LR points to the return address returning into
|
|
// Java (interpreted or compiled) code; i.e., the return address
|
|
// becomes the throwing pc.
|
|
//
|
|
// Arguments pushed before the runtime call are still on the stack
|
|
// but the exception handler will reset the stack pointer ->
|
|
// ignore them. A potential result in registers can be ignored as
|
|
// well.
|
|
|
|
#ifdef ASSERT
|
|
// make sure this code is only executed if there is a pending exception
|
|
{
|
|
Label L;
|
|
__ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
|
|
__ cbnz(rscratch1, L);
|
|
__ stop("StubRoutines::forward exception: no pending exception (1)");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
// compute exception handler into r19
|
|
|
|
// call the VM to find the handler address associated with the
|
|
// caller address. pass thread in r0 and caller pc (ret address)
|
|
// in r1. n.b. the caller pc is in lr, unlike x86 where it is on
|
|
// the stack.
|
|
__ mov(c_rarg1, lr);
|
|
// lr will be trashed by the VM call so we move it to R19
|
|
// (callee-saved) because we also need to pass it to the handler
|
|
// returned by this call.
|
|
__ mov(r19, lr);
|
|
BLOCK_COMMENT("call exception_handler_for_return_address");
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
|
|
SharedRuntime::exception_handler_for_return_address),
|
|
rthread, c_rarg1);
|
|
// Reinitialize the ptrue predicate register, in case the external runtime
|
|
// call clobbers ptrue reg, as we may return to SVE compiled code.
|
|
__ reinitialize_ptrue();
|
|
|
|
// we should not really care that lr is no longer the callee
|
|
// address. we saved the value the handler needs in r19 so we can
|
|
// just copy it to r3. however, the C2 handler will push its own
|
|
// frame and then calls into the VM and the VM code asserts that
|
|
// the PC for the frame above the handler belongs to a compiled
|
|
// Java method. So, we restore lr here to satisfy that assert.
|
|
__ mov(lr, r19);
|
|
// setup r0 & r3 & clear pending exception
|
|
__ mov(r3, r19);
|
|
__ mov(r19, r0);
|
|
__ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
|
|
__ str(zr, Address(rthread, Thread::pending_exception_offset()));
|
|
|
|
#ifdef ASSERT
|
|
// make sure exception is set
|
|
{
|
|
Label L;
|
|
__ cbnz(r0, L);
|
|
__ stop("StubRoutines::forward exception: no pending exception (2)");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
// continue at exception handler
|
|
// r0: exception
|
|
// r3: throwing pc
|
|
// r19: exception handler
|
|
__ verify_oop(r0);
|
|
__ br(r19);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Non-destructive plausibility checks for oops
|
|
//
|
|
// Arguments:
|
|
// r0: oop to verify
|
|
// rscratch1: error message
|
|
//
|
|
// Stack after saving c_rarg3:
|
|
// [tos + 0]: saved c_rarg3
|
|
// [tos + 1]: saved c_rarg2
|
|
// [tos + 2]: saved lr
|
|
// [tos + 3]: saved rscratch2
|
|
// [tos + 4]: saved r0
|
|
// [tos + 5]: saved rscratch1
|
|
address generate_verify_oop() {
|
|
StubId stub_id = StubId::stubgen_verify_oop_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Label exit, error;
|
|
|
|
// save c_rarg2 and c_rarg3
|
|
__ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
|
|
|
|
// __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
|
|
__ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
|
|
__ ldr(c_rarg3, Address(c_rarg2));
|
|
__ add(c_rarg3, c_rarg3, 1);
|
|
__ str(c_rarg3, Address(c_rarg2));
|
|
|
|
// object is in r0
|
|
// make sure object is 'reasonable'
|
|
__ cbz(r0, exit); // if obj is null it is OK
|
|
|
|
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
|
|
|
|
// return if everything seems ok
|
|
__ bind(exit);
|
|
|
|
__ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
|
|
__ ret(lr);
|
|
|
|
// handle errors
|
|
__ bind(error);
|
|
__ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
|
|
|
|
__ push(RegSet::range(r0, r29), sp);
|
|
// debug(char* msg, int64_t pc, int64_t regs[])
|
|
__ mov(c_rarg0, rscratch1); // pass address of error message
|
|
__ mov(c_rarg1, lr); // pass return address
|
|
__ mov(c_rarg2, sp); // pass address of regs on stack
|
|
#ifndef PRODUCT
|
|
assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
|
|
#endif
|
|
BLOCK_COMMENT("call MacroAssembler::debug");
|
|
__ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
|
|
__ blr(rscratch1);
|
|
__ hlt(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Generate indices for iota vector.
|
|
address generate_iota_indices(StubId stub_id) {
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
// B
|
|
__ emit_data64(0x0706050403020100, relocInfo::none);
|
|
__ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
|
|
// H
|
|
__ emit_data64(0x0003000200010000, relocInfo::none);
|
|
__ emit_data64(0x0007000600050004, relocInfo::none);
|
|
// S
|
|
__ emit_data64(0x0000000100000000, relocInfo::none);
|
|
__ emit_data64(0x0000000300000002, relocInfo::none);
|
|
// D
|
|
__ emit_data64(0x0000000000000000, relocInfo::none);
|
|
__ emit_data64(0x0000000000000001, relocInfo::none);
|
|
// S - FP
|
|
__ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
|
|
__ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
|
|
// D - FP
|
|
__ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
|
|
__ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
|
|
return start;
|
|
}
|
|
|
|
// The inner part of zero_words(). This is the bulk operation,
|
|
// zeroing words in blocks, possibly using DC ZVA to do it. The
|
|
// caller is responsible for zeroing the last few words.
|
|
//
|
|
// Inputs:
|
|
// r10: the HeapWord-aligned base address of an array to zero.
|
|
// r11: the count in HeapWords, r11 > 0.
|
|
//
|
|
// Returns r10 and r11, adjusted for the caller to clear.
|
|
// r10: the base address of the tail of words left to clear.
|
|
// r11: the number of words in the tail.
|
|
// r11 < MacroAssembler::zero_words_block_size.
|
|
|
|
address generate_zero_blocks() {
|
|
Label done;
|
|
Label base_aligned;
|
|
|
|
Register base = r10, cnt = r11;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_zero_blocks_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
if (UseBlockZeroing) {
|
|
int zva_length = VM_Version::zva_length();
|
|
|
|
// Ensure ZVA length can be divided by 16. This is required by
|
|
// the subsequent operations.
|
|
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
|
|
|
|
__ tbz(base, 3, base_aligned);
|
|
__ str(zr, Address(__ post(base, 8)));
|
|
__ sub(cnt, cnt, 1);
|
|
__ bind(base_aligned);
|
|
|
|
// Ensure count >= zva_length * 2 so that it still deserves a zva after
|
|
// alignment.
|
|
Label small;
|
|
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
|
|
__ subs(rscratch1, cnt, low_limit >> 3);
|
|
__ br(Assembler::LT, small);
|
|
__ zero_dcache_blocks(base, cnt);
|
|
__ bind(small);
|
|
}
|
|
|
|
{
|
|
// Number of stp instructions we'll unroll
|
|
const int unroll =
|
|
MacroAssembler::zero_words_block_size / 2;
|
|
// Clear the remaining blocks.
|
|
Label loop;
|
|
__ subs(cnt, cnt, unroll * 2);
|
|
__ br(Assembler::LT, done);
|
|
__ bind(loop);
|
|
for (int i = 0; i < unroll; i++)
|
|
__ stp(zr, zr, __ post(base, 16));
|
|
__ subs(cnt, cnt, unroll * 2);
|
|
__ br(Assembler::GE, loop);
|
|
__ bind(done);
|
|
__ add(cnt, cnt, unroll * 2);
|
|
}
|
|
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
|
|
typedef enum {
|
|
copy_forwards = 1,
|
|
copy_backwards = -1
|
|
} copy_direction;
|
|
|
|
// Helper object to reduce noise when telling the GC barriers how to perform loads and stores
|
|
// for arraycopy stubs.
|
|
class ArrayCopyBarrierSetHelper : StackObj {
|
|
BarrierSetAssembler* _bs_asm;
|
|
MacroAssembler* _masm;
|
|
DecoratorSet _decorators;
|
|
BasicType _type;
|
|
Register _gct1;
|
|
Register _gct2;
|
|
Register _gct3;
|
|
FloatRegister _gcvt1;
|
|
FloatRegister _gcvt2;
|
|
FloatRegister _gcvt3;
|
|
|
|
public:
|
|
ArrayCopyBarrierSetHelper(MacroAssembler* masm,
|
|
DecoratorSet decorators,
|
|
BasicType type,
|
|
Register gct1,
|
|
Register gct2,
|
|
Register gct3,
|
|
FloatRegister gcvt1,
|
|
FloatRegister gcvt2,
|
|
FloatRegister gcvt3)
|
|
: _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
|
|
_masm(masm),
|
|
_decorators(decorators),
|
|
_type(type),
|
|
_gct1(gct1),
|
|
_gct2(gct2),
|
|
_gct3(gct3),
|
|
_gcvt1(gcvt1),
|
|
_gcvt2(gcvt2),
|
|
_gcvt3(gcvt3) {
|
|
}
|
|
|
|
void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
|
|
_bs_asm->copy_load_at(_masm, _decorators, _type, 32,
|
|
dst1, dst2, src,
|
|
_gct1, _gct2, _gcvt1);
|
|
}
|
|
|
|
void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
|
|
_bs_asm->copy_store_at(_masm, _decorators, _type, 32,
|
|
dst, src1, src2,
|
|
_gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
|
|
}
|
|
|
|
void copy_load_at_16(Register dst1, Register dst2, Address src) {
|
|
_bs_asm->copy_load_at(_masm, _decorators, _type, 16,
|
|
dst1, dst2, src,
|
|
_gct1);
|
|
}
|
|
|
|
void copy_store_at_16(Address dst, Register src1, Register src2) {
|
|
_bs_asm->copy_store_at(_masm, _decorators, _type, 16,
|
|
dst, src1, src2,
|
|
_gct1, _gct2, _gct3);
|
|
}
|
|
|
|
void copy_load_at_8(Register dst, Address src) {
|
|
_bs_asm->copy_load_at(_masm, _decorators, _type, 8,
|
|
dst, noreg, src,
|
|
_gct1);
|
|
}
|
|
|
|
void copy_store_at_8(Address dst, Register src) {
|
|
_bs_asm->copy_store_at(_masm, _decorators, _type, 8,
|
|
dst, src, noreg,
|
|
_gct1, _gct2, _gct3);
|
|
}
|
|
};
|
|
|
|
// Bulk copy of blocks of 8 words.
|
|
//
|
|
// count is a count of words.
|
|
//
|
|
// Precondition: count >= 8
|
|
//
|
|
// Postconditions:
|
|
//
|
|
// The least significant bit of count contains the remaining count
|
|
// of words to copy. The rest of count is trash.
|
|
//
|
|
// s and d are adjusted to point to the remaining words to copy
|
|
//
|
|
address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
|
|
BasicType type;
|
|
copy_direction direction;
|
|
|
|
switch (stub_id) {
|
|
case StubId::stubgen_copy_byte_f_id:
|
|
direction = copy_forwards;
|
|
type = T_BYTE;
|
|
break;
|
|
case StubId::stubgen_copy_byte_b_id:
|
|
direction = copy_backwards;
|
|
type = T_BYTE;
|
|
break;
|
|
case StubId::stubgen_copy_oop_f_id:
|
|
direction = copy_forwards;
|
|
type = T_OBJECT;
|
|
break;
|
|
case StubId::stubgen_copy_oop_b_id:
|
|
direction = copy_backwards;
|
|
type = T_OBJECT;
|
|
break;
|
|
case StubId::stubgen_copy_oop_uninit_f_id:
|
|
direction = copy_forwards;
|
|
type = T_OBJECT;
|
|
break;
|
|
case StubId::stubgen_copy_oop_uninit_b_id:
|
|
direction = copy_backwards;
|
|
type = T_OBJECT;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
int unit = wordSize * direction;
|
|
int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
|
|
|
|
const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
|
|
t4 = r7, t5 = r11, t6 = r12, t7 = r13;
|
|
const Register stride = r14;
|
|
const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
|
|
const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
|
|
ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
|
|
|
|
assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
|
|
assert_different_registers(s, d, count, rscratch1, rscratch2);
|
|
|
|
Label again, drain;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
|
|
Label unaligned_copy_long;
|
|
if (AvoidUnalignedAccesses) {
|
|
__ tbnz(d, 3, unaligned_copy_long);
|
|
}
|
|
|
|
if (direction == copy_forwards) {
|
|
__ sub(s, s, bias);
|
|
__ sub(d, d, bias);
|
|
}
|
|
|
|
#ifdef ASSERT
|
|
// Make sure we are never given < 8 words
|
|
{
|
|
Label L;
|
|
__ cmp(count, (u1)8);
|
|
__ br(Assembler::GE, L);
|
|
__ stop("genrate_copy_longs called with < 8 words");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
// Fill 8 registers
|
|
if (UseSIMDForMemoryOps) {
|
|
bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
|
|
bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
|
|
} else {
|
|
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
|
|
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
|
|
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
|
|
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
|
|
}
|
|
|
|
__ subs(count, count, 16);
|
|
__ br(Assembler::LO, drain);
|
|
|
|
int prefetch = PrefetchCopyIntervalInBytes;
|
|
bool use_stride = false;
|
|
if (direction == copy_backwards) {
|
|
use_stride = prefetch > 256;
|
|
prefetch = -prefetch;
|
|
if (use_stride) __ mov(stride, prefetch);
|
|
}
|
|
|
|
__ bind(again);
|
|
|
|
if (PrefetchCopyIntervalInBytes > 0)
|
|
__ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
|
|
|
|
if (UseSIMDForMemoryOps) {
|
|
bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
|
|
bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
|
|
bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
|
|
bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
|
|
} else {
|
|
bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
|
|
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
|
|
bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
|
|
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
|
|
bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
|
|
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
|
|
bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
|
|
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
|
|
}
|
|
|
|
__ subs(count, count, 8);
|
|
__ br(Assembler::HS, again);
|
|
|
|
// Drain
|
|
__ bind(drain);
|
|
if (UseSIMDForMemoryOps) {
|
|
bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
|
|
bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
|
|
} else {
|
|
bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
|
|
bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
|
|
bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
|
|
bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
|
|
}
|
|
|
|
{
|
|
Label L1, L2;
|
|
__ tbz(count, exact_log2(4), L1);
|
|
if (UseSIMDForMemoryOps) {
|
|
bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
|
|
bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
|
|
} else {
|
|
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
|
|
bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
|
|
bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
|
|
bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
|
|
}
|
|
__ bind(L1);
|
|
|
|
if (direction == copy_forwards) {
|
|
__ add(s, s, bias);
|
|
__ add(d, d, bias);
|
|
}
|
|
|
|
__ tbz(count, 1, L2);
|
|
bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
|
|
bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
|
|
__ bind(L2);
|
|
}
|
|
|
|
__ ret(lr);
|
|
|
|
if (AvoidUnalignedAccesses) {
|
|
Label drain, again;
|
|
// Register order for storing. Order is different for backward copy.
|
|
|
|
__ bind(unaligned_copy_long);
|
|
|
|
// source address is even aligned, target odd aligned
|
|
//
|
|
// when forward copying word pairs we read long pairs at offsets
|
|
// {0, 2, 4, 6} (in long words). when backwards copying we read
|
|
// long pairs at offsets {-2, -4, -6, -8}. We adjust the source
|
|
// address by -2 in the forwards case so we can compute the
|
|
// source offsets for both as {2, 4, 6, 8} * unit where unit = 1
|
|
// or -1.
|
|
//
|
|
// when forward copying we need to store 1 word, 3 pairs and
|
|
// then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
|
|
// zero offset We adjust the destination by -1 which means we
|
|
// have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
|
|
//
|
|
// When backwards copyng we need to store 1 word, 3 pairs and
|
|
// then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
|
|
// offsets {1, 3, 5, 7, 8} * unit.
|
|
|
|
if (direction == copy_forwards) {
|
|
__ sub(s, s, 16);
|
|
__ sub(d, d, 8);
|
|
}
|
|
|
|
// Fill 8 registers
|
|
//
|
|
// for forwards copy s was offset by -16 from the original input
|
|
// value of s so the register contents are at these offsets
|
|
// relative to the 64 bit block addressed by that original input
|
|
// and so on for each successive 64 byte block when s is updated
|
|
//
|
|
// t0 at offset 0, t1 at offset 8
|
|
// t2 at offset 16, t3 at offset 24
|
|
// t4 at offset 32, t5 at offset 40
|
|
// t6 at offset 48, t7 at offset 56
|
|
|
|
// for backwards copy s was not offset so the register contents
|
|
// are at these offsets into the preceding 64 byte block
|
|
// relative to that original input and so on for each successive
|
|
// preceding 64 byte block when s is updated. this explains the
|
|
// slightly counter-intuitive looking pattern of register usage
|
|
// in the stp instructions for backwards copy.
|
|
//
|
|
// t0 at offset -16, t1 at offset -8
|
|
// t2 at offset -32, t3 at offset -24
|
|
// t4 at offset -48, t5 at offset -40
|
|
// t6 at offset -64, t7 at offset -56
|
|
|
|
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
|
|
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
|
|
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
|
|
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
|
|
|
|
__ subs(count, count, 16);
|
|
__ br(Assembler::LO, drain);
|
|
|
|
int prefetch = PrefetchCopyIntervalInBytes;
|
|
bool use_stride = false;
|
|
if (direction == copy_backwards) {
|
|
use_stride = prefetch > 256;
|
|
prefetch = -prefetch;
|
|
if (use_stride) __ mov(stride, prefetch);
|
|
}
|
|
|
|
__ bind(again);
|
|
|
|
if (PrefetchCopyIntervalInBytes > 0)
|
|
__ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
|
|
|
|
if (direction == copy_forwards) {
|
|
// allowing for the offset of -8 the store instructions place
|
|
// registers into the target 64 bit block at the following
|
|
// offsets
|
|
//
|
|
// t0 at offset 0
|
|
// t1 at offset 8, t2 at offset 16
|
|
// t3 at offset 24, t4 at offset 32
|
|
// t5 at offset 40, t6 at offset 48
|
|
// t7 at offset 56
|
|
|
|
bs.copy_store_at_8(Address(d, 1 * unit), t0);
|
|
bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
|
|
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
|
|
bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
|
|
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
|
|
bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
|
|
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
|
|
bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
|
|
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
|
|
} else {
|
|
// d was not offset when we started so the registers are
|
|
// written into the 64 bit block preceding d with the following
|
|
// offsets
|
|
//
|
|
// t1 at offset -8
|
|
// t3 at offset -24, t0 at offset -16
|
|
// t5 at offset -48, t2 at offset -32
|
|
// t7 at offset -56, t4 at offset -48
|
|
// t6 at offset -64
|
|
//
|
|
// note that this matches the offsets previously noted for the
|
|
// loads
|
|
|
|
bs.copy_store_at_8(Address(d, 1 * unit), t1);
|
|
bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
|
|
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
|
|
bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
|
|
bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
|
|
bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
|
|
bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
|
|
bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
|
|
bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
|
|
}
|
|
|
|
__ subs(count, count, 8);
|
|
__ br(Assembler::HS, again);
|
|
|
|
// Drain
|
|
//
|
|
// this uses the same pattern of offsets and register arguments
|
|
// as above
|
|
__ bind(drain);
|
|
if (direction == copy_forwards) {
|
|
bs.copy_store_at_8(Address(d, 1 * unit), t0);
|
|
bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
|
|
bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
|
|
bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
|
|
bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
|
|
} else {
|
|
bs.copy_store_at_8(Address(d, 1 * unit), t1);
|
|
bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
|
|
bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
|
|
bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
|
|
bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
|
|
}
|
|
// now we need to copy any remaining part block which may
|
|
// include a 4 word block subblock and/or a 2 word subblock.
|
|
// bits 2 and 1 in the count are the tell-tale for whether we
|
|
// have each such subblock
|
|
{
|
|
Label L1, L2;
|
|
__ tbz(count, exact_log2(4), L1);
|
|
// this is the same as above but copying only 4 longs hence
|
|
// with only one intervening stp between the str instructions
|
|
// but note that the offsets and registers still follow the
|
|
// same pattern
|
|
bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
|
|
bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
|
|
if (direction == copy_forwards) {
|
|
bs.copy_store_at_8(Address(d, 1 * unit), t0);
|
|
bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
|
|
bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
|
|
} else {
|
|
bs.copy_store_at_8(Address(d, 1 * unit), t1);
|
|
bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
|
|
bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
|
|
}
|
|
__ bind(L1);
|
|
|
|
__ tbz(count, 1, L2);
|
|
// this is the same as above but copying only 2 longs hence
|
|
// there is no intervening stp between the str instructions
|
|
// but note that the offset and register patterns are still
|
|
// the same
|
|
bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
|
|
if (direction == copy_forwards) {
|
|
bs.copy_store_at_8(Address(d, 1 * unit), t0);
|
|
bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
|
|
} else {
|
|
bs.copy_store_at_8(Address(d, 1 * unit), t1);
|
|
bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
|
|
}
|
|
__ bind(L2);
|
|
|
|
// for forwards copy we need to re-adjust the offsets we
|
|
// applied so that s and d are follow the last words written
|
|
|
|
if (direction == copy_forwards) {
|
|
__ add(s, s, 16);
|
|
__ add(d, d, 8);
|
|
}
|
|
|
|
}
|
|
|
|
__ ret(lr);
|
|
}
|
|
|
|
return start;
|
|
}
|
|
|
|
// Small copy: less than 16 bytes.
|
|
//
|
|
// NB: Ignores all of the bits of count which represent more than 15
|
|
// bytes, so a caller doesn't have to mask them.
|
|
|
|
void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
|
|
bool is_backwards = step < 0;
|
|
size_t granularity = g_uabs(step);
|
|
int direction = is_backwards ? -1 : 1;
|
|
|
|
Label Lword, Lint, Lshort, Lbyte;
|
|
|
|
assert(granularity
|
|
&& granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
|
|
|
|
const Register t0 = r3;
|
|
const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
|
|
ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
|
|
|
|
// ??? I don't know if this bit-test-and-branch is the right thing
|
|
// to do. It does a lot of jumping, resulting in several
|
|
// mispredicted branches. It might make more sense to do this
|
|
// with something like Duff's device with a single computed branch.
|
|
|
|
__ tbz(count, 3 - exact_log2(granularity), Lword);
|
|
bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
|
|
bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
|
|
__ bind(Lword);
|
|
|
|
if (granularity <= sizeof (jint)) {
|
|
__ tbz(count, 2 - exact_log2(granularity), Lint);
|
|
__ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
|
|
__ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
|
|
__ bind(Lint);
|
|
}
|
|
|
|
if (granularity <= sizeof (jshort)) {
|
|
__ tbz(count, 1 - exact_log2(granularity), Lshort);
|
|
__ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
|
|
__ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
|
|
__ bind(Lshort);
|
|
}
|
|
|
|
if (granularity <= sizeof (jbyte)) {
|
|
__ tbz(count, 0, Lbyte);
|
|
__ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
|
|
__ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
|
|
__ bind(Lbyte);
|
|
}
|
|
}
|
|
|
|
// All-singing all-dancing memory copy.
|
|
//
|
|
// Copy count units of memory from s to d. The size of a unit is
|
|
// step, which can be positive or negative depending on the direction
|
|
// of copy. If is_aligned is false, we align the source address.
|
|
//
|
|
|
|
void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
|
|
Register s, Register d, Register count, int step) {
|
|
copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
|
|
bool is_backwards = step < 0;
|
|
unsigned int granularity = g_uabs(step);
|
|
const Register t0 = r3, t1 = r4;
|
|
|
|
// <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
|
|
// load all the data before writing anything
|
|
Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
|
|
const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
|
|
const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
|
|
const Register send = r17, dend = r16;
|
|
const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
|
|
const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
|
|
ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
|
|
|
|
if (PrefetchCopyIntervalInBytes > 0)
|
|
__ prfm(Address(s, 0), PLDL1KEEP);
|
|
__ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
|
|
__ br(Assembler::HI, copy_big);
|
|
|
|
__ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
|
|
__ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
|
|
|
|
__ cmp(count, u1(16/granularity));
|
|
__ br(Assembler::LS, copy16);
|
|
|
|
__ cmp(count, u1(64/granularity));
|
|
__ br(Assembler::HI, copy80);
|
|
|
|
__ cmp(count, u1(32/granularity));
|
|
__ br(Assembler::LS, copy32);
|
|
|
|
// 33..64 bytes
|
|
if (UseSIMDForMemoryOps) {
|
|
bs.copy_load_at_32(v0, v1, Address(s, 0));
|
|
bs.copy_load_at_32(v2, v3, Address(send, -32));
|
|
bs.copy_store_at_32(Address(d, 0), v0, v1);
|
|
bs.copy_store_at_32(Address(dend, -32), v2, v3);
|
|
} else {
|
|
bs.copy_load_at_16(t0, t1, Address(s, 0));
|
|
bs.copy_load_at_16(t2, t3, Address(s, 16));
|
|
bs.copy_load_at_16(t4, t5, Address(send, -32));
|
|
bs.copy_load_at_16(t6, t7, Address(send, -16));
|
|
|
|
bs.copy_store_at_16(Address(d, 0), t0, t1);
|
|
bs.copy_store_at_16(Address(d, 16), t2, t3);
|
|
bs.copy_store_at_16(Address(dend, -32), t4, t5);
|
|
bs.copy_store_at_16(Address(dend, -16), t6, t7);
|
|
}
|
|
__ b(finish);
|
|
|
|
// 17..32 bytes
|
|
__ bind(copy32);
|
|
bs.copy_load_at_16(t0, t1, Address(s, 0));
|
|
bs.copy_load_at_16(t6, t7, Address(send, -16));
|
|
|
|
bs.copy_store_at_16(Address(d, 0), t0, t1);
|
|
bs.copy_store_at_16(Address(dend, -16), t6, t7);
|
|
__ b(finish);
|
|
|
|
// 65..80/96 bytes
|
|
// (96 bytes if SIMD because we do 32 byes per instruction)
|
|
__ bind(copy80);
|
|
if (UseSIMDForMemoryOps) {
|
|
bs.copy_load_at_32(v0, v1, Address(s, 0));
|
|
bs.copy_load_at_32(v2, v3, Address(s, 32));
|
|
// Unaligned pointers can be an issue for copying.
|
|
// The issue has more chances to happen when granularity of data is
|
|
// less than 4(sizeof(jint)). Pointers for arrays of jint are at least
|
|
// 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
|
|
// The most performance drop has been seen for the range 65-80 bytes.
|
|
// For such cases using the pair of ldp/stp instead of the third pair of
|
|
// ldpq/stpq fixes the performance issue.
|
|
if (granularity < sizeof (jint)) {
|
|
Label copy96;
|
|
__ cmp(count, u1(80/granularity));
|
|
__ br(Assembler::HI, copy96);
|
|
bs.copy_load_at_16(t0, t1, Address(send, -16));
|
|
|
|
bs.copy_store_at_32(Address(d, 0), v0, v1);
|
|
bs.copy_store_at_32(Address(d, 32), v2, v3);
|
|
|
|
bs.copy_store_at_16(Address(dend, -16), t0, t1);
|
|
__ b(finish);
|
|
|
|
__ bind(copy96);
|
|
}
|
|
bs.copy_load_at_32(v4, v5, Address(send, -32));
|
|
|
|
bs.copy_store_at_32(Address(d, 0), v0, v1);
|
|
bs.copy_store_at_32(Address(d, 32), v2, v3);
|
|
|
|
bs.copy_store_at_32(Address(dend, -32), v4, v5);
|
|
} else {
|
|
bs.copy_load_at_16(t0, t1, Address(s, 0));
|
|
bs.copy_load_at_16(t2, t3, Address(s, 16));
|
|
bs.copy_load_at_16(t4, t5, Address(s, 32));
|
|
bs.copy_load_at_16(t6, t7, Address(s, 48));
|
|
bs.copy_load_at_16(t8, t9, Address(send, -16));
|
|
|
|
bs.copy_store_at_16(Address(d, 0), t0, t1);
|
|
bs.copy_store_at_16(Address(d, 16), t2, t3);
|
|
bs.copy_store_at_16(Address(d, 32), t4, t5);
|
|
bs.copy_store_at_16(Address(d, 48), t6, t7);
|
|
bs.copy_store_at_16(Address(dend, -16), t8, t9);
|
|
}
|
|
__ b(finish);
|
|
|
|
// 0..16 bytes
|
|
__ bind(copy16);
|
|
__ cmp(count, u1(8/granularity));
|
|
__ br(Assembler::LO, copy8);
|
|
|
|
// 8..16 bytes
|
|
bs.copy_load_at_8(t0, Address(s, 0));
|
|
bs.copy_load_at_8(t1, Address(send, -8));
|
|
bs.copy_store_at_8(Address(d, 0), t0);
|
|
bs.copy_store_at_8(Address(dend, -8), t1);
|
|
__ b(finish);
|
|
|
|
if (granularity < 8) {
|
|
// 4..7 bytes
|
|
__ bind(copy8);
|
|
__ tbz(count, 2 - exact_log2(granularity), copy4);
|
|
__ ldrw(t0, Address(s, 0));
|
|
__ ldrw(t1, Address(send, -4));
|
|
__ strw(t0, Address(d, 0));
|
|
__ strw(t1, Address(dend, -4));
|
|
__ b(finish);
|
|
if (granularity < 4) {
|
|
// 0..3 bytes
|
|
__ bind(copy4);
|
|
__ cbz(count, finish); // get rid of 0 case
|
|
if (granularity == 2) {
|
|
__ ldrh(t0, Address(s, 0));
|
|
__ strh(t0, Address(d, 0));
|
|
} else { // granularity == 1
|
|
// Now 1..3 bytes. Handle the 1 and 2 byte case by copying
|
|
// the first and last byte.
|
|
// Handle the 3 byte case by loading and storing base + count/2
|
|
// (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
|
|
// This does means in the 1 byte case we load/store the same
|
|
// byte 3 times.
|
|
__ lsr(count, count, 1);
|
|
__ ldrb(t0, Address(s, 0));
|
|
__ ldrb(t1, Address(send, -1));
|
|
__ ldrb(t2, Address(s, count));
|
|
__ strb(t0, Address(d, 0));
|
|
__ strb(t1, Address(dend, -1));
|
|
__ strb(t2, Address(d, count));
|
|
}
|
|
__ b(finish);
|
|
}
|
|
}
|
|
|
|
__ bind(copy_big);
|
|
if (is_backwards) {
|
|
__ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
|
|
__ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
|
|
}
|
|
|
|
// Now we've got the small case out of the way we can align the
|
|
// source address on a 2-word boundary.
|
|
|
|
// Here we will materialize a count in r15, which is used by copy_memory_small
|
|
// and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
|
|
// Up until here, we have used t9, which aliases r15, but from here on, that register
|
|
// can not be used as a temp register, as it contains the count.
|
|
|
|
Label aligned;
|
|
|
|
if (is_aligned) {
|
|
// We may have to adjust by 1 word to get s 2-word-aligned.
|
|
__ tbz(s, exact_log2(wordSize), aligned);
|
|
bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
|
|
bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
|
|
__ sub(count, count, wordSize/granularity);
|
|
} else {
|
|
if (is_backwards) {
|
|
__ andr(r15, s, 2 * wordSize - 1);
|
|
} else {
|
|
__ neg(r15, s);
|
|
__ andr(r15, r15, 2 * wordSize - 1);
|
|
}
|
|
// r15 is the byte adjustment needed to align s.
|
|
__ cbz(r15, aligned);
|
|
int shift = exact_log2(granularity);
|
|
if (shift > 0) {
|
|
__ lsr(r15, r15, shift);
|
|
}
|
|
__ sub(count, count, r15);
|
|
|
|
#if 0
|
|
// ?? This code is only correct for a disjoint copy. It may or
|
|
// may not make sense to use it in that case.
|
|
|
|
// Copy the first pair; s and d may not be aligned.
|
|
__ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
|
|
__ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
|
|
|
|
// Align s and d, adjust count
|
|
if (is_backwards) {
|
|
__ sub(s, s, r15);
|
|
__ sub(d, d, r15);
|
|
} else {
|
|
__ add(s, s, r15);
|
|
__ add(d, d, r15);
|
|
}
|
|
#else
|
|
copy_memory_small(decorators, type, s, d, r15, step);
|
|
#endif
|
|
}
|
|
|
|
__ bind(aligned);
|
|
|
|
// s is now 2-word-aligned.
|
|
|
|
// We have a count of units and some trailing bytes. Adjust the
|
|
// count and do a bulk copy of words. If the shift is zero
|
|
// perform a move instead to benefit from zero latency moves.
|
|
int shift = exact_log2(wordSize/granularity);
|
|
if (shift > 0) {
|
|
__ lsr(r15, count, shift);
|
|
} else {
|
|
__ mov(r15, count);
|
|
}
|
|
if (direction == copy_forwards) {
|
|
if (type != T_OBJECT) {
|
|
__ bl(StubRoutines::aarch64::copy_byte_f());
|
|
} else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
|
|
__ bl(StubRoutines::aarch64::copy_oop_uninit_f());
|
|
} else {
|
|
__ bl(StubRoutines::aarch64::copy_oop_f());
|
|
}
|
|
} else {
|
|
if (type != T_OBJECT) {
|
|
__ bl(StubRoutines::aarch64::copy_byte_b());
|
|
} else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
|
|
__ bl(StubRoutines::aarch64::copy_oop_uninit_b());
|
|
} else {
|
|
__ bl(StubRoutines::aarch64::copy_oop_b());
|
|
}
|
|
}
|
|
|
|
// And the tail.
|
|
copy_memory_small(decorators, type, s, d, count, step);
|
|
|
|
if (granularity >= 8) __ bind(copy8);
|
|
if (granularity >= 4) __ bind(copy4);
|
|
__ bind(finish);
|
|
}
|
|
|
|
|
|
void clobber_registers() {
|
|
#ifdef ASSERT
|
|
RegSet clobbered
|
|
= MacroAssembler::call_clobbered_gp_registers() - rscratch1;
|
|
__ mov(rscratch1, (uint64_t)0xdeadbeef);
|
|
__ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
|
|
for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
|
|
__ mov(*it, rscratch1);
|
|
}
|
|
#endif
|
|
|
|
}
|
|
|
|
// Scan over array at a for count oops, verifying each one.
|
|
// Preserves a and count, clobbers rscratch1 and rscratch2.
|
|
void verify_oop_array (int size, Register a, Register count, Register temp) {
|
|
Label loop, end;
|
|
__ mov(rscratch1, a);
|
|
__ mov(rscratch2, zr);
|
|
__ bind(loop);
|
|
__ cmp(rscratch2, count);
|
|
__ br(Assembler::HS, end);
|
|
if (size == wordSize) {
|
|
__ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
|
|
__ verify_oop(temp);
|
|
} else {
|
|
__ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
|
|
__ decode_heap_oop(temp); // calls verify_oop
|
|
}
|
|
__ add(rscratch2, rscratch2, 1);
|
|
__ b(loop);
|
|
__ bind(end);
|
|
}
|
|
|
|
// Arguments:
|
|
// stub_id - is used to name the stub and identify all details of
|
|
// how to perform the copy.
|
|
//
|
|
// entry - is assigned to the stub's post push entry point unless
|
|
// it is null
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source array address
|
|
// c_rarg1 - destination array address
|
|
// c_rarg2 - element count, treated as ssize_t, can be zero
|
|
//
|
|
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
|
|
// the hardware handle it. The two dwords within qwords that span
|
|
// cache line boundaries will still be loaded and stored atomically.
|
|
//
|
|
// Side Effects: nopush_entry is set to the (post push) entry point
|
|
// so it can be used by the corresponding conjoint
|
|
// copy method
|
|
//
|
|
address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
|
|
Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
|
|
RegSet saved_reg = RegSet::of(s, d, count);
|
|
int size;
|
|
bool aligned;
|
|
bool is_oop;
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jbyte_disjoint_arraycopy_id:
|
|
size = sizeof(jbyte);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
|
|
size = sizeof(jbyte);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_jshort_disjoint_arraycopy_id:
|
|
size = sizeof(jshort);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
|
|
size = sizeof(jshort);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_jint_disjoint_arraycopy_id:
|
|
size = sizeof(jint);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
|
|
size = sizeof(jint);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_jlong_disjoint_arraycopy_id:
|
|
// since this is always aligned we can (should!) use the same
|
|
// stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
|
|
ShouldNotReachHere();
|
|
break;
|
|
case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
|
|
size = sizeof(jlong);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_oop_disjoint_arraycopy_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
break;
|
|
}
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
if (nopush_entry != nullptr) {
|
|
*nopush_entry = __ pc();
|
|
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
|
|
BLOCK_COMMENT("Entry:");
|
|
}
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
if (aligned) {
|
|
decorators |= ARRAYCOPY_ALIGNED;
|
|
}
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
|
|
|
|
if (is_oop) {
|
|
// save regs before copy_memory
|
|
__ push(RegSet::of(d, count), sp);
|
|
}
|
|
{
|
|
// UnsafeMemoryAccess page error: continue after unsafe access
|
|
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
|
|
UnsafeMemoryAccessMark umam(this, add_entry, true);
|
|
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
|
|
}
|
|
|
|
if (is_oop) {
|
|
__ pop(RegSet::of(d, count), sp);
|
|
if (VerifyOops)
|
|
verify_oop_array(size, d, count, r16);
|
|
}
|
|
|
|
bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
|
|
|
|
__ leave();
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
// stub_id - is used to name the stub and identify all details of
|
|
// how to perform the copy.
|
|
//
|
|
// nooverlap_target - identifes the (post push) entry for the
|
|
// corresponding disjoint copy routine which can be
|
|
// jumped to if the ranges do not actually overlap
|
|
//
|
|
// entry - is assigned to the stub's post push entry point unless
|
|
// it is null
|
|
//
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source array address
|
|
// c_rarg1 - destination array address
|
|
// c_rarg2 - element count, treated as ssize_t, can be zero
|
|
//
|
|
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
|
|
// the hardware handle it. The two dwords within qwords that span
|
|
// cache line boundaries will still be loaded and stored atomically.
|
|
//
|
|
// Side Effects:
|
|
// nopush_entry is set to the no-overlap entry point so it can be
|
|
// used by some other conjoint copy method
|
|
//
|
|
address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
|
|
Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
|
|
RegSet saved_regs = RegSet::of(s, d, count);
|
|
int size;
|
|
bool aligned;
|
|
bool is_oop;
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jbyte_arraycopy_id:
|
|
size = sizeof(jbyte);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jbyte_arraycopy_id:
|
|
size = sizeof(jbyte);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_jshort_arraycopy_id:
|
|
size = sizeof(jshort);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jshort_arraycopy_id:
|
|
size = sizeof(jshort);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_jint_arraycopy_id:
|
|
size = sizeof(jint);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jint_arraycopy_id:
|
|
size = sizeof(jint);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_jlong_arraycopy_id:
|
|
// since this is always aligned we can (should!) use the same
|
|
// stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
|
|
ShouldNotReachHere();
|
|
break;
|
|
case StubId::stubgen_arrayof_jlong_arraycopy_id:
|
|
size = sizeof(jlong);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_oop_arraycopy_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_oop_arraycopy_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_oop_arraycopy_uninit_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
if (nopush_entry != nullptr) {
|
|
*nopush_entry = __ pc();
|
|
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
|
|
BLOCK_COMMENT("Entry:");
|
|
}
|
|
|
|
// use fwd copy when (d-s) above_equal (count*size)
|
|
Label L_overlapping;
|
|
__ sub(rscratch1, d, s);
|
|
__ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
|
|
__ br(Assembler::LO, L_overlapping);
|
|
__ b(RuntimeAddress(nooverlap_target));
|
|
__ bind(L_overlapping);
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
if (aligned) {
|
|
decorators |= ARRAYCOPY_ALIGNED;
|
|
}
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
|
|
|
|
if (is_oop) {
|
|
// save regs before copy_memory
|
|
__ push(RegSet::of(d, count), sp);
|
|
}
|
|
{
|
|
// UnsafeMemoryAccess page error: continue after unsafe access
|
|
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
|
|
UnsafeMemoryAccessMark umam(this, add_entry, true);
|
|
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
|
|
}
|
|
if (is_oop) {
|
|
__ pop(RegSet::of(d, count), sp);
|
|
if (VerifyOops)
|
|
verify_oop_array(size, d, count, r16);
|
|
}
|
|
bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
|
|
__ leave();
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
return start;
|
|
}
|
|
|
|
// Helper for generating a dynamic type check.
|
|
// Smashes rscratch1, rscratch2.
|
|
void generate_type_check(Register sub_klass,
|
|
Register super_check_offset,
|
|
Register super_klass,
|
|
Register temp1,
|
|
Register temp2,
|
|
Register result,
|
|
Label& L_success) {
|
|
assert_different_registers(sub_klass, super_check_offset, super_klass);
|
|
|
|
BLOCK_COMMENT("type_check:");
|
|
|
|
Label L_miss;
|
|
|
|
__ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
|
|
super_check_offset);
|
|
__ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
|
|
|
|
// Fall through on failure!
|
|
__ BIND(L_miss);
|
|
}
|
|
|
|
//
|
|
// Generate checkcasting array copy stub
|
|
//
|
|
// Input:
|
|
// c_rarg0 - source array address
|
|
// c_rarg1 - destination array address
|
|
// c_rarg2 - element count, treated as ssize_t, can be zero
|
|
// c_rarg3 - size_t ckoff (super_check_offset)
|
|
// c_rarg4 - oop ckval (super_klass)
|
|
//
|
|
// Output:
|
|
// r0 == 0 - success
|
|
// r0 == -1^K - failure, where K is partial transfer count
|
|
//
|
|
address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_checkcast_arraycopy_id:
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_checkcast_arraycopy_uninit_id:
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
|
|
|
|
// Input registers (after setup_arg_regs)
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register count = c_rarg2; // elementscount
|
|
const Register ckoff = c_rarg3; // super_check_offset
|
|
const Register ckval = c_rarg4; // super_klass
|
|
|
|
RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
|
|
RegSet wb_post_saved_regs = RegSet::of(count);
|
|
|
|
// Registers used as temps (r19, r20, r21, r22 are save-on-entry)
|
|
const Register copied_oop = r22; // actual oop copied
|
|
const Register count_save = r21; // orig elementscount
|
|
const Register start_to = r20; // destination array start address
|
|
const Register r19_klass = r19; // oop._klass
|
|
|
|
// Registers used as gc temps (r5, r6, r7 are save-on-call)
|
|
const Register gct1 = r5, gct2 = r6, gct3 = r7;
|
|
|
|
//---------------------------------------------------------------
|
|
// Assembler stub will be used for this call to arraycopy
|
|
// if the two arrays are subtypes of Object[] but the
|
|
// destination array type is not equal to or a supertype
|
|
// of the source type. Each element must be separately
|
|
// checked.
|
|
|
|
assert_different_registers(from, to, count, ckoff, ckval, start_to,
|
|
copied_oop, r19_klass, count_save);
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
#ifdef ASSERT
|
|
// caller guarantees that the arrays really are different
|
|
// otherwise, we would have to make conjoint checks
|
|
{ Label L;
|
|
__ b(L); // conjoint check not yet implemented
|
|
__ stop("checkcast_copy within a single array");
|
|
__ bind(L);
|
|
}
|
|
#endif //ASSERT
|
|
|
|
// Caller of this entry point must set up the argument registers.
|
|
if (nopush_entry != nullptr) {
|
|
*nopush_entry = __ pc();
|
|
BLOCK_COMMENT("Entry:");
|
|
}
|
|
|
|
// Empty array: Nothing to do.
|
|
__ cbz(count, L_done);
|
|
__ push(RegSet::of(r19, r20, r21, r22), sp);
|
|
|
|
#ifdef ASSERT
|
|
BLOCK_COMMENT("assert consistent ckoff/ckval");
|
|
// The ckoff and ckval must be mutually consistent,
|
|
// even though caller generates both.
|
|
{ Label L;
|
|
int sco_offset = in_bytes(Klass::super_check_offset_offset());
|
|
__ ldrw(start_to, Address(ckval, sco_offset));
|
|
__ cmpw(ckoff, start_to);
|
|
__ br(Assembler::EQ, L);
|
|
__ stop("super_check_offset inconsistent");
|
|
__ bind(L);
|
|
}
|
|
#endif //ASSERT
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
|
|
bool is_oop = true;
|
|
int element_size = UseCompressedOops ? 4 : 8;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
|
|
|
|
// save the original count
|
|
__ mov(count_save, count);
|
|
|
|
// Copy from low to high addresses
|
|
__ mov(start_to, to); // Save destination array start address
|
|
__ b(L_load_element);
|
|
|
|
// ======== begin loop ========
|
|
// (Loop is rotated; its entry is L_load_element.)
|
|
// Loop control:
|
|
// for (; count != 0; count--) {
|
|
// copied_oop = load_heap_oop(from++);
|
|
// ... generate_type_check ...;
|
|
// store_heap_oop(to++, copied_oop);
|
|
// }
|
|
__ align(OptoLoopAlignment);
|
|
|
|
__ BIND(L_store_element);
|
|
bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
|
|
__ post(to, element_size), copied_oop, noreg,
|
|
gct1, gct2, gct3);
|
|
__ sub(count, count, 1);
|
|
__ cbz(count, L_do_card_marks);
|
|
|
|
// ======== loop entry is here ========
|
|
__ BIND(L_load_element);
|
|
bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
|
|
copied_oop, noreg, __ post(from, element_size),
|
|
gct1);
|
|
__ cbz(copied_oop, L_store_element);
|
|
|
|
__ load_klass(r19_klass, copied_oop);// query the object klass
|
|
|
|
BLOCK_COMMENT("type_check:");
|
|
generate_type_check(/*sub_klass*/r19_klass,
|
|
/*super_check_offset*/ckoff,
|
|
/*super_klass*/ckval,
|
|
/*r_array_base*/gct1,
|
|
/*temp2*/gct2,
|
|
/*result*/r10, L_store_element);
|
|
|
|
// Fall through on failure!
|
|
|
|
// ======== end loop ========
|
|
|
|
// It was a real error; we must depend on the caller to finish the job.
|
|
// Register count = remaining oops, count_orig = total oops.
|
|
// Emit GC store barriers for the oops we have copied and report
|
|
// their number to the caller.
|
|
|
|
__ subs(count, count_save, count); // K = partially copied oop count
|
|
__ eon(count, count, zr); // report (-1^K) to caller
|
|
__ br(Assembler::EQ, L_done_pop);
|
|
|
|
__ BIND(L_do_card_marks);
|
|
bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
|
|
|
|
__ bind(L_done_pop);
|
|
__ pop(RegSet::of(r19, r20, r21, r22), sp);
|
|
inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
|
|
|
|
__ bind(L_done);
|
|
__ mov(r0, count);
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Perform range checks on the proposed arraycopy.
|
|
// Kills temp, but nothing else.
|
|
// Also, clean the sign bits of src_pos and dst_pos.
|
|
void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
|
|
Register src_pos, // source position (c_rarg1)
|
|
Register dst, // destination array oo (c_rarg2)
|
|
Register dst_pos, // destination position (c_rarg3)
|
|
Register length,
|
|
Register temp,
|
|
Label& L_failed) {
|
|
BLOCK_COMMENT("arraycopy_range_checks:");
|
|
|
|
assert_different_registers(rscratch1, temp);
|
|
|
|
// if (src_pos + length > arrayOop(src)->length()) FAIL;
|
|
__ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
|
|
__ addw(temp, length, src_pos);
|
|
__ cmpw(temp, rscratch1);
|
|
__ br(Assembler::HI, L_failed);
|
|
|
|
// if (dst_pos + length > arrayOop(dst)->length()) FAIL;
|
|
__ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
|
|
__ addw(temp, length, dst_pos);
|
|
__ cmpw(temp, rscratch1);
|
|
__ br(Assembler::HI, L_failed);
|
|
|
|
// Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
|
|
__ movw(src_pos, src_pos);
|
|
__ movw(dst_pos, dst_pos);
|
|
|
|
BLOCK_COMMENT("arraycopy_range_checks done");
|
|
}
|
|
|
|
// These stubs get called from some dumb test routine.
|
|
// I'll write them properly when they're called from
|
|
// something that's actually doing something.
|
|
static void fake_arraycopy_stub(address src, address dst, int count) {
|
|
assert(count == 0, "huh?");
|
|
}
|
|
|
|
|
|
//
|
|
// Generate 'unsafe' array copy stub
|
|
// Though just as safe as the other stubs, it takes an unscaled
|
|
// size_t argument instead of an element count.
|
|
//
|
|
// Input:
|
|
// c_rarg0 - source array address
|
|
// c_rarg1 - destination array address
|
|
// c_rarg2 - byte count, treated as ssize_t, can be zero
|
|
//
|
|
// Examines the alignment of the operands and dispatches
|
|
// to a long, int, short, or byte copy loop.
|
|
//
|
|
address generate_unsafe_copy(address byte_copy_entry,
|
|
address short_copy_entry,
|
|
address int_copy_entry,
|
|
address long_copy_entry) {
|
|
StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
|
|
|
|
Label L_long_aligned, L_int_aligned, L_short_aligned;
|
|
Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
// bump this on entry, not on exit:
|
|
inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
|
|
|
|
__ orr(rscratch1, s, d);
|
|
__ orr(rscratch1, rscratch1, count);
|
|
|
|
__ andr(rscratch1, rscratch1, BytesPerLong-1);
|
|
__ cbz(rscratch1, L_long_aligned);
|
|
__ andr(rscratch1, rscratch1, BytesPerInt-1);
|
|
__ cbz(rscratch1, L_int_aligned);
|
|
__ tbz(rscratch1, 0, L_short_aligned);
|
|
__ b(RuntimeAddress(byte_copy_entry));
|
|
|
|
__ BIND(L_short_aligned);
|
|
__ lsr(count, count, LogBytesPerShort); // size => short_count
|
|
__ b(RuntimeAddress(short_copy_entry));
|
|
__ BIND(L_int_aligned);
|
|
__ lsr(count, count, LogBytesPerInt); // size => int_count
|
|
__ b(RuntimeAddress(int_copy_entry));
|
|
__ BIND(L_long_aligned);
|
|
__ lsr(count, count, LogBytesPerLong); // size => long_count
|
|
__ b(RuntimeAddress(long_copy_entry));
|
|
|
|
return start;
|
|
}
|
|
|
|
//
|
|
// Generate generic array copy stubs
|
|
//
|
|
// Input:
|
|
// c_rarg0 - src oop
|
|
// c_rarg1 - src_pos (32-bits)
|
|
// c_rarg2 - dst oop
|
|
// c_rarg3 - dst_pos (32-bits)
|
|
// c_rarg4 - element count (32-bits)
|
|
//
|
|
// Output:
|
|
// r0 == 0 - success
|
|
// r0 == -1^K - failure, where K is partial transfer count
|
|
//
|
|
address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
|
|
address int_copy_entry, address oop_copy_entry,
|
|
address long_copy_entry, address checkcast_copy_entry) {
|
|
StubId stub_id = StubId::stubgen_generic_arraycopy_id;
|
|
|
|
Label L_failed, L_objArray;
|
|
Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
|
|
|
|
// Input registers
|
|
const Register src = c_rarg0; // source array oop
|
|
const Register src_pos = c_rarg1; // source position
|
|
const Register dst = c_rarg2; // destination array oop
|
|
const Register dst_pos = c_rarg3; // destination position
|
|
const Register length = c_rarg4;
|
|
|
|
|
|
// Registers used as temps
|
|
const Register dst_klass = c_rarg5;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
// bump this on entry, not on exit:
|
|
inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
|
|
|
|
//-----------------------------------------------------------------------
|
|
// Assembler stub will be used for this call to arraycopy
|
|
// if the following conditions are met:
|
|
//
|
|
// (1) src and dst must not be null.
|
|
// (2) src_pos must not be negative.
|
|
// (3) dst_pos must not be negative.
|
|
// (4) length must not be negative.
|
|
// (5) src klass and dst klass should be the same and not null.
|
|
// (6) src and dst should be arrays.
|
|
// (7) src_pos + length must not exceed length of src.
|
|
// (8) dst_pos + length must not exceed length of dst.
|
|
//
|
|
|
|
// if (src == nullptr) return -1;
|
|
__ cbz(src, L_failed);
|
|
|
|
// if (src_pos < 0) return -1;
|
|
__ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
|
|
|
|
// if (dst == nullptr) return -1;
|
|
__ cbz(dst, L_failed);
|
|
|
|
// if (dst_pos < 0) return -1;
|
|
__ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
|
|
|
|
// registers used as temp
|
|
const Register scratch_length = r16; // elements count to copy
|
|
const Register scratch_src_klass = r17; // array klass
|
|
const Register lh = r15; // layout helper
|
|
|
|
// if (length < 0) return -1;
|
|
__ movw(scratch_length, length); // length (elements count, 32-bits value)
|
|
__ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
|
|
|
|
__ load_klass(scratch_src_klass, src);
|
|
#ifdef ASSERT
|
|
// assert(src->klass() != nullptr);
|
|
{
|
|
BLOCK_COMMENT("assert klasses not null {");
|
|
Label L1, L2;
|
|
__ cbnz(scratch_src_klass, L2); // it is broken if klass is null
|
|
__ bind(L1);
|
|
__ stop("broken null klass");
|
|
__ bind(L2);
|
|
__ load_klass(rscratch1, dst);
|
|
__ cbz(rscratch1, L1); // this would be broken also
|
|
BLOCK_COMMENT("} assert klasses not null done");
|
|
}
|
|
#endif
|
|
|
|
// Load layout helper (32-bits)
|
|
//
|
|
// |array_tag| | header_size | element_type | |log2_element_size|
|
|
// 32 30 24 16 8 2 0
|
|
//
|
|
// array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
|
|
//
|
|
|
|
const int lh_offset = in_bytes(Klass::layout_helper_offset());
|
|
|
|
// Handle objArrays completely differently...
|
|
const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
|
|
__ ldrw(lh, Address(scratch_src_klass, lh_offset));
|
|
__ movw(rscratch1, objArray_lh);
|
|
__ eorw(rscratch2, lh, rscratch1);
|
|
__ cbzw(rscratch2, L_objArray);
|
|
|
|
// if (src->klass() != dst->klass()) return -1;
|
|
__ load_klass(rscratch2, dst);
|
|
__ eor(rscratch2, rscratch2, scratch_src_klass);
|
|
__ cbnz(rscratch2, L_failed);
|
|
|
|
// if (!src->is_Array()) return -1;
|
|
__ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
|
|
|
|
// At this point, it is known to be a typeArray (array_tag 0x3).
|
|
#ifdef ASSERT
|
|
{
|
|
BLOCK_COMMENT("assert primitive array {");
|
|
Label L;
|
|
__ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
|
|
__ cmpw(lh, rscratch2);
|
|
__ br(Assembler::GE, L);
|
|
__ stop("must be a primitive array");
|
|
__ bind(L);
|
|
BLOCK_COMMENT("} assert primitive array done");
|
|
}
|
|
#endif
|
|
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
|
|
rscratch2, L_failed);
|
|
|
|
// TypeArrayKlass
|
|
//
|
|
// src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
|
|
// dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
|
|
//
|
|
|
|
const Register rscratch1_offset = rscratch1; // array offset
|
|
const Register r15_elsize = lh; // element size
|
|
|
|
__ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
|
|
exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
|
|
__ add(src, src, rscratch1_offset); // src array offset
|
|
__ add(dst, dst, rscratch1_offset); // dst array offset
|
|
BLOCK_COMMENT("choose copy loop based on element size");
|
|
|
|
// next registers should be set before the jump to corresponding stub
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register count = c_rarg2; // elements count
|
|
|
|
// 'from', 'to', 'count' registers should be set in such order
|
|
// since they are the same as 'src', 'src_pos', 'dst'.
|
|
|
|
assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
|
|
|
|
// The possible values of elsize are 0-3, i.e. exact_log2(element
|
|
// size in bytes). We do a simple bitwise binary search.
|
|
__ BIND(L_copy_bytes);
|
|
__ tbnz(r15_elsize, 1, L_copy_ints);
|
|
__ tbnz(r15_elsize, 0, L_copy_shorts);
|
|
__ lea(from, Address(src, src_pos));// src_addr
|
|
__ lea(to, Address(dst, dst_pos));// dst_addr
|
|
__ movw(count, scratch_length); // length
|
|
__ b(RuntimeAddress(byte_copy_entry));
|
|
|
|
__ BIND(L_copy_shorts);
|
|
__ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
|
|
__ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
|
|
__ movw(count, scratch_length); // length
|
|
__ b(RuntimeAddress(short_copy_entry));
|
|
|
|
__ BIND(L_copy_ints);
|
|
__ tbnz(r15_elsize, 0, L_copy_longs);
|
|
__ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
|
|
__ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
|
|
__ movw(count, scratch_length); // length
|
|
__ b(RuntimeAddress(int_copy_entry));
|
|
|
|
__ BIND(L_copy_longs);
|
|
#ifdef ASSERT
|
|
{
|
|
BLOCK_COMMENT("assert long copy {");
|
|
Label L;
|
|
__ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
|
|
__ cmpw(r15_elsize, LogBytesPerLong);
|
|
__ br(Assembler::EQ, L);
|
|
__ stop("must be long copy, but elsize is wrong");
|
|
__ bind(L);
|
|
BLOCK_COMMENT("} assert long copy done");
|
|
}
|
|
#endif
|
|
__ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
|
|
__ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
|
|
__ movw(count, scratch_length); // length
|
|
__ b(RuntimeAddress(long_copy_entry));
|
|
|
|
// ObjArrayKlass
|
|
__ BIND(L_objArray);
|
|
// live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
|
|
|
|
Label L_plain_copy, L_checkcast_copy;
|
|
// test array classes for subtyping
|
|
__ load_klass(r15, dst);
|
|
__ cmp(scratch_src_klass, r15); // usual case is exact equality
|
|
__ br(Assembler::NE, L_checkcast_copy);
|
|
|
|
// Identically typed arrays can be copied without element-wise checks.
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
|
|
rscratch2, L_failed);
|
|
|
|
__ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
|
|
__ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
|
|
__ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
|
|
__ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
|
|
__ movw(count, scratch_length); // length
|
|
__ BIND(L_plain_copy);
|
|
__ b(RuntimeAddress(oop_copy_entry));
|
|
|
|
__ BIND(L_checkcast_copy);
|
|
// live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
|
|
{
|
|
// Before looking at dst.length, make sure dst is also an objArray.
|
|
__ ldrw(rscratch1, Address(r15, lh_offset));
|
|
__ movw(rscratch2, objArray_lh);
|
|
__ eorw(rscratch1, rscratch1, rscratch2);
|
|
__ cbnzw(rscratch1, L_failed);
|
|
|
|
// It is safe to examine both src.length and dst.length.
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
|
|
r15, L_failed);
|
|
|
|
__ load_klass(dst_klass, dst); // reload
|
|
|
|
// Marshal the base address arguments now, freeing registers.
|
|
__ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
|
|
__ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
|
|
__ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
|
|
__ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
|
|
__ movw(count, length); // length (reloaded)
|
|
Register sco_temp = c_rarg3; // this register is free now
|
|
assert_different_registers(from, to, count, sco_temp,
|
|
dst_klass, scratch_src_klass);
|
|
// assert_clean_int(count, sco_temp);
|
|
|
|
// Generate the type check.
|
|
const int sco_offset = in_bytes(Klass::super_check_offset_offset());
|
|
__ ldrw(sco_temp, Address(dst_klass, sco_offset));
|
|
|
|
// Smashes rscratch1, rscratch2
|
|
generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
|
|
L_plain_copy);
|
|
|
|
// Fetch destination element klass from the ObjArrayKlass header.
|
|
int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
|
|
__ ldr(dst_klass, Address(dst_klass, ek_offset));
|
|
__ ldrw(sco_temp, Address(dst_klass, sco_offset));
|
|
|
|
// the checkcast_copy loop needs two extra arguments:
|
|
assert(c_rarg3 == sco_temp, "#3 already in place");
|
|
// Set up arguments for checkcast_copy_entry.
|
|
__ mov(c_rarg4, dst_klass); // dst.klass.element_klass
|
|
__ b(RuntimeAddress(checkcast_copy_entry));
|
|
}
|
|
|
|
__ BIND(L_failed);
|
|
__ mov(r0, -1);
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
//
|
|
// Generate stub for array fill. If "aligned" is true, the
|
|
// "to" address is assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// to: c_rarg0
|
|
// value: c_rarg1
|
|
// count: c_rarg2 treated as signed
|
|
//
|
|
address generate_fill(StubId stub_id) {
|
|
BasicType t;
|
|
bool aligned;
|
|
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jbyte_fill_id:
|
|
t = T_BYTE;
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_jshort_fill_id:
|
|
t = T_SHORT;
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_jint_fill_id:
|
|
t = T_INT;
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jbyte_fill_id:
|
|
t = T_BYTE;
|
|
aligned = true;
|
|
break;
|
|
case StubId::stubgen_arrayof_jshort_fill_id:
|
|
t = T_SHORT;
|
|
aligned = true;
|
|
break;
|
|
case StubId::stubgen_arrayof_jint_fill_id:
|
|
t = T_INT;
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
|
|
const Register to = c_rarg0; // source array address
|
|
const Register value = c_rarg1; // value
|
|
const Register count = c_rarg2; // elements count
|
|
|
|
const Register bz_base = r10; // base for block_zero routine
|
|
const Register cnt_words = r11; // temp register
|
|
|
|
__ enter();
|
|
|
|
Label L_fill_elements, L_exit1;
|
|
|
|
int shift = -1;
|
|
switch (t) {
|
|
case T_BYTE:
|
|
shift = 0;
|
|
__ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
|
|
__ bfi(value, value, 8, 8); // 8 bit -> 16 bit
|
|
__ bfi(value, value, 16, 16); // 16 bit -> 32 bit
|
|
__ br(Assembler::LO, L_fill_elements);
|
|
break;
|
|
case T_SHORT:
|
|
shift = 1;
|
|
__ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
|
|
__ bfi(value, value, 16, 16); // 16 bit -> 32 bit
|
|
__ br(Assembler::LO, L_fill_elements);
|
|
break;
|
|
case T_INT:
|
|
shift = 2;
|
|
__ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
|
|
__ br(Assembler::LO, L_fill_elements);
|
|
break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
|
|
// Align source address at 8 bytes address boundary.
|
|
Label L_skip_align1, L_skip_align2, L_skip_align4;
|
|
if (!aligned) {
|
|
switch (t) {
|
|
case T_BYTE:
|
|
// One byte misalignment happens only for byte arrays.
|
|
__ tbz(to, 0, L_skip_align1);
|
|
__ strb(value, Address(__ post(to, 1)));
|
|
__ subw(count, count, 1);
|
|
__ bind(L_skip_align1);
|
|
// Fallthrough
|
|
case T_SHORT:
|
|
// Two bytes misalignment happens only for byte and short (char) arrays.
|
|
__ tbz(to, 1, L_skip_align2);
|
|
__ strh(value, Address(__ post(to, 2)));
|
|
__ subw(count, count, 2 >> shift);
|
|
__ bind(L_skip_align2);
|
|
// Fallthrough
|
|
case T_INT:
|
|
// Align to 8 bytes, we know we are 4 byte aligned to start.
|
|
__ tbz(to, 2, L_skip_align4);
|
|
__ strw(value, Address(__ post(to, 4)));
|
|
__ subw(count, count, 4 >> shift);
|
|
__ bind(L_skip_align4);
|
|
break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
}
|
|
|
|
//
|
|
// Fill large chunks
|
|
//
|
|
__ lsrw(cnt_words, count, 3 - shift); // number of words
|
|
__ bfi(value, value, 32, 32); // 32 bit -> 64 bit
|
|
__ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
|
|
if (UseBlockZeroing) {
|
|
Label non_block_zeroing, rest;
|
|
// If the fill value is zero we can use the fast zero_words().
|
|
__ cbnz(value, non_block_zeroing);
|
|
__ mov(bz_base, to);
|
|
__ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
|
|
address tpc = __ zero_words(bz_base, cnt_words);
|
|
if (tpc == nullptr) {
|
|
fatal("CodeCache is full at generate_fill");
|
|
}
|
|
__ b(rest);
|
|
__ bind(non_block_zeroing);
|
|
__ fill_words(to, cnt_words, value);
|
|
__ bind(rest);
|
|
} else {
|
|
__ fill_words(to, cnt_words, value);
|
|
}
|
|
|
|
// Remaining count is less than 8 bytes. Fill it by a single store.
|
|
// Note that the total length is no less than 8 bytes.
|
|
if (t == T_BYTE || t == T_SHORT) {
|
|
Label L_exit1;
|
|
__ cbzw(count, L_exit1);
|
|
__ add(to, to, count, Assembler::LSL, shift); // points to the end
|
|
__ str(value, Address(to, -8)); // overwrite some elements
|
|
__ bind(L_exit1);
|
|
__ leave();
|
|
__ ret(lr);
|
|
}
|
|
|
|
// Handle copies less than 8 bytes.
|
|
Label L_fill_2, L_fill_4, L_exit2;
|
|
__ bind(L_fill_elements);
|
|
switch (t) {
|
|
case T_BYTE:
|
|
__ tbz(count, 0, L_fill_2);
|
|
__ strb(value, Address(__ post(to, 1)));
|
|
__ bind(L_fill_2);
|
|
__ tbz(count, 1, L_fill_4);
|
|
__ strh(value, Address(__ post(to, 2)));
|
|
__ bind(L_fill_4);
|
|
__ tbz(count, 2, L_exit2);
|
|
__ strw(value, Address(to));
|
|
break;
|
|
case T_SHORT:
|
|
__ tbz(count, 0, L_fill_4);
|
|
__ strh(value, Address(__ post(to, 2)));
|
|
__ bind(L_fill_4);
|
|
__ tbz(count, 1, L_exit2);
|
|
__ strw(value, Address(to));
|
|
break;
|
|
case T_INT:
|
|
__ cbzw(count, L_exit2);
|
|
__ strw(value, Address(to));
|
|
break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
__ bind(L_exit2);
|
|
__ leave();
|
|
__ ret(lr);
|
|
return start;
|
|
}
|
|
|
|
address generate_unsafecopy_common_error_exit() {
|
|
address start_pc = __ pc();
|
|
__ leave();
|
|
__ mov(r0, 0);
|
|
__ ret(lr);
|
|
return start_pc;
|
|
}
|
|
|
|
//
|
|
// Generate 'unsafe' set memory stub
|
|
// Though just as safe as the other stubs, it takes an unscaled
|
|
// size_t (# bytes) argument instead of an element count.
|
|
//
|
|
// This fill operation is atomicity preserving: as long as the
|
|
// address supplied is sufficiently aligned, all writes of up to 64
|
|
// bits in size are single-copy atomic.
|
|
//
|
|
// Input:
|
|
// c_rarg0 - destination array address
|
|
// c_rarg1 - byte count (size_t)
|
|
// c_rarg2 - byte value
|
|
//
|
|
address generate_unsafe_setmemory() {
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
|
|
address start = __ pc();
|
|
|
|
Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
|
|
Label tail;
|
|
|
|
UnsafeMemoryAccessMark umam(this, true, false);
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
__ dup(v0, __ T16B, value);
|
|
|
|
if (AvoidUnalignedAccesses) {
|
|
__ cmp(count, (u1)16);
|
|
__ br(__ LO, tail);
|
|
|
|
__ mov(rscratch1, 16);
|
|
__ andr(rscratch2, dest, 15);
|
|
__ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
|
|
__ strq(v0, Address(dest));
|
|
__ sub(count, count, rscratch1);
|
|
__ add(dest, dest, rscratch1);
|
|
}
|
|
|
|
__ subs(count, count, (u1)64);
|
|
__ br(__ LO, tail);
|
|
{
|
|
Label again;
|
|
__ bind(again);
|
|
__ stpq(v0, v0, Address(dest));
|
|
__ stpq(v0, v0, Address(dest, 32));
|
|
|
|
__ subs(count, count, 64);
|
|
__ add(dest, dest, 64);
|
|
__ br(__ HS, again);
|
|
}
|
|
|
|
__ bind(tail);
|
|
// The count of bytes is off by 64, but we don't need to correct
|
|
// it because we're only going to use the least-significant few
|
|
// count bits from here on.
|
|
// __ add(count, count, 64);
|
|
|
|
{
|
|
Label dont;
|
|
__ tbz(count, exact_log2(32), dont);
|
|
__ stpq(v0, v0, __ post(dest, 32));
|
|
__ bind(dont);
|
|
}
|
|
{
|
|
Label dont;
|
|
__ tbz(count, exact_log2(16), dont);
|
|
__ strq(v0, __ post(dest, 16));
|
|
__ bind(dont);
|
|
}
|
|
{
|
|
Label dont;
|
|
__ tbz(count, exact_log2(8), dont);
|
|
__ strd(v0, __ post(dest, 8));
|
|
__ bind(dont);
|
|
}
|
|
|
|
Label finished;
|
|
__ tst(count, 7);
|
|
__ br(__ EQ, finished);
|
|
|
|
{
|
|
Label dont;
|
|
__ tbz(count, exact_log2(4), dont);
|
|
__ strs(v0, __ post(dest, 4));
|
|
__ bind(dont);
|
|
}
|
|
{
|
|
Label dont;
|
|
__ tbz(count, exact_log2(2), dont);
|
|
__ bfi(value, value, 8, 8);
|
|
__ strh(value, __ post(dest, 2));
|
|
__ bind(dont);
|
|
}
|
|
{
|
|
Label dont;
|
|
__ tbz(count, exact_log2(1), dont);
|
|
__ strb(value, Address(dest));
|
|
__ bind(dont);
|
|
}
|
|
|
|
__ bind(finished);
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_data_cache_writeback() {
|
|
const Register line = c_rarg0; // address of line to write back
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubId stub_id = StubId::stubgen_data_cache_writeback_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
__ enter();
|
|
__ cache_wb(Address(line, 0));
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_data_cache_writeback_sync() {
|
|
const Register is_pre = c_rarg0; // pre or post sync
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
// pre wbsync is a no-op
|
|
// post wbsync translates to an sfence
|
|
|
|
Label skip;
|
|
address start = __ pc();
|
|
__ enter();
|
|
__ cbnz(is_pre, skip);
|
|
__ cache_wbsync(false);
|
|
__ bind(skip);
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
void generate_arraycopy_stubs() {
|
|
// Some copy stubs publish a normal entry and then a 2nd 'fallback'
|
|
// entry immediately following their stack push. This can be used
|
|
// as a post-push branch target for compatible stubs when they
|
|
// identify a special case that can be handled by the fallback
|
|
// stub e.g a disjoint copy stub may be use as a special case
|
|
// fallback for its compatible conjoint copy stub.
|
|
//
|
|
// A no push entry is always returned in the following local and
|
|
// then published by assigning to the appropriate entry field in
|
|
// class StubRoutines. The entry value is then passed to the
|
|
// generator for the compatible stub. That means the entry must be
|
|
// listed when saving to/restoring from the AOT cache, ensuring
|
|
// that the inter-stub jumps are noted at AOT-cache save and
|
|
// relocated at AOT cache load.
|
|
address nopush_entry;
|
|
|
|
// generate the common exit first so later stubs can rely on it if
|
|
// they want an UnsafeMemoryAccess exit non-local to the stub
|
|
StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
|
|
// register the stub as the default exit with class UnsafeMemoryAccess
|
|
UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
|
|
|
|
// generate and publish arch64-specific bulk copy routines first
|
|
// so we can call them from other copy stubs
|
|
StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
|
|
StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
|
|
|
|
StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
|
|
StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
|
|
|
|
StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
|
|
StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
|
|
|
|
StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
|
|
|
|
//*** jbyte
|
|
// Always need aligned and unaligned versions
|
|
StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
|
|
// disjoint nopush entry is needed by conjoint copy
|
|
StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
|
|
// conjoint nopush entry is needed by generic/unsafe copy
|
|
StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
|
|
// disjoint arrayof nopush entry is needed by conjoint copy
|
|
StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
|
|
|
|
//*** jshort
|
|
// Always need aligned and unaligned versions
|
|
StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
|
|
// disjoint nopush entry is needed by conjoint copy
|
|
StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
|
|
// conjoint nopush entry is used by generic/unsafe copy
|
|
StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
|
|
// disjoint arrayof nopush entry is needed by conjoint copy
|
|
StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
|
|
|
|
//*** jint
|
|
// Aligned versions
|
|
StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
|
|
// disjoint arrayof nopush entry is needed by conjoint copy
|
|
StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
|
|
// In 64 bit we need both aligned and unaligned versions of jint arraycopy.
|
|
// jint_arraycopy_nopush always points to the unaligned version
|
|
StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
|
|
// disjoint nopush entry is needed by conjoint copy
|
|
StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
|
|
// conjoint nopush entry is needed by generic/unsafe copy
|
|
StubRoutines::_jint_arraycopy_nopush = nopush_entry;
|
|
|
|
//*** jlong
|
|
// It is always aligned
|
|
StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
|
|
// disjoint arrayof nopush entry is needed by conjoint copy
|
|
StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
|
|
// conjoint nopush entry is needed by generic/unsafe copy
|
|
StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
|
|
// disjoint normal/nopush and conjoint normal entries are not
|
|
// generated since the arrayof versions are the same
|
|
StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
|
|
StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
|
|
StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
|
|
|
|
//*** oops
|
|
{
|
|
StubRoutines::_arrayof_oop_disjoint_arraycopy
|
|
= generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
|
|
// disjoint arrayof nopush entry is needed by conjoint copy
|
|
StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
|
|
StubRoutines::_arrayof_oop_arraycopy
|
|
= generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
|
|
// conjoint arrayof nopush entry is needed by generic/unsafe copy
|
|
StubRoutines::_oop_arraycopy_nopush = nopush_entry;
|
|
// Aligned versions without pre-barriers
|
|
StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
|
|
= generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
|
|
// disjoint arrayof+uninit nopush entry is needed by conjoint copy
|
|
StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
|
|
// note that we don't need a returned nopush entry because the
|
|
// generic/unsafe copy does not cater for uninit arrays.
|
|
StubRoutines::_arrayof_oop_arraycopy_uninit
|
|
= generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
|
|
}
|
|
|
|
// for oop copies reuse arrayof entries for non-arrayof cases
|
|
StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
|
|
StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
|
|
StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
|
|
StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
|
|
StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
|
|
StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
|
|
|
|
StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
|
|
// checkcast nopush entry is needed by generic copy
|
|
StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
|
|
// note that we don't need a returned nopush entry because the
|
|
// generic copy does not cater for uninit arrays.
|
|
StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
|
|
|
|
// unsafe arraycopy may fallback on conjoint stubs
|
|
StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
|
|
StubRoutines::_jshort_arraycopy_nopush,
|
|
StubRoutines::_jint_arraycopy_nopush,
|
|
StubRoutines::_jlong_arraycopy_nopush);
|
|
|
|
// generic arraycopy may fallback on conjoint stubs
|
|
StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
|
|
StubRoutines::_jshort_arraycopy_nopush,
|
|
StubRoutines::_jint_arraycopy_nopush,
|
|
StubRoutines::_oop_arraycopy_nopush,
|
|
StubRoutines::_jlong_arraycopy_nopush,
|
|
StubRoutines::_checkcast_arraycopy_nopush);
|
|
|
|
StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
|
|
StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
|
|
StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
|
|
StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
|
|
StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
|
|
StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
|
|
}
|
|
|
|
void generate_math_stubs() { Unimplemented(); }
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - K (key) in little endian int array
|
|
//
|
|
address generate_aescrypt_encryptBlock() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register keylen = rscratch1;
|
|
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ aesenc_loadkeys(key, keylen);
|
|
__ aesecb_encrypt(from, to, keylen);
|
|
|
|
__ mov(r0, 0);
|
|
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - K (key) in little endian int array
|
|
//
|
|
address generate_aescrypt_decryptBlock() {
|
|
assert(UseAES, "need AES cryptographic extension support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
Label L_doLast;
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register keylen = rscratch1;
|
|
|
|
address start = __ pc();
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ aesecb_decrypt(from, to, key, keylen);
|
|
|
|
__ mov(r0, 0);
|
|
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - K (key) in little endian int array
|
|
// c_rarg3 - r vector byte array address
|
|
// c_rarg4 - input length
|
|
//
|
|
// Output:
|
|
// x0 - input length
|
|
//
|
|
address generate_cipherBlockChaining_encryptAESCrypt() {
|
|
assert(UseAES, "need AES cryptographic extension support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register rvec = c_rarg3; // r byte array initialized from initvector array address
|
|
// and left with the results of the last encryption block
|
|
const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
|
|
const Register keylen = rscratch1;
|
|
|
|
address start = __ pc();
|
|
|
|
__ enter();
|
|
|
|
__ movw(rscratch2, len_reg);
|
|
|
|
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ ld1(v0, __ T16B, rvec);
|
|
|
|
__ cmpw(keylen, 52);
|
|
__ br(Assembler::CC, L_loadkeys_44);
|
|
__ br(Assembler::EQ, L_loadkeys_52);
|
|
|
|
__ ld1(v17, v18, __ T16B, __ post(key, 32));
|
|
__ rev32(v17, __ T16B, v17);
|
|
__ rev32(v18, __ T16B, v18);
|
|
__ BIND(L_loadkeys_52);
|
|
__ ld1(v19, v20, __ T16B, __ post(key, 32));
|
|
__ rev32(v19, __ T16B, v19);
|
|
__ rev32(v20, __ T16B, v20);
|
|
__ BIND(L_loadkeys_44);
|
|
__ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
|
|
__ rev32(v21, __ T16B, v21);
|
|
__ rev32(v22, __ T16B, v22);
|
|
__ rev32(v23, __ T16B, v23);
|
|
__ rev32(v24, __ T16B, v24);
|
|
__ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
|
|
__ rev32(v25, __ T16B, v25);
|
|
__ rev32(v26, __ T16B, v26);
|
|
__ rev32(v27, __ T16B, v27);
|
|
__ rev32(v28, __ T16B, v28);
|
|
__ ld1(v29, v30, v31, __ T16B, key);
|
|
__ rev32(v29, __ T16B, v29);
|
|
__ rev32(v30, __ T16B, v30);
|
|
__ rev32(v31, __ T16B, v31);
|
|
|
|
__ BIND(L_aes_loop);
|
|
__ ld1(v1, __ T16B, __ post(from, 16));
|
|
__ eor(v0, __ T16B, v0, v1);
|
|
|
|
__ br(Assembler::CC, L_rounds_44);
|
|
__ br(Assembler::EQ, L_rounds_52);
|
|
|
|
__ aese(v0, v17); __ aesmc(v0, v0);
|
|
__ aese(v0, v18); __ aesmc(v0, v0);
|
|
__ BIND(L_rounds_52);
|
|
__ aese(v0, v19); __ aesmc(v0, v0);
|
|
__ aese(v0, v20); __ aesmc(v0, v0);
|
|
__ BIND(L_rounds_44);
|
|
__ aese(v0, v21); __ aesmc(v0, v0);
|
|
__ aese(v0, v22); __ aesmc(v0, v0);
|
|
__ aese(v0, v23); __ aesmc(v0, v0);
|
|
__ aese(v0, v24); __ aesmc(v0, v0);
|
|
__ aese(v0, v25); __ aesmc(v0, v0);
|
|
__ aese(v0, v26); __ aesmc(v0, v0);
|
|
__ aese(v0, v27); __ aesmc(v0, v0);
|
|
__ aese(v0, v28); __ aesmc(v0, v0);
|
|
__ aese(v0, v29); __ aesmc(v0, v0);
|
|
__ aese(v0, v30);
|
|
__ eor(v0, __ T16B, v0, v31);
|
|
|
|
__ st1(v0, __ T16B, __ post(to, 16));
|
|
|
|
__ subw(len_reg, len_reg, 16);
|
|
__ cbnzw(len_reg, L_aes_loop);
|
|
|
|
__ st1(v0, __ T16B, rvec);
|
|
|
|
__ mov(r0, rscratch2);
|
|
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - K (key) in little endian int array
|
|
// c_rarg3 - r vector byte array address
|
|
// c_rarg4 - input length
|
|
//
|
|
// Output:
|
|
// r0 - input length
|
|
//
|
|
address generate_cipherBlockChaining_decryptAESCrypt() {
|
|
assert(UseAES, "need AES cryptographic extension support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register rvec = c_rarg3; // r byte array initialized from initvector array address
|
|
// and left with the results of the last encryption block
|
|
const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
|
|
const Register keylen = rscratch1;
|
|
|
|
address start = __ pc();
|
|
|
|
__ enter();
|
|
|
|
__ movw(rscratch2, len_reg);
|
|
|
|
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ ld1(v2, __ T16B, rvec);
|
|
|
|
__ ld1(v31, __ T16B, __ post(key, 16));
|
|
__ rev32(v31, __ T16B, v31);
|
|
|
|
__ cmpw(keylen, 52);
|
|
__ br(Assembler::CC, L_loadkeys_44);
|
|
__ br(Assembler::EQ, L_loadkeys_52);
|
|
|
|
__ ld1(v17, v18, __ T16B, __ post(key, 32));
|
|
__ rev32(v17, __ T16B, v17);
|
|
__ rev32(v18, __ T16B, v18);
|
|
__ BIND(L_loadkeys_52);
|
|
__ ld1(v19, v20, __ T16B, __ post(key, 32));
|
|
__ rev32(v19, __ T16B, v19);
|
|
__ rev32(v20, __ T16B, v20);
|
|
__ BIND(L_loadkeys_44);
|
|
__ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
|
|
__ rev32(v21, __ T16B, v21);
|
|
__ rev32(v22, __ T16B, v22);
|
|
__ rev32(v23, __ T16B, v23);
|
|
__ rev32(v24, __ T16B, v24);
|
|
__ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
|
|
__ rev32(v25, __ T16B, v25);
|
|
__ rev32(v26, __ T16B, v26);
|
|
__ rev32(v27, __ T16B, v27);
|
|
__ rev32(v28, __ T16B, v28);
|
|
__ ld1(v29, v30, __ T16B, key);
|
|
__ rev32(v29, __ T16B, v29);
|
|
__ rev32(v30, __ T16B, v30);
|
|
|
|
__ BIND(L_aes_loop);
|
|
__ ld1(v0, __ T16B, __ post(from, 16));
|
|
__ orr(v1, __ T16B, v0, v0);
|
|
|
|
__ br(Assembler::CC, L_rounds_44);
|
|
__ br(Assembler::EQ, L_rounds_52);
|
|
|
|
__ aesd(v0, v17); __ aesimc(v0, v0);
|
|
__ aesd(v0, v18); __ aesimc(v0, v0);
|
|
__ BIND(L_rounds_52);
|
|
__ aesd(v0, v19); __ aesimc(v0, v0);
|
|
__ aesd(v0, v20); __ aesimc(v0, v0);
|
|
__ BIND(L_rounds_44);
|
|
__ aesd(v0, v21); __ aesimc(v0, v0);
|
|
__ aesd(v0, v22); __ aesimc(v0, v0);
|
|
__ aesd(v0, v23); __ aesimc(v0, v0);
|
|
__ aesd(v0, v24); __ aesimc(v0, v0);
|
|
__ aesd(v0, v25); __ aesimc(v0, v0);
|
|
__ aesd(v0, v26); __ aesimc(v0, v0);
|
|
__ aesd(v0, v27); __ aesimc(v0, v0);
|
|
__ aesd(v0, v28); __ aesimc(v0, v0);
|
|
__ aesd(v0, v29); __ aesimc(v0, v0);
|
|
__ aesd(v0, v30);
|
|
__ eor(v0, __ T16B, v0, v31);
|
|
__ eor(v0, __ T16B, v0, v2);
|
|
|
|
__ st1(v0, __ T16B, __ post(to, 16));
|
|
__ orr(v2, __ T16B, v1, v1);
|
|
|
|
__ subw(len_reg, len_reg, 16);
|
|
__ cbnzw(len_reg, L_aes_loop);
|
|
|
|
__ st1(v2, __ T16B, rvec);
|
|
|
|
__ mov(r0, rscratch2);
|
|
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Big-endian 128-bit + 64-bit -> 128-bit addition.
|
|
// Inputs: 128-bits. in is preserved.
|
|
// The least-significant 64-bit word is in the upper dword of each vector.
|
|
// inc (the 64-bit increment) is preserved. Its lower dword must be zero.
|
|
// Output: result
|
|
void be_add_128_64(FloatRegister result, FloatRegister in,
|
|
FloatRegister inc, FloatRegister tmp) {
|
|
assert_different_registers(result, tmp, inc);
|
|
|
|
__ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
|
|
// input
|
|
__ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
|
|
__ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
|
|
// MSD == 0 (must be!) to LSD
|
|
__ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
|
|
}
|
|
|
|
// CTR AES crypt.
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - K (key) in little endian int array
|
|
// c_rarg3 - counter vector byte array address
|
|
// c_rarg4 - input length
|
|
// c_rarg5 - saved encryptedCounter start
|
|
// c_rarg6 - saved used length
|
|
//
|
|
// Output:
|
|
// r0 - input length
|
|
//
|
|
address generate_counterMode_AESCrypt() {
|
|
const Register in = c_rarg0;
|
|
const Register out = c_rarg1;
|
|
const Register key = c_rarg2;
|
|
const Register counter = c_rarg3;
|
|
const Register saved_len = c_rarg4, len = r10;
|
|
const Register saved_encrypted_ctr = c_rarg5;
|
|
const Register used_ptr = c_rarg6, used = r12;
|
|
|
|
const Register offset = r7;
|
|
const Register keylen = r11;
|
|
|
|
const unsigned char block_size = 16;
|
|
const int bulk_width = 4;
|
|
// NB: bulk_width can be 4 or 8. 8 gives slightly faster
|
|
// performance with larger data sizes, but it also means that the
|
|
// fast path isn't used until you have at least 8 blocks, and up
|
|
// to 127 bytes of data will be executed on the slow path. For
|
|
// that reason, and also so as not to blow away too much icache, 4
|
|
// blocks seems like a sensible compromise.
|
|
|
|
// Algorithm:
|
|
//
|
|
// if (len == 0) {
|
|
// goto DONE;
|
|
// }
|
|
// int result = len;
|
|
// do {
|
|
// if (used >= blockSize) {
|
|
// if (len >= bulk_width * blockSize) {
|
|
// CTR_large_block();
|
|
// if (len == 0)
|
|
// goto DONE;
|
|
// }
|
|
// for (;;) {
|
|
// 16ByteVector v0 = counter;
|
|
// embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
|
|
// used = 0;
|
|
// if (len < blockSize)
|
|
// break; /* goto NEXT */
|
|
// 16ByteVector v1 = load16Bytes(in, offset);
|
|
// v1 = v1 ^ encryptedCounter;
|
|
// store16Bytes(out, offset);
|
|
// used = blockSize;
|
|
// offset += blockSize;
|
|
// len -= blockSize;
|
|
// if (len == 0)
|
|
// goto DONE;
|
|
// }
|
|
// }
|
|
// NEXT:
|
|
// out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
|
|
// len--;
|
|
// } while (len != 0);
|
|
// DONE:
|
|
// return result;
|
|
//
|
|
// CTR_large_block()
|
|
// Wide bulk encryption of whole blocks.
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
const address start = __ pc();
|
|
__ enter();
|
|
|
|
Label DONE, CTR_large_block, large_block_return;
|
|
__ ldrw(used, Address(used_ptr));
|
|
__ cbzw(saved_len, DONE);
|
|
|
|
__ mov(len, saved_len);
|
|
__ mov(offset, 0);
|
|
|
|
// Compute #rounds for AES based on the length of the key array
|
|
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ aesenc_loadkeys(key, keylen);
|
|
|
|
{
|
|
Label L_CTR_loop, NEXT;
|
|
|
|
__ bind(L_CTR_loop);
|
|
|
|
__ cmp(used, block_size);
|
|
__ br(__ LO, NEXT);
|
|
|
|
// Maybe we have a lot of data
|
|
__ subsw(rscratch1, len, bulk_width * block_size);
|
|
__ br(__ HS, CTR_large_block);
|
|
__ BIND(large_block_return);
|
|
__ cbzw(len, DONE);
|
|
|
|
// Setup the counter
|
|
__ movi(v4, __ T4S, 0);
|
|
__ movi(v5, __ T4S, 1);
|
|
__ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
|
|
|
|
// 128-bit big-endian increment
|
|
__ ld1(v0, __ T16B, counter);
|
|
__ rev64(v16, __ T16B, v0);
|
|
be_add_128_64(v16, v16, v4, /*tmp*/v5);
|
|
__ rev64(v16, __ T16B, v16);
|
|
__ st1(v16, __ T16B, counter);
|
|
// Previous counter value is in v0
|
|
// v4 contains { 0, 1 }
|
|
|
|
{
|
|
// We have fewer than bulk_width blocks of data left. Encrypt
|
|
// them one by one until there is less than a full block
|
|
// remaining, being careful to save both the encrypted counter
|
|
// and the counter.
|
|
|
|
Label inner_loop;
|
|
__ bind(inner_loop);
|
|
// Counter to encrypt is in v0
|
|
__ aesecb_encrypt(noreg, noreg, keylen);
|
|
__ st1(v0, __ T16B, saved_encrypted_ctr);
|
|
|
|
// Do we have a remaining full block?
|
|
|
|
__ mov(used, 0);
|
|
__ cmp(len, block_size);
|
|
__ br(__ LO, NEXT);
|
|
|
|
// Yes, we have a full block
|
|
__ ldrq(v1, Address(in, offset));
|
|
__ eor(v1, __ T16B, v1, v0);
|
|
__ strq(v1, Address(out, offset));
|
|
__ mov(used, block_size);
|
|
__ add(offset, offset, block_size);
|
|
|
|
__ subw(len, len, block_size);
|
|
__ cbzw(len, DONE);
|
|
|
|
// Increment the counter, store it back
|
|
__ orr(v0, __ T16B, v16, v16);
|
|
__ rev64(v16, __ T16B, v16);
|
|
be_add_128_64(v16, v16, v4, /*tmp*/v5);
|
|
__ rev64(v16, __ T16B, v16);
|
|
__ st1(v16, __ T16B, counter); // Save the incremented counter back
|
|
|
|
__ b(inner_loop);
|
|
}
|
|
|
|
__ BIND(NEXT);
|
|
|
|
// Encrypt a single byte, and loop.
|
|
// We expect this to be a rare event.
|
|
__ ldrb(rscratch1, Address(in, offset));
|
|
__ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
|
|
__ eor(rscratch1, rscratch1, rscratch2);
|
|
__ strb(rscratch1, Address(out, offset));
|
|
__ add(offset, offset, 1);
|
|
__ add(used, used, 1);
|
|
__ subw(len, len,1);
|
|
__ cbnzw(len, L_CTR_loop);
|
|
}
|
|
|
|
__ bind(DONE);
|
|
__ strw(used, Address(used_ptr));
|
|
__ mov(r0, saved_len);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(lr);
|
|
|
|
// Bulk encryption
|
|
|
|
__ BIND (CTR_large_block);
|
|
assert(bulk_width == 4 || bulk_width == 8, "must be");
|
|
|
|
if (bulk_width == 8) {
|
|
__ sub(sp, sp, 4 * 16);
|
|
__ st1(v12, v13, v14, v15, __ T16B, Address(sp));
|
|
}
|
|
__ sub(sp, sp, 4 * 16);
|
|
__ st1(v8, v9, v10, v11, __ T16B, Address(sp));
|
|
RegSet saved_regs = (RegSet::of(in, out, offset)
|
|
+ RegSet::of(saved_encrypted_ctr, used_ptr, len));
|
|
__ push(saved_regs, sp);
|
|
__ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
|
|
__ add(in, in, offset);
|
|
__ add(out, out, offset);
|
|
|
|
// Keys should already be loaded into the correct registers
|
|
|
|
__ ld1(v0, __ T16B, counter); // v0 contains the first counter
|
|
__ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
|
|
|
|
// AES/CTR loop
|
|
{
|
|
Label L_CTR_loop;
|
|
__ BIND(L_CTR_loop);
|
|
|
|
// Setup the counters
|
|
__ movi(v8, __ T4S, 0);
|
|
__ movi(v9, __ T4S, 1);
|
|
__ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
|
|
|
|
for (int i = 0; i < bulk_width; i++) {
|
|
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
|
|
__ rev64(v0_ofs, __ T16B, v16);
|
|
be_add_128_64(v16, v16, v8, /*tmp*/v9);
|
|
}
|
|
|
|
__ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
|
|
|
|
// Encrypt the counters
|
|
__ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
|
|
|
|
if (bulk_width == 8) {
|
|
__ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
|
|
}
|
|
|
|
// XOR the encrypted counters with the inputs
|
|
for (int i = 0; i < bulk_width; i++) {
|
|
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
|
|
FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
|
|
__ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
|
|
}
|
|
|
|
// Write the encrypted data
|
|
__ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
|
|
if (bulk_width == 8) {
|
|
__ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
|
|
}
|
|
|
|
__ subw(len, len, 16 * bulk_width);
|
|
__ cbnzw(len, L_CTR_loop);
|
|
}
|
|
|
|
// Save the counter back where it goes
|
|
__ rev64(v16, __ T16B, v16);
|
|
__ st1(v16, __ T16B, counter);
|
|
|
|
__ pop(saved_regs, sp);
|
|
|
|
__ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
|
|
if (bulk_width == 8) {
|
|
__ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
|
|
}
|
|
|
|
__ andr(rscratch1, len, -16 * bulk_width);
|
|
__ sub(len, len, rscratch1);
|
|
__ add(offset, offset, rscratch1);
|
|
__ mov(used, 16);
|
|
__ strw(used, Address(used_ptr));
|
|
__ b(large_block_return);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Vector AES Galois Counter Mode implementation. Parameters:
|
|
//
|
|
// in = c_rarg0
|
|
// len = c_rarg1
|
|
// ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
|
|
// out = c_rarg3
|
|
// key = c_rarg4
|
|
// state = c_rarg5 - GHASH.state
|
|
// subkeyHtbl = c_rarg6 - powers of H
|
|
// counter = c_rarg7 - 16 bytes of CTR
|
|
// return - number of processed bytes
|
|
address generate_galoisCounterMode_AESCrypt() {
|
|
Label ghash_polynomial; // local data generated after code
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register in = c_rarg0;
|
|
const Register len = c_rarg1;
|
|
const Register ct = c_rarg2;
|
|
const Register out = c_rarg3;
|
|
// and updated with the incremented counter in the end
|
|
|
|
const Register key = c_rarg4;
|
|
const Register state = c_rarg5;
|
|
|
|
const Register subkeyHtbl = c_rarg6;
|
|
|
|
const Register counter = c_rarg7;
|
|
|
|
const Register keylen = r10;
|
|
// Save state before entering routine
|
|
__ sub(sp, sp, 4 * 16);
|
|
__ st1(v12, v13, v14, v15, __ T16B, Address(sp));
|
|
__ sub(sp, sp, 4 * 16);
|
|
__ st1(v8, v9, v10, v11, __ T16B, Address(sp));
|
|
|
|
// __ andr(len, len, -512);
|
|
__ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
|
|
__ str(len, __ pre(sp, -2 * wordSize));
|
|
|
|
Label DONE;
|
|
__ cbz(len, DONE);
|
|
|
|
// Compute #rounds for AES based on the length of the key array
|
|
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ aesenc_loadkeys(key, keylen);
|
|
__ ld1(v0, __ T16B, counter); // v0 contains the first counter
|
|
__ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
|
|
|
|
// AES/CTR loop
|
|
{
|
|
Label L_CTR_loop;
|
|
__ BIND(L_CTR_loop);
|
|
|
|
// Setup the counters
|
|
__ movi(v8, __ T4S, 0);
|
|
__ movi(v9, __ T4S, 1);
|
|
__ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
|
|
|
|
assert(v0->encoding() < v8->encoding(), "");
|
|
for (int i = v0->encoding(); i < v8->encoding(); i++) {
|
|
FloatRegister f = as_FloatRegister(i);
|
|
__ rev32(f, __ T16B, v16);
|
|
__ addv(v16, __ T4S, v16, v8);
|
|
}
|
|
|
|
__ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
|
|
|
|
// Encrypt the counters
|
|
__ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
|
|
|
|
__ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
|
|
|
|
// XOR the encrypted counters with the inputs
|
|
for (int i = 0; i < 8; i++) {
|
|
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
|
|
FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
|
|
__ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
|
|
}
|
|
__ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
|
|
__ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
|
|
|
|
__ subw(len, len, 16 * 8);
|
|
__ cbnzw(len, L_CTR_loop);
|
|
}
|
|
|
|
__ rev32(v16, __ T16B, v16);
|
|
__ st1(v16, __ T16B, counter);
|
|
|
|
__ ldr(len, Address(sp));
|
|
__ lsr(len, len, exact_log2(16)); // We want the count of blocks
|
|
|
|
// GHASH/CTR loop
|
|
__ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
|
|
len, /*unrolls*/4);
|
|
|
|
#ifdef ASSERT
|
|
{ Label L;
|
|
__ cmp(len, (unsigned char)0);
|
|
__ br(Assembler::EQ, L);
|
|
__ stop("stubGenerator: abort");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
__ bind(DONE);
|
|
// Return the number of bytes processed
|
|
__ ldr(r0, __ post(sp, 2 * wordSize));
|
|
|
|
__ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
|
|
__ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(lr);
|
|
|
|
// bind label and generate polynomial data
|
|
__ align(wordSize * 2);
|
|
__ bind(ghash_polynomial);
|
|
__ emit_int64(0x87); // The low-order bits of the field
|
|
// polynomial (i.e. p = z^7+z^2+z+1)
|
|
// repeated in the low and high parts of a
|
|
// 128-bit vector
|
|
__ emit_int64(0x87);
|
|
|
|
return start;
|
|
}
|
|
|
|
class Cached64Bytes {
|
|
private:
|
|
MacroAssembler *_masm;
|
|
Register _regs[8];
|
|
|
|
public:
|
|
Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
|
|
assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
|
|
auto it = rs.begin();
|
|
for (auto &r: _regs) {
|
|
r = *it;
|
|
++it;
|
|
}
|
|
}
|
|
|
|
void gen_loads(Register base) {
|
|
for (int i = 0; i < 8; i += 2) {
|
|
__ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
|
|
}
|
|
}
|
|
|
|
// Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
|
|
void extract_u32(Register dest, int i) {
|
|
__ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
|
|
}
|
|
};
|
|
|
|
// Utility routines for md5.
|
|
// Clobbers r10 and r11.
|
|
void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
|
|
int k, int s, int t) {
|
|
Register rscratch3 = r10;
|
|
Register rscratch4 = r11;
|
|
|
|
__ eorw(rscratch3, r3, r4);
|
|
__ movw(rscratch2, t);
|
|
__ andw(rscratch3, rscratch3, r2);
|
|
__ addw(rscratch4, r1, rscratch2);
|
|
reg_cache.extract_u32(rscratch1, k);
|
|
__ eorw(rscratch3, rscratch3, r4);
|
|
__ addw(rscratch4, rscratch4, rscratch1);
|
|
__ addw(rscratch3, rscratch3, rscratch4);
|
|
__ rorw(rscratch2, rscratch3, 32 - s);
|
|
__ addw(r1, rscratch2, r2);
|
|
}
|
|
|
|
void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
|
|
int k, int s, int t) {
|
|
Register rscratch3 = r10;
|
|
Register rscratch4 = r11;
|
|
|
|
reg_cache.extract_u32(rscratch1, k);
|
|
__ movw(rscratch2, t);
|
|
__ addw(rscratch4, r1, rscratch2);
|
|
__ addw(rscratch4, rscratch4, rscratch1);
|
|
__ bicw(rscratch2, r3, r4);
|
|
__ andw(rscratch3, r2, r4);
|
|
__ addw(rscratch2, rscratch2, rscratch4);
|
|
__ addw(rscratch2, rscratch2, rscratch3);
|
|
__ rorw(rscratch2, rscratch2, 32 - s);
|
|
__ addw(r1, rscratch2, r2);
|
|
}
|
|
|
|
void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
|
|
int k, int s, int t) {
|
|
Register rscratch3 = r10;
|
|
Register rscratch4 = r11;
|
|
|
|
__ eorw(rscratch3, r3, r4);
|
|
__ movw(rscratch2, t);
|
|
__ addw(rscratch4, r1, rscratch2);
|
|
reg_cache.extract_u32(rscratch1, k);
|
|
__ eorw(rscratch3, rscratch3, r2);
|
|
__ addw(rscratch4, rscratch4, rscratch1);
|
|
__ addw(rscratch3, rscratch3, rscratch4);
|
|
__ rorw(rscratch2, rscratch3, 32 - s);
|
|
__ addw(r1, rscratch2, r2);
|
|
}
|
|
|
|
void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
|
|
int k, int s, int t) {
|
|
Register rscratch3 = r10;
|
|
Register rscratch4 = r11;
|
|
|
|
__ movw(rscratch3, t);
|
|
__ ornw(rscratch2, r2, r4);
|
|
__ addw(rscratch4, r1, rscratch3);
|
|
reg_cache.extract_u32(rscratch1, k);
|
|
__ eorw(rscratch3, rscratch2, r3);
|
|
__ addw(rscratch4, rscratch4, rscratch1);
|
|
__ addw(rscratch3, rscratch3, rscratch4);
|
|
__ rorw(rscratch2, rscratch3, 32 - s);
|
|
__ addw(r1, rscratch2, r2);
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - byte[] source+offset
|
|
// c_rarg1 - int[] SHA.state
|
|
// c_rarg2 - int offset
|
|
// c_rarg3 - int limit
|
|
//
|
|
address generate_md5_implCompress(StubId stub_id) {
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_md5_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case StubId::stubgen_md5_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register buf = c_rarg0;
|
|
Register state = c_rarg1;
|
|
Register ofs = c_rarg2;
|
|
Register limit = c_rarg3;
|
|
Register a = r4;
|
|
Register b = r5;
|
|
Register c = r6;
|
|
Register d = r7;
|
|
Register rscratch3 = r10;
|
|
Register rscratch4 = r11;
|
|
|
|
Register state_regs[2] = { r12, r13 };
|
|
RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
|
|
Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
|
|
|
|
__ push(saved_regs, sp);
|
|
|
|
__ ldp(state_regs[0], state_regs[1], Address(state));
|
|
__ ubfx(a, state_regs[0], 0, 32);
|
|
__ ubfx(b, state_regs[0], 32, 32);
|
|
__ ubfx(c, state_regs[1], 0, 32);
|
|
__ ubfx(d, state_regs[1], 32, 32);
|
|
|
|
Label md5_loop;
|
|
__ BIND(md5_loop);
|
|
|
|
reg_cache.gen_loads(buf);
|
|
|
|
// Round 1
|
|
md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
|
|
md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
|
|
md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
|
|
md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
|
|
md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
|
|
md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
|
|
md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
|
|
md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
|
|
md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
|
|
md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
|
|
md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
|
|
md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
|
|
md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
|
|
md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
|
|
md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
|
|
md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
|
|
|
|
// Round 2
|
|
md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
|
|
md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
|
|
md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
|
|
md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
|
|
md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
|
|
md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
|
|
md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
|
|
md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
|
|
md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
|
|
md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
|
|
md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
|
|
md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
|
|
md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
|
|
md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
|
|
md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
|
|
md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
|
|
|
|
// Round 3
|
|
md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
|
|
md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
|
|
md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
|
|
md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
|
|
md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
|
|
md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
|
|
md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
|
|
md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
|
|
md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
|
|
md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
|
|
md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
|
|
md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
|
|
md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
|
|
md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
|
|
md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
|
|
md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
|
|
|
|
// Round 4
|
|
md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
|
|
md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
|
|
md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
|
|
md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
|
|
md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
|
|
md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
|
|
md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
|
|
md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
|
|
md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
|
|
md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
|
|
md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
|
|
md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
|
|
md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
|
|
md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
|
|
md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
|
|
md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
|
|
|
|
__ addw(a, state_regs[0], a);
|
|
__ ubfx(rscratch2, state_regs[0], 32, 32);
|
|
__ addw(b, rscratch2, b);
|
|
__ addw(c, state_regs[1], c);
|
|
__ ubfx(rscratch4, state_regs[1], 32, 32);
|
|
__ addw(d, rscratch4, d);
|
|
|
|
__ orr(state_regs[0], a, b, Assembler::LSL, 32);
|
|
__ orr(state_regs[1], c, d, Assembler::LSL, 32);
|
|
|
|
if (multi_block) {
|
|
__ add(buf, buf, 64);
|
|
__ add(ofs, ofs, 64);
|
|
__ cmp(ofs, limit);
|
|
__ br(Assembler::LE, md5_loop);
|
|
__ mov(c_rarg0, ofs); // return ofs
|
|
}
|
|
|
|
// write hash values back in the correct order
|
|
__ stp(state_regs[0], state_regs[1], Address(state));
|
|
|
|
__ pop(saved_regs, sp);
|
|
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - byte[] source+offset
|
|
// c_rarg1 - int[] SHA.state
|
|
// c_rarg2 - int offset
|
|
// c_rarg3 - int limit
|
|
//
|
|
address generate_sha1_implCompress(StubId stub_id) {
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_sha1_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case StubId::stubgen_sha1_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register buf = c_rarg0;
|
|
Register state = c_rarg1;
|
|
Register ofs = c_rarg2;
|
|
Register limit = c_rarg3;
|
|
|
|
Label keys;
|
|
Label sha1_loop;
|
|
|
|
// load the keys into v0..v3
|
|
__ adr(rscratch1, keys);
|
|
__ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
|
|
// load 5 words state into v6, v7
|
|
__ ldrq(v6, Address(state, 0));
|
|
__ ldrs(v7, Address(state, 16));
|
|
|
|
|
|
__ BIND(sha1_loop);
|
|
// load 64 bytes of data into v16..v19
|
|
__ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
|
|
__ rev32(v16, __ T16B, v16);
|
|
__ rev32(v17, __ T16B, v17);
|
|
__ rev32(v18, __ T16B, v18);
|
|
__ rev32(v19, __ T16B, v19);
|
|
|
|
// do the sha1
|
|
__ addv(v4, __ T4S, v16, v0);
|
|
__ orr(v20, __ T16B, v6, v6);
|
|
|
|
FloatRegister d0 = v16;
|
|
FloatRegister d1 = v17;
|
|
FloatRegister d2 = v18;
|
|
FloatRegister d3 = v19;
|
|
|
|
for (int round = 0; round < 20; round++) {
|
|
FloatRegister tmp1 = (round & 1) ? v4 : v5;
|
|
FloatRegister tmp2 = (round & 1) ? v21 : v22;
|
|
FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
|
|
FloatRegister tmp4 = (round & 1) ? v5 : v4;
|
|
FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
|
|
|
|
if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
|
|
if (round < 19) __ addv(tmp1, __ T4S, d1, key);
|
|
__ sha1h(tmp2, __ T4S, v20);
|
|
if (round < 5)
|
|
__ sha1c(v20, __ T4S, tmp3, tmp4);
|
|
else if (round < 10 || round >= 15)
|
|
__ sha1p(v20, __ T4S, tmp3, tmp4);
|
|
else
|
|
__ sha1m(v20, __ T4S, tmp3, tmp4);
|
|
if (round < 16) __ sha1su1(d0, __ T4S, d3);
|
|
|
|
tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
|
|
}
|
|
|
|
__ addv(v7, __ T2S, v7, v21);
|
|
__ addv(v6, __ T4S, v6, v20);
|
|
|
|
if (multi_block) {
|
|
__ add(ofs, ofs, 64);
|
|
__ cmp(ofs, limit);
|
|
__ br(Assembler::LE, sha1_loop);
|
|
__ mov(c_rarg0, ofs); // return ofs
|
|
}
|
|
|
|
__ strq(v6, Address(state, 0));
|
|
__ strs(v7, Address(state, 16));
|
|
|
|
__ ret(lr);
|
|
|
|
__ bind(keys);
|
|
__ emit_int32(0x5a827999);
|
|
__ emit_int32(0x6ed9eba1);
|
|
__ emit_int32(0x8f1bbcdc);
|
|
__ emit_int32(0xca62c1d6);
|
|
|
|
return start;
|
|
}
|
|
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - byte[] source+offset
|
|
// c_rarg1 - int[] SHA.state
|
|
// c_rarg2 - int offset
|
|
// c_rarg3 - int limit
|
|
//
|
|
address generate_sha256_implCompress(StubId stub_id) {
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_sha256_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case StubId::stubgen_sha256_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
static const uint32_t round_consts[64] = {
|
|
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
|
|
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
|
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
|
|
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
|
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
|
|
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
|
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
|
|
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
|
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
|
|
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
|
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
|
|
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
|
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
|
|
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
|
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
|
|
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register buf = c_rarg0;
|
|
Register state = c_rarg1;
|
|
Register ofs = c_rarg2;
|
|
Register limit = c_rarg3;
|
|
|
|
Label sha1_loop;
|
|
|
|
__ stpd(v8, v9, __ pre(sp, -32));
|
|
__ stpd(v10, v11, Address(sp, 16));
|
|
|
|
// dga == v0
|
|
// dgb == v1
|
|
// dg0 == v2
|
|
// dg1 == v3
|
|
// dg2 == v4
|
|
// t0 == v6
|
|
// t1 == v7
|
|
|
|
// load 16 keys to v16..v31
|
|
__ lea(rscratch1, ExternalAddress((address)round_consts));
|
|
__ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
|
|
__ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
|
|
__ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
|
|
__ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
|
|
|
|
// load 8 words (256 bits) state
|
|
__ ldpq(v0, v1, state);
|
|
|
|
__ BIND(sha1_loop);
|
|
// load 64 bytes of data into v8..v11
|
|
__ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
|
|
__ rev32(v8, __ T16B, v8);
|
|
__ rev32(v9, __ T16B, v9);
|
|
__ rev32(v10, __ T16B, v10);
|
|
__ rev32(v11, __ T16B, v11);
|
|
|
|
__ addv(v6, __ T4S, v8, v16);
|
|
__ orr(v2, __ T16B, v0, v0);
|
|
__ orr(v3, __ T16B, v1, v1);
|
|
|
|
FloatRegister d0 = v8;
|
|
FloatRegister d1 = v9;
|
|
FloatRegister d2 = v10;
|
|
FloatRegister d3 = v11;
|
|
|
|
|
|
for (int round = 0; round < 16; round++) {
|
|
FloatRegister tmp1 = (round & 1) ? v6 : v7;
|
|
FloatRegister tmp2 = (round & 1) ? v7 : v6;
|
|
FloatRegister tmp3 = (round & 1) ? v2 : v4;
|
|
FloatRegister tmp4 = (round & 1) ? v4 : v2;
|
|
|
|
if (round < 12) __ sha256su0(d0, __ T4S, d1);
|
|
__ orr(v4, __ T16B, v2, v2);
|
|
if (round < 15)
|
|
__ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
|
|
__ sha256h(v2, __ T4S, v3, tmp2);
|
|
__ sha256h2(v3, __ T4S, v4, tmp2);
|
|
if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
|
|
|
|
tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
|
|
}
|
|
|
|
__ addv(v0, __ T4S, v0, v2);
|
|
__ addv(v1, __ T4S, v1, v3);
|
|
|
|
if (multi_block) {
|
|
__ add(ofs, ofs, 64);
|
|
__ cmp(ofs, limit);
|
|
__ br(Assembler::LE, sha1_loop);
|
|
__ mov(c_rarg0, ofs); // return ofs
|
|
}
|
|
|
|
__ ldpd(v10, v11, Address(sp, 16));
|
|
__ ldpd(v8, v9, __ post(sp, 32));
|
|
|
|
__ stpq(v0, v1, state);
|
|
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Double rounds for sha512.
|
|
void sha512_dround(int dr,
|
|
FloatRegister vi0, FloatRegister vi1,
|
|
FloatRegister vi2, FloatRegister vi3,
|
|
FloatRegister vi4, FloatRegister vrc0,
|
|
FloatRegister vrc1, FloatRegister vin0,
|
|
FloatRegister vin1, FloatRegister vin2,
|
|
FloatRegister vin3, FloatRegister vin4) {
|
|
if (dr < 36) {
|
|
__ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
|
|
}
|
|
__ addv(v5, __ T2D, vrc0, vin0);
|
|
__ ext(v6, __ T16B, vi2, vi3, 8);
|
|
__ ext(v5, __ T16B, v5, v5, 8);
|
|
__ ext(v7, __ T16B, vi1, vi2, 8);
|
|
__ addv(vi3, __ T2D, vi3, v5);
|
|
if (dr < 32) {
|
|
__ ext(v5, __ T16B, vin3, vin4, 8);
|
|
__ sha512su0(vin0, __ T2D, vin1);
|
|
}
|
|
__ sha512h(vi3, __ T2D, v6, v7);
|
|
if (dr < 32) {
|
|
__ sha512su1(vin0, __ T2D, vin2, v5);
|
|
}
|
|
__ addv(vi4, __ T2D, vi1, vi3);
|
|
__ sha512h2(vi3, __ T2D, vi1, vi0);
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - byte[] source+offset
|
|
// c_rarg1 - int[] SHA.state
|
|
// c_rarg2 - int offset
|
|
// c_rarg3 - int limit
|
|
//
|
|
address generate_sha512_implCompress(StubId stub_id) {
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_sha512_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case StubId::stubgen_sha512_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
static const uint64_t round_consts[80] = {
|
|
0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
|
|
0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
|
|
0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
|
|
0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
|
|
0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
|
|
0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
|
|
0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
|
|
0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
|
|
0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
|
|
0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
|
|
0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
|
|
0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
|
|
0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
|
|
0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
|
|
0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
|
|
0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
|
|
0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
|
|
0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
|
|
0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
|
|
0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
|
|
0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
|
|
0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
|
|
0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
|
|
0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
|
|
0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
|
|
0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
|
|
0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register buf = c_rarg0;
|
|
Register state = c_rarg1;
|
|
Register ofs = c_rarg2;
|
|
Register limit = c_rarg3;
|
|
|
|
__ stpd(v8, v9, __ pre(sp, -64));
|
|
__ stpd(v10, v11, Address(sp, 16));
|
|
__ stpd(v12, v13, Address(sp, 32));
|
|
__ stpd(v14, v15, Address(sp, 48));
|
|
|
|
Label sha512_loop;
|
|
|
|
// load state
|
|
__ ld1(v8, v9, v10, v11, __ T2D, state);
|
|
|
|
// load first 4 round constants
|
|
__ lea(rscratch1, ExternalAddress((address)round_consts));
|
|
__ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
|
|
|
|
__ BIND(sha512_loop);
|
|
// load 128B of data into v12..v19
|
|
__ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
|
|
__ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
|
|
__ rev64(v12, __ T16B, v12);
|
|
__ rev64(v13, __ T16B, v13);
|
|
__ rev64(v14, __ T16B, v14);
|
|
__ rev64(v15, __ T16B, v15);
|
|
__ rev64(v16, __ T16B, v16);
|
|
__ rev64(v17, __ T16B, v17);
|
|
__ rev64(v18, __ T16B, v18);
|
|
__ rev64(v19, __ T16B, v19);
|
|
|
|
__ mov(rscratch2, rscratch1);
|
|
|
|
__ mov(v0, __ T16B, v8);
|
|
__ mov(v1, __ T16B, v9);
|
|
__ mov(v2, __ T16B, v10);
|
|
__ mov(v3, __ T16B, v11);
|
|
|
|
sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
|
|
sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
|
|
sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
|
|
sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
|
|
sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
|
|
sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
|
|
sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
|
|
sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
|
|
sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
|
|
sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
|
|
sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
|
|
sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
|
|
sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
|
|
sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
|
|
sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
|
|
sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
|
|
sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
|
|
sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
|
|
sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
|
|
sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
|
|
sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
|
|
sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
|
|
sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
|
|
sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
|
|
sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
|
|
sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
|
|
sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
|
|
sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
|
|
sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
|
|
sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
|
|
sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
|
|
sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
|
|
sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
|
|
sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
|
|
sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
|
|
sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
|
|
sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
|
|
sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
|
|
sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
|
|
sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
|
|
|
|
__ addv(v8, __ T2D, v8, v0);
|
|
__ addv(v9, __ T2D, v9, v1);
|
|
__ addv(v10, __ T2D, v10, v2);
|
|
__ addv(v11, __ T2D, v11, v3);
|
|
|
|
if (multi_block) {
|
|
__ add(ofs, ofs, 128);
|
|
__ cmp(ofs, limit);
|
|
__ br(Assembler::LE, sha512_loop);
|
|
__ mov(c_rarg0, ofs); // return ofs
|
|
}
|
|
|
|
__ st1(v8, v9, v10, v11, __ T2D, state);
|
|
|
|
__ ldpd(v14, v15, Address(sp, 48));
|
|
__ ldpd(v12, v13, Address(sp, 32));
|
|
__ ldpd(v10, v11, Address(sp, 16));
|
|
__ ldpd(v8, v9, __ post(sp, 64));
|
|
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Execute one round of keccak of two computations in parallel.
|
|
// One of the states should be loaded into the lower halves of
|
|
// the vector registers v0-v24, the other should be loaded into
|
|
// the upper halves of those registers. The ld1r instruction loads
|
|
// the round constant into both halves of register v31.
|
|
// Intermediate results c0...c5 and d0...d5 are computed
|
|
// in registers v25...v30.
|
|
// All vector instructions that are used operate on both register
|
|
// halves in parallel.
|
|
// If only a single computation is needed, one can only load the lower halves.
|
|
void keccak_round(Register rscratch1) {
|
|
__ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
|
|
__ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
|
|
__ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
|
|
__ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
|
|
__ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
|
|
__ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
|
|
__ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
|
|
__ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
|
|
__ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
|
|
__ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
|
|
|
|
__ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
|
|
__ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
|
|
__ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
|
|
__ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
|
|
__ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
|
|
|
|
__ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
|
|
__ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
|
|
__ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
|
|
__ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
|
|
__ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
|
|
__ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
|
|
__ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
|
|
__ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
|
|
__ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
|
|
__ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
|
|
__ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
|
|
__ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
|
|
__ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
|
|
__ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
|
|
__ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
|
|
__ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
|
|
__ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
|
|
__ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
|
|
__ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
|
|
__ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
|
|
__ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
|
|
__ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
|
|
__ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
|
|
__ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
|
|
__ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
|
|
|
|
__ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
|
|
__ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
|
|
__ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
|
|
__ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
|
|
__ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
|
|
|
|
__ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
|
|
|
|
__ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
|
|
__ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
|
|
__ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
|
|
__ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
|
|
__ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
|
|
|
|
__ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
|
|
__ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
|
|
__ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
|
|
__ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
|
|
__ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
|
|
|
|
__ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
|
|
__ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
|
|
__ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
|
|
__ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
|
|
__ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
|
|
|
|
__ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
|
|
__ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
|
|
__ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
|
|
__ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
|
|
__ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
|
|
|
|
__ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - byte[] source+offset
|
|
// c_rarg1 - byte[] SHA.state
|
|
// c_rarg2 - int block_size
|
|
// c_rarg3 - int offset
|
|
// c_rarg4 - int limit
|
|
//
|
|
address generate_sha3_implCompress(StubId stub_id) {
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_sha3_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case StubId::stubgen_sha3_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
static const uint64_t round_consts[24] = {
|
|
0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
|
|
0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
|
|
0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
|
|
0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
|
|
0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
|
|
0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
|
|
0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
|
|
0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register buf = c_rarg0;
|
|
Register state = c_rarg1;
|
|
Register block_size = c_rarg2;
|
|
Register ofs = c_rarg3;
|
|
Register limit = c_rarg4;
|
|
|
|
Label sha3_loop, rounds24_loop;
|
|
Label sha3_512_or_sha3_384, shake128;
|
|
|
|
__ stpd(v8, v9, __ pre(sp, -64));
|
|
__ stpd(v10, v11, Address(sp, 16));
|
|
__ stpd(v12, v13, Address(sp, 32));
|
|
__ stpd(v14, v15, Address(sp, 48));
|
|
|
|
// load state
|
|
__ add(rscratch1, state, 32);
|
|
__ ld1(v0, v1, v2, v3, __ T1D, state);
|
|
__ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
|
|
__ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
|
|
__ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
|
|
__ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
|
|
__ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
|
|
__ ld1(v24, __ T1D, rscratch1);
|
|
|
|
__ BIND(sha3_loop);
|
|
|
|
// 24 keccak rounds
|
|
__ movw(rscratch2, 24);
|
|
|
|
// load round_constants base
|
|
__ lea(rscratch1, ExternalAddress((address) round_consts));
|
|
|
|
// load input
|
|
__ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
|
|
__ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
|
|
__ eor(v0, __ T8B, v0, v25);
|
|
__ eor(v1, __ T8B, v1, v26);
|
|
__ eor(v2, __ T8B, v2, v27);
|
|
__ eor(v3, __ T8B, v3, v28);
|
|
__ eor(v4, __ T8B, v4, v29);
|
|
__ eor(v5, __ T8B, v5, v30);
|
|
__ eor(v6, __ T8B, v6, v31);
|
|
|
|
// block_size == 72, SHA3-512; block_size == 104, SHA3-384
|
|
__ tbz(block_size, 7, sha3_512_or_sha3_384);
|
|
|
|
__ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
|
|
__ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
|
|
__ eor(v7, __ T8B, v7, v25);
|
|
__ eor(v8, __ T8B, v8, v26);
|
|
__ eor(v9, __ T8B, v9, v27);
|
|
__ eor(v10, __ T8B, v10, v28);
|
|
__ eor(v11, __ T8B, v11, v29);
|
|
__ eor(v12, __ T8B, v12, v30);
|
|
__ eor(v13, __ T8B, v13, v31);
|
|
|
|
__ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
|
|
__ eor(v14, __ T8B, v14, v25);
|
|
__ eor(v15, __ T8B, v15, v26);
|
|
__ eor(v16, __ T8B, v16, v27);
|
|
|
|
// block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
|
|
__ andw(c_rarg5, block_size, 48);
|
|
__ cbzw(c_rarg5, rounds24_loop);
|
|
|
|
__ tbnz(block_size, 5, shake128);
|
|
// block_size == 144, bit5 == 0, SHA3-224
|
|
__ ldrd(v28, __ post(buf, 8));
|
|
__ eor(v17, __ T8B, v17, v28);
|
|
__ b(rounds24_loop);
|
|
|
|
__ BIND(shake128);
|
|
__ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
|
|
__ eor(v17, __ T8B, v17, v28);
|
|
__ eor(v18, __ T8B, v18, v29);
|
|
__ eor(v19, __ T8B, v19, v30);
|
|
__ eor(v20, __ T8B, v20, v31);
|
|
__ b(rounds24_loop); // block_size == 168, SHAKE128
|
|
|
|
__ BIND(sha3_512_or_sha3_384);
|
|
__ ld1(v25, v26, __ T8B, __ post(buf, 16));
|
|
__ eor(v7, __ T8B, v7, v25);
|
|
__ eor(v8, __ T8B, v8, v26);
|
|
__ tbz(block_size, 5, rounds24_loop); // SHA3-512
|
|
|
|
// SHA3-384
|
|
__ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
|
|
__ eor(v9, __ T8B, v9, v27);
|
|
__ eor(v10, __ T8B, v10, v28);
|
|
__ eor(v11, __ T8B, v11, v29);
|
|
__ eor(v12, __ T8B, v12, v30);
|
|
|
|
__ BIND(rounds24_loop);
|
|
__ subw(rscratch2, rscratch2, 1);
|
|
|
|
keccak_round(rscratch1);
|
|
|
|
__ cbnzw(rscratch2, rounds24_loop);
|
|
|
|
if (multi_block) {
|
|
__ add(ofs, ofs, block_size);
|
|
__ cmp(ofs, limit);
|
|
__ br(Assembler::LE, sha3_loop);
|
|
__ mov(c_rarg0, ofs); // return ofs
|
|
}
|
|
|
|
__ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
|
|
__ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
|
|
__ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
|
|
__ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
|
|
__ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
|
|
__ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
|
|
__ st1(v24, __ T1D, state);
|
|
|
|
// restore callee-saved registers
|
|
__ ldpd(v14, v15, Address(sp, 48));
|
|
__ ldpd(v12, v13, Address(sp, 32));
|
|
__ ldpd(v10, v11, Address(sp, 16));
|
|
__ ldpd(v8, v9, __ post(sp, 64));
|
|
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Inputs:
|
|
// c_rarg0 - long[] state0
|
|
// c_rarg1 - long[] state1
|
|
address generate_double_keccak() {
|
|
static const uint64_t round_consts[24] = {
|
|
0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
|
|
0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
|
|
0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
|
|
0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
|
|
0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
|
|
0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
|
|
0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
|
|
0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
|
|
};
|
|
|
|
// Implements the double_keccak() method of the
|
|
// sun.secyrity.provider.SHA3Parallel class
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, "StubRoutines", "double_keccak");
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
Register state0 = c_rarg0;
|
|
Register state1 = c_rarg1;
|
|
|
|
Label rounds24_loop;
|
|
|
|
// save callee-saved registers
|
|
__ stpd(v8, v9, __ pre(sp, -64));
|
|
__ stpd(v10, v11, Address(sp, 16));
|
|
__ stpd(v12, v13, Address(sp, 32));
|
|
__ stpd(v14, v15, Address(sp, 48));
|
|
|
|
// load states
|
|
__ add(rscratch1, state0, 32);
|
|
__ ld4(v0, v1, v2, v3, __ D, 0, state0);
|
|
__ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
|
|
__ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
|
|
__ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
|
|
__ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
|
|
__ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
|
|
__ ld1(v24, __ D, 0, rscratch1);
|
|
__ add(rscratch1, state1, 32);
|
|
__ ld4(v0, v1, v2, v3, __ D, 1, state1);
|
|
__ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
|
|
__ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
|
|
__ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
|
|
__ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
|
|
__ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
|
|
__ ld1(v24, __ D, 1, rscratch1);
|
|
|
|
// 24 keccak rounds
|
|
__ movw(rscratch2, 24);
|
|
|
|
// load round_constants base
|
|
__ lea(rscratch1, ExternalAddress((address) round_consts));
|
|
|
|
__ BIND(rounds24_loop);
|
|
__ subw(rscratch2, rscratch2, 1);
|
|
keccak_round(rscratch1);
|
|
__ cbnzw(rscratch2, rounds24_loop);
|
|
|
|
__ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
|
|
__ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
|
|
__ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
|
|
__ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
|
|
__ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
|
|
__ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
|
|
__ st1(v24, __ D, 0, state0);
|
|
__ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
|
|
__ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
|
|
__ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
|
|
__ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
|
|
__ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
|
|
__ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
|
|
__ st1(v24, __ D, 1, state1);
|
|
|
|
// restore callee-saved vector registers
|
|
__ ldpd(v14, v15, Address(sp, 48));
|
|
__ ldpd(v12, v13, Address(sp, 32));
|
|
__ ldpd(v10, v11, Address(sp, 16));
|
|
__ ldpd(v8, v9, __ post(sp, 64));
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// ChaCha20 block function. This version parallelizes the 32-bit
|
|
// state elements on each of 16 vectors, producing 4 blocks of
|
|
// keystream at a time.
|
|
//
|
|
// state (int[16]) = c_rarg0
|
|
// keystream (byte[256]) = c_rarg1
|
|
// return - number of bytes of produced keystream (always 256)
|
|
//
|
|
// This implementation takes each 32-bit integer from the state
|
|
// array and broadcasts it across all 4 32-bit lanes of a vector register
|
|
// (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
|
|
// of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
|
|
// the quarter round schedule is implemented as outlined in RFC 7539 section
|
|
// 2.3. However, instead of sequentially processing the 3 quarter round
|
|
// operations represented by one QUARTERROUND function, we instead stack all
|
|
// the adds, xors and left-rotations from the first 4 quarter rounds together
|
|
// and then do the same for the second set of 4 quarter rounds. This removes
|
|
// some latency that would otherwise be incurred by waiting for an add to
|
|
// complete before performing an xor (which depends on the result of the
|
|
// add), etc. An adjustment happens between the first and second groups of 4
|
|
// quarter rounds, but this is done only in the inputs to the macro functions
|
|
// that generate the assembly instructions - these adjustments themselves are
|
|
// not part of the resulting assembly.
|
|
// The 4 registers v0-v3 are used during the quarter round operations as
|
|
// scratch registers. Once the 20 rounds are complete, these 4 scratch
|
|
// registers become the vectors involved in adding the start state back onto
|
|
// the post-QR working state. After the adds are complete, each of the 16
|
|
// vectors write their first lane back to the keystream buffer, followed
|
|
// by the second lane from all vectors and so on.
|
|
address generate_chacha20Block_blockpar() {
|
|
Label L_twoRounds, L_cc20_const;
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_chacha20Block_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
int i, j;
|
|
const Register state = c_rarg0;
|
|
const Register keystream = c_rarg1;
|
|
const Register loopCtr = r10;
|
|
const Register tmpAddr = r11;
|
|
const FloatRegister ctrAddOverlay = v28;
|
|
const FloatRegister lrot8Tbl = v29;
|
|
|
|
// Organize SIMD registers in an array that facilitates
|
|
// putting repetitive opcodes into loop structures. It is
|
|
// important that each grouping of 4 registers is monotonically
|
|
// increasing to support the requirements of multi-register
|
|
// instructions (e.g. ld4r, st4, etc.)
|
|
const FloatRegister workSt[16] = {
|
|
v4, v5, v6, v7, v16, v17, v18, v19,
|
|
v20, v21, v22, v23, v24, v25, v26, v27
|
|
};
|
|
|
|
// Pull in constant data. The first 16 bytes are the add overlay
|
|
// which is applied to the vector holding the counter (state[12]).
|
|
// The second 16 bytes is the index register for the 8-bit left
|
|
// rotation tbl instruction.
|
|
__ adr(tmpAddr, L_cc20_const);
|
|
__ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
|
|
|
|
// Load from memory and interlace across 16 SIMD registers,
|
|
// With each word from memory being broadcast to all lanes of
|
|
// each successive SIMD register.
|
|
// Addr(0) -> All lanes in workSt[i]
|
|
// Addr(4) -> All lanes workSt[i + 1], etc.
|
|
__ mov(tmpAddr, state);
|
|
for (i = 0; i < 16; i += 4) {
|
|
__ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
|
|
__ post(tmpAddr, 16));
|
|
}
|
|
__ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
|
|
|
|
// Before entering the loop, create 5 4-register arrays. These
|
|
// will hold the 4 registers that represent the a/b/c/d fields
|
|
// in the quarter round operation. For instance the "b" field
|
|
// for the first 4 quarter round operations is the set of v16/v17/v18/v19,
|
|
// but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
|
|
// since it is part of a diagonal organization. The aSet and scratch
|
|
// register sets are defined at declaration time because they do not change
|
|
// organization at any point during the 20-round processing.
|
|
FloatRegister aSet[4] = { v4, v5, v6, v7 };
|
|
FloatRegister bSet[4];
|
|
FloatRegister cSet[4];
|
|
FloatRegister dSet[4];
|
|
FloatRegister scratch[4] = { v0, v1, v2, v3 };
|
|
|
|
// Set up the 10 iteration loop and perform all 8 quarter round ops
|
|
__ mov(loopCtr, 10);
|
|
__ BIND(L_twoRounds);
|
|
|
|
// Set to columnar organization and do the following 4 quarter-rounds:
|
|
// QUARTERROUND(0, 4, 8, 12)
|
|
// QUARTERROUND(1, 5, 9, 13)
|
|
// QUARTERROUND(2, 6, 10, 14)
|
|
// QUARTERROUND(3, 7, 11, 15)
|
|
__ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
|
|
__ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
|
|
__ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
|
|
|
|
__ cc20_qr_add4(aSet, bSet); // a += b
|
|
__ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
|
|
__ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
|
|
|
|
__ cc20_qr_add4(cSet, dSet); // c += d
|
|
__ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
|
|
__ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
|
|
|
|
__ cc20_qr_add4(aSet, bSet); // a += b
|
|
__ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
|
|
__ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
|
|
|
|
__ cc20_qr_add4(cSet, dSet); // c += d
|
|
__ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
|
|
__ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
|
|
|
|
// Set to diagonal organization and do the next 4 quarter-rounds:
|
|
// QUARTERROUND(0, 5, 10, 15)
|
|
// QUARTERROUND(1, 6, 11, 12)
|
|
// QUARTERROUND(2, 7, 8, 13)
|
|
// QUARTERROUND(3, 4, 9, 14)
|
|
__ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
|
|
__ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
|
|
__ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
|
|
|
|
__ cc20_qr_add4(aSet, bSet); // a += b
|
|
__ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
|
|
__ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
|
|
|
|
__ cc20_qr_add4(cSet, dSet); // c += d
|
|
__ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
|
|
__ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
|
|
|
|
__ cc20_qr_add4(aSet, bSet); // a += b
|
|
__ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
|
|
__ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
|
|
|
|
__ cc20_qr_add4(cSet, dSet); // c += d
|
|
__ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
|
|
__ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
|
|
|
|
// Decrement and iterate
|
|
__ sub(loopCtr, loopCtr, 1);
|
|
__ cbnz(loopCtr, L_twoRounds);
|
|
|
|
__ mov(tmpAddr, state);
|
|
|
|
// Add the starting state back to the post-loop keystream
|
|
// state. We read/interlace the state array from memory into
|
|
// 4 registers similar to what we did in the beginning. Then
|
|
// add the counter overlay onto workSt[12] at the end.
|
|
for (i = 0; i < 16; i += 4) {
|
|
__ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
|
|
__ addv(workSt[i], __ T4S, workSt[i], v0);
|
|
__ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
|
|
__ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
|
|
__ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
|
|
}
|
|
__ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
|
|
|
|
// Write working state into the keystream buffer. This is accomplished
|
|
// by taking the lane "i" from each of the four vectors and writing
|
|
// it to consecutive 4-byte offsets, then post-incrementing by 16 and
|
|
// repeating with the next 4 vectors until all 16 vectors have been used.
|
|
// Then move to the next lane and repeat the process until all lanes have
|
|
// been written.
|
|
for (i = 0; i < 4; i++) {
|
|
for (j = 0; j < 16; j += 4) {
|
|
__ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
|
|
__ post(keystream, 16));
|
|
}
|
|
}
|
|
|
|
__ mov(r0, 256); // Return length of output keystream
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
// bind label and generate local constant data used by this stub
|
|
// The constant data is broken into two 128-bit segments to be loaded
|
|
// onto FloatRegisters. The first 128 bits are a counter add overlay
|
|
// that adds +0/+1/+2/+3 to the vector holding replicated state[12].
|
|
// The second 128-bits is a table constant used for 8-bit left rotations.
|
|
__ BIND(L_cc20_const);
|
|
__ emit_int64(0x0000000100000000UL);
|
|
__ emit_int64(0x0000000300000002UL);
|
|
__ emit_int64(0x0605040702010003UL);
|
|
__ emit_int64(0x0E0D0C0F0A09080BUL);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Helpers to schedule parallel operation bundles across vector
|
|
// register sequences of size 2, 4 or 8.
|
|
|
|
// Implement various primitive computations across vector sequences
|
|
|
|
template<int N>
|
|
void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
|
|
const VSeq<N>& v1, const VSeq<N>& v2) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
assert(!vs_write_before_read(v, v2), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ addv(v[i], T, v1[i], v2[i]);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
|
|
const VSeq<N>& v1, const VSeq<N>& v2) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
assert(!vs_write_before_read(v, v2), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ subv(v[i], T, v1[i], v2[i]);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
|
|
const VSeq<N>& v1, const VSeq<N>& v2) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
assert(!vs_write_before_read(v, v2), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ mulv(v[i], T, v1[i], v2[i]);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ negr(v[i], T, v1[i]);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
|
|
const VSeq<N>& v1, int shift) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ sshr(v[i], T, v1[i], shift);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
assert(!vs_write_before_read(v, v2), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ andr(v[i], __ T16B, v1[i], v2[i]);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
assert(!vs_write_before_read(v, v2), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ orr(v[i], __ T16B, v1[i], v2[i]);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ notr(v[i], __ T16B, v1[i]);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
assert(!vs_write_before_read(v, v2), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ sqdmulh(v[i], T, v1[i], v2[i]);
|
|
}
|
|
}
|
|
|
|
template<int N>
|
|
void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
|
|
// output must not be constant
|
|
assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
|
|
// output cannot overwrite pending inputs
|
|
assert(!vs_write_before_read(v, v1), "output overwrites input");
|
|
assert(!vs_write_before_read(v, v2), "output overwrites input");
|
|
for (int i = 0; i < N; i++) {
|
|
__ mlsv(v[i], T, v1[i], v2[i]);
|
|
}
|
|
}
|
|
|
|
// load N/2 successive pairs of quadword values from memory in order
|
|
// into N successive vector registers of the sequence via the
|
|
// address supplied in base.
|
|
template<int N>
|
|
void vs_ldpq(const VSeq<N>& v, Register base) {
|
|
for (int i = 0; i < N; i += 2) {
|
|
__ ldpq(v[i], v[i+1], Address(base, 32 * i));
|
|
}
|
|
}
|
|
|
|
// load N/2 successive pairs of quadword values from memory in order
|
|
// into N vector registers of the sequence via the address supplied
|
|
// in base using post-increment addressing
|
|
template<int N>
|
|
void vs_ldpq_post(const VSeq<N>& v, Register base) {
|
|
static_assert((N & (N - 1)) == 0, "sequence length must be even");
|
|
for (int i = 0; i < N; i += 2) {
|
|
__ ldpq(v[i], v[i+1], __ post(base, 32));
|
|
}
|
|
}
|
|
|
|
// store N successive vector registers of the sequence into N/2
|
|
// successive pairs of quadword memory locations via the address
|
|
// supplied in base using post-increment addressing
|
|
template<int N>
|
|
void vs_stpq_post(const VSeq<N>& v, Register base) {
|
|
static_assert((N & (N - 1)) == 0, "sequence length must be even");
|
|
for (int i = 0; i < N; i += 2) {
|
|
__ stpq(v[i], v[i+1], __ post(base, 32));
|
|
}
|
|
}
|
|
|
|
// load N/2 pairs of quadword values from memory de-interleaved into
|
|
// N vector registers 2 at a time via the address supplied in base
|
|
// using post-increment addressing.
|
|
template<int N>
|
|
void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
|
|
static_assert((N & (N - 1)) == 0, "sequence length must be even");
|
|
for (int i = 0; i < N; i += 2) {
|
|
__ ld2(v[i], v[i+1], T, __ post(base, 32));
|
|
}
|
|
}
|
|
|
|
// store N vector registers interleaved into N/2 pairs of quadword
|
|
// memory locations via the address supplied in base using
|
|
// post-increment addressing.
|
|
template<int N>
|
|
void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
|
|
static_assert((N & (N - 1)) == 0, "sequence length must be even");
|
|
for (int i = 0; i < N; i += 2) {
|
|
__ st2(v[i], v[i+1], T, __ post(base, 32));
|
|
}
|
|
}
|
|
|
|
// load N quadword values from memory de-interleaved into N vector
|
|
// registers 3 elements at a time via the address supplied in base.
|
|
template<int N>
|
|
void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
|
|
static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
|
|
for (int i = 0; i < N; i += 3) {
|
|
__ ld3(v[i], v[i+1], v[i+2], T, base);
|
|
}
|
|
}
|
|
|
|
// load N quadword values from memory de-interleaved into N vector
|
|
// registers 3 elements at a time via the address supplied in base
|
|
// using post-increment addressing.
|
|
template<int N>
|
|
void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
|
|
static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
|
|
for (int i = 0; i < N; i += 3) {
|
|
__ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
|
|
}
|
|
}
|
|
|
|
// load N/2 pairs of quadword values from memory into N vector
|
|
// registers via the address supplied in base with each pair indexed
|
|
// using the the start offset plus the corresponding entry in the
|
|
// offsets array
|
|
template<int N>
|
|
void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
|
|
for (int i = 0; i < N/2; i++) {
|
|
__ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
|
|
}
|
|
}
|
|
|
|
// store N vector registers into N/2 pairs of quadword memory
|
|
// locations via the address supplied in base with each pair indexed
|
|
// using the the start offset plus the corresponding entry in the
|
|
// offsets array
|
|
template<int N>
|
|
void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
|
|
for (int i = 0; i < N/2; i++) {
|
|
__ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
|
|
}
|
|
}
|
|
|
|
// load N single quadword values from memory into N vector registers
|
|
// via the address supplied in base with each value indexed using
|
|
// the the start offset plus the corresponding entry in the offsets
|
|
// array
|
|
template<int N>
|
|
void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
|
|
int start, int (&offsets)[N]) {
|
|
for (int i = 0; i < N; i++) {
|
|
__ ldr(v[i], T, Address(base, start + offsets[i]));
|
|
}
|
|
}
|
|
|
|
// store N vector registers into N single quadword memory locations
|
|
// via the address supplied in base with each value indexed using
|
|
// the the start offset plus the corresponding entry in the offsets
|
|
// array
|
|
template<int N>
|
|
void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
|
|
int start, int (&offsets)[N]) {
|
|
for (int i = 0; i < N; i++) {
|
|
__ str(v[i], T, Address(base, start + offsets[i]));
|
|
}
|
|
}
|
|
|
|
// load N/2 pairs of quadword values from memory de-interleaved into
|
|
// N vector registers 2 at a time via the address supplied in base
|
|
// with each pair indexed using the the start offset plus the
|
|
// corresponding entry in the offsets array
|
|
template<int N>
|
|
void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
|
|
Register tmp, int start, int (&offsets)[N/2]) {
|
|
for (int i = 0; i < N/2; i++) {
|
|
__ add(tmp, base, start + offsets[i]);
|
|
__ ld2(v[2*i], v[2*i+1], T, tmp);
|
|
}
|
|
}
|
|
|
|
// store N vector registers 2 at a time interleaved into N/2 pairs
|
|
// of quadword memory locations via the address supplied in base
|
|
// with each pair indexed using the the start offset plus the
|
|
// corresponding entry in the offsets array
|
|
template<int N>
|
|
void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
|
|
Register tmp, int start, int (&offsets)[N/2]) {
|
|
for (int i = 0; i < N/2; i++) {
|
|
__ add(tmp, base, start + offsets[i]);
|
|
__ st2(v[2*i], v[2*i+1], T, tmp);
|
|
}
|
|
}
|
|
|
|
// Helper routines for various flavours of Montgomery multiply
|
|
|
|
// Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
|
|
// multiplications in parallel
|
|
//
|
|
|
|
// See the montMul() method of the sun.security.provider.ML_DSA
|
|
// class.
|
|
//
|
|
// Computes 4x4S results or 8x8H results
|
|
// a = b * c * 2^MONT_R_BITS mod MONT_Q
|
|
// Inputs: vb, vc - 4x4S or 4x8H vector register sequences
|
|
// vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
|
|
// Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
|
|
// Outputs: va - 4x4S or 4x8H vector register sequences
|
|
// vb, vc, vtmp and vq must all be disjoint
|
|
// va must be disjoint from all other inputs/temps or must equal vc
|
|
// va must have a non-zero delta i.e. it must not be a constant vseq.
|
|
// n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
|
|
void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
|
|
Assembler::SIMD_Arrangement T,
|
|
const VSeq<4>& vtmp, const VSeq<2>& vq) {
|
|
assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
|
|
assert(vs_disjoint(vb, vc), "vb and vc overlap");
|
|
assert(vs_disjoint(vb, vq), "vb and vq overlap");
|
|
assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
|
|
|
|
assert(vs_disjoint(vc, vq), "vc and vq overlap");
|
|
assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
|
|
|
|
assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
|
|
|
|
assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
|
|
assert(vs_disjoint(va, vb), "va and vb overlap");
|
|
assert(vs_disjoint(va, vq), "va and vq overlap");
|
|
assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
|
|
assert(!va.is_constant(), "output vector must identify 4 different registers");
|
|
|
|
// schedule 4 streams of instructions across the vector sequences
|
|
for (int i = 0; i < 4; i++) {
|
|
__ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
|
|
__ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
|
|
}
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
__ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
|
|
}
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
__ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
|
|
}
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
__ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
|
|
}
|
|
}
|
|
|
|
// Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
|
|
// multiplications in parallel
|
|
//
|
|
|
|
// See the montMul() method of the sun.security.provider.ML_DSA
|
|
// class.
|
|
//
|
|
// Computes 4x4S results or 8x8H results
|
|
// a = b * c * 2^MONT_R_BITS mod MONT_Q
|
|
// Inputs: vb, vc - 4x4S or 4x8H vector register sequences
|
|
// vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
|
|
// Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
|
|
// Outputs: va - 4x4S or 4x8H vector register sequences
|
|
// vb, vc, vtmp and vq must all be disjoint
|
|
// va must be disjoint from all other inputs/temps or must equal vc
|
|
// va must have a non-zero delta i.e. it must not be a constant vseq.
|
|
// n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
|
|
void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
|
|
Assembler::SIMD_Arrangement T,
|
|
const VSeq<2>& vtmp, const VSeq<2>& vq) {
|
|
assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
|
|
assert(vs_disjoint(vb, vc), "vb and vc overlap");
|
|
assert(vs_disjoint(vb, vq), "vb and vq overlap");
|
|
assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
|
|
|
|
assert(vs_disjoint(vc, vq), "vc and vq overlap");
|
|
assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
|
|
|
|
assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
|
|
|
|
assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
|
|
assert(vs_disjoint(va, vb), "va and vb overlap");
|
|
assert(vs_disjoint(va, vq), "va and vq overlap");
|
|
assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
|
|
assert(!va.is_constant(), "output vector must identify 2 different registers");
|
|
|
|
// schedule 2 streams of instructions across the vector sequences
|
|
for (int i = 0; i < 2; i++) {
|
|
__ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
|
|
__ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
|
|
}
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
__ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
|
|
}
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
__ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
|
|
}
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
__ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
|
|
}
|
|
}
|
|
|
|
// Perform 16 16-bit Montgomery multiplications in parallel.
|
|
void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
|
|
const VSeq<2>& vtmp, const VSeq<2>& vq) {
|
|
// Use the helper routine to schedule a 2x8H Montgomery multiply.
|
|
// It will assert that the register use is valid
|
|
vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
|
|
}
|
|
|
|
// Perform 32 16-bit Montgomery multiplications in parallel.
|
|
void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
|
|
const VSeq<4>& vtmp, const VSeq<2>& vq) {
|
|
// Use the helper routine to schedule a 4x8H Montgomery multiply.
|
|
// It will assert that the register use is valid
|
|
vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
|
|
}
|
|
|
|
// Perform 64 16-bit Montgomery multiplications in parallel.
|
|
void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
|
|
const VSeq<4>& vtmp, const VSeq<2>& vq) {
|
|
// Schedule two successive 4x8H multiplies via the montmul helper
|
|
// on the front and back halves of va, vb and vc. The helper will
|
|
// assert that the register use has no overlap conflicts on each
|
|
// individual call but we also need to ensure that the necessary
|
|
// disjoint/equality constraints are met across both calls.
|
|
|
|
// vb, vc, vtmp and vq must be disjoint. va must either be
|
|
// disjoint from all other registers or equal vc
|
|
|
|
assert(vs_disjoint(vb, vc), "vb and vc overlap");
|
|
assert(vs_disjoint(vb, vq), "vb and vq overlap");
|
|
assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
|
|
|
|
assert(vs_disjoint(vc, vq), "vc and vq overlap");
|
|
assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
|
|
|
|
assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
|
|
|
|
assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
|
|
assert(vs_disjoint(va, vb), "va and vb overlap");
|
|
assert(vs_disjoint(va, vq), "va and vq overlap");
|
|
assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
|
|
|
|
// we multiply the front and back halves of each sequence 4 at a
|
|
// time because
|
|
//
|
|
// 1) we are currently only able to get 4-way instruction
|
|
// parallelism at best
|
|
//
|
|
// 2) we need registers for the constants in vq and temporary
|
|
// scratch registers to hold intermediate results so vtmp can only
|
|
// be a VSeq<4> which means we only have 4 scratch slots
|
|
|
|
vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
|
|
vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
|
|
}
|
|
|
|
void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
|
|
const VSeq<4>& vc,
|
|
const VSeq<4>& vtmp,
|
|
const VSeq<2>& vq) {
|
|
// compute a = montmul(a1, c)
|
|
kyber_montmul32(vc, va1, vc, vtmp, vq);
|
|
// ouptut a1 = a0 - a
|
|
vs_subv(va1, __ T8H, va0, vc);
|
|
// and a0 = a0 + a
|
|
vs_addv(va0, __ T8H, va0, vc);
|
|
}
|
|
|
|
void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
|
|
const VSeq<4>& vb,
|
|
const VSeq<4>& vtmp1,
|
|
const VSeq<4>& vtmp2,
|
|
const VSeq<2>& vq) {
|
|
// compute c = a0 - a1
|
|
vs_subv(vtmp1, __ T8H, va0, va1);
|
|
// output a0 = a0 + a1
|
|
vs_addv(va0, __ T8H, va0, va1);
|
|
// output a1 = b montmul c
|
|
kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
|
|
}
|
|
|
|
void load64shorts(const VSeq<8>& v, Register shorts) {
|
|
vs_ldpq_post(v, shorts);
|
|
}
|
|
|
|
void load32shorts(const VSeq<4>& v, Register shorts) {
|
|
vs_ldpq_post(v, shorts);
|
|
}
|
|
|
|
void store64shorts(VSeq<8> v, Register tmpAddr) {
|
|
vs_stpq_post(v, tmpAddr);
|
|
}
|
|
|
|
// Kyber NTT function.
|
|
// Implements
|
|
// static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
|
|
//
|
|
// coeffs (short[256]) = c_rarg0
|
|
// ntt_zetas (short[256]) = c_rarg1
|
|
address generate_kyberNtt() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_kyberNtt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register coeffs = c_rarg0;
|
|
const Register zetas = c_rarg1;
|
|
|
|
const Register kyberConsts = r10;
|
|
const Register tmpAddr = r11;
|
|
|
|
VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
|
|
VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
|
|
VSeq<2> vq(30); // n.b. constants overlap vs3
|
|
|
|
__ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
|
|
// load the montmul constants
|
|
vs_ldpq(vq, kyberConsts);
|
|
|
|
// Each level corresponds to an iteration of the outermost loop of the
|
|
// Java method seilerNTT(int[] coeffs). There are some differences
|
|
// from what is done in the seilerNTT() method, though:
|
|
// 1. The computation is using 16-bit signed values, we do not convert them
|
|
// to ints here.
|
|
// 2. The zetas are delivered in a bigger array, 128 zetas are stored in
|
|
// this array for each level, it is easier that way to fill up the vector
|
|
// registers.
|
|
// 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
|
|
// multiplications (this is because that way there should not be any
|
|
// overflow during the inverse NTT computation), here we usr R = 2^16 so
|
|
// that we can use the 16-bit arithmetic in the vector unit.
|
|
//
|
|
// On each level, we fill up the vector registers in such a way that the
|
|
// array elements that need to be multiplied by the zetas go into one
|
|
// set of vector registers while the corresponding ones that don't need to
|
|
// be multiplied, go into another set.
|
|
// We can do 32 Montgomery multiplications in parallel, using 12 vector
|
|
// registers interleaving the steps of 4 identical computations,
|
|
// each done on 8 16-bit values per register.
|
|
|
|
// At levels 0-3 the coefficients multiplied by or added/subtracted
|
|
// to the zetas occur in discrete blocks whose size is some multiple
|
|
// of 32.
|
|
|
|
// level 0
|
|
__ add(tmpAddr, coeffs, 256);
|
|
load64shorts(vs1, tmpAddr);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
load64shorts(vs1, tmpAddr);
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
vs_stpq_post(vs1, tmpAddr);
|
|
__ add(tmpAddr, coeffs, 256);
|
|
vs_stpq_post(vs3, tmpAddr);
|
|
// restore montmul constants
|
|
vs_ldpq(vq, kyberConsts);
|
|
load64shorts(vs1, tmpAddr);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 128);
|
|
load64shorts(vs1, tmpAddr);
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 128);
|
|
store64shorts(vs1, tmpAddr);
|
|
__ add(tmpAddr, coeffs, 384);
|
|
store64shorts(vs3, tmpAddr);
|
|
|
|
// level 1
|
|
// restore montmul constants
|
|
vs_ldpq(vq, kyberConsts);
|
|
__ add(tmpAddr, coeffs, 128);
|
|
load64shorts(vs1, tmpAddr);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
load64shorts(vs1, tmpAddr);
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
store64shorts(vs1, tmpAddr);
|
|
store64shorts(vs3, tmpAddr);
|
|
vs_ldpq(vq, kyberConsts);
|
|
__ add(tmpAddr, coeffs, 384);
|
|
load64shorts(vs1, tmpAddr);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 256);
|
|
load64shorts(vs1, tmpAddr);
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 256);
|
|
store64shorts(vs1, tmpAddr);
|
|
store64shorts(vs3, tmpAddr);
|
|
|
|
// level 2
|
|
vs_ldpq(vq, kyberConsts);
|
|
int offsets1[4] = { 0, 32, 128, 160 };
|
|
vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
|
|
// kyber_subv_addv64();
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
vs_stpq_post(vs_front(vs1), tmpAddr);
|
|
vs_stpq_post(vs_front(vs3), tmpAddr);
|
|
vs_stpq_post(vs_back(vs1), tmpAddr);
|
|
vs_stpq_post(vs_back(vs3), tmpAddr);
|
|
vs_ldpq(vq, kyberConsts);
|
|
vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
|
|
// kyber_subv_addv64();
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 256);
|
|
vs_stpq_post(vs_front(vs1), tmpAddr);
|
|
vs_stpq_post(vs_front(vs3), tmpAddr);
|
|
vs_stpq_post(vs_back(vs1), tmpAddr);
|
|
vs_stpq_post(vs_back(vs3), tmpAddr);
|
|
|
|
// level 3
|
|
vs_ldpq(vq, kyberConsts);
|
|
int offsets2[4] = { 0, 64, 128, 192 };
|
|
vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
vs_stpq_indexed(vs1, coeffs, 0, offsets2);
|
|
vs_stpq_indexed(vs3, coeffs, 32, offsets2);
|
|
|
|
vs_ldpq(vq, kyberConsts);
|
|
vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
vs_stpq_indexed(vs1, coeffs, 256, offsets2);
|
|
vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
|
|
|
|
// level 4
|
|
// At level 4 coefficients occur in 8 discrete blocks of size 16
|
|
// so they are loaded using employing an ldr at 8 distinct offsets.
|
|
|
|
vs_ldpq(vq, kyberConsts);
|
|
int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
|
|
vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
|
|
|
|
vs_ldpq(vq, kyberConsts);
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
|
|
load64shorts(vs2, zetas);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
|
|
vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_addv(vs1, __ T8H, vs1, vs2);
|
|
vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
|
|
vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
|
|
|
|
// level 5
|
|
// At level 5 related coefficients occur in discrete blocks of size 8 so
|
|
// need to be loaded interleaved using an ld2 operation with arrangement 2D.
|
|
|
|
vs_ldpq(vq, kyberConsts);
|
|
int offsets4[4] = { 0, 32, 64, 96 };
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
|
|
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
|
|
|
|
// level 6
|
|
// At level 6 related coefficients occur in discrete blocks of size 4 so
|
|
// need to be loaded interleaved using an ld2 operation with arrangement 4S.
|
|
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
|
|
// __ ldpq(v18, v19, __ post(zetas, 32));
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
|
|
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
|
|
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Kyber Inverse NTT function
|
|
// Implements
|
|
// static int implKyberInverseNtt(short[] poly, short[] zetas) {}
|
|
//
|
|
// coeffs (short[256]) = c_rarg0
|
|
// ntt_zetas (short[256]) = c_rarg1
|
|
address generate_kyberInverseNtt() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register coeffs = c_rarg0;
|
|
const Register zetas = c_rarg1;
|
|
|
|
const Register kyberConsts = r10;
|
|
const Register tmpAddr = r11;
|
|
const Register tmpAddr2 = c_rarg2;
|
|
|
|
VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
|
|
VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
|
|
VSeq<2> vq(30); // n.b. constants overlap vs3
|
|
|
|
__ lea(kyberConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
|
|
|
|
// level 0
|
|
// At level 0 related coefficients occur in discrete blocks of size 4 so
|
|
// need to be loaded interleaved using an ld2 operation with arrangement 4S.
|
|
|
|
vs_ldpq(vq, kyberConsts);
|
|
int offsets4[4] = { 0, 32, 64, 96 };
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
|
|
|
|
// level 1
|
|
// At level 1 related coefficients occur in discrete blocks of size 8 so
|
|
// need to be loaded interleaved using an ld2 operation with arrangement 2D.
|
|
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
|
|
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
|
|
load32shorts(vs_front(vs2), zetas);
|
|
kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
|
|
|
|
// level 2
|
|
// At level 2 coefficients occur in 8 discrete blocks of size 16
|
|
// so they are loaded using employing an ldr at 8 distinct offsets.
|
|
|
|
int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
|
|
vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
|
|
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
|
|
vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
|
|
|
|
// Barrett reduction at indexes where overflow may happen
|
|
|
|
// load q and the multiplier for the Barrett reduction
|
|
__ add(tmpAddr, kyberConsts, 16);
|
|
vs_ldpq(vq, tmpAddr);
|
|
|
|
VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
|
|
VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
|
|
VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
|
|
vs_sqdmulh(vs2, __ T8H, vs1, vq2);
|
|
vs_sshr(vs2, __ T8H, vs2, 11);
|
|
vs_mlsv(vs1, __ T8H, vs2, vq1);
|
|
vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
|
|
vs_sqdmulh(vs2, __ T8H, vs1, vq2);
|
|
vs_sshr(vs2, __ T8H, vs2, 11);
|
|
vs_mlsv(vs1, __ T8H, vs2, vq1);
|
|
vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
|
|
|
|
// level 3
|
|
// From level 3 upwards coefficients occur in discrete blocks whose size is
|
|
// some multiple of 32 so can be loaded using ldpq and suitable indexes.
|
|
|
|
int offsets2[4] = { 0, 64, 128, 192 };
|
|
vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
|
|
vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
vs_stpq_indexed(vs3, coeffs, 0, offsets2);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_stpq_indexed(vs2, coeffs, 32, offsets2);
|
|
|
|
vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
|
|
vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
vs_stpq_indexed(vs3, coeffs, 256, offsets2);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
|
|
|
|
// level 4
|
|
|
|
int offsets1[4] = { 0, 32, 128, 160 };
|
|
vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
|
|
vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
vs_stpq_indexed(vs3, coeffs, 0, offsets1);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_stpq_indexed(vs2, coeffs, 64, offsets1);
|
|
|
|
vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
|
|
vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
vs_stpq_indexed(vs3, coeffs, 256, offsets1);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
|
|
|
|
// level 5
|
|
|
|
__ add(tmpAddr, coeffs, 0);
|
|
load64shorts(vs1, tmpAddr);
|
|
__ add(tmpAddr, coeffs, 128);
|
|
load64shorts(vs2, tmpAddr);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
store64shorts(vs3, tmpAddr);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 128);
|
|
store64shorts(vs2, tmpAddr);
|
|
|
|
load64shorts(vs1, tmpAddr);
|
|
__ add(tmpAddr, coeffs, 384);
|
|
load64shorts(vs2, tmpAddr);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 256);
|
|
store64shorts(vs3, tmpAddr);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 384);
|
|
store64shorts(vs2, tmpAddr);
|
|
|
|
// Barrett reduction at indexes where overflow may happen
|
|
|
|
// load q and the multiplier for the Barrett reduction
|
|
__ add(tmpAddr, kyberConsts, 16);
|
|
vs_ldpq(vq, tmpAddr);
|
|
|
|
int offsets0[2] = { 0, 256 };
|
|
vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
|
|
vs_sqdmulh(vs2, __ T8H, vs1, vq2);
|
|
vs_sshr(vs2, __ T8H, vs2, 11);
|
|
vs_mlsv(vs1, __ T8H, vs2, vq1);
|
|
vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
|
|
|
|
// level 6
|
|
|
|
__ add(tmpAddr, coeffs, 0);
|
|
load64shorts(vs1, tmpAddr);
|
|
__ add(tmpAddr, coeffs, 256);
|
|
load64shorts(vs2, tmpAddr);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
store64shorts(vs3, tmpAddr);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 256);
|
|
store64shorts(vs2, tmpAddr);
|
|
|
|
__ add(tmpAddr, coeffs, 128);
|
|
load64shorts(vs1, tmpAddr);
|
|
__ add(tmpAddr, coeffs, 384);
|
|
load64shorts(vs2, tmpAddr);
|
|
vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
|
|
vs_subv(vs1, __ T8H, vs1, vs2);
|
|
__ add(tmpAddr, coeffs, 128);
|
|
store64shorts(vs3, tmpAddr);
|
|
load64shorts(vs2, zetas);
|
|
vs_ldpq(vq, kyberConsts);
|
|
kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 384);
|
|
store64shorts(vs2, tmpAddr);
|
|
|
|
// multiply by 2^-n
|
|
|
|
// load toMont(2^-n mod q)
|
|
__ add(tmpAddr, kyberConsts, 48);
|
|
__ ldr(v29, __ Q, tmpAddr);
|
|
|
|
vs_ldpq(vq, kyberConsts);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
load64shorts(vs1, tmpAddr);
|
|
kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 0);
|
|
store64shorts(vs2, tmpAddr);
|
|
|
|
// now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
|
|
load64shorts(vs1, tmpAddr);
|
|
kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 128);
|
|
store64shorts(vs2, tmpAddr);
|
|
|
|
// now tmpAddr contains coeffs + 256
|
|
load64shorts(vs1, tmpAddr);
|
|
kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 256);
|
|
store64shorts(vs2, tmpAddr);
|
|
|
|
// now tmpAddr contains coeffs + 384
|
|
load64shorts(vs1, tmpAddr);
|
|
kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
|
|
__ add(tmpAddr, coeffs, 384);
|
|
store64shorts(vs2, tmpAddr);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Kyber multiply polynomials in the NTT domain.
|
|
// Implements
|
|
// static int implKyberNttMult(
|
|
// short[] result, short[] ntta, short[] nttb, short[] zetas) {}
|
|
//
|
|
// result (short[256]) = c_rarg0
|
|
// ntta (short[256]) = c_rarg1
|
|
// nttb (short[256]) = c_rarg2
|
|
// zetas (short[128]) = c_rarg3
|
|
address generate_kyberNttMult() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_kyberNttMult_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register result = c_rarg0;
|
|
const Register ntta = c_rarg1;
|
|
const Register nttb = c_rarg2;
|
|
const Register zetas = c_rarg3;
|
|
|
|
const Register kyberConsts = r10;
|
|
const Register limit = r11;
|
|
|
|
VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
|
|
VSeq<4> vs3(16), vs4(20);
|
|
VSeq<2> vq(30); // pair of constants for montmul: q, qinv
|
|
VSeq<2> vz(28); // pair of zetas
|
|
VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
|
|
|
|
__ lea(kyberConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
|
|
|
|
Label kyberNttMult_loop;
|
|
|
|
__ add(limit, result, 512);
|
|
|
|
// load q and qinv
|
|
vs_ldpq(vq, kyberConsts);
|
|
|
|
// load R^2 mod q (to convert back from Montgomery representation)
|
|
__ add(kyberConsts, kyberConsts, 64);
|
|
__ ldr(v27, __ Q, kyberConsts);
|
|
|
|
__ BIND(kyberNttMult_loop);
|
|
|
|
// load 16 zetas
|
|
vs_ldpq_post(vz, zetas);
|
|
|
|
// load 2 sets of 32 coefficients from the two input arrays
|
|
// interleaved as shorts. i.e. pairs of shorts adjacent in memory
|
|
// are striped across pairs of vector registers
|
|
vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
|
|
vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
|
|
vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
|
|
vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
|
|
|
|
// compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
|
|
// i.e. montmul the first and second halves of vs1 in order and
|
|
// then with one sequence reversed storing the two results in vs3
|
|
//
|
|
// vs3[0] <- montmul(a0, b0)
|
|
// vs3[1] <- montmul(a1, b1)
|
|
// vs3[2] <- montmul(a0, b1)
|
|
// vs3[3] <- montmul(a1, b0)
|
|
kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
|
|
kyber_montmul16(vs_back(vs3),
|
|
vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
|
|
|
|
// compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
|
|
// i.e. montmul the first and second halves of vs4 in order and
|
|
// then with one sequence reversed storing the two results in vs1
|
|
//
|
|
// vs1[0] <- montmul(a2, b2)
|
|
// vs1[1] <- montmul(a3, b3)
|
|
// vs1[2] <- montmul(a2, b3)
|
|
// vs1[3] <- montmul(a3, b2)
|
|
kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
|
|
kyber_montmul16(vs_back(vs1),
|
|
vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
|
|
|
|
// montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
|
|
// We can schedule two montmuls at a time if we use a suitable vector
|
|
// sequence <vs3[1], vs1[1]>.
|
|
int delta = vs1[1]->encoding() - vs3[1]->encoding();
|
|
VSeq<2> vs5(vs3[1], delta);
|
|
|
|
// vs3[1] <- montmul(montmul(a1, b1), z0)
|
|
// vs1[1] <- montmul(montmul(a3, b3), z1)
|
|
kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
|
|
|
|
// add results in pairs storing in vs3
|
|
// vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
|
|
// vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
|
|
vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
|
|
|
|
// vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
|
|
// vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
|
|
vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
|
|
|
|
// vs1 <- montmul(vs3, montRSquareModQ)
|
|
kyber_montmul32(vs1, vs3, vc, vs2, vq);
|
|
|
|
// store back the two pairs of result vectors de-interleaved as 8H elements
|
|
// i.e. storing each pairs of shorts striped across a register pair adjacent
|
|
// in memory
|
|
vs_st2_post(vs1, __ T8H, result);
|
|
|
|
__ cmp(result, limit);
|
|
__ br(Assembler::NE, kyberNttMult_loop);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Kyber add 2 polynomials.
|
|
// Implements
|
|
// static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
|
|
//
|
|
// result (short[256]) = c_rarg0
|
|
// a (short[256]) = c_rarg1
|
|
// b (short[256]) = c_rarg2
|
|
address generate_kyberAddPoly_2() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register result = c_rarg0;
|
|
const Register a = c_rarg1;
|
|
const Register b = c_rarg2;
|
|
|
|
const Register kyberConsts = r11;
|
|
|
|
// We sum 256 sets of values in total i.e. 32 x 8H quadwords.
|
|
// So, we can load, add and store the data in 3 groups of 11,
|
|
// 11 and 10 at a time i.e. we need to map sets of 10 or 11
|
|
// registers. A further constraint is that the mapping needs
|
|
// to skip callee saves. So, we allocate the register
|
|
// sequences using two 8 sequences, two 2 sequences and two
|
|
// single registers.
|
|
VSeq<8> vs1_1(0);
|
|
VSeq<2> vs1_2(16);
|
|
FloatRegister vs1_3 = v28;
|
|
VSeq<8> vs2_1(18);
|
|
VSeq<2> vs2_2(26);
|
|
FloatRegister vs2_3 = v29;
|
|
|
|
// two constant vector sequences
|
|
VSeq<8> vc_1(31, 0);
|
|
VSeq<2> vc_2(31, 0);
|
|
|
|
FloatRegister vc_3 = v31;
|
|
__ lea(kyberConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
|
|
|
|
__ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
|
|
for (int i = 0; i < 3; i++) {
|
|
// load 80 or 88 values from a into vs1_1/2/3
|
|
vs_ldpq_post(vs1_1, a);
|
|
vs_ldpq_post(vs1_2, a);
|
|
if (i < 2) {
|
|
__ ldr(vs1_3, __ Q, __ post(a, 16));
|
|
}
|
|
// load 80 or 88 values from b into vs2_1/2/3
|
|
vs_ldpq_post(vs2_1, b);
|
|
vs_ldpq_post(vs2_2, b);
|
|
if (i < 2) {
|
|
__ ldr(vs2_3, __ Q, __ post(b, 16));
|
|
}
|
|
// sum 80 or 88 values across vs1 and vs2 into vs1
|
|
vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
|
|
vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
|
|
if (i < 2) {
|
|
__ addv(vs1_3, __ T8H, vs1_3, vs2_3);
|
|
}
|
|
// add constant to all 80 or 88 results
|
|
vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
|
|
vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
|
|
if (i < 2) {
|
|
__ addv(vs1_3, __ T8H, vs1_3, vc_3);
|
|
}
|
|
// store 80 or 88 values
|
|
vs_stpq_post(vs1_1, result);
|
|
vs_stpq_post(vs1_2, result);
|
|
if (i < 2) {
|
|
__ str(vs1_3, __ Q, __ post(result, 16));
|
|
}
|
|
}
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Kyber add 3 polynomials.
|
|
// Implements
|
|
// static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
|
|
//
|
|
// result (short[256]) = c_rarg0
|
|
// a (short[256]) = c_rarg1
|
|
// b (short[256]) = c_rarg2
|
|
// c (short[256]) = c_rarg3
|
|
address generate_kyberAddPoly_3() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register result = c_rarg0;
|
|
const Register a = c_rarg1;
|
|
const Register b = c_rarg2;
|
|
const Register c = c_rarg3;
|
|
|
|
const Register kyberConsts = r11;
|
|
|
|
// As above we sum 256 sets of values in total i.e. 32 x 8H
|
|
// quadwords. So, we can load, add and store the data in 3
|
|
// groups of 11, 11 and 10 at a time i.e. we need to map sets
|
|
// of 10 or 11 registers. A further constraint is that the
|
|
// mapping needs to skip callee saves. So, we allocate the
|
|
// register sequences using two 8 sequences, two 2 sequences
|
|
// and two single registers.
|
|
VSeq<8> vs1_1(0);
|
|
VSeq<2> vs1_2(16);
|
|
FloatRegister vs1_3 = v28;
|
|
VSeq<8> vs2_1(18);
|
|
VSeq<2> vs2_2(26);
|
|
FloatRegister vs2_3 = v29;
|
|
|
|
// two constant vector sequences
|
|
VSeq<8> vc_1(31, 0);
|
|
VSeq<2> vc_2(31, 0);
|
|
|
|
FloatRegister vc_3 = v31;
|
|
|
|
__ lea(kyberConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
|
|
|
|
__ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
|
|
for (int i = 0; i < 3; i++) {
|
|
// load 80 or 88 values from a into vs1_1/2/3
|
|
vs_ldpq_post(vs1_1, a);
|
|
vs_ldpq_post(vs1_2, a);
|
|
if (i < 2) {
|
|
__ ldr(vs1_3, __ Q, __ post(a, 16));
|
|
}
|
|
// load 80 or 88 values from b into vs2_1/2/3
|
|
vs_ldpq_post(vs2_1, b);
|
|
vs_ldpq_post(vs2_2, b);
|
|
if (i < 2) {
|
|
__ ldr(vs2_3, __ Q, __ post(b, 16));
|
|
}
|
|
// sum 80 or 88 values across vs1 and vs2 into vs1
|
|
vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
|
|
vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
|
|
if (i < 2) {
|
|
__ addv(vs1_3, __ T8H, vs1_3, vs2_3);
|
|
}
|
|
// load 80 or 88 values from c into vs2_1/2/3
|
|
vs_ldpq_post(vs2_1, c);
|
|
vs_ldpq_post(vs2_2, c);
|
|
if (i < 2) {
|
|
__ ldr(vs2_3, __ Q, __ post(c, 16));
|
|
}
|
|
// sum 80 or 88 values across vs1 and vs2 into vs1
|
|
vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
|
|
vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
|
|
if (i < 2) {
|
|
__ addv(vs1_3, __ T8H, vs1_3, vs2_3);
|
|
}
|
|
// add constant to all 80 or 88 results
|
|
vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
|
|
vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
|
|
if (i < 2) {
|
|
__ addv(vs1_3, __ T8H, vs1_3, vc_3);
|
|
}
|
|
// store 80 or 88 values
|
|
vs_stpq_post(vs1_1, result);
|
|
vs_stpq_post(vs1_2, result);
|
|
if (i < 2) {
|
|
__ str(vs1_3, __ Q, __ post(result, 16));
|
|
}
|
|
}
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Kyber parse XOF output to polynomial coefficient candidates
|
|
// or decodePoly(12, ...).
|
|
// Implements
|
|
// static int implKyber12To16(
|
|
// byte[] condensed, int index, short[] parsed, int parsedLength) {}
|
|
//
|
|
// (parsedLength or (parsedLength - 48) must be divisible by 64.)
|
|
//
|
|
// condensed (byte[]) = c_rarg0
|
|
// condensedIndex = c_rarg1
|
|
// parsed (short[112 or 256]) = c_rarg2
|
|
// parsedLength (112 or 256) = c_rarg3
|
|
address generate_kyber12To16() {
|
|
Label L_F00, L_loop, L_end;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_kyber12To16_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register condensed = c_rarg0;
|
|
const Register condensedOffs = c_rarg1;
|
|
const Register parsed = c_rarg2;
|
|
const Register parsedLength = c_rarg3;
|
|
|
|
const Register tmpAddr = r11;
|
|
|
|
// Data is input 96 bytes at a time i.e. in groups of 6 x 16B
|
|
// quadwords so we need a 6 vector sequence for the inputs.
|
|
// Parsing produces 64 shorts, employing two 8 vector
|
|
// sequences to store and combine the intermediate data.
|
|
VSeq<6> vin(24);
|
|
VSeq<8> va(0), vb(16);
|
|
|
|
__ adr(tmpAddr, L_F00);
|
|
__ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
|
|
__ add(condensed, condensed, condensedOffs);
|
|
|
|
__ BIND(L_loop);
|
|
// load 96 (6 x 16B) byte values
|
|
vs_ld3_post(vin, __ T16B, condensed);
|
|
|
|
// The front half of sequence vin (vin[0], vin[1] and vin[2])
|
|
// holds 48 (16x3) contiguous bytes from memory striped
|
|
// horizontally across each of the 16 byte lanes. Equivalently,
|
|
// that is 16 pairs of 12-bit integers. Likewise the back half
|
|
// holds the next 48 bytes in the same arrangement.
|
|
|
|
// Each vector in the front half can also be viewed as a vertical
|
|
// strip across the 16 pairs of 12 bit integers. Each byte in
|
|
// vin[0] stores the low 8 bits of the first int in a pair. Each
|
|
// byte in vin[1] stores the high 4 bits of the first int and the
|
|
// low 4 bits of the second int. Each byte in vin[2] stores the
|
|
// high 8 bits of the second int. Likewise the vectors in second
|
|
// half.
|
|
|
|
// Converting the data to 16-bit shorts requires first of all
|
|
// expanding each of the 6 x 16B vectors into 6 corresponding
|
|
// pairs of 8H vectors. Mask, shift and add operations on the
|
|
// resulting vector pairs can be used to combine 4 and 8 bit
|
|
// parts of related 8H vector elements.
|
|
//
|
|
// The middle vectors (vin[2] and vin[5]) are actually expanded
|
|
// twice, one copy manipulated to provide the lower 4 bits
|
|
// belonging to the first short in a pair and another copy
|
|
// manipulated to provide the higher 4 bits belonging to the
|
|
// second short in a pair. This is why the the vector sequences va
|
|
// and vb used to hold the expanded 8H elements are of length 8.
|
|
|
|
// Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
|
|
// n.b. target elements 2 and 3 duplicate elements 4 and 5
|
|
__ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
|
|
__ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
|
|
__ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
|
|
__ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
|
|
__ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
|
|
__ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
|
|
|
|
// likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
|
|
// and vb[4:5]
|
|
__ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
|
|
__ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
|
|
__ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
|
|
__ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
|
|
__ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
|
|
__ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
|
|
|
|
// shift lo byte of copy 1 of the middle stripe into the high byte
|
|
__ shl(va[2], __ T8H, va[2], 8);
|
|
__ shl(va[3], __ T8H, va[3], 8);
|
|
__ shl(vb[2], __ T8H, vb[2], 8);
|
|
__ shl(vb[3], __ T8H, vb[3], 8);
|
|
|
|
// expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
|
|
// time pre-shifted by 4 to ensure top bits of input 12-bit int
|
|
// are in bit positions [4..11].
|
|
__ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
|
|
__ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
|
|
__ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
|
|
__ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
|
|
|
|
// mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
|
|
// shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
|
|
// copy2
|
|
__ andr(va[2], __ T16B, va[2], v31);
|
|
__ andr(va[3], __ T16B, va[3], v31);
|
|
__ ushr(va[4], __ T8H, va[4], 4);
|
|
__ ushr(va[5], __ T8H, va[5], 4);
|
|
__ andr(vb[2], __ T16B, vb[2], v31);
|
|
__ andr(vb[3], __ T16B, vb[3], v31);
|
|
__ ushr(vb[4], __ T8H, vb[4], 4);
|
|
__ ushr(vb[5], __ T8H, vb[5], 4);
|
|
|
|
// sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
|
|
// hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
|
|
// n.b. the ordering ensures: i) inputs are consumed before they
|
|
// are overwritten ii) the order of 16-bit results across successive
|
|
// pairs of vectors in va and then vb reflects the order of the
|
|
// corresponding 12-bit inputs
|
|
__ addv(va[0], __ T8H, va[0], va[2]);
|
|
__ addv(va[2], __ T8H, va[1], va[3]);
|
|
__ addv(va[1], __ T8H, va[4], va[6]);
|
|
__ addv(va[3], __ T8H, va[5], va[7]);
|
|
__ addv(vb[0], __ T8H, vb[0], vb[2]);
|
|
__ addv(vb[2], __ T8H, vb[1], vb[3]);
|
|
__ addv(vb[1], __ T8H, vb[4], vb[6]);
|
|
__ addv(vb[3], __ T8H, vb[5], vb[7]);
|
|
|
|
// store 64 results interleaved as shorts
|
|
vs_st2_post(vs_front(va), __ T8H, parsed);
|
|
vs_st2_post(vs_front(vb), __ T8H, parsed);
|
|
|
|
__ sub(parsedLength, parsedLength, 64);
|
|
__ cmp(parsedLength, (u1)64);
|
|
__ br(Assembler::GE, L_loop);
|
|
__ cbz(parsedLength, L_end);
|
|
|
|
// if anything is left it should be a final 72 bytes of input
|
|
// i.e. a final 48 12-bit values. so we handle this by loading
|
|
// 48 bytes into all 16B lanes of front(vin) and only 24
|
|
// bytes into the lower 8B lane of back(vin)
|
|
vs_ld3_post(vs_front(vin), __ T16B, condensed);
|
|
vs_ld3(vs_back(vin), __ T8B, condensed);
|
|
|
|
// Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
|
|
// n.b. target elements 2 and 3 of va duplicate elements 4 and
|
|
// 5 and target element 2 of vb duplicates element 4.
|
|
__ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
|
|
__ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
|
|
__ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
|
|
__ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
|
|
__ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
|
|
__ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
|
|
|
|
// This time expand just the lower 8 lanes
|
|
__ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
|
|
__ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
|
|
__ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
|
|
|
|
// shift lo byte of copy 1 of the middle stripe into the high byte
|
|
__ shl(va[2], __ T8H, va[2], 8);
|
|
__ shl(va[3], __ T8H, va[3], 8);
|
|
__ shl(vb[2], __ T8H, vb[2], 8);
|
|
|
|
// expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
|
|
// vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
|
|
// int are in bit positions [4..11].
|
|
__ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
|
|
__ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
|
|
__ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
|
|
|
|
// mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
|
|
// shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
|
|
// copy2
|
|
__ andr(va[2], __ T16B, va[2], v31);
|
|
__ andr(va[3], __ T16B, va[3], v31);
|
|
__ ushr(va[4], __ T8H, va[4], 4);
|
|
__ ushr(va[5], __ T8H, va[5], 4);
|
|
__ andr(vb[2], __ T16B, vb[2], v31);
|
|
__ ushr(vb[4], __ T8H, vb[4], 4);
|
|
|
|
|
|
|
|
// sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
|
|
// hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
|
|
|
|
// n.b. ordering ensures: i) inputs are consumed before they are
|
|
// overwritten ii) order of 16-bit results across succsessive
|
|
// pairs of vectors in va and then lower half of vb reflects order
|
|
// of corresponding 12-bit inputs
|
|
__ addv(va[0], __ T8H, va[0], va[2]);
|
|
__ addv(va[2], __ T8H, va[1], va[3]);
|
|
__ addv(va[1], __ T8H, va[4], va[6]);
|
|
__ addv(va[3], __ T8H, va[5], va[7]);
|
|
__ addv(vb[0], __ T8H, vb[0], vb[2]);
|
|
__ addv(vb[1], __ T8H, vb[4], vb[6]);
|
|
|
|
// store 48 results interleaved as shorts
|
|
vs_st2_post(vs_front(va), __ T8H, parsed);
|
|
vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
|
|
|
|
__ BIND(L_end);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
// bind label and generate constant data used by this stub
|
|
__ BIND(L_F00);
|
|
__ emit_int64(0x0f000f000f000f00);
|
|
__ emit_int64(0x0f000f000f000f00);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Kyber Barrett reduce function.
|
|
// Implements
|
|
// static int implKyberBarrettReduce(short[] coeffs) {}
|
|
//
|
|
// coeffs (short[256]) = c_rarg0
|
|
address generate_kyberBarrettReduce() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register coeffs = c_rarg0;
|
|
|
|
const Register kyberConsts = r10;
|
|
const Register result = r11;
|
|
|
|
// As above we process 256 sets of values in total i.e. 32 x
|
|
// 8H quadwords. So, we can load, add and store the data in 3
|
|
// groups of 11, 11 and 10 at a time i.e. we need to map sets
|
|
// of 10 or 11 registers. A further constraint is that the
|
|
// mapping needs to skip callee saves. So, we allocate the
|
|
// register sequences using two 8 sequences, two 2 sequences
|
|
// and two single registers.
|
|
VSeq<8> vs1_1(0);
|
|
VSeq<2> vs1_2(16);
|
|
FloatRegister vs1_3 = v28;
|
|
VSeq<8> vs2_1(18);
|
|
VSeq<2> vs2_2(26);
|
|
FloatRegister vs2_3 = v29;
|
|
|
|
// we also need a pair of corresponding constant sequences
|
|
|
|
VSeq<8> vc1_1(30, 0);
|
|
VSeq<2> vc1_2(30, 0);
|
|
FloatRegister vc1_3 = v30; // for kyber_q
|
|
|
|
VSeq<8> vc2_1(31, 0);
|
|
VSeq<2> vc2_2(31, 0);
|
|
FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
|
|
|
|
__ add(result, coeffs, 0);
|
|
__ lea(kyberConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
|
|
|
|
// load q and the multiplier for the Barrett reduction
|
|
__ add(kyberConsts, kyberConsts, 16);
|
|
__ ldpq(vc1_3, vc2_3, kyberConsts);
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
// load 80 or 88 coefficients
|
|
vs_ldpq_post(vs1_1, coeffs);
|
|
vs_ldpq_post(vs1_2, coeffs);
|
|
if (i < 2) {
|
|
__ ldr(vs1_3, __ Q, __ post(coeffs, 16));
|
|
}
|
|
|
|
// vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
|
|
vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
|
|
vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
|
|
if (i < 2) {
|
|
__ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
|
|
}
|
|
|
|
// vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
|
|
vs_sshr(vs2_1, __ T8H, vs2_1, 11);
|
|
vs_sshr(vs2_2, __ T8H, vs2_2, 11);
|
|
if (i < 2) {
|
|
__ sshr(vs2_3, __ T8H, vs2_3, 11);
|
|
}
|
|
|
|
// vs1 <- vs1 - vs2 * kyber_q
|
|
vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
|
|
vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
|
|
if (i < 2) {
|
|
__ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
|
|
}
|
|
|
|
vs_stpq_post(vs1_1, result);
|
|
vs_stpq_post(vs1_2, result);
|
|
if (i < 2) {
|
|
__ str(vs1_3, __ Q, __ post(result, 16));
|
|
}
|
|
}
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
|
|
// Dilithium-specific montmul helper routines that generate parallel
|
|
// code for, respectively, a single 4x4s vector sequence montmul or
|
|
// two such multiplies in a row.
|
|
|
|
// Perform 16 32-bit Montgomery multiplications in parallel
|
|
void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
|
|
const VSeq<4>& vtmp, const VSeq<2>& vq) {
|
|
// Use the helper routine to schedule a 4x4S Montgomery multiply.
|
|
// It will assert that the register use is valid
|
|
vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
|
|
}
|
|
|
|
// Perform 2x16 32-bit Montgomery multiplications in parallel
|
|
void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
|
|
const VSeq<4>& vtmp, const VSeq<2>& vq) {
|
|
// Schedule two successive 4x4S multiplies via the montmul helper
|
|
// on the front and back halves of va, vb and vc. The helper will
|
|
// assert that the register use has no overlap conflicts on each
|
|
// individual call but we also need to ensure that the necessary
|
|
// disjoint/equality constraints are met across both calls.
|
|
|
|
// vb, vc, vtmp and vq must be disjoint. va must either be
|
|
// disjoint from all other registers or equal vc
|
|
|
|
assert(vs_disjoint(vb, vc), "vb and vc overlap");
|
|
assert(vs_disjoint(vb, vq), "vb and vq overlap");
|
|
assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
|
|
|
|
assert(vs_disjoint(vc, vq), "vc and vq overlap");
|
|
assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
|
|
|
|
assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
|
|
|
|
assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
|
|
assert(vs_disjoint(va, vb), "va and vb overlap");
|
|
assert(vs_disjoint(va, vq), "va and vq overlap");
|
|
assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
|
|
|
|
// We multiply the front and back halves of each sequence 4 at a
|
|
// time because
|
|
//
|
|
// 1) we are currently only able to get 4-way instruction
|
|
// parallelism at best
|
|
//
|
|
// 2) we need registers for the constants in vq and temporary
|
|
// scratch registers to hold intermediate results so vtmp can only
|
|
// be a VSeq<4> which means we only have 4 scratch slots.
|
|
|
|
vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
|
|
vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
|
|
}
|
|
|
|
// Perform combined montmul then add/sub on 4x4S vectors.
|
|
void dilithium_montmul16_sub_add(
|
|
const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
|
|
const VSeq<4>& vtmp, const VSeq<2>& vq) {
|
|
// compute a = montmul(a1, c)
|
|
dilithium_montmul16(vc, va1, vc, vtmp, vq);
|
|
// ouptut a1 = a0 - a
|
|
vs_subv(va1, __ T4S, va0, vc);
|
|
// and a0 = a0 + a
|
|
vs_addv(va0, __ T4S, va0, vc);
|
|
}
|
|
|
|
// Perform combined add/sub then montul on 4x4S vectors.
|
|
void dilithium_sub_add_montmul16(
|
|
const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
|
|
const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
|
|
// compute c = a0 - a1
|
|
vs_subv(vtmp1, __ T4S, va0, va1);
|
|
// output a0 = a0 + a1
|
|
vs_addv(va0, __ T4S, va0, va1);
|
|
// output a1 = b montmul c
|
|
dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
|
|
}
|
|
|
|
// At these levels, the indices that correspond to the 'j's (and 'j+l's)
|
|
// in the Java implementation come in sequences of at least 8, so we
|
|
// can use ldpq to collect the corresponding data into pairs of vector
|
|
// registers.
|
|
// We collect the coefficients corresponding to the 'j+l' indexes into
|
|
// the vector registers v0-v7, the zetas into the vector registers v16-v23
|
|
// then we do the (Montgomery) multiplications by the zetas in parallel
|
|
// into v16-v23, load the coeffs corresponding to the 'j' indexes into
|
|
// v0-v7, then do the additions into v24-v31 and the subtractions into
|
|
// v0-v7 and finally save the results back to the coeffs array.
|
|
void dilithiumNttLevel0_4(const Register dilithiumConsts,
|
|
const Register coeffs, const Register zetas) {
|
|
int c1 = 0;
|
|
int c2 = 512;
|
|
int startIncr;
|
|
// don't use callee save registers v8 - v15
|
|
VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
|
|
VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
|
|
VSeq<2> vq(30); // n.b. constants overlap vs3
|
|
int offsets[4] = { 0, 32, 64, 96 };
|
|
|
|
for (int level = 0; level < 5; level++) {
|
|
int c1Start = c1;
|
|
int c2Start = c2;
|
|
if (level == 3) {
|
|
offsets[1] = 32;
|
|
offsets[2] = 128;
|
|
offsets[3] = 160;
|
|
} else if (level == 4) {
|
|
offsets[1] = 64;
|
|
offsets[2] = 128;
|
|
offsets[3] = 192;
|
|
}
|
|
|
|
// For levels 1 - 4 we simply load 2 x 4 adjacent values at a
|
|
// time at 4 different offsets and multiply them in order by the
|
|
// next set of input values. So we employ indexed load and store
|
|
// pair instructions with arrangement 4S.
|
|
for (int i = 0; i < 4; i++) {
|
|
// reload q and qinv
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// load 8x4S coefficients via second start pos == c2
|
|
vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
|
|
// load next 8x4S inputs == b
|
|
vs_ldpq_post(vs2, zetas);
|
|
// compute a == c2 * b mod MONT_Q
|
|
dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
|
|
// load 8x4s coefficients via first start pos == c1
|
|
vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
|
|
// compute a1 = c1 + a
|
|
vs_addv(vs3, __ T4S, vs1, vs2);
|
|
// compute a2 = c1 - a
|
|
vs_subv(vs1, __ T4S, vs1, vs2);
|
|
// output a1 and a2
|
|
vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
|
|
vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
|
|
|
|
int k = 4 * level + i;
|
|
|
|
if (k > 7) {
|
|
startIncr = 256;
|
|
} else if (k == 5) {
|
|
startIncr = 384;
|
|
} else {
|
|
startIncr = 128;
|
|
}
|
|
|
|
c1Start += startIncr;
|
|
c2Start += startIncr;
|
|
}
|
|
|
|
c2 /= 2;
|
|
}
|
|
}
|
|
|
|
// Dilithium NTT function except for the final "normalization" to |coeff| < Q.
|
|
// Implements the method
|
|
// static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
|
|
// of the Java class sun.security.provider
|
|
//
|
|
// coeffs (int[256]) = c_rarg0
|
|
// zetas (int[256]) = c_rarg1
|
|
address generate_dilithiumAlmostNtt() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register coeffs = c_rarg0;
|
|
const Register zetas = c_rarg1;
|
|
|
|
const Register tmpAddr = r9;
|
|
const Register dilithiumConsts = r10;
|
|
const Register result = r11;
|
|
// don't use callee save registers v8 - v15
|
|
VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
|
|
VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
|
|
VSeq<2> vq(30); // n.b. constants overlap vs3
|
|
int offsets[4] = { 0, 32, 64, 96};
|
|
int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
|
|
int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
|
|
__ add(result, coeffs, 0);
|
|
__ lea(dilithiumConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
|
|
|
|
// Each level represents one iteration of the outer for loop of the Java version.
|
|
|
|
// level 0-4
|
|
dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
|
|
|
|
// level 5
|
|
|
|
// At level 5 the coefficients we need to combine with the zetas
|
|
// are grouped in memory in blocks of size 4. So, for both sets of
|
|
// coefficients we load 4 adjacent values at 8 different offsets
|
|
// using an indexed ldr with register variant Q and multiply them
|
|
// in sequence order by the next set of inputs. Likewise we store
|
|
// the resuls using an indexed str with register variant Q.
|
|
for (int i = 0; i < 1024; i += 256) {
|
|
// reload constants q, qinv each iteration as they get clobbered later
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// load 32 (8x4S) coefficients via first offsets = c1
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
|
|
// load next 32 (8x4S) inputs = b
|
|
vs_ldpq_post(vs2, zetas);
|
|
// a = b montul c1
|
|
dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
|
|
// load 32 (8x4S) coefficients via second offsets = c2
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
|
|
// add/sub with result of multiply
|
|
vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
|
|
vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
|
|
// write back new coefficients using same offsets
|
|
vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
|
|
vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
|
|
}
|
|
|
|
// level 6
|
|
// At level 6 the coefficients we need to combine with the zetas
|
|
// are grouped in memory in pairs, the first two being montmul
|
|
// inputs and the second add/sub inputs. We can still implement
|
|
// the montmul+sub+add using 4-way parallelism but only if we
|
|
// combine the coefficients with the zetas 16 at a time. We load 8
|
|
// adjacent values at 4 different offsets using an ld2 load with
|
|
// arrangement 2D. That interleaves the lower and upper halves of
|
|
// each pair of quadwords into successive vector registers. We
|
|
// then need to montmul the 4 even elements of the coefficients
|
|
// register sequence by the zetas in order and then add/sub the 4
|
|
// odd elements of the coefficients register sequence. We use an
|
|
// equivalent st2 operation to store the results back into memory
|
|
// de-interleaved.
|
|
for (int i = 0; i < 1024; i += 128) {
|
|
// reload constants q, qinv each iteration as they get clobbered later
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// load interleaved 16 (4x2D) coefficients via offsets
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
|
|
// load next 16 (4x4S) inputs
|
|
vs_ldpq_post(vs_front(vs2), zetas);
|
|
// mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
|
|
dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vtmp, vq);
|
|
// store interleaved 16 (4x2D) coefficients via offsets
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
|
|
}
|
|
|
|
// level 7
|
|
// At level 7 the coefficients we need to combine with the zetas
|
|
// occur singly with montmul inputs alterating with add/sub
|
|
// inputs. Once again we can use 4-way parallelism to combine 16
|
|
// zetas at a time. However, we have to load 8 adjacent values at
|
|
// 4 different offsets using an ld2 load with arrangement 4S. That
|
|
// interleaves the the odd words of each pair into one
|
|
// coefficients vector register and the even words of the pair
|
|
// into the next register. We then need to montmul the 4 even
|
|
// elements of the coefficients register sequence by the zetas in
|
|
// order and then add/sub the 4 odd elements of the coefficients
|
|
// register sequence. We use an equivalent st2 operation to store
|
|
// the results back into memory de-interleaved.
|
|
|
|
for (int i = 0; i < 1024; i += 128) {
|
|
// reload constants q, qinv each iteration as they get clobbered later
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// load interleaved 16 (4x4S) coefficients via offsets
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
|
|
// load next 16 (4x4S) inputs
|
|
vs_ldpq_post(vs_front(vs2), zetas);
|
|
// mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
|
|
dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vtmp, vq);
|
|
// store interleaved 16 (4x4S) coefficients via offsets
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
|
|
}
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// At these levels, the indices that correspond to the 'j's (and 'j+l's)
|
|
// in the Java implementation come in sequences of at least 8, so we
|
|
// can use ldpq to collect the corresponding data into pairs of vector
|
|
// registers
|
|
// We collect the coefficients that correspond to the 'j's into vs1
|
|
// the coefficiets that correspond to the 'j+l's into vs2 then
|
|
// do the additions into vs3 and the subtractions into vs1 then
|
|
// save the result of the additions, load the zetas into vs2
|
|
// do the (Montgomery) multiplications by zeta in parallel into vs2
|
|
// finally save the results back to the coeffs array
|
|
void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
|
|
const Register coeffs, const Register zetas) {
|
|
int c1 = 0;
|
|
int c2 = 32;
|
|
int startIncr;
|
|
int offsets[4];
|
|
VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
|
|
VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
|
|
VSeq<2> vq(30); // n.b. constants overlap vs3
|
|
|
|
offsets[0] = 0;
|
|
|
|
for (int level = 3; level < 8; level++) {
|
|
int c1Start = c1;
|
|
int c2Start = c2;
|
|
if (level == 3) {
|
|
offsets[1] = 64;
|
|
offsets[2] = 128;
|
|
offsets[3] = 192;
|
|
} else if (level == 4) {
|
|
offsets[1] = 32;
|
|
offsets[2] = 128;
|
|
offsets[3] = 160;
|
|
} else {
|
|
offsets[1] = 32;
|
|
offsets[2] = 64;
|
|
offsets[3] = 96;
|
|
}
|
|
|
|
// For levels 3 - 7 we simply load 2 x 4 adjacent values at a
|
|
// time at 4 different offsets and multiply them in order by the
|
|
// next set of input values. So we employ indexed load and store
|
|
// pair instructions with arrangement 4S.
|
|
for (int i = 0; i < 4; i++) {
|
|
// load v1 32 (8x4S) coefficients relative to first start index
|
|
vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
|
|
// load v2 32 (8x4S) coefficients relative to second start index
|
|
vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
|
|
// a0 = v1 + v2 -- n.b. clobbers vqs
|
|
vs_addv(vs3, __ T4S, vs1, vs2);
|
|
// a1 = v1 - v2
|
|
vs_subv(vs1, __ T4S, vs1, vs2);
|
|
// save a1 relative to first start index
|
|
vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
|
|
// load constants q, qinv each iteration as they get clobbered above
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// load b next 32 (8x4S) inputs
|
|
vs_ldpq_post(vs2, zetas);
|
|
// a = a1 montmul b
|
|
dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
|
|
// save a relative to second start index
|
|
vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
|
|
|
|
int k = 4 * level + i;
|
|
|
|
if (k < 24) {
|
|
startIncr = 256;
|
|
} else if (k == 25) {
|
|
startIncr = 384;
|
|
} else {
|
|
startIncr = 128;
|
|
}
|
|
|
|
c1Start += startIncr;
|
|
c2Start += startIncr;
|
|
}
|
|
|
|
c2 *= 2;
|
|
}
|
|
}
|
|
|
|
// Dilithium Inverse NTT function except the final mod Q division by 2^256.
|
|
// Implements the method
|
|
// static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
|
|
// the sun.security.provider.ML_DSA class.
|
|
//
|
|
// coeffs (int[256]) = c_rarg0
|
|
// zetas (int[256]) = c_rarg1
|
|
address generate_dilithiumAlmostInverseNtt() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const Register coeffs = c_rarg0;
|
|
const Register zetas = c_rarg1;
|
|
|
|
const Register tmpAddr = r9;
|
|
const Register dilithiumConsts = r10;
|
|
const Register result = r11;
|
|
VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
|
|
VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
|
|
VSeq<2> vq(30); // n.b. constants overlap vs3
|
|
int offsets[4] = { 0, 32, 64, 96 };
|
|
int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
|
|
int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
|
|
|
|
__ add(result, coeffs, 0);
|
|
__ lea(dilithiumConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
|
|
|
|
// Each level represents one iteration of the outer for loop of the Java version
|
|
|
|
// level 0
|
|
// At level 0 we need to interleave adjacent quartets of
|
|
// coefficients before we multiply and add/sub by the next 16
|
|
// zetas just as we did for level 7 in the multiply code. So we
|
|
// load and store the values using an ld2/st2 with arrangement 4S.
|
|
for (int i = 0; i < 1024; i += 128) {
|
|
// load constants q, qinv
|
|
// n.b. this can be moved out of the loop as they do not get
|
|
// clobbered by first two loops
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// a0/a1 load interleaved 32 (8x4S) coefficients
|
|
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
|
|
// b load next 32 (8x4S) inputs
|
|
vs_ldpq_post(vs_front(vs2), zetas);
|
|
// compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
|
|
// n.b. second half of vs2 provides temporary register storage
|
|
dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
// a0/a1 store interleaved 32 (8x4S) coefficients
|
|
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
|
|
}
|
|
|
|
// level 1
|
|
// At level 1 we need to interleave pairs of adjacent pairs of
|
|
// coefficients before we multiply by the next 16 zetas just as we
|
|
// did for level 6 in the multiply code. So we load and store the
|
|
// values an ld2/st2 with arrangement 2D.
|
|
for (int i = 0; i < 1024; i += 128) {
|
|
// a0/a1 load interleaved 32 (8x2D) coefficients
|
|
vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
|
|
// b load next 16 (4x4S) inputs
|
|
vs_ldpq_post(vs_front(vs2), zetas);
|
|
// compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
|
|
// n.b. second half of vs2 provides temporary register storage
|
|
dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
|
|
vs_front(vs2), vs_back(vs2), vtmp, vq);
|
|
// a0/a1 store interleaved 32 (8x2D) coefficients
|
|
vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
|
|
}
|
|
|
|
// level 2
|
|
// At level 2 coefficients come in blocks of 4. So, we load 4
|
|
// adjacent coefficients at 8 distinct offsets for both the first
|
|
// and second coefficient sequences, using an ldr with register
|
|
// variant Q then combine them with next set of 32 zetas. Likewise
|
|
// we store the results using an str with register variant Q.
|
|
for (int i = 0; i < 1024; i += 256) {
|
|
// c0 load 32 (8x4S) coefficients via first offsets
|
|
vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
|
|
// c1 load 32 (8x4S) coefficients via second offsets
|
|
vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
|
|
// a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
|
|
vs_addv(vs3, __ T4S, vs1, vs2);
|
|
// c = c0 - c1
|
|
vs_subv(vs1, __ T4S, vs1, vs2);
|
|
// store a0 32 (8x4S) coefficients via first offsets
|
|
vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
|
|
// b load 32 (8x4S) next inputs
|
|
vs_ldpq_post(vs2, zetas);
|
|
// reload constants q, qinv -- they were clobbered earlier
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// compute a1 = b montmul c
|
|
dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
|
|
// store a1 32 (8x4S) coefficients via second offsets
|
|
vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
|
|
}
|
|
|
|
// level 3-7
|
|
dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Dilithium multiply polynomials in the NTT domain.
|
|
// Straightforward implementation of the method
|
|
// static int implDilithiumNttMult(
|
|
// int[] result, int[] ntta, int[] nttb {} of
|
|
// the sun.security.provider.ML_DSA class.
|
|
//
|
|
// result (int[256]) = c_rarg0
|
|
// poly1 (int[256]) = c_rarg1
|
|
// poly2 (int[256]) = c_rarg2
|
|
address generate_dilithiumNttMult() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
Label L_loop;
|
|
|
|
const Register result = c_rarg0;
|
|
const Register poly1 = c_rarg1;
|
|
const Register poly2 = c_rarg2;
|
|
|
|
const Register dilithiumConsts = r10;
|
|
const Register len = r11;
|
|
|
|
VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
|
|
VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
|
|
VSeq<2> vq(30); // n.b. constants overlap vs3
|
|
VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
|
|
|
|
__ lea(dilithiumConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
|
|
|
|
// load constants q, qinv
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// load constant rSquare into v29
|
|
__ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
|
|
|
|
__ mov(len, zr);
|
|
__ add(len, len, 1024);
|
|
|
|
__ BIND(L_loop);
|
|
|
|
// b load 32 (8x4S) next inputs from poly1
|
|
vs_ldpq_post(vs1, poly1);
|
|
// c load 32 (8x4S) next inputs from poly2
|
|
vs_ldpq_post(vs2, poly2);
|
|
// compute a = b montmul c
|
|
dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
|
|
// compute a = rsquare montmul a
|
|
dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
|
|
// save a 32 (8x4S) results
|
|
vs_stpq_post(vs2, result);
|
|
|
|
__ sub(len, len, 128);
|
|
__ cmp(len, (u1)128);
|
|
__ br(Assembler::GE, L_loop);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Dilithium Motgomery multiply an array by a constant.
|
|
// A straightforward implementation of the method
|
|
// static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
|
|
// of the sun.security.provider.MLDSA class
|
|
//
|
|
// coeffs (int[256]) = c_rarg0
|
|
// constant (int) = c_rarg1
|
|
address generate_dilithiumMontMulByConstant() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
Label L_loop;
|
|
|
|
const Register coeffs = c_rarg0;
|
|
const Register constant = c_rarg1;
|
|
|
|
const Register dilithiumConsts = r10;
|
|
const Register result = r11;
|
|
const Register len = r12;
|
|
|
|
VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
|
|
VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
|
|
VSeq<2> vq(30); // n.b. constants overlap vs3
|
|
VSeq<8> vconst(29, 0); // for montmul by constant
|
|
|
|
// results track inputs
|
|
__ add(result, coeffs, 0);
|
|
__ lea(dilithiumConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
|
|
|
|
// load constants q, qinv -- they do not get clobbered by first two loops
|
|
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
|
// copy caller supplied constant across vconst
|
|
__ dup(vconst[0], __ T4S, constant);
|
|
__ mov(len, zr);
|
|
__ add(len, len, 1024);
|
|
|
|
__ BIND(L_loop);
|
|
|
|
// load next 32 inputs
|
|
vs_ldpq_post(vs2, coeffs);
|
|
// mont mul by constant
|
|
dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
|
|
// write next 32 results
|
|
vs_stpq_post(vs2, result);
|
|
|
|
__ sub(len, len, 128);
|
|
__ cmp(len, (u1)128);
|
|
__ br(Assembler::GE, L_loop);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Dilithium decompose poly.
|
|
// Implements the method
|
|
// static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
|
|
// of the sun.security.provider.ML_DSA class
|
|
//
|
|
// input (int[256]) = c_rarg0
|
|
// lowPart (int[256]) = c_rarg1
|
|
// highPart (int[256]) = c_rarg2
|
|
// twoGamma2 (int) = c_rarg3
|
|
// multiplier (int) = c_rarg4
|
|
address generate_dilithiumDecomposePoly() {
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
Label L_loop;
|
|
|
|
const Register input = c_rarg0;
|
|
const Register lowPart = c_rarg1;
|
|
const Register highPart = c_rarg2;
|
|
const Register twoGamma2 = c_rarg3;
|
|
const Register multiplier = c_rarg4;
|
|
|
|
const Register len = r9;
|
|
const Register dilithiumConsts = r10;
|
|
const Register tmp = r11;
|
|
|
|
// 6 independent sets of 4x4s values
|
|
VSeq<4> vs1(0), vs2(4), vs3(8);
|
|
VSeq<4> vs4(12), vs5(16), vtmp(20);
|
|
|
|
// 7 constants for cross-multiplying
|
|
VSeq<4> one(25, 0);
|
|
VSeq<4> qminus1(26, 0);
|
|
VSeq<4> g2(27, 0);
|
|
VSeq<4> twog2(28, 0);
|
|
VSeq<4> mult(29, 0);
|
|
VSeq<4> q(30, 0);
|
|
VSeq<4> qadd(31, 0);
|
|
|
|
__ enter();
|
|
|
|
__ lea(dilithiumConsts,
|
|
ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
|
|
|
|
// save callee-saved registers
|
|
__ stpd(v8, v9, __ pre(sp, -64));
|
|
__ stpd(v10, v11, Address(sp, 16));
|
|
__ stpd(v12, v13, Address(sp, 32));
|
|
__ stpd(v14, v15, Address(sp, 48));
|
|
|
|
// populate constant registers
|
|
__ mov(tmp, zr);
|
|
__ add(tmp, tmp, 1);
|
|
__ dup(one[0], __ T4S, tmp); // 1
|
|
__ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
|
|
__ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
|
|
__ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
|
|
__ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
|
|
__ subv(qminus1[0], __ T4S, v30, v25); // q - 1
|
|
__ sshr(g2[0], __ T4S, v28, 1); // gamma2
|
|
|
|
__ mov(len, zr);
|
|
__ add(len, len, 1024);
|
|
|
|
__ BIND(L_loop);
|
|
|
|
// load next 4x4S inputs interleaved: rplus --> vs1
|
|
__ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
|
|
|
|
// rplus = rplus - ((rplus + qadd) >> 23) * q
|
|
vs_addv(vtmp, __ T4S, vs1, qadd);
|
|
vs_sshr(vtmp, __ T4S, vtmp, 23);
|
|
vs_mulv(vtmp, __ T4S, vtmp, q);
|
|
vs_subv(vs1, __ T4S, vs1, vtmp);
|
|
|
|
// rplus = rplus + ((rplus >> 31) & dilithium_q);
|
|
vs_sshr(vtmp, __ T4S, vs1, 31);
|
|
vs_andr(vtmp, vtmp, q);
|
|
vs_addv(vs1, __ T4S, vs1, vtmp);
|
|
|
|
// quotient --> vs2
|
|
// int quotient = (rplus * multiplier) >> 22;
|
|
vs_mulv(vtmp, __ T4S, vs1, mult);
|
|
vs_sshr(vs2, __ T4S, vtmp, 22);
|
|
|
|
// r0 --> vs3
|
|
// int r0 = rplus - quotient * twoGamma2;
|
|
vs_mulv(vtmp, __ T4S, vs2, twog2);
|
|
vs_subv(vs3, __ T4S, vs1, vtmp);
|
|
|
|
// mask --> vs4
|
|
// int mask = (twoGamma2 - r0) >> 22;
|
|
vs_subv(vtmp, __ T4S, twog2, vs3);
|
|
vs_sshr(vs4, __ T4S, vtmp, 22);
|
|
|
|
// r0 -= (mask & twoGamma2);
|
|
vs_andr(vtmp, vs4, twog2);
|
|
vs_subv(vs3, __ T4S, vs3, vtmp);
|
|
|
|
// quotient += (mask & 1);
|
|
vs_andr(vtmp, vs4, one);
|
|
vs_addv(vs2, __ T4S, vs2, vtmp);
|
|
|
|
// mask = (twoGamma2 / 2 - r0) >> 31;
|
|
vs_subv(vtmp, __ T4S, g2, vs3);
|
|
vs_sshr(vs4, __ T4S, vtmp, 31);
|
|
|
|
// r0 -= (mask & twoGamma2);
|
|
vs_andr(vtmp, vs4, twog2);
|
|
vs_subv(vs3, __ T4S, vs3, vtmp);
|
|
|
|
// quotient += (mask & 1);
|
|
vs_andr(vtmp, vs4, one);
|
|
vs_addv(vs2, __ T4S, vs2, vtmp);
|
|
|
|
// r1 --> vs5
|
|
// int r1 = rplus - r0 - (dilithium_q - 1);
|
|
vs_subv(vtmp, __ T4S, vs1, vs3);
|
|
vs_subv(vs5, __ T4S, vtmp, qminus1);
|
|
|
|
// r1 --> vs1 (overwriting rplus)
|
|
// r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
|
|
vs_negr(vtmp, __ T4S, vs5);
|
|
vs_orr(vtmp, vs5, vtmp);
|
|
vs_sshr(vs1, __ T4S, vtmp, 31);
|
|
|
|
// r0 += ~r1;
|
|
vs_notr(vtmp, vs1);
|
|
vs_addv(vs3, __ T4S, vs3, vtmp);
|
|
|
|
// r1 = r1 & quotient;
|
|
vs_andr(vs1, vs2, vs1);
|
|
|
|
// store results inteleaved
|
|
// lowPart[m] = r0;
|
|
// highPart[m] = r1;
|
|
__ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
|
|
__ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
|
|
|
|
__ sub(len, len, 64);
|
|
__ cmp(len, (u1)64);
|
|
__ br(Assembler::GE, L_loop);
|
|
|
|
// restore callee-saved vector registers
|
|
__ ldpd(v14, v15, Address(sp, 48));
|
|
__ ldpd(v12, v13, Address(sp, 32));
|
|
__ ldpd(v10, v11, Address(sp, 16));
|
|
__ ldpd(v8, v9, __ post(sp, 64));
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ mov(r0, zr); // return 0
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
|
|
Register tmp0, Register tmp1, Register tmp2) {
|
|
__ bic(tmp0, a2, a1); // for a0
|
|
__ bic(tmp1, a3, a2); // for a1
|
|
__ bic(tmp2, a4, a3); // for a2
|
|
__ eor(a2, a2, tmp2);
|
|
__ bic(tmp2, a0, a4); // for a3
|
|
__ eor(a3, a3, tmp2);
|
|
__ bic(tmp2, a1, a0); // for a4
|
|
__ eor(a0, a0, tmp0);
|
|
__ eor(a1, a1, tmp1);
|
|
__ eor(a4, a4, tmp2);
|
|
}
|
|
|
|
void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
|
|
Register a0, Register a1, Register a2, Register a3, Register a4,
|
|
Register a5, Register a6, Register a7, Register a8, Register a9,
|
|
Register a10, Register a11, Register a12, Register a13, Register a14,
|
|
Register a15, Register a16, Register a17, Register a18, Register a19,
|
|
Register a20, Register a21, Register a22, Register a23, Register a24,
|
|
Register tmp0, Register tmp1, Register tmp2) {
|
|
__ eor3(tmp1, a4, a9, a14);
|
|
__ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
|
|
__ eor3(tmp2, a1, a6, a11);
|
|
__ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
|
|
__ rax1(tmp2, tmp0, tmp1); // d0
|
|
{
|
|
|
|
Register tmp3, tmp4;
|
|
if (can_use_fp && can_use_r18) {
|
|
tmp3 = rfp;
|
|
tmp4 = r18_tls;
|
|
} else {
|
|
tmp3 = a4;
|
|
tmp4 = a9;
|
|
__ stp(tmp3, tmp4, __ pre(sp, -16));
|
|
}
|
|
|
|
__ eor3(tmp3, a0, a5, a10);
|
|
__ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
|
|
__ eor(a0, a0, tmp2);
|
|
__ eor(a5, a5, tmp2);
|
|
__ eor(a10, a10, tmp2);
|
|
__ eor(a15, a15, tmp2);
|
|
__ eor(a20, a20, tmp2); // d0(tmp2)
|
|
__ eor3(tmp3, a2, a7, a12);
|
|
__ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
|
|
__ rax1(tmp3, tmp4, tmp2); // d1
|
|
__ eor(a1, a1, tmp3);
|
|
__ eor(a6, a6, tmp3);
|
|
__ eor(a11, a11, tmp3);
|
|
__ eor(a16, a16, tmp3);
|
|
__ eor(a21, a21, tmp3); // d1(tmp3)
|
|
__ rax1(tmp3, tmp2, tmp0); // d3
|
|
__ eor3(tmp2, a3, a8, a13);
|
|
__ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
|
|
__ eor(a3, a3, tmp3);
|
|
__ eor(a8, a8, tmp3);
|
|
__ eor(a13, a13, tmp3);
|
|
__ eor(a18, a18, tmp3);
|
|
__ eor(a23, a23, tmp3);
|
|
__ rax1(tmp2, tmp1, tmp0); // d2
|
|
__ eor(a2, a2, tmp2);
|
|
__ eor(a7, a7, tmp2);
|
|
__ eor(a12, a12, tmp2);
|
|
__ rax1(tmp0, tmp0, tmp4); // d4
|
|
if (!can_use_fp || !can_use_r18) {
|
|
__ ldp(tmp3, tmp4, __ post(sp, 16));
|
|
}
|
|
__ eor(a17, a17, tmp2);
|
|
__ eor(a22, a22, tmp2);
|
|
__ eor(a4, a4, tmp0);
|
|
__ eor(a9, a9, tmp0);
|
|
__ eor(a14, a14, tmp0);
|
|
__ eor(a19, a19, tmp0);
|
|
__ eor(a24, a24, tmp0);
|
|
}
|
|
|
|
__ rol(tmp0, a10, 3);
|
|
__ rol(a10, a1, 1);
|
|
__ rol(a1, a6, 44);
|
|
__ rol(a6, a9, 20);
|
|
__ rol(a9, a22, 61);
|
|
__ rol(a22, a14, 39);
|
|
__ rol(a14, a20, 18);
|
|
__ rol(a20, a2, 62);
|
|
__ rol(a2, a12, 43);
|
|
__ rol(a12, a13, 25);
|
|
__ rol(a13, a19, 8) ;
|
|
__ rol(a19, a23, 56);
|
|
__ rol(a23, a15, 41);
|
|
__ rol(a15, a4, 27);
|
|
__ rol(a4, a24, 14);
|
|
__ rol(a24, a21, 2);
|
|
__ rol(a21, a8, 55);
|
|
__ rol(a8, a16, 45);
|
|
__ rol(a16, a5, 36);
|
|
__ rol(a5, a3, 28);
|
|
__ rol(a3, a18, 21);
|
|
__ rol(a18, a17, 15);
|
|
__ rol(a17, a11, 10);
|
|
__ rol(a11, a7, 6);
|
|
__ mov(a7, tmp0);
|
|
|
|
bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
|
|
bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
|
|
bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
|
|
bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
|
|
bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
|
|
|
|
__ ldr(tmp1, __ post(rc, 8));
|
|
__ eor(a0, a0, tmp1);
|
|
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - byte[] source+offset
|
|
// c_rarg1 - byte[] SHA.state
|
|
// c_rarg2 - int block_size
|
|
// c_rarg3 - int offset
|
|
// c_rarg4 - int limit
|
|
//
|
|
address generate_sha3_implCompress_gpr(StubId stub_id) {
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_sha3_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case StubId::stubgen_sha3_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
static const uint64_t round_consts[24] = {
|
|
0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
|
|
0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
|
|
0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
|
|
0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
|
|
0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
|
|
0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
|
|
0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
|
|
0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register buf = c_rarg0;
|
|
Register state = c_rarg1;
|
|
Register block_size = c_rarg2;
|
|
Register ofs = c_rarg3;
|
|
Register limit = c_rarg4;
|
|
|
|
// use r3.r17,r19..r28 to keep a0..a24.
|
|
// a0..a24 are respective locals from SHA3.java
|
|
Register a0 = r25,
|
|
a1 = r26,
|
|
a2 = r27,
|
|
a3 = r3,
|
|
a4 = r4,
|
|
a5 = r5,
|
|
a6 = r6,
|
|
a7 = r7,
|
|
a8 = rscratch1, // r8
|
|
a9 = rscratch2, // r9
|
|
a10 = r10,
|
|
a11 = r11,
|
|
a12 = r12,
|
|
a13 = r13,
|
|
a14 = r14,
|
|
a15 = r15,
|
|
a16 = r16,
|
|
a17 = r17,
|
|
a18 = r28,
|
|
a19 = r19,
|
|
a20 = r20,
|
|
a21 = r21,
|
|
a22 = r22,
|
|
a23 = r23,
|
|
a24 = r24;
|
|
|
|
Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
|
|
|
|
Label sha3_loop, rounds24_preloop, loop_body;
|
|
Label sha3_512_or_sha3_384, shake128;
|
|
|
|
bool can_use_r18 = false;
|
|
#ifndef R18_RESERVED
|
|
can_use_r18 = true;
|
|
#endif
|
|
bool can_use_fp = !PreserveFramePointer;
|
|
|
|
__ enter();
|
|
|
|
// save almost all yet unsaved gpr registers on stack
|
|
__ str(block_size, __ pre(sp, -128));
|
|
if (multi_block) {
|
|
__ stpw(ofs, limit, Address(sp, 8));
|
|
}
|
|
// 8 bytes at sp+16 will be used to keep buf
|
|
__ stp(r19, r20, Address(sp, 32));
|
|
__ stp(r21, r22, Address(sp, 48));
|
|
__ stp(r23, r24, Address(sp, 64));
|
|
__ stp(r25, r26, Address(sp, 80));
|
|
__ stp(r27, r28, Address(sp, 96));
|
|
if (can_use_r18 && can_use_fp) {
|
|
__ stp(r18_tls, state, Address(sp, 112));
|
|
} else {
|
|
__ str(state, Address(sp, 112));
|
|
}
|
|
|
|
// begin sha3 calculations: loading a0..a24 from state arrary
|
|
__ ldp(a0, a1, state);
|
|
__ ldp(a2, a3, Address(state, 16));
|
|
__ ldp(a4, a5, Address(state, 32));
|
|
__ ldp(a6, a7, Address(state, 48));
|
|
__ ldp(a8, a9, Address(state, 64));
|
|
__ ldp(a10, a11, Address(state, 80));
|
|
__ ldp(a12, a13, Address(state, 96));
|
|
__ ldp(a14, a15, Address(state, 112));
|
|
__ ldp(a16, a17, Address(state, 128));
|
|
__ ldp(a18, a19, Address(state, 144));
|
|
__ ldp(a20, a21, Address(state, 160));
|
|
__ ldp(a22, a23, Address(state, 176));
|
|
__ ldr(a24, Address(state, 192));
|
|
|
|
__ BIND(sha3_loop);
|
|
|
|
// load input
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a0, a0, tmp3);
|
|
__ eor(a1, a1, tmp2);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a2, a2, tmp3);
|
|
__ eor(a3, a3, tmp2);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a4, a4, tmp3);
|
|
__ eor(a5, a5, tmp2);
|
|
__ ldr(tmp3, __ post(buf, 8));
|
|
__ eor(a6, a6, tmp3);
|
|
|
|
// block_size == 72, SHA3-512; block_size == 104, SHA3-384
|
|
__ tbz(block_size, 7, sha3_512_or_sha3_384);
|
|
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a7, a7, tmp3);
|
|
__ eor(a8, a8, tmp2);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a9, a9, tmp3);
|
|
__ eor(a10, a10, tmp2);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a11, a11, tmp3);
|
|
__ eor(a12, a12, tmp2);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a13, a13, tmp3);
|
|
__ eor(a14, a14, tmp2);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a15, a15, tmp3);
|
|
__ eor(a16, a16, tmp2);
|
|
|
|
// block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
|
|
__ andw(tmp2, block_size, 48);
|
|
__ cbzw(tmp2, rounds24_preloop);
|
|
__ tbnz(block_size, 5, shake128);
|
|
// block_size == 144, bit5 == 0, SHA3-244
|
|
__ ldr(tmp3, __ post(buf, 8));
|
|
__ eor(a17, a17, tmp3);
|
|
__ b(rounds24_preloop);
|
|
|
|
__ BIND(shake128);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a17, a17, tmp3);
|
|
__ eor(a18, a18, tmp2);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a19, a19, tmp3);
|
|
__ eor(a20, a20, tmp2);
|
|
__ b(rounds24_preloop); // block_size == 168, SHAKE128
|
|
|
|
__ BIND(sha3_512_or_sha3_384);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a7, a7, tmp3);
|
|
__ eor(a8, a8, tmp2);
|
|
__ tbz(block_size, 5, rounds24_preloop); // SHA3-512
|
|
|
|
// SHA3-384
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a9, a9, tmp3);
|
|
__ eor(a10, a10, tmp2);
|
|
__ ldp(tmp3, tmp2, __ post(buf, 16));
|
|
__ eor(a11, a11, tmp3);
|
|
__ eor(a12, a12, tmp2);
|
|
|
|
__ BIND(rounds24_preloop);
|
|
__ fmovs(v0, 24.0); // float loop counter,
|
|
__ fmovs(v1, 1.0); // exact representation
|
|
|
|
__ str(buf, Address(sp, 16));
|
|
__ lea(tmp3, ExternalAddress((address) round_consts));
|
|
|
|
__ BIND(loop_body);
|
|
keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
|
|
a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
|
|
a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
|
|
tmp0, tmp1, tmp2);
|
|
__ fsubs(v0, v0, v1);
|
|
__ fcmps(v0, 0.0);
|
|
__ br(__ NE, loop_body);
|
|
|
|
if (multi_block) {
|
|
__ ldrw(block_size, sp); // block_size
|
|
__ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
|
|
__ addw(tmp2, tmp2, block_size);
|
|
__ cmpw(tmp2, tmp1);
|
|
__ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
|
|
__ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
|
|
__ br(Assembler::LE, sha3_loop);
|
|
__ movw(c_rarg0, tmp2); // return offset
|
|
}
|
|
if (can_use_fp && can_use_r18) {
|
|
__ ldp(r18_tls, state, Address(sp, 112));
|
|
} else {
|
|
__ ldr(state, Address(sp, 112));
|
|
}
|
|
// save calculated sha3 state
|
|
__ stp(a0, a1, Address(state));
|
|
__ stp(a2, a3, Address(state, 16));
|
|
__ stp(a4, a5, Address(state, 32));
|
|
__ stp(a6, a7, Address(state, 48));
|
|
__ stp(a8, a9, Address(state, 64));
|
|
__ stp(a10, a11, Address(state, 80));
|
|
__ stp(a12, a13, Address(state, 96));
|
|
__ stp(a14, a15, Address(state, 112));
|
|
__ stp(a16, a17, Address(state, 128));
|
|
__ stp(a18, a19, Address(state, 144));
|
|
__ stp(a20, a21, Address(state, 160));
|
|
__ stp(a22, a23, Address(state, 176));
|
|
__ str(a24, Address(state, 192));
|
|
|
|
// restore required registers from stack
|
|
__ ldp(r19, r20, Address(sp, 32));
|
|
__ ldp(r21, r22, Address(sp, 48));
|
|
__ ldp(r23, r24, Address(sp, 64));
|
|
__ ldp(r25, r26, Address(sp, 80));
|
|
__ ldp(r27, r28, Address(sp, 96));
|
|
if (can_use_fp && can_use_r18) {
|
|
__ add(rfp, sp, 128); // leave() will copy rfp to sp below
|
|
} // else no need to recalculate rfp, since it wasn't changed
|
|
|
|
__ leave();
|
|
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Inputs:
|
|
* c_rarg0 - int crc
|
|
* c_rarg1 - byte* buf
|
|
* c_rarg2 - int length
|
|
*
|
|
* Output:
|
|
* rax - int crc result
|
|
*/
|
|
address generate_updateBytesCRC32() {
|
|
assert(UseCRC32Intrinsics, "what are we doing here?");
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
|
|
const Register crc = c_rarg0; // crc
|
|
const Register buf = c_rarg1; // source java byte array address
|
|
const Register len = c_rarg2; // length
|
|
const Register table0 = c_rarg3; // crc_table address
|
|
const Register table1 = c_rarg4;
|
|
const Register table2 = c_rarg5;
|
|
const Register table3 = c_rarg6;
|
|
const Register tmp3 = c_rarg7;
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
__ kernel_crc32(crc, buf, len,
|
|
table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Inputs:
|
|
* c_rarg0 - int crc
|
|
* c_rarg1 - byte* buf
|
|
* c_rarg2 - int length
|
|
* c_rarg3 - int* table
|
|
*
|
|
* Output:
|
|
* r0 - int crc result
|
|
*/
|
|
address generate_updateBytesCRC32C() {
|
|
assert(UseCRC32CIntrinsics, "what are we doing here?");
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
|
|
const Register crc = c_rarg0; // crc
|
|
const Register buf = c_rarg1; // source java byte array address
|
|
const Register len = c_rarg2; // length
|
|
const Register table0 = c_rarg3; // crc_table address
|
|
const Register table1 = c_rarg4;
|
|
const Register table2 = c_rarg5;
|
|
const Register table3 = c_rarg6;
|
|
const Register tmp3 = c_rarg7;
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
__ kernel_crc32c(crc, buf, len,
|
|
table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
/***
|
|
* Arguments:
|
|
*
|
|
* Inputs:
|
|
* c_rarg0 - int adler
|
|
* c_rarg1 - byte* buff
|
|
* c_rarg2 - int len
|
|
*
|
|
* Output:
|
|
* c_rarg0 - int adler result
|
|
*/
|
|
address generate_updateBytesAdler32() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
|
|
|
|
// Aliases
|
|
Register adler = c_rarg0;
|
|
Register s1 = c_rarg0;
|
|
Register s2 = c_rarg3;
|
|
Register buff = c_rarg1;
|
|
Register len = c_rarg2;
|
|
Register nmax = r4;
|
|
Register base = r5;
|
|
Register count = r6;
|
|
Register temp0 = rscratch1;
|
|
Register temp1 = rscratch2;
|
|
FloatRegister vbytes = v0;
|
|
FloatRegister vs1acc = v1;
|
|
FloatRegister vs2acc = v2;
|
|
FloatRegister vtable = v3;
|
|
|
|
// Max number of bytes we can process before having to take the mod
|
|
// 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
|
|
uint64_t BASE = 0xfff1;
|
|
uint64_t NMAX = 0x15B0;
|
|
|
|
__ mov(base, BASE);
|
|
__ mov(nmax, NMAX);
|
|
|
|
// Load accumulation coefficients for the upper 16 bits
|
|
__ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
|
|
__ ld1(vtable, __ T16B, Address(temp0));
|
|
|
|
// s1 is initialized to the lower 16 bits of adler
|
|
// s2 is initialized to the upper 16 bits of adler
|
|
__ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
|
|
__ uxth(s1, adler); // s1 = (adler & 0xffff)
|
|
|
|
// The pipelined loop needs at least 16 elements for 1 iteration
|
|
// It does check this, but it is more effective to skip to the cleanup loop
|
|
__ cmp(len, (u1)16);
|
|
__ br(Assembler::HS, L_nmax);
|
|
__ cbz(len, L_combine);
|
|
|
|
__ bind(L_simple_by1_loop);
|
|
__ ldrb(temp0, Address(__ post(buff, 1)));
|
|
__ add(s1, s1, temp0);
|
|
__ add(s2, s2, s1);
|
|
__ subs(len, len, 1);
|
|
__ br(Assembler::HI, L_simple_by1_loop);
|
|
|
|
// s1 = s1 % BASE
|
|
__ subs(temp0, s1, base);
|
|
__ csel(s1, temp0, s1, Assembler::HS);
|
|
|
|
// s2 = s2 % BASE
|
|
__ lsr(temp0, s2, 16);
|
|
__ lsl(temp1, temp0, 4);
|
|
__ sub(temp1, temp1, temp0);
|
|
__ add(s2, temp1, s2, ext::uxth);
|
|
|
|
__ subs(temp0, s2, base);
|
|
__ csel(s2, temp0, s2, Assembler::HS);
|
|
|
|
__ b(L_combine);
|
|
|
|
__ bind(L_nmax);
|
|
__ subs(len, len, nmax);
|
|
__ sub(count, nmax, 16);
|
|
__ br(Assembler::LO, L_by16);
|
|
|
|
__ bind(L_nmax_loop);
|
|
|
|
generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
|
|
vbytes, vs1acc, vs2acc, vtable);
|
|
|
|
__ subs(count, count, 16);
|
|
__ br(Assembler::HS, L_nmax_loop);
|
|
|
|
// s1 = s1 % BASE
|
|
__ lsr(temp0, s1, 16);
|
|
__ lsl(temp1, temp0, 4);
|
|
__ sub(temp1, temp1, temp0);
|
|
__ add(temp1, temp1, s1, ext::uxth);
|
|
|
|
__ lsr(temp0, temp1, 16);
|
|
__ lsl(s1, temp0, 4);
|
|
__ sub(s1, s1, temp0);
|
|
__ add(s1, s1, temp1, ext:: uxth);
|
|
|
|
__ subs(temp0, s1, base);
|
|
__ csel(s1, temp0, s1, Assembler::HS);
|
|
|
|
// s2 = s2 % BASE
|
|
__ lsr(temp0, s2, 16);
|
|
__ lsl(temp1, temp0, 4);
|
|
__ sub(temp1, temp1, temp0);
|
|
__ add(temp1, temp1, s2, ext::uxth);
|
|
|
|
__ lsr(temp0, temp1, 16);
|
|
__ lsl(s2, temp0, 4);
|
|
__ sub(s2, s2, temp0);
|
|
__ add(s2, s2, temp1, ext:: uxth);
|
|
|
|
__ subs(temp0, s2, base);
|
|
__ csel(s2, temp0, s2, Assembler::HS);
|
|
|
|
__ subs(len, len, nmax);
|
|
__ sub(count, nmax, 16);
|
|
__ br(Assembler::HS, L_nmax_loop);
|
|
|
|
__ bind(L_by16);
|
|
__ adds(len, len, count);
|
|
__ br(Assembler::LO, L_by1);
|
|
|
|
__ bind(L_by16_loop);
|
|
|
|
generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
|
|
vbytes, vs1acc, vs2acc, vtable);
|
|
|
|
__ subs(len, len, 16);
|
|
__ br(Assembler::HS, L_by16_loop);
|
|
|
|
__ bind(L_by1);
|
|
__ adds(len, len, 15);
|
|
__ br(Assembler::LO, L_do_mod);
|
|
|
|
__ bind(L_by1_loop);
|
|
__ ldrb(temp0, Address(__ post(buff, 1)));
|
|
__ add(s1, temp0, s1);
|
|
__ add(s2, s2, s1);
|
|
__ subs(len, len, 1);
|
|
__ br(Assembler::HS, L_by1_loop);
|
|
|
|
__ bind(L_do_mod);
|
|
// s1 = s1 % BASE
|
|
__ lsr(temp0, s1, 16);
|
|
__ lsl(temp1, temp0, 4);
|
|
__ sub(temp1, temp1, temp0);
|
|
__ add(temp1, temp1, s1, ext::uxth);
|
|
|
|
__ lsr(temp0, temp1, 16);
|
|
__ lsl(s1, temp0, 4);
|
|
__ sub(s1, s1, temp0);
|
|
__ add(s1, s1, temp1, ext:: uxth);
|
|
|
|
__ subs(temp0, s1, base);
|
|
__ csel(s1, temp0, s1, Assembler::HS);
|
|
|
|
// s2 = s2 % BASE
|
|
__ lsr(temp0, s2, 16);
|
|
__ lsl(temp1, temp0, 4);
|
|
__ sub(temp1, temp1, temp0);
|
|
__ add(temp1, temp1, s2, ext::uxth);
|
|
|
|
__ lsr(temp0, temp1, 16);
|
|
__ lsl(s2, temp0, 4);
|
|
__ sub(s2, s2, temp0);
|
|
__ add(s2, s2, temp1, ext:: uxth);
|
|
|
|
__ subs(temp0, s2, base);
|
|
__ csel(s2, temp0, s2, Assembler::HS);
|
|
|
|
// Combine lower bits and higher bits
|
|
__ bind(L_combine);
|
|
__ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
|
|
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
|
|
Register temp0, Register temp1, FloatRegister vbytes,
|
|
FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
|
|
// Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
|
|
// We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
|
|
// In non-vectorized code, we update s1 and s2 as:
|
|
// s1 <- s1 + b1
|
|
// s2 <- s2 + s1
|
|
// s1 <- s1 + b2
|
|
// s2 <- s2 + b1
|
|
// ...
|
|
// s1 <- s1 + b16
|
|
// s2 <- s2 + s1
|
|
// Putting above assignments together, we have:
|
|
// s1_new = s1 + b1 + b2 + ... + b16
|
|
// s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
|
|
// = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
|
|
// = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
|
|
__ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
|
|
|
|
// s2 = s2 + s1 * 16
|
|
__ add(s2, s2, s1, Assembler::LSL, 4);
|
|
|
|
// vs1acc = b1 + b2 + b3 + ... + b16
|
|
// vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
|
|
__ umullv(vs2acc, __ T8B, vtable, vbytes);
|
|
__ umlalv(vs2acc, __ T16B, vtable, vbytes);
|
|
__ uaddlv(vs1acc, __ T16B, vbytes);
|
|
__ uaddlv(vs2acc, __ T8H, vs2acc);
|
|
|
|
// s1 = s1 + vs1acc, s2 = s2 + vs2acc
|
|
__ fmovd(temp0, vs1acc);
|
|
__ fmovd(temp1, vs2acc);
|
|
__ add(s1, s1, temp0);
|
|
__ add(s2, s2, temp1);
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Input:
|
|
* c_rarg0 - x address
|
|
* c_rarg1 - x length
|
|
* c_rarg2 - y address
|
|
* c_rarg3 - y length
|
|
* c_rarg4 - z address
|
|
*/
|
|
address generate_multiplyToLen() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_multiplyToLen_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
const Register x = r0;
|
|
const Register xlen = r1;
|
|
const Register y = r2;
|
|
const Register ylen = r3;
|
|
const Register z = r4;
|
|
|
|
const Register tmp0 = r5;
|
|
const Register tmp1 = r10;
|
|
const Register tmp2 = r11;
|
|
const Register tmp3 = r12;
|
|
const Register tmp4 = r13;
|
|
const Register tmp5 = r14;
|
|
const Register tmp6 = r15;
|
|
const Register tmp7 = r16;
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
__ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_squareToLen() {
|
|
// squareToLen algorithm for sizes 1..127 described in java code works
|
|
// faster than multiply_to_len on some CPUs and slower on others, but
|
|
// multiply_to_len shows a bit better overall results
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_squareToLen_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register x = r0;
|
|
const Register xlen = r1;
|
|
const Register z = r2;
|
|
const Register y = r4; // == x
|
|
const Register ylen = r5; // == xlen
|
|
|
|
const Register tmp0 = r3;
|
|
const Register tmp1 = r10;
|
|
const Register tmp2 = r11;
|
|
const Register tmp3 = r12;
|
|
const Register tmp4 = r13;
|
|
const Register tmp5 = r14;
|
|
const Register tmp6 = r15;
|
|
const Register tmp7 = r16;
|
|
|
|
RegSet spilled_regs = RegSet::of(y, ylen);
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter();
|
|
__ push(spilled_regs, sp);
|
|
__ mov(y, x);
|
|
__ mov(ylen, xlen);
|
|
__ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
|
|
__ pop(spilled_regs, sp);
|
|
__ leave();
|
|
__ ret(lr);
|
|
return start;
|
|
}
|
|
|
|
address generate_mulAdd() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_mulAdd_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
|
|
const Register out = r0;
|
|
const Register in = r1;
|
|
const Register offset = r2;
|
|
const Register len = r3;
|
|
const Register k = r4;
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter();
|
|
__ mul_add(out, in, offset, len, k);
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Input:
|
|
// c_rarg0 - newArr address
|
|
// c_rarg1 - oldArr address
|
|
// c_rarg2 - newIdx
|
|
// c_rarg3 - shiftCount
|
|
// c_rarg4 - numIter
|
|
//
|
|
address generate_bigIntegerRightShift() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
|
|
|
|
Register newArr = c_rarg0;
|
|
Register oldArr = c_rarg1;
|
|
Register newIdx = c_rarg2;
|
|
Register shiftCount = c_rarg3;
|
|
Register numIter = c_rarg4;
|
|
Register idx = numIter;
|
|
|
|
Register newArrCur = rscratch1;
|
|
Register shiftRevCount = rscratch2;
|
|
Register oldArrCur = r13;
|
|
Register oldArrNext = r14;
|
|
|
|
FloatRegister oldElem0 = v0;
|
|
FloatRegister oldElem1 = v1;
|
|
FloatRegister newElem = v2;
|
|
FloatRegister shiftVCount = v3;
|
|
FloatRegister shiftVRevCount = v4;
|
|
|
|
__ cbz(idx, Exit);
|
|
|
|
__ add(newArr, newArr, newIdx, Assembler::LSL, 2);
|
|
|
|
// left shift count
|
|
__ movw(shiftRevCount, 32);
|
|
__ subw(shiftRevCount, shiftRevCount, shiftCount);
|
|
|
|
// numIter too small to allow a 4-words SIMD loop, rolling back
|
|
__ cmp(numIter, (u1)4);
|
|
__ br(Assembler::LT, ShiftThree);
|
|
|
|
__ dup(shiftVCount, __ T4S, shiftCount);
|
|
__ dup(shiftVRevCount, __ T4S, shiftRevCount);
|
|
__ negr(shiftVCount, __ T4S, shiftVCount);
|
|
|
|
__ BIND(ShiftSIMDLoop);
|
|
|
|
// Calculate the load addresses
|
|
__ sub(idx, idx, 4);
|
|
__ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
|
|
__ add(newArrCur, newArr, idx, Assembler::LSL, 2);
|
|
__ add(oldArrCur, oldArrNext, 4);
|
|
|
|
// Load 4 words and process
|
|
__ ld1(oldElem0, __ T4S, Address(oldArrCur));
|
|
__ ld1(oldElem1, __ T4S, Address(oldArrNext));
|
|
__ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
|
|
__ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
|
|
__ orr(newElem, __ T16B, oldElem0, oldElem1);
|
|
__ st1(newElem, __ T4S, Address(newArrCur));
|
|
|
|
__ cmp(idx, (u1)4);
|
|
__ br(Assembler::LT, ShiftTwoLoop);
|
|
__ b(ShiftSIMDLoop);
|
|
|
|
__ BIND(ShiftTwoLoop);
|
|
__ cbz(idx, Exit);
|
|
__ cmp(idx, (u1)1);
|
|
__ br(Assembler::EQ, ShiftOne);
|
|
|
|
// Calculate the load addresses
|
|
__ sub(idx, idx, 2);
|
|
__ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
|
|
__ add(newArrCur, newArr, idx, Assembler::LSL, 2);
|
|
__ add(oldArrCur, oldArrNext, 4);
|
|
|
|
// Load 2 words and process
|
|
__ ld1(oldElem0, __ T2S, Address(oldArrCur));
|
|
__ ld1(oldElem1, __ T2S, Address(oldArrNext));
|
|
__ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
|
|
__ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
|
|
__ orr(newElem, __ T8B, oldElem0, oldElem1);
|
|
__ st1(newElem, __ T2S, Address(newArrCur));
|
|
__ b(ShiftTwoLoop);
|
|
|
|
__ BIND(ShiftThree);
|
|
__ tbz(idx, 1, ShiftOne);
|
|
__ tbz(idx, 0, ShiftTwo);
|
|
__ ldrw(r10, Address(oldArr, 12));
|
|
__ ldrw(r11, Address(oldArr, 8));
|
|
__ lsrvw(r10, r10, shiftCount);
|
|
__ lslvw(r11, r11, shiftRevCount);
|
|
__ orrw(r12, r10, r11);
|
|
__ strw(r12, Address(newArr, 8));
|
|
|
|
__ BIND(ShiftTwo);
|
|
__ ldrw(r10, Address(oldArr, 8));
|
|
__ ldrw(r11, Address(oldArr, 4));
|
|
__ lsrvw(r10, r10, shiftCount);
|
|
__ lslvw(r11, r11, shiftRevCount);
|
|
__ orrw(r12, r10, r11);
|
|
__ strw(r12, Address(newArr, 4));
|
|
|
|
__ BIND(ShiftOne);
|
|
__ ldrw(r10, Address(oldArr, 4));
|
|
__ ldrw(r11, Address(oldArr));
|
|
__ lsrvw(r10, r10, shiftCount);
|
|
__ lslvw(r11, r11, shiftRevCount);
|
|
__ orrw(r12, r10, r11);
|
|
__ strw(r12, Address(newArr));
|
|
|
|
__ BIND(Exit);
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Input:
|
|
// c_rarg0 - newArr address
|
|
// c_rarg1 - oldArr address
|
|
// c_rarg2 - newIdx
|
|
// c_rarg3 - shiftCount
|
|
// c_rarg4 - numIter
|
|
//
|
|
address generate_bigIntegerLeftShift() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
|
|
|
|
Register newArr = c_rarg0;
|
|
Register oldArr = c_rarg1;
|
|
Register newIdx = c_rarg2;
|
|
Register shiftCount = c_rarg3;
|
|
Register numIter = c_rarg4;
|
|
|
|
Register shiftRevCount = rscratch1;
|
|
Register oldArrNext = rscratch2;
|
|
|
|
FloatRegister oldElem0 = v0;
|
|
FloatRegister oldElem1 = v1;
|
|
FloatRegister newElem = v2;
|
|
FloatRegister shiftVCount = v3;
|
|
FloatRegister shiftVRevCount = v4;
|
|
|
|
__ cbz(numIter, Exit);
|
|
|
|
__ add(oldArrNext, oldArr, 4);
|
|
__ add(newArr, newArr, newIdx, Assembler::LSL, 2);
|
|
|
|
// right shift count
|
|
__ movw(shiftRevCount, 32);
|
|
__ subw(shiftRevCount, shiftRevCount, shiftCount);
|
|
|
|
// numIter too small to allow a 4-words SIMD loop, rolling back
|
|
__ cmp(numIter, (u1)4);
|
|
__ br(Assembler::LT, ShiftThree);
|
|
|
|
__ dup(shiftVCount, __ T4S, shiftCount);
|
|
__ dup(shiftVRevCount, __ T4S, shiftRevCount);
|
|
__ negr(shiftVRevCount, __ T4S, shiftVRevCount);
|
|
|
|
__ BIND(ShiftSIMDLoop);
|
|
|
|
// load 4 words and process
|
|
__ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
|
|
__ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
|
|
__ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
|
|
__ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
|
|
__ orr(newElem, __ T16B, oldElem0, oldElem1);
|
|
__ st1(newElem, __ T4S, __ post(newArr, 16));
|
|
__ sub(numIter, numIter, 4);
|
|
|
|
__ cmp(numIter, (u1)4);
|
|
__ br(Assembler::LT, ShiftTwoLoop);
|
|
__ b(ShiftSIMDLoop);
|
|
|
|
__ BIND(ShiftTwoLoop);
|
|
__ cbz(numIter, Exit);
|
|
__ cmp(numIter, (u1)1);
|
|
__ br(Assembler::EQ, ShiftOne);
|
|
|
|
// load 2 words and process
|
|
__ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
|
|
__ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
|
|
__ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
|
|
__ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
|
|
__ orr(newElem, __ T8B, oldElem0, oldElem1);
|
|
__ st1(newElem, __ T2S, __ post(newArr, 8));
|
|
__ sub(numIter, numIter, 2);
|
|
__ b(ShiftTwoLoop);
|
|
|
|
__ BIND(ShiftThree);
|
|
__ ldrw(r10, __ post(oldArr, 4));
|
|
__ ldrw(r11, __ post(oldArrNext, 4));
|
|
__ lslvw(r10, r10, shiftCount);
|
|
__ lsrvw(r11, r11, shiftRevCount);
|
|
__ orrw(r12, r10, r11);
|
|
__ strw(r12, __ post(newArr, 4));
|
|
__ tbz(numIter, 1, Exit);
|
|
__ tbz(numIter, 0, ShiftOne);
|
|
|
|
__ BIND(ShiftTwo);
|
|
__ ldrw(r10, __ post(oldArr, 4));
|
|
__ ldrw(r11, __ post(oldArrNext, 4));
|
|
__ lslvw(r10, r10, shiftCount);
|
|
__ lsrvw(r11, r11, shiftRevCount);
|
|
__ orrw(r12, r10, r11);
|
|
__ strw(r12, __ post(newArr, 4));
|
|
|
|
__ BIND(ShiftOne);
|
|
__ ldrw(r10, Address(oldArr));
|
|
__ ldrw(r11, Address(oldArrNext));
|
|
__ lslvw(r10, r10, shiftCount);
|
|
__ lsrvw(r11, r11, shiftRevCount);
|
|
__ orrw(r12, r10, r11);
|
|
__ strw(r12, Address(newArr));
|
|
|
|
__ BIND(Exit);
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_count_positives(address &count_positives_long) {
|
|
const u1 large_loop_size = 64;
|
|
const uint64_t UPPER_BIT_MASK=0x8080808080808080;
|
|
int dcache_line = VM_Version::dcache_line_size();
|
|
|
|
Register ary1 = r1, len = r2, result = r0;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubId stub_id = StubId::stubgen_count_positives_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address entry = __ pc();
|
|
|
|
__ enter();
|
|
// precondition: a copy of len is already in result
|
|
// __ mov(result, len);
|
|
|
|
Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
|
|
LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
|
|
|
|
__ cmp(len, (u1)15);
|
|
__ br(Assembler::GT, LEN_OVER_15);
|
|
// The only case when execution falls into this code is when pointer is near
|
|
// the end of memory page and we have to avoid reading next page
|
|
__ add(ary1, ary1, len);
|
|
__ subs(len, len, 8);
|
|
__ br(Assembler::GT, LEN_OVER_8);
|
|
__ ldr(rscratch2, Address(ary1, -8));
|
|
__ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
|
|
__ lsrv(rscratch2, rscratch2, rscratch1);
|
|
__ tst(rscratch2, UPPER_BIT_MASK);
|
|
__ csel(result, zr, result, Assembler::NE);
|
|
__ leave();
|
|
__ ret(lr);
|
|
__ bind(LEN_OVER_8);
|
|
__ ldp(rscratch1, rscratch2, Address(ary1, -16));
|
|
__ sub(len, len, 8); // no data dep., then sub can be executed while loading
|
|
__ tst(rscratch2, UPPER_BIT_MASK);
|
|
__ br(Assembler::NE, RET_NO_POP);
|
|
__ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
|
|
__ lsrv(rscratch1, rscratch1, rscratch2);
|
|
__ tst(rscratch1, UPPER_BIT_MASK);
|
|
__ bind(RET_NO_POP);
|
|
__ csel(result, zr, result, Assembler::NE);
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
|
|
const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
|
|
|
|
count_positives_long = __ pc(); // 2nd entry point
|
|
|
|
__ enter();
|
|
|
|
__ bind(LEN_OVER_15);
|
|
__ push(spilled_regs, sp);
|
|
__ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
|
|
__ cbz(rscratch2, ALIGNED);
|
|
__ ldp(tmp6, tmp1, Address(ary1));
|
|
__ mov(tmp5, 16);
|
|
__ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
|
|
__ add(ary1, ary1, rscratch1);
|
|
__ orr(tmp6, tmp6, tmp1);
|
|
__ tst(tmp6, UPPER_BIT_MASK);
|
|
__ br(Assembler::NE, RET_ADJUST);
|
|
__ sub(len, len, rscratch1);
|
|
|
|
__ bind(ALIGNED);
|
|
__ cmp(len, large_loop_size);
|
|
__ br(Assembler::LT, CHECK_16);
|
|
// Perform 16-byte load as early return in pre-loop to handle situation
|
|
// when initially aligned large array has negative values at starting bytes,
|
|
// so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
|
|
// slower. Cases with negative bytes further ahead won't be affected that
|
|
// much. In fact, it'll be faster due to early loads, less instructions and
|
|
// less branches in LARGE_LOOP.
|
|
__ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
|
|
__ sub(len, len, 16);
|
|
__ orr(tmp6, tmp6, tmp1);
|
|
__ tst(tmp6, UPPER_BIT_MASK);
|
|
__ br(Assembler::NE, RET_ADJUST_16);
|
|
__ cmp(len, large_loop_size);
|
|
__ br(Assembler::LT, CHECK_16);
|
|
|
|
if (SoftwarePrefetchHintDistance >= 0
|
|
&& SoftwarePrefetchHintDistance >= dcache_line) {
|
|
// initial prefetch
|
|
__ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
|
|
}
|
|
__ bind(LARGE_LOOP);
|
|
if (SoftwarePrefetchHintDistance >= 0) {
|
|
__ prfm(Address(ary1, SoftwarePrefetchHintDistance));
|
|
}
|
|
// Issue load instructions first, since it can save few CPU/MEM cycles, also
|
|
// instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
|
|
// better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
|
|
// instructions per cycle and have less branches, but this approach disables
|
|
// early return, thus, all 64 bytes are loaded and checked every time.
|
|
__ ldp(tmp2, tmp3, Address(ary1));
|
|
__ ldp(tmp4, tmp5, Address(ary1, 16));
|
|
__ ldp(rscratch1, rscratch2, Address(ary1, 32));
|
|
__ ldp(tmp6, tmp1, Address(ary1, 48));
|
|
__ add(ary1, ary1, large_loop_size);
|
|
__ sub(len, len, large_loop_size);
|
|
__ orr(tmp2, tmp2, tmp3);
|
|
__ orr(tmp4, tmp4, tmp5);
|
|
__ orr(rscratch1, rscratch1, rscratch2);
|
|
__ orr(tmp6, tmp6, tmp1);
|
|
__ orr(tmp2, tmp2, tmp4);
|
|
__ orr(rscratch1, rscratch1, tmp6);
|
|
__ orr(tmp2, tmp2, rscratch1);
|
|
__ tst(tmp2, UPPER_BIT_MASK);
|
|
__ br(Assembler::NE, RET_ADJUST_LONG);
|
|
__ cmp(len, large_loop_size);
|
|
__ br(Assembler::GE, LARGE_LOOP);
|
|
|
|
__ bind(CHECK_16); // small 16-byte load pre-loop
|
|
__ cmp(len, (u1)16);
|
|
__ br(Assembler::LT, POST_LOOP16);
|
|
|
|
__ bind(LOOP16); // small 16-byte load loop
|
|
__ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
|
|
__ sub(len, len, 16);
|
|
__ orr(tmp2, tmp2, tmp3);
|
|
__ tst(tmp2, UPPER_BIT_MASK);
|
|
__ br(Assembler::NE, RET_ADJUST_16);
|
|
__ cmp(len, (u1)16);
|
|
__ br(Assembler::GE, LOOP16); // 16-byte load loop end
|
|
|
|
__ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
|
|
__ cmp(len, (u1)8);
|
|
__ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
|
|
__ ldr(tmp3, Address(__ post(ary1, 8)));
|
|
__ tst(tmp3, UPPER_BIT_MASK);
|
|
__ br(Assembler::NE, RET_ADJUST);
|
|
__ sub(len, len, 8);
|
|
|
|
__ bind(POST_LOOP16_LOAD_TAIL);
|
|
__ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
|
|
__ ldr(tmp1, Address(ary1));
|
|
__ mov(tmp2, 64);
|
|
__ sub(tmp4, tmp2, len, __ LSL, 3);
|
|
__ lslv(tmp1, tmp1, tmp4);
|
|
__ tst(tmp1, UPPER_BIT_MASK);
|
|
__ br(Assembler::NE, RET_ADJUST);
|
|
// Fallthrough
|
|
|
|
__ bind(RET_LEN);
|
|
__ pop(spilled_regs, sp);
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
// difference result - len is the count of guaranteed to be
|
|
// positive bytes
|
|
|
|
__ bind(RET_ADJUST_LONG);
|
|
__ add(len, len, (u1)(large_loop_size - 16));
|
|
__ bind(RET_ADJUST_16);
|
|
__ add(len, len, 16);
|
|
__ bind(RET_ADJUST);
|
|
__ pop(spilled_regs, sp);
|
|
__ leave();
|
|
__ sub(result, result, len);
|
|
__ ret(lr);
|
|
|
|
return entry;
|
|
}
|
|
|
|
void generate_large_array_equals_loop_nonsimd(int loopThreshold,
|
|
bool usePrefetch, Label &NOT_EQUAL) {
|
|
Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
|
|
tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
|
|
tmp7 = r12, tmp8 = r13;
|
|
Label LOOP;
|
|
|
|
__ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
|
|
__ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
|
|
__ bind(LOOP);
|
|
if (usePrefetch) {
|
|
__ prfm(Address(a1, SoftwarePrefetchHintDistance));
|
|
__ prfm(Address(a2, SoftwarePrefetchHintDistance));
|
|
}
|
|
__ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
|
|
__ eor(tmp1, tmp1, tmp2);
|
|
__ eor(tmp3, tmp3, tmp4);
|
|
__ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
|
|
__ orr(tmp1, tmp1, tmp3);
|
|
__ cbnz(tmp1, NOT_EQUAL);
|
|
__ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
|
|
__ eor(tmp5, tmp5, tmp6);
|
|
__ eor(tmp7, tmp7, tmp8);
|
|
__ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
|
|
__ orr(tmp5, tmp5, tmp7);
|
|
__ cbnz(tmp5, NOT_EQUAL);
|
|
__ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
|
|
__ eor(tmp1, tmp1, tmp2);
|
|
__ eor(tmp3, tmp3, tmp4);
|
|
__ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
|
|
__ orr(tmp1, tmp1, tmp3);
|
|
__ cbnz(tmp1, NOT_EQUAL);
|
|
__ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
|
|
__ eor(tmp5, tmp5, tmp6);
|
|
__ sub(cnt1, cnt1, 8 * wordSize);
|
|
__ eor(tmp7, tmp7, tmp8);
|
|
__ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
|
|
// tmp6 is not used. MacroAssembler::subs is used here (rather than
|
|
// cmp) because subs allows an unlimited range of immediate operand.
|
|
__ subs(tmp6, cnt1, loopThreshold);
|
|
__ orr(tmp5, tmp5, tmp7);
|
|
__ cbnz(tmp5, NOT_EQUAL);
|
|
__ br(__ GE, LOOP);
|
|
// post-loop
|
|
__ eor(tmp1, tmp1, tmp2);
|
|
__ eor(tmp3, tmp3, tmp4);
|
|
__ orr(tmp1, tmp1, tmp3);
|
|
__ sub(cnt1, cnt1, 2 * wordSize);
|
|
__ cbnz(tmp1, NOT_EQUAL);
|
|
}
|
|
|
|
void generate_large_array_equals_loop_simd(int loopThreshold,
|
|
bool usePrefetch, Label &NOT_EQUAL) {
|
|
Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
|
|
tmp2 = rscratch2;
|
|
Label LOOP;
|
|
|
|
__ bind(LOOP);
|
|
if (usePrefetch) {
|
|
__ prfm(Address(a1, SoftwarePrefetchHintDistance));
|
|
__ prfm(Address(a2, SoftwarePrefetchHintDistance));
|
|
}
|
|
__ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
|
|
__ sub(cnt1, cnt1, 8 * wordSize);
|
|
__ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
|
|
__ subs(tmp1, cnt1, loopThreshold);
|
|
__ eor(v0, __ T16B, v0, v4);
|
|
__ eor(v1, __ T16B, v1, v5);
|
|
__ eor(v2, __ T16B, v2, v6);
|
|
__ eor(v3, __ T16B, v3, v7);
|
|
__ orr(v0, __ T16B, v0, v1);
|
|
__ orr(v1, __ T16B, v2, v3);
|
|
__ orr(v0, __ T16B, v0, v1);
|
|
__ umov(tmp1, v0, __ D, 0);
|
|
__ umov(tmp2, v0, __ D, 1);
|
|
__ orr(tmp1, tmp1, tmp2);
|
|
__ cbnz(tmp1, NOT_EQUAL);
|
|
__ br(__ GE, LOOP);
|
|
}
|
|
|
|
// a1 = r1 - array1 address
|
|
// a2 = r2 - array2 address
|
|
// result = r0 - return value. Already contains "false"
|
|
// cnt1 = r10 - amount of elements left to check, reduced by wordSize
|
|
// r3-r5 are reserved temporary registers
|
|
// Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
|
|
address generate_large_array_equals() {
|
|
Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
|
|
tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
|
|
tmp7 = r12, tmp8 = r13;
|
|
Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
|
|
SMALL_LOOP, POST_LOOP;
|
|
const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
|
|
// calculate if at least 32 prefetched bytes are used
|
|
int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
|
|
int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
|
|
RegSet spilled_regs = RegSet::range(tmp6, tmp8);
|
|
assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
|
|
tmp5, tmp6, tmp7, tmp8);
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubId stub_id = StubId::stubgen_large_array_equals_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address entry = __ pc();
|
|
__ enter();
|
|
__ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
|
|
// also advance pointers to use post-increment instead of pre-increment
|
|
__ add(a1, a1, wordSize);
|
|
__ add(a2, a2, wordSize);
|
|
if (AvoidUnalignedAccesses) {
|
|
// both implementations (SIMD/nonSIMD) are using relatively large load
|
|
// instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
|
|
// on some CPUs in case of address is not at least 16-byte aligned.
|
|
// Arrays are 8-byte aligned currently, so, we can make additional 8-byte
|
|
// load if needed at least for 1st address and make if 16-byte aligned.
|
|
Label ALIGNED16;
|
|
__ tbz(a1, 3, ALIGNED16);
|
|
__ ldr(tmp1, Address(__ post(a1, wordSize)));
|
|
__ ldr(tmp2, Address(__ post(a2, wordSize)));
|
|
__ sub(cnt1, cnt1, wordSize);
|
|
__ eor(tmp1, tmp1, tmp2);
|
|
__ cbnz(tmp1, NOT_EQUAL_NO_POP);
|
|
__ bind(ALIGNED16);
|
|
}
|
|
if (UseSIMDForArrayEquals) {
|
|
if (SoftwarePrefetchHintDistance >= 0) {
|
|
__ subs(tmp1, cnt1, prefetchLoopThreshold);
|
|
__ br(__ LE, NO_PREFETCH_LARGE_LOOP);
|
|
generate_large_array_equals_loop_simd(prefetchLoopThreshold,
|
|
/* prfm = */ true, NOT_EQUAL);
|
|
__ subs(zr, cnt1, nonPrefetchLoopThreshold);
|
|
__ br(__ LT, TAIL);
|
|
}
|
|
__ bind(NO_PREFETCH_LARGE_LOOP);
|
|
generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
|
|
/* prfm = */ false, NOT_EQUAL);
|
|
} else {
|
|
__ push(spilled_regs, sp);
|
|
if (SoftwarePrefetchHintDistance >= 0) {
|
|
__ subs(tmp1, cnt1, prefetchLoopThreshold);
|
|
__ br(__ LE, NO_PREFETCH_LARGE_LOOP);
|
|
generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
|
|
/* prfm = */ true, NOT_EQUAL);
|
|
__ subs(zr, cnt1, nonPrefetchLoopThreshold);
|
|
__ br(__ LT, TAIL);
|
|
}
|
|
__ bind(NO_PREFETCH_LARGE_LOOP);
|
|
generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
|
|
/* prfm = */ false, NOT_EQUAL);
|
|
}
|
|
__ bind(TAIL);
|
|
__ cbz(cnt1, EQUAL);
|
|
__ subs(cnt1, cnt1, wordSize);
|
|
__ br(__ LE, POST_LOOP);
|
|
__ bind(SMALL_LOOP);
|
|
__ ldr(tmp1, Address(__ post(a1, wordSize)));
|
|
__ ldr(tmp2, Address(__ post(a2, wordSize)));
|
|
__ subs(cnt1, cnt1, wordSize);
|
|
__ eor(tmp1, tmp1, tmp2);
|
|
__ cbnz(tmp1, NOT_EQUAL);
|
|
__ br(__ GT, SMALL_LOOP);
|
|
__ bind(POST_LOOP);
|
|
__ ldr(tmp1, Address(a1, cnt1));
|
|
__ ldr(tmp2, Address(a2, cnt1));
|
|
__ eor(tmp1, tmp1, tmp2);
|
|
__ cbnz(tmp1, NOT_EQUAL);
|
|
__ bind(EQUAL);
|
|
__ mov(result, true);
|
|
__ bind(NOT_EQUAL);
|
|
if (!UseSIMDForArrayEquals) {
|
|
__ pop(spilled_regs, sp);
|
|
}
|
|
__ bind(NOT_EQUAL_NO_POP);
|
|
__ leave();
|
|
__ ret(lr);
|
|
return entry;
|
|
}
|
|
|
|
// result = r0 - return value. Contains initial hashcode value on entry.
|
|
// ary = r1 - array address
|
|
// cnt = r2 - elements count
|
|
// Clobbers: v0-v13, rscratch1, rscratch2
|
|
address generate_large_arrays_hashcode(BasicType eltype) {
|
|
const Register result = r0, ary = r1, cnt = r2;
|
|
const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
|
|
const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
|
|
const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
|
|
const FloatRegister vpowm = v13;
|
|
|
|
ARRAYS_HASHCODE_REGISTERS;
|
|
|
|
Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
|
|
|
|
unsigned int vf; // vectorization factor
|
|
bool multiply_by_halves;
|
|
Assembler::SIMD_Arrangement load_arrangement;
|
|
switch (eltype) {
|
|
case T_BOOLEAN:
|
|
case T_BYTE:
|
|
load_arrangement = Assembler::T8B;
|
|
multiply_by_halves = true;
|
|
vf = 8;
|
|
break;
|
|
case T_CHAR:
|
|
case T_SHORT:
|
|
load_arrangement = Assembler::T8H;
|
|
multiply_by_halves = true;
|
|
vf = 8;
|
|
break;
|
|
case T_INT:
|
|
load_arrangement = Assembler::T4S;
|
|
multiply_by_halves = false;
|
|
vf = 4;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
// Unroll factor
|
|
const unsigned uf = 4;
|
|
|
|
// Effective vectorization factor
|
|
const unsigned evf = vf * uf;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubId stub_id;
|
|
switch (eltype) {
|
|
case T_BOOLEAN:
|
|
stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
|
|
break;
|
|
case T_BYTE:
|
|
stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
|
|
break;
|
|
case T_CHAR:
|
|
stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
|
|
break;
|
|
case T_SHORT:
|
|
stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
|
|
break;
|
|
case T_INT:
|
|
stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
|
|
break;
|
|
default:
|
|
stub_id = StubId::NO_STUBID;
|
|
ShouldNotReachHere();
|
|
};
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address entry = __ pc();
|
|
__ enter();
|
|
|
|
// Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
|
|
// the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
|
|
// value shouldn't change throughout both loops.
|
|
__ movw(rscratch1, intpow(31U, 3));
|
|
__ mov(vpow, Assembler::S, 0, rscratch1);
|
|
__ movw(rscratch1, intpow(31U, 2));
|
|
__ mov(vpow, Assembler::S, 1, rscratch1);
|
|
__ movw(rscratch1, intpow(31U, 1));
|
|
__ mov(vpow, Assembler::S, 2, rscratch1);
|
|
__ movw(rscratch1, intpow(31U, 0));
|
|
__ mov(vpow, Assembler::S, 3, rscratch1);
|
|
|
|
__ mov(vmul0, Assembler::T16B, 0);
|
|
__ mov(vmul0, Assembler::S, 3, result);
|
|
|
|
__ andr(rscratch2, cnt, (uf - 1) * vf);
|
|
__ cbz(rscratch2, LARGE_LOOP_PREHEADER);
|
|
|
|
__ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
|
|
__ mov(vpowm, Assembler::S, 0, rscratch1);
|
|
|
|
// SMALL LOOP
|
|
__ bind(SMALL_LOOP);
|
|
|
|
__ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
|
|
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
|
|
__ subsw(rscratch2, rscratch2, vf);
|
|
|
|
if (load_arrangement == Assembler::T8B) {
|
|
// Extend 8B to 8H to be able to use vector multiply
|
|
// instructions
|
|
assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
|
|
if (is_signed_subword_type(eltype)) {
|
|
__ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
|
|
} else {
|
|
__ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
|
|
}
|
|
}
|
|
|
|
switch (load_arrangement) {
|
|
case Assembler::T4S:
|
|
__ addv(vmul0, load_arrangement, vmul0, vdata0);
|
|
break;
|
|
case Assembler::T8B:
|
|
case Assembler::T8H:
|
|
assert(is_subword_type(eltype), "subword type expected");
|
|
if (is_signed_subword_type(eltype)) {
|
|
__ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
|
|
} else {
|
|
__ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
|
|
}
|
|
break;
|
|
default:
|
|
__ should_not_reach_here();
|
|
}
|
|
|
|
// Process the upper half of a vector
|
|
if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
|
|
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
|
|
if (is_signed_subword_type(eltype)) {
|
|
__ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
|
|
} else {
|
|
__ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
|
|
}
|
|
}
|
|
|
|
__ br(Assembler::HI, SMALL_LOOP);
|
|
|
|
// SMALL LOOP'S EPILOQUE
|
|
__ lsr(rscratch2, cnt, exact_log2(evf));
|
|
__ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
|
|
|
|
__ mulv(vmul0, Assembler::T4S, vmul0, vpow);
|
|
__ addv(vmul0, Assembler::T4S, vmul0);
|
|
__ umov(result, vmul0, Assembler::S, 0);
|
|
|
|
// TAIL
|
|
__ bind(TAIL);
|
|
|
|
// The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
|
|
// of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
|
|
assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
|
|
__ andr(rscratch2, cnt, vf - 1);
|
|
__ bind(TAIL_SHORTCUT);
|
|
__ adr(rscratch1, BR_BASE);
|
|
// For Cortex-A53 offset is 4 because 2 nops are generated.
|
|
__ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
|
|
__ movw(rscratch2, 0x1f);
|
|
__ br(rscratch1);
|
|
|
|
for (size_t i = 0; i < vf - 1; ++i) {
|
|
__ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
|
|
eltype);
|
|
__ maddw(result, result, rscratch2, rscratch1);
|
|
// maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
|
|
// Generate 2nd nop to have 4 instructions per iteration.
|
|
if (VM_Version::supports_a53mac()) {
|
|
__ nop();
|
|
}
|
|
}
|
|
__ bind(BR_BASE);
|
|
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
// LARGE LOOP
|
|
__ bind(LARGE_LOOP_PREHEADER);
|
|
|
|
__ lsr(rscratch2, cnt, exact_log2(evf));
|
|
|
|
if (multiply_by_halves) {
|
|
// 31^4 - multiplier between lower and upper parts of a register
|
|
__ movw(rscratch1, intpow(31U, vf / 2));
|
|
__ mov(vpowm, Assembler::S, 1, rscratch1);
|
|
// 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
|
|
__ movw(rscratch1, intpow(31U, evf - vf / 2));
|
|
__ mov(vpowm, Assembler::S, 0, rscratch1);
|
|
} else {
|
|
// 31^16
|
|
__ movw(rscratch1, intpow(31U, evf));
|
|
__ mov(vpowm, Assembler::S, 0, rscratch1);
|
|
}
|
|
|
|
__ mov(vmul3, Assembler::T16B, 0);
|
|
__ mov(vmul2, Assembler::T16B, 0);
|
|
__ mov(vmul1, Assembler::T16B, 0);
|
|
|
|
__ bind(LARGE_LOOP);
|
|
|
|
__ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
|
|
__ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
|
|
__ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
|
|
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
|
|
|
|
__ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
|
|
Address(__ post(ary, evf * type2aelembytes(eltype))));
|
|
|
|
if (load_arrangement == Assembler::T8B) {
|
|
// Extend 8B to 8H to be able to use vector multiply
|
|
// instructions
|
|
assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
|
|
if (is_signed_subword_type(eltype)) {
|
|
__ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
|
|
__ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
|
|
__ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
|
|
__ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
|
|
} else {
|
|
__ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
|
|
__ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
|
|
__ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
|
|
__ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
|
|
}
|
|
}
|
|
|
|
switch (load_arrangement) {
|
|
case Assembler::T4S:
|
|
__ addv(vmul3, load_arrangement, vmul3, vdata3);
|
|
__ addv(vmul2, load_arrangement, vmul2, vdata2);
|
|
__ addv(vmul1, load_arrangement, vmul1, vdata1);
|
|
__ addv(vmul0, load_arrangement, vmul0, vdata0);
|
|
break;
|
|
case Assembler::T8B:
|
|
case Assembler::T8H:
|
|
assert(is_subword_type(eltype), "subword type expected");
|
|
if (is_signed_subword_type(eltype)) {
|
|
__ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
|
|
__ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
|
|
__ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
|
|
__ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
|
|
} else {
|
|
__ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
|
|
__ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
|
|
__ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
|
|
__ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
|
|
}
|
|
break;
|
|
default:
|
|
__ should_not_reach_here();
|
|
}
|
|
|
|
// Process the upper half of a vector
|
|
if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
|
|
__ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
|
|
__ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
|
|
__ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
|
|
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
|
|
if (is_signed_subword_type(eltype)) {
|
|
__ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
|
|
__ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
|
|
__ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
|
|
__ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
|
|
} else {
|
|
__ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
|
|
__ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
|
|
__ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
|
|
__ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
|
|
}
|
|
}
|
|
|
|
__ subsw(rscratch2, rscratch2, 1);
|
|
__ br(Assembler::HI, LARGE_LOOP);
|
|
|
|
__ mulv(vmul3, Assembler::T4S, vmul3, vpow);
|
|
__ addv(vmul3, Assembler::T4S, vmul3);
|
|
__ umov(result, vmul3, Assembler::S, 0);
|
|
|
|
__ mov(rscratch2, intpow(31U, vf));
|
|
|
|
__ mulv(vmul2, Assembler::T4S, vmul2, vpow);
|
|
__ addv(vmul2, Assembler::T4S, vmul2);
|
|
__ umov(rscratch1, vmul2, Assembler::S, 0);
|
|
__ maddw(result, result, rscratch2, rscratch1);
|
|
|
|
__ mulv(vmul1, Assembler::T4S, vmul1, vpow);
|
|
__ addv(vmul1, Assembler::T4S, vmul1);
|
|
__ umov(rscratch1, vmul1, Assembler::S, 0);
|
|
__ maddw(result, result, rscratch2, rscratch1);
|
|
|
|
__ mulv(vmul0, Assembler::T4S, vmul0, vpow);
|
|
__ addv(vmul0, Assembler::T4S, vmul0);
|
|
__ umov(rscratch1, vmul0, Assembler::S, 0);
|
|
__ maddw(result, result, rscratch2, rscratch1);
|
|
|
|
__ andr(rscratch2, cnt, vf - 1);
|
|
__ cbnz(rscratch2, TAIL_SHORTCUT);
|
|
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return entry;
|
|
}
|
|
|
|
address generate_dsin_dcos(bool isCos) {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
|
|
(address)StubRoutines::aarch64::_two_over_pi,
|
|
(address)StubRoutines::aarch64::_pio2,
|
|
(address)StubRoutines::aarch64::_dsin_coef,
|
|
(address)StubRoutines::aarch64::_dcos_coef);
|
|
return start;
|
|
}
|
|
|
|
// code for comparing 16 characters of strings with Latin1 and Utf16 encoding
|
|
void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
|
|
Label &DIFF2) {
|
|
Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
|
|
FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
|
|
|
|
__ ldrq(vtmp, Address(__ post(tmp2, 16)));
|
|
__ ldr(tmpU, Address(__ post(cnt1, 8)));
|
|
__ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
|
|
// now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
|
|
|
|
__ fmovd(tmpL, vtmp3);
|
|
__ eor(rscratch2, tmp3, tmpL);
|
|
__ cbnz(rscratch2, DIFF2);
|
|
|
|
__ ldr(tmp3, Address(__ post(cnt1, 8)));
|
|
__ umov(tmpL, vtmp3, __ D, 1);
|
|
__ eor(rscratch2, tmpU, tmpL);
|
|
__ cbnz(rscratch2, DIFF1);
|
|
|
|
__ zip2(vtmp, __ T16B, vtmp, vtmpZ);
|
|
__ ldr(tmpU, Address(__ post(cnt1, 8)));
|
|
__ fmovd(tmpL, vtmp);
|
|
__ eor(rscratch2, tmp3, tmpL);
|
|
__ cbnz(rscratch2, DIFF2);
|
|
|
|
__ ldr(tmp3, Address(__ post(cnt1, 8)));
|
|
__ umov(tmpL, vtmp, __ D, 1);
|
|
__ eor(rscratch2, tmpU, tmpL);
|
|
__ cbnz(rscratch2, DIFF1);
|
|
}
|
|
|
|
// r0 = result
|
|
// r1 = str1
|
|
// r2 = cnt1
|
|
// r3 = str2
|
|
// r4 = cnt2
|
|
// r10 = tmp1
|
|
// r11 = tmp2
|
|
address generate_compare_long_string_different_encoding(bool isLU) {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
|
|
DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
|
|
LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
|
|
Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
|
|
tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
|
|
FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
|
|
RegSet spilled_regs = RegSet::of(tmp3, tmp4);
|
|
|
|
int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
|
|
|
|
__ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
|
|
// cnt2 == amount of characters left to compare
|
|
// Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
|
|
__ zip1(vtmp, __ T8B, vtmp, vtmpZ);
|
|
__ add(str1, str1, isLU ? wordSize/2 : wordSize);
|
|
__ add(str2, str2, isLU ? wordSize : wordSize/2);
|
|
__ fmovd(isLU ? tmp1 : tmp2, vtmp);
|
|
__ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
|
|
__ eor(rscratch2, tmp1, tmp2);
|
|
__ mov(rscratch1, tmp2);
|
|
__ cbnz(rscratch2, CALCULATE_DIFFERENCE);
|
|
Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
|
|
tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
|
|
__ push(spilled_regs, sp);
|
|
__ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
|
|
__ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
|
|
|
|
__ ldr(tmp3, Address(__ post(cnt1, 8)));
|
|
|
|
if (SoftwarePrefetchHintDistance >= 0) {
|
|
__ subs(rscratch2, cnt2, prefetchLoopExitCondition);
|
|
__ br(__ LT, NO_PREFETCH);
|
|
__ bind(LARGE_LOOP_PREFETCH);
|
|
__ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
|
|
__ mov(tmp4, 2);
|
|
__ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
|
|
__ bind(LARGE_LOOP_PREFETCH_REPEAT1);
|
|
compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
|
|
__ subs(tmp4, tmp4, 1);
|
|
__ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
|
|
__ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
|
|
__ mov(tmp4, 2);
|
|
__ bind(LARGE_LOOP_PREFETCH_REPEAT2);
|
|
compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
|
|
__ subs(tmp4, tmp4, 1);
|
|
__ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
|
|
__ sub(cnt2, cnt2, 64);
|
|
__ subs(rscratch2, cnt2, prefetchLoopExitCondition);
|
|
__ br(__ GE, LARGE_LOOP_PREFETCH);
|
|
}
|
|
__ cbz(cnt2, LOAD_LAST); // no characters left except last load
|
|
__ bind(NO_PREFETCH);
|
|
__ subs(cnt2, cnt2, 16);
|
|
__ br(__ LT, TAIL);
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(SMALL_LOOP); // smaller loop
|
|
__ subs(cnt2, cnt2, 16);
|
|
compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
|
|
__ br(__ GE, SMALL_LOOP);
|
|
__ cmn(cnt2, (u1)16);
|
|
__ br(__ EQ, LOAD_LAST);
|
|
__ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
|
|
__ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
|
|
__ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
|
|
__ ldr(tmp3, Address(cnt1, -8));
|
|
compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
|
|
__ b(LOAD_LAST);
|
|
__ bind(DIFF2);
|
|
__ mov(tmpU, tmp3);
|
|
__ bind(DIFF1);
|
|
__ pop(spilled_regs, sp);
|
|
__ b(CALCULATE_DIFFERENCE);
|
|
__ bind(LOAD_LAST);
|
|
// Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
|
|
// No need to load it again
|
|
__ mov(tmpU, tmp3);
|
|
__ pop(spilled_regs, sp);
|
|
|
|
// tmp2 points to the address of the last 4 Latin1 characters right now
|
|
__ ldrs(vtmp, Address(tmp2));
|
|
__ zip1(vtmp, __ T8B, vtmp, vtmpZ);
|
|
__ fmovd(tmpL, vtmp);
|
|
|
|
__ eor(rscratch2, tmpU, tmpL);
|
|
__ cbz(rscratch2, DONE);
|
|
|
|
// Find the first different characters in the longwords and
|
|
// compute their difference.
|
|
__ bind(CALCULATE_DIFFERENCE);
|
|
__ rev(rscratch2, rscratch2);
|
|
__ clz(rscratch2, rscratch2);
|
|
__ andr(rscratch2, rscratch2, -16);
|
|
__ lsrv(tmp1, tmp1, rscratch2);
|
|
__ uxthw(tmp1, tmp1);
|
|
__ lsrv(rscratch1, rscratch1, rscratch2);
|
|
__ uxthw(rscratch1, rscratch1);
|
|
__ subw(result, tmp1, rscratch1);
|
|
__ bind(DONE);
|
|
__ ret(lr);
|
|
return entry;
|
|
}
|
|
|
|
// r0 = input (float16)
|
|
// v0 = result (float)
|
|
// v1 = temporary float register
|
|
address generate_float16ToFloat() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_hf2f_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
BLOCK_COMMENT("Entry:");
|
|
__ flt16_to_flt(v0, r0, v1);
|
|
__ ret(lr);
|
|
return entry;
|
|
}
|
|
|
|
// v0 = input (float)
|
|
// r0 = result (float16)
|
|
// v1 = temporary float register
|
|
address generate_floatToFloat16() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_f2hf_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
BLOCK_COMMENT("Entry:");
|
|
__ flt_to_flt16(r0, v0, v1);
|
|
__ ret(lr);
|
|
return entry;
|
|
}
|
|
|
|
address generate_method_entry_barrier() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_method_entry_barrier_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
Label deoptimize_label;
|
|
|
|
address start = __ pc();
|
|
|
|
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
|
|
if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
|
|
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
|
|
// We can get here despite the nmethod being good, if we have not
|
|
// yet applied our cross modification fence (or data fence).
|
|
Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
|
|
__ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
|
|
__ ldrw(rscratch2, rscratch2);
|
|
__ strw(rscratch2, thread_epoch_addr);
|
|
__ isb();
|
|
__ membar(__ LoadLoad);
|
|
}
|
|
|
|
__ set_last_Java_frame(sp, rfp, lr, rscratch1);
|
|
|
|
__ enter();
|
|
__ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
|
|
|
|
__ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
|
|
|
|
__ push_call_clobbered_registers();
|
|
|
|
__ mov(c_rarg0, rscratch2);
|
|
__ call_VM_leaf
|
|
(CAST_FROM_FN_PTR
|
|
(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
|
|
|
|
__ reset_last_Java_frame(true);
|
|
|
|
__ mov(rscratch1, r0);
|
|
|
|
__ pop_call_clobbered_registers();
|
|
|
|
__ cbnz(rscratch1, deoptimize_label);
|
|
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
__ BIND(deoptimize_label);
|
|
|
|
__ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
|
|
__ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
|
|
|
|
__ mov(sp, rscratch1);
|
|
__ br(rscratch2);
|
|
|
|
return start;
|
|
}
|
|
|
|
// r0 = result
|
|
// r1 = str1
|
|
// r2 = cnt1
|
|
// r3 = str2
|
|
// r4 = cnt2
|
|
// r10 = tmp1
|
|
// r11 = tmp2
|
|
address generate_compare_long_string_same_encoding(bool isLL) {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
|
|
tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
|
|
|
|
Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
|
|
|
|
// exit from large loop when less than 64 bytes left to read or we're about
|
|
// to prefetch memory behind array border
|
|
int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
|
|
|
|
// before jumping to stub, pre-load 8 bytes already, so do comparison directly
|
|
__ eor(rscratch2, tmp1, tmp2);
|
|
__ cbnz(rscratch2, CAL_DIFFERENCE);
|
|
|
|
__ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
|
|
// update pointers, because of previous read
|
|
__ add(str1, str1, wordSize);
|
|
__ add(str2, str2, wordSize);
|
|
if (SoftwarePrefetchHintDistance >= 0) {
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(LARGE_LOOP_PREFETCH);
|
|
__ prfm(Address(str1, SoftwarePrefetchHintDistance));
|
|
__ prfm(Address(str2, SoftwarePrefetchHintDistance));
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
__ ldp(tmp1, tmp1h, Address(str1, i * 16));
|
|
__ ldp(tmp2, tmp2h, Address(str2, i * 16));
|
|
__ cmp(tmp1, tmp2);
|
|
__ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
|
|
__ br(Assembler::NE, DIFF);
|
|
}
|
|
__ sub(cnt2, cnt2, isLL ? 64 : 32);
|
|
__ add(str1, str1, 64);
|
|
__ add(str2, str2, 64);
|
|
__ subs(rscratch2, cnt2, largeLoopExitCondition);
|
|
__ br(Assembler::GE, LARGE_LOOP_PREFETCH);
|
|
__ cbz(cnt2, LENGTH_DIFF); // no more chars left?
|
|
}
|
|
|
|
__ subs(rscratch1, cnt2, isLL ? 16 : 8);
|
|
__ br(Assembler::LE, LESS16);
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(LOOP_COMPARE16);
|
|
__ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
|
|
__ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
|
|
__ cmp(tmp1, tmp2);
|
|
__ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
|
|
__ br(Assembler::NE, DIFF);
|
|
__ sub(cnt2, cnt2, isLL ? 16 : 8);
|
|
__ subs(rscratch2, cnt2, isLL ? 16 : 8);
|
|
__ br(Assembler::LT, LESS16);
|
|
|
|
__ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
|
|
__ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
|
|
__ cmp(tmp1, tmp2);
|
|
__ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
|
|
__ br(Assembler::NE, DIFF);
|
|
__ sub(cnt2, cnt2, isLL ? 16 : 8);
|
|
__ subs(rscratch2, cnt2, isLL ? 16 : 8);
|
|
__ br(Assembler::GE, LOOP_COMPARE16);
|
|
__ cbz(cnt2, LENGTH_DIFF);
|
|
|
|
__ bind(LESS16);
|
|
// each 8 compare
|
|
__ subs(cnt2, cnt2, isLL ? 8 : 4);
|
|
__ br(Assembler::LE, LESS8);
|
|
__ ldr(tmp1, Address(__ post(str1, 8)));
|
|
__ ldr(tmp2, Address(__ post(str2, 8)));
|
|
__ eor(rscratch2, tmp1, tmp2);
|
|
__ cbnz(rscratch2, CAL_DIFFERENCE);
|
|
__ sub(cnt2, cnt2, isLL ? 8 : 4);
|
|
|
|
__ bind(LESS8); // directly load last 8 bytes
|
|
if (!isLL) {
|
|
__ add(cnt2, cnt2, cnt2);
|
|
}
|
|
__ ldr(tmp1, Address(str1, cnt2));
|
|
__ ldr(tmp2, Address(str2, cnt2));
|
|
__ eor(rscratch2, tmp1, tmp2);
|
|
__ cbz(rscratch2, LENGTH_DIFF);
|
|
__ b(CAL_DIFFERENCE);
|
|
|
|
__ bind(DIFF);
|
|
__ cmp(tmp1, tmp2);
|
|
__ csel(tmp1, tmp1, tmp1h, Assembler::NE);
|
|
__ csel(tmp2, tmp2, tmp2h, Assembler::NE);
|
|
// reuse rscratch2 register for the result of eor instruction
|
|
__ eor(rscratch2, tmp1, tmp2);
|
|
|
|
__ bind(CAL_DIFFERENCE);
|
|
__ rev(rscratch2, rscratch2);
|
|
__ clz(rscratch2, rscratch2);
|
|
__ andr(rscratch2, rscratch2, isLL ? -8 : -16);
|
|
__ lsrv(tmp1, tmp1, rscratch2);
|
|
__ lsrv(tmp2, tmp2, rscratch2);
|
|
if (isLL) {
|
|
__ uxtbw(tmp1, tmp1);
|
|
__ uxtbw(tmp2, tmp2);
|
|
} else {
|
|
__ uxthw(tmp1, tmp1);
|
|
__ uxthw(tmp2, tmp2);
|
|
}
|
|
__ subw(result, tmp1, tmp2);
|
|
|
|
__ bind(LENGTH_DIFF);
|
|
__ ret(lr);
|
|
return entry;
|
|
}
|
|
|
|
enum string_compare_mode {
|
|
LL,
|
|
LU,
|
|
UL,
|
|
UU,
|
|
};
|
|
|
|
// The following registers are declared in aarch64.ad
|
|
// r0 = result
|
|
// r1 = str1
|
|
// r2 = cnt1
|
|
// r3 = str2
|
|
// r4 = cnt2
|
|
// r10 = tmp1
|
|
// r11 = tmp2
|
|
// z0 = ztmp1
|
|
// z1 = ztmp2
|
|
// p0 = pgtmp1
|
|
// p1 = pgtmp2
|
|
address generate_compare_long_string_sve(string_compare_mode mode) {
|
|
StubId stub_id;
|
|
switch (mode) {
|
|
case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
|
|
case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
|
|
case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
|
|
case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
|
|
__ align(CodeEntryAlignment);
|
|
address entry = __ pc();
|
|
Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
|
|
tmp1 = r10, tmp2 = r11;
|
|
|
|
Label LOOP, DONE, MISMATCH;
|
|
Register vec_len = tmp1;
|
|
Register idx = tmp2;
|
|
// The minimum of the string lengths has been stored in cnt2.
|
|
Register cnt = cnt2;
|
|
FloatRegister ztmp1 = z0, ztmp2 = z1;
|
|
PRegister pgtmp1 = p0, pgtmp2 = p1;
|
|
|
|
#define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
|
|
switch (mode) { \
|
|
case LL: \
|
|
__ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
|
|
__ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
|
|
break; \
|
|
case LU: \
|
|
__ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
|
|
__ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
|
|
break; \
|
|
case UL: \
|
|
__ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
|
|
__ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
|
|
break; \
|
|
case UU: \
|
|
__ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
|
|
__ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
|
|
break; \
|
|
default: \
|
|
ShouldNotReachHere(); \
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
__ mov(idx, 0);
|
|
__ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
|
|
|
|
if (mode == LL) {
|
|
__ sve_cntb(vec_len);
|
|
} else {
|
|
__ sve_cnth(vec_len);
|
|
}
|
|
|
|
__ sub(rscratch1, cnt, vec_len);
|
|
|
|
__ bind(LOOP);
|
|
|
|
// main loop
|
|
LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
|
|
__ add(idx, idx, vec_len);
|
|
// Compare strings.
|
|
__ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
|
|
__ br(__ NE, MISMATCH);
|
|
__ cmp(idx, rscratch1);
|
|
__ br(__ LT, LOOP);
|
|
|
|
// post loop, last iteration
|
|
__ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
|
|
|
|
LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
|
|
__ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
|
|
__ br(__ EQ, DONE);
|
|
|
|
__ bind(MISMATCH);
|
|
|
|
// Crop the vector to find its location.
|
|
__ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
|
|
// Extract the first different characters of each string.
|
|
__ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
|
|
__ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
|
|
|
|
// Compute the difference of the first different characters.
|
|
__ sub(result, rscratch1, rscratch2);
|
|
|
|
__ bind(DONE);
|
|
__ ret(lr);
|
|
#undef LOAD_PAIR
|
|
return entry;
|
|
}
|
|
|
|
void generate_compare_long_strings() {
|
|
if (UseSVE == 0) {
|
|
StubRoutines::aarch64::_compare_long_string_LL
|
|
= generate_compare_long_string_same_encoding(true);
|
|
StubRoutines::aarch64::_compare_long_string_UU
|
|
= generate_compare_long_string_same_encoding(false);
|
|
StubRoutines::aarch64::_compare_long_string_LU
|
|
= generate_compare_long_string_different_encoding(true);
|
|
StubRoutines::aarch64::_compare_long_string_UL
|
|
= generate_compare_long_string_different_encoding(false);
|
|
} else {
|
|
StubRoutines::aarch64::_compare_long_string_LL
|
|
= generate_compare_long_string_sve(LL);
|
|
StubRoutines::aarch64::_compare_long_string_UU
|
|
= generate_compare_long_string_sve(UU);
|
|
StubRoutines::aarch64::_compare_long_string_LU
|
|
= generate_compare_long_string_sve(LU);
|
|
StubRoutines::aarch64::_compare_long_string_UL
|
|
= generate_compare_long_string_sve(UL);
|
|
}
|
|
}
|
|
|
|
// R0 = result
|
|
// R1 = str2
|
|
// R2 = cnt1
|
|
// R3 = str1
|
|
// R4 = cnt2
|
|
// Clobbers: rscratch1, rscratch2, v0, v1, rflags
|
|
//
|
|
// This generic linear code use few additional ideas, which makes it faster:
|
|
// 1) we can safely keep at least 1st register of pattern(since length >= 8)
|
|
// in order to skip initial loading(help in systems with 1 ld pipeline)
|
|
// 2) we can use "fast" algorithm of finding single character to search for
|
|
// first symbol with less branches(1 branch per each loaded register instead
|
|
// of branch for each symbol), so, this is where constants like
|
|
// 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
|
|
// 3) after loading and analyzing 1st register of source string, it can be
|
|
// used to search for every 1st character entry, saving few loads in
|
|
// comparison with "simplier-but-slower" implementation
|
|
// 4) in order to avoid lots of push/pop operations, code below is heavily
|
|
// re-using/re-initializing/compressing register values, which makes code
|
|
// larger and a bit less readable, however, most of extra operations are
|
|
// issued during loads or branches, so, penalty is minimal
|
|
address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
|
|
StubId stub_id;
|
|
if (str1_isL) {
|
|
if (str2_isL) {
|
|
stub_id = StubId::stubgen_string_indexof_linear_ll_id;
|
|
} else {
|
|
stub_id = StubId::stubgen_string_indexof_linear_ul_id;
|
|
}
|
|
} else {
|
|
if (str2_isL) {
|
|
ShouldNotReachHere();
|
|
} else {
|
|
stub_id = StubId::stubgen_string_indexof_linear_uu_id;
|
|
}
|
|
}
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
|
|
int str1_chr_size = str1_isL ? 1 : 2;
|
|
int str2_chr_size = str2_isL ? 1 : 2;
|
|
int str1_chr_shift = str1_isL ? 0 : 1;
|
|
int str2_chr_shift = str2_isL ? 0 : 1;
|
|
bool isL = str1_isL && str2_isL;
|
|
// parameters
|
|
Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
|
|
// temporary registers
|
|
Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
|
|
RegSet spilled_regs = RegSet::range(tmp1, tmp4);
|
|
// redefinitions
|
|
Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
|
|
|
|
__ push(spilled_regs, sp);
|
|
Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
|
|
L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
|
|
L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
|
|
L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
|
|
L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
|
|
L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
|
|
// Read whole register from str1. It is safe, because length >=8 here
|
|
__ ldr(ch1, Address(str1));
|
|
// Read whole register from str2. It is safe, because length >=8 here
|
|
__ ldr(ch2, Address(str2));
|
|
__ sub(cnt2, cnt2, cnt1);
|
|
__ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
|
|
if (str1_isL != str2_isL) {
|
|
__ eor(v0, __ T16B, v0, v0);
|
|
}
|
|
__ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
|
|
__ mul(first, first, tmp1);
|
|
// check if we have less than 1 register to check
|
|
__ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
|
|
if (str1_isL != str2_isL) {
|
|
__ fmovd(v1, ch1);
|
|
}
|
|
__ br(__ LE, L_SMALL);
|
|
__ eor(ch2, first, ch2);
|
|
if (str1_isL != str2_isL) {
|
|
__ zip1(v1, __ T16B, v1, v0);
|
|
}
|
|
__ sub(tmp2, ch2, tmp1);
|
|
__ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
|
|
__ bics(tmp2, tmp2, ch2);
|
|
if (str1_isL != str2_isL) {
|
|
__ fmovd(ch1, v1);
|
|
}
|
|
__ br(__ NE, L_HAS_ZERO);
|
|
__ subs(cnt2, cnt2, wordSize/str2_chr_size);
|
|
__ add(result, result, wordSize/str2_chr_size);
|
|
__ add(str2, str2, wordSize);
|
|
__ br(__ LT, L_POST_LOOP);
|
|
__ BIND(L_LOOP);
|
|
__ ldr(ch2, Address(str2));
|
|
__ eor(ch2, first, ch2);
|
|
__ sub(tmp2, ch2, tmp1);
|
|
__ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
|
|
__ bics(tmp2, tmp2, ch2);
|
|
__ br(__ NE, L_HAS_ZERO);
|
|
__ BIND(L_LOOP_PROCEED);
|
|
__ subs(cnt2, cnt2, wordSize/str2_chr_size);
|
|
__ add(str2, str2, wordSize);
|
|
__ add(result, result, wordSize/str2_chr_size);
|
|
__ br(__ GE, L_LOOP);
|
|
__ BIND(L_POST_LOOP);
|
|
__ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
|
|
__ br(__ LE, NOMATCH);
|
|
__ ldr(ch2, Address(str2));
|
|
__ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
|
|
__ eor(ch2, first, ch2);
|
|
__ sub(tmp2, ch2, tmp1);
|
|
__ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
|
|
__ mov(tmp4, -1); // all bits set
|
|
__ b(L_SMALL_PROCEED);
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_SMALL);
|
|
__ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
|
|
__ eor(ch2, first, ch2);
|
|
if (str1_isL != str2_isL) {
|
|
__ zip1(v1, __ T16B, v1, v0);
|
|
}
|
|
__ sub(tmp2, ch2, tmp1);
|
|
__ mov(tmp4, -1); // all bits set
|
|
__ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
|
|
if (str1_isL != str2_isL) {
|
|
__ fmovd(ch1, v1); // move converted 4 symbols
|
|
}
|
|
__ BIND(L_SMALL_PROCEED);
|
|
__ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
|
|
__ bic(tmp2, tmp2, ch2);
|
|
__ ands(tmp2, tmp2, tmp4); // clear useless bits and check
|
|
__ rbit(tmp2, tmp2);
|
|
__ br(__ EQ, NOMATCH);
|
|
__ BIND(L_SMALL_HAS_ZERO_LOOP);
|
|
__ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
|
|
__ cmp(cnt1, u1(wordSize/str2_chr_size));
|
|
__ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
|
|
if (str2_isL) { // LL
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
|
|
__ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
|
|
__ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
|
|
__ add(result, result, tmp4, __ LSR, LogBitsPerByte);
|
|
__ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
|
|
} else {
|
|
__ mov(ch2, 0xE); // all bits in byte set except last one
|
|
__ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
|
|
__ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
|
|
__ lslv(tmp2, tmp2, tmp4);
|
|
__ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
}
|
|
__ cmp(ch1, ch2);
|
|
__ mov(tmp4, wordSize/str2_chr_size);
|
|
__ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
|
|
__ BIND(L_SMALL_CMP_LOOP);
|
|
str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
|
|
: __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
|
|
str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
|
|
: __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
|
|
__ add(tmp4, tmp4, 1);
|
|
__ cmp(tmp4, cnt1);
|
|
__ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
|
|
__ cmp(first, ch2);
|
|
__ br(__ EQ, L_SMALL_CMP_LOOP);
|
|
__ BIND(L_SMALL_CMP_LOOP_NOMATCH);
|
|
__ cbz(tmp2, NOMATCH); // no more matches. exit
|
|
__ clz(tmp4, tmp2);
|
|
__ add(result, result, 1); // advance index
|
|
__ add(str2, str2, str2_chr_size); // advance pointer
|
|
__ b(L_SMALL_HAS_ZERO_LOOP);
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
|
|
__ cmp(first, ch2);
|
|
__ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
|
|
__ b(DONE);
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
|
|
if (str2_isL) { // LL
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
|
|
__ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
|
|
__ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
|
|
__ add(result, result, tmp4, __ LSR, LogBitsPerByte);
|
|
__ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
|
|
} else {
|
|
__ mov(ch2, 0xE); // all bits in byte set except last one
|
|
__ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
|
|
__ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
|
|
__ lslv(tmp2, tmp2, tmp4);
|
|
__ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
}
|
|
__ cmp(ch1, ch2);
|
|
__ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
|
|
__ b(DONE);
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_HAS_ZERO);
|
|
__ rbit(tmp2, tmp2);
|
|
__ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
|
|
// Now, perform compression of counters(cnt2 and cnt1) into one register.
|
|
// It's fine because both counters are 32bit and are not changed in this
|
|
// loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
|
|
__ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
|
|
__ sub(result, result, 1);
|
|
__ BIND(L_HAS_ZERO_LOOP);
|
|
__ mov(cnt1, wordSize/str2_chr_size);
|
|
__ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
|
|
__ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
|
|
if (str2_isL) {
|
|
__ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
|
|
__ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
|
|
__ lslv(tmp2, tmp2, tmp4);
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ add(tmp4, tmp4, 1);
|
|
__ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ lsl(tmp2, tmp2, 1);
|
|
__ mov(tmp4, wordSize/str2_chr_size);
|
|
} else {
|
|
__ mov(ch2, 0xE);
|
|
__ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
|
|
__ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
|
|
__ lslv(tmp2, tmp2, tmp4);
|
|
__ add(tmp4, tmp4, 1);
|
|
__ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
|
|
__ lsl(tmp2, tmp2, 1);
|
|
__ mov(tmp4, wordSize/str2_chr_size);
|
|
__ sub(str2, str2, str2_chr_size);
|
|
}
|
|
__ cmp(ch1, ch2);
|
|
__ mov(tmp4, wordSize/str2_chr_size);
|
|
__ br(__ NE, L_CMP_LOOP_NOMATCH);
|
|
__ BIND(L_CMP_LOOP);
|
|
str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
|
|
: __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
|
|
str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
|
|
: __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
|
|
__ add(tmp4, tmp4, 1);
|
|
__ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
|
|
__ br(__ GE, L_CMP_LOOP_LAST_CMP);
|
|
__ cmp(cnt1, ch2);
|
|
__ br(__ EQ, L_CMP_LOOP);
|
|
__ BIND(L_CMP_LOOP_NOMATCH);
|
|
// here we're not matched
|
|
__ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
|
|
__ clz(tmp4, tmp2);
|
|
__ add(str2, str2, str2_chr_size); // advance pointer
|
|
__ b(L_HAS_ZERO_LOOP);
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_CMP_LOOP_LAST_CMP);
|
|
__ cmp(cnt1, ch2);
|
|
__ br(__ NE, L_CMP_LOOP_NOMATCH);
|
|
__ b(DONE);
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_CMP_LOOP_LAST_CMP2);
|
|
if (str2_isL) {
|
|
__ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
|
|
__ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
|
|
__ lslv(tmp2, tmp2, tmp4);
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ add(tmp4, tmp4, 1);
|
|
__ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ lsl(tmp2, tmp2, 1);
|
|
} else {
|
|
__ mov(ch2, 0xE);
|
|
__ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
|
|
__ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
|
|
__ lslv(tmp2, tmp2, tmp4);
|
|
__ add(tmp4, tmp4, 1);
|
|
__ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
|
|
__ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
|
|
__ lsl(tmp2, tmp2, 1);
|
|
__ sub(str2, str2, str2_chr_size);
|
|
}
|
|
__ cmp(ch1, ch2);
|
|
__ br(__ NE, L_CMP_LOOP_NOMATCH);
|
|
__ b(DONE);
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_HAS_ZERO_LOOP_NOMATCH);
|
|
// 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
|
|
// L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
|
|
// so, result was increased at max by wordSize/str2_chr_size - 1, so,
|
|
// respective high bit wasn't changed. L_LOOP_PROCEED will increase
|
|
// result by analyzed characters value, so, we can just reset lower bits
|
|
// in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
|
|
// 2) restore cnt1 and cnt2 values from "compressed" cnt2
|
|
// 3) advance str2 value to represent next str2 octet. result & 7/3 is
|
|
// index of last analyzed substring inside current octet. So, str2 in at
|
|
// respective start address. We need to advance it to next octet
|
|
__ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
|
|
__ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
|
|
__ bfm(result, zr, 0, 2 - str2_chr_shift);
|
|
__ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
|
|
__ movw(cnt2, cnt2);
|
|
__ b(L_LOOP_PROCEED);
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(NOMATCH);
|
|
__ mov(result, -1);
|
|
__ BIND(DONE);
|
|
__ pop(spilled_regs, sp);
|
|
__ ret(lr);
|
|
return entry;
|
|
}
|
|
|
|
void generate_string_indexof_stubs() {
|
|
StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
|
|
StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
|
|
StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
|
|
}
|
|
|
|
void inflate_and_store_2_fp_registers(bool generatePrfm,
|
|
FloatRegister src1, FloatRegister src2) {
|
|
Register dst = r1;
|
|
__ zip1(v1, __ T16B, src1, v0);
|
|
__ zip2(v2, __ T16B, src1, v0);
|
|
if (generatePrfm) {
|
|
__ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
|
|
}
|
|
__ zip1(v3, __ T16B, src2, v0);
|
|
__ zip2(v4, __ T16B, src2, v0);
|
|
__ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
|
|
}
|
|
|
|
// R0 = src
|
|
// R1 = dst
|
|
// R2 = len
|
|
// R3 = len >> 3
|
|
// V0 = 0
|
|
// v1 = loaded 8 bytes
|
|
// Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
|
|
address generate_large_byte_array_inflate() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
|
|
Register src = r0, dst = r1, len = r2, octetCounter = r3;
|
|
const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
|
|
|
|
// do one more 8-byte read to have address 16-byte aligned in most cases
|
|
// also use single store instruction
|
|
__ ldrd(v2, __ post(src, 8));
|
|
__ sub(octetCounter, octetCounter, 2);
|
|
__ zip1(v1, __ T16B, v1, v0);
|
|
__ zip1(v2, __ T16B, v2, v0);
|
|
__ st1(v1, v2, __ T16B, __ post(dst, 32));
|
|
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
|
|
__ subs(rscratch1, octetCounter, large_loop_threshold);
|
|
__ br(__ LE, LOOP_START);
|
|
__ b(LOOP_PRFM_START);
|
|
__ bind(LOOP_PRFM);
|
|
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
|
|
__ bind(LOOP_PRFM_START);
|
|
__ prfm(Address(src, SoftwarePrefetchHintDistance));
|
|
__ sub(octetCounter, octetCounter, 8);
|
|
__ subs(rscratch1, octetCounter, large_loop_threshold);
|
|
inflate_and_store_2_fp_registers(true, v3, v4);
|
|
inflate_and_store_2_fp_registers(true, v5, v6);
|
|
__ br(__ GT, LOOP_PRFM);
|
|
__ cmp(octetCounter, (u1)8);
|
|
__ br(__ LT, DONE);
|
|
__ bind(LOOP);
|
|
__ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
|
|
__ bind(LOOP_START);
|
|
__ sub(octetCounter, octetCounter, 8);
|
|
__ cmp(octetCounter, (u1)8);
|
|
inflate_and_store_2_fp_registers(false, v3, v4);
|
|
inflate_and_store_2_fp_registers(false, v5, v6);
|
|
__ br(__ GE, LOOP);
|
|
__ bind(DONE);
|
|
__ ret(lr);
|
|
return entry;
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Input:
|
|
* c_rarg0 - current state address
|
|
* c_rarg1 - H key address
|
|
* c_rarg2 - data address
|
|
* c_rarg3 - number of blocks
|
|
*
|
|
* Output:
|
|
* Updated state at c_rarg0
|
|
*/
|
|
address generate_ghash_processBlocks() {
|
|
// Bafflingly, GCM uses little-endian for the byte order, but
|
|
// big-endian for the bit order. For example, the polynomial 1 is
|
|
// represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
|
|
//
|
|
// So, we must either reverse the bytes in each word and do
|
|
// everything big-endian or reverse the bits in each byte and do
|
|
// it little-endian. On AArch64 it's more idiomatic to reverse
|
|
// the bits in each byte (we have an instruction, RBIT, to do
|
|
// that) and keep the data in little-endian bit order through the
|
|
// calculation, bit-reversing the inputs and outputs.
|
|
|
|
StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
Label polynomial; // local data generated at end of stub
|
|
__ align(CodeEntryAlignment);
|
|
address start = __ pc();
|
|
|
|
Register state = c_rarg0;
|
|
Register subkeyH = c_rarg1;
|
|
Register data = c_rarg2;
|
|
Register blocks = c_rarg3;
|
|
|
|
FloatRegister vzr = v30;
|
|
__ eor(vzr, __ T16B, vzr, vzr); // zero register
|
|
|
|
__ adr(rscratch1, polynomial);
|
|
__ ldrq(v24, rscratch1); // The field polynomial
|
|
|
|
__ ldrq(v0, Address(state));
|
|
__ ldrq(v1, Address(subkeyH));
|
|
|
|
__ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
|
|
__ rbit(v0, __ T16B, v0);
|
|
__ rev64(v1, __ T16B, v1);
|
|
__ rbit(v1, __ T16B, v1);
|
|
|
|
__ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
|
|
__ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
|
|
|
|
{
|
|
Label L_ghash_loop;
|
|
__ bind(L_ghash_loop);
|
|
|
|
__ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
|
|
// reversing each byte
|
|
__ rbit(v2, __ T16B, v2);
|
|
__ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
|
|
|
|
// Multiply state in v2 by subkey in v1
|
|
__ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
|
|
/*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
|
|
/*temps*/v6, v3, /*reuse/clobber b*/v2);
|
|
// Reduce v7:v5 by the field polynomial
|
|
__ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
|
|
|
|
__ sub(blocks, blocks, 1);
|
|
__ cbnz(blocks, L_ghash_loop);
|
|
}
|
|
|
|
// The bit-reversed result is at this point in v0
|
|
__ rev64(v0, __ T16B, v0);
|
|
__ rbit(v0, __ T16B, v0);
|
|
|
|
__ st1(v0, __ T16B, state);
|
|
__ ret(lr);
|
|
|
|
// bind label and generate local polynomial data
|
|
__ align(wordSize * 2);
|
|
__ bind(polynomial);
|
|
__ emit_int64(0x87); // The low-order bits of the field
|
|
// polynomial (i.e. p = z^7+z^2+z+1)
|
|
// repeated in the low and high parts of a
|
|
// 128-bit vector
|
|
__ emit_int64(0x87);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_ghash_processBlocks_wide() {
|
|
address small = generate_ghash_processBlocks();
|
|
|
|
StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
Label polynomial; // local data generated after stub
|
|
__ align(CodeEntryAlignment);
|
|
address start = __ pc();
|
|
|
|
Register state = c_rarg0;
|
|
Register subkeyH = c_rarg1;
|
|
Register data = c_rarg2;
|
|
Register blocks = c_rarg3;
|
|
|
|
const int unroll = 4;
|
|
|
|
__ cmp(blocks, (unsigned char)(unroll * 2));
|
|
__ br(__ LT, small);
|
|
|
|
if (unroll > 1) {
|
|
// Save state before entering routine
|
|
__ sub(sp, sp, 4 * 16);
|
|
__ st1(v12, v13, v14, v15, __ T16B, Address(sp));
|
|
__ sub(sp, sp, 4 * 16);
|
|
__ st1(v8, v9, v10, v11, __ T16B, Address(sp));
|
|
}
|
|
|
|
__ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
|
|
|
|
if (unroll > 1) {
|
|
// And restore state
|
|
__ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
|
|
__ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
|
|
}
|
|
|
|
__ cmp(blocks, (unsigned char)0);
|
|
__ br(__ GT, small);
|
|
|
|
__ ret(lr);
|
|
|
|
// bind label and generate polynomial data
|
|
__ align(wordSize * 2);
|
|
__ bind(polynomial);
|
|
__ emit_int64(0x87); // The low-order bits of the field
|
|
// polynomial (i.e. p = z^7+z^2+z+1)
|
|
// repeated in the low and high parts of a
|
|
// 128-bit vector
|
|
__ emit_int64(0x87);
|
|
|
|
return start;
|
|
|
|
}
|
|
|
|
void generate_base64_encode_simdround(Register src, Register dst,
|
|
FloatRegister codec, u8 size) {
|
|
|
|
FloatRegister in0 = v4, in1 = v5, in2 = v6;
|
|
FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
|
|
FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
|
|
|
|
Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
|
|
|
|
__ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
|
|
|
|
__ ushr(ind0, arrangement, in0, 2);
|
|
|
|
__ ushr(ind1, arrangement, in1, 2);
|
|
__ shl(in0, arrangement, in0, 6);
|
|
__ orr(ind1, arrangement, ind1, in0);
|
|
__ ushr(ind1, arrangement, ind1, 2);
|
|
|
|
__ ushr(ind2, arrangement, in2, 4);
|
|
__ shl(in1, arrangement, in1, 4);
|
|
__ orr(ind2, arrangement, in1, ind2);
|
|
__ ushr(ind2, arrangement, ind2, 2);
|
|
|
|
__ shl(ind3, arrangement, in2, 2);
|
|
__ ushr(ind3, arrangement, ind3, 2);
|
|
|
|
__ tbl(out0, arrangement, codec, 4, ind0);
|
|
__ tbl(out1, arrangement, codec, 4, ind1);
|
|
__ tbl(out2, arrangement, codec, 4, ind2);
|
|
__ tbl(out3, arrangement, codec, 4, ind3);
|
|
|
|
__ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Input:
|
|
* c_rarg0 - src_start
|
|
* c_rarg1 - src_offset
|
|
* c_rarg2 - src_length
|
|
* c_rarg3 - dest_start
|
|
* c_rarg4 - dest_offset
|
|
* c_rarg5 - isURL
|
|
*
|
|
*/
|
|
address generate_base64_encodeBlock() {
|
|
|
|
static const char toBase64[64] = {
|
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
|
|
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
|
|
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
|
|
};
|
|
|
|
static const char toBase64URL[64] = {
|
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
|
|
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
|
|
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register src = c_rarg0; // source array
|
|
Register soff = c_rarg1; // source start offset
|
|
Register send = c_rarg2; // source end offset
|
|
Register dst = c_rarg3; // dest array
|
|
Register doff = c_rarg4; // position for writing to dest array
|
|
Register isURL = c_rarg5; // Base64 or URL character set
|
|
|
|
// c_rarg6 and c_rarg7 are free to use as temps
|
|
Register codec = c_rarg6;
|
|
Register length = c_rarg7;
|
|
|
|
Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
|
|
|
|
__ add(src, src, soff);
|
|
__ add(dst, dst, doff);
|
|
__ sub(length, send, soff);
|
|
|
|
// load the codec base address
|
|
__ lea(codec, ExternalAddress((address) toBase64));
|
|
__ cbz(isURL, ProcessData);
|
|
__ lea(codec, ExternalAddress((address) toBase64URL));
|
|
|
|
__ BIND(ProcessData);
|
|
|
|
// too short to formup a SIMD loop, roll back
|
|
__ cmp(length, (u1)24);
|
|
__ br(Assembler::LT, Process3B);
|
|
|
|
__ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
|
|
|
|
__ BIND(Process48B);
|
|
__ cmp(length, (u1)48);
|
|
__ br(Assembler::LT, Process24B);
|
|
generate_base64_encode_simdround(src, dst, v0, 16);
|
|
__ sub(length, length, 48);
|
|
__ b(Process48B);
|
|
|
|
__ BIND(Process24B);
|
|
__ cmp(length, (u1)24);
|
|
__ br(Assembler::LT, SIMDExit);
|
|
generate_base64_encode_simdround(src, dst, v0, 8);
|
|
__ sub(length, length, 24);
|
|
|
|
__ BIND(SIMDExit);
|
|
__ cbz(length, Exit);
|
|
|
|
__ BIND(Process3B);
|
|
// 3 src bytes, 24 bits
|
|
__ ldrb(r10, __ post(src, 1));
|
|
__ ldrb(r11, __ post(src, 1));
|
|
__ ldrb(r12, __ post(src, 1));
|
|
__ orrw(r11, r11, r10, Assembler::LSL, 8);
|
|
__ orrw(r12, r12, r11, Assembler::LSL, 8);
|
|
// codec index
|
|
__ ubfmw(r15, r12, 18, 23);
|
|
__ ubfmw(r14, r12, 12, 17);
|
|
__ ubfmw(r13, r12, 6, 11);
|
|
__ andw(r12, r12, 63);
|
|
// get the code based on the codec
|
|
__ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
|
|
__ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
|
|
__ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
|
|
__ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
|
|
__ strb(r15, __ post(dst, 1));
|
|
__ strb(r14, __ post(dst, 1));
|
|
__ strb(r13, __ post(dst, 1));
|
|
__ strb(r12, __ post(dst, 1));
|
|
__ sub(length, length, 3);
|
|
__ cbnz(length, Process3B);
|
|
|
|
__ BIND(Exit);
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
void generate_base64_decode_simdround(Register src, Register dst,
|
|
FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
|
|
|
|
FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
|
|
FloatRegister out0 = v20, out1 = v21, out2 = v22;
|
|
|
|
FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
|
|
FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
|
|
|
|
Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
|
|
|
|
Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
|
|
|
|
__ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
|
|
|
|
// we need unsigned saturating subtract, to make sure all input values
|
|
// in range [0, 63] will have 0U value in the higher half lookup
|
|
__ uqsubv(decH0, __ T16B, in0, v27);
|
|
__ uqsubv(decH1, __ T16B, in1, v27);
|
|
__ uqsubv(decH2, __ T16B, in2, v27);
|
|
__ uqsubv(decH3, __ T16B, in3, v27);
|
|
|
|
// lower half lookup
|
|
__ tbl(decL0, arrangement, codecL, 4, in0);
|
|
__ tbl(decL1, arrangement, codecL, 4, in1);
|
|
__ tbl(decL2, arrangement, codecL, 4, in2);
|
|
__ tbl(decL3, arrangement, codecL, 4, in3);
|
|
|
|
// higher half lookup
|
|
__ tbx(decH0, arrangement, codecH, 4, decH0);
|
|
__ tbx(decH1, arrangement, codecH, 4, decH1);
|
|
__ tbx(decH2, arrangement, codecH, 4, decH2);
|
|
__ tbx(decH3, arrangement, codecH, 4, decH3);
|
|
|
|
// combine lower and higher
|
|
__ orr(decL0, arrangement, decL0, decH0);
|
|
__ orr(decL1, arrangement, decL1, decH1);
|
|
__ orr(decL2, arrangement, decL2, decH2);
|
|
__ orr(decL3, arrangement, decL3, decH3);
|
|
|
|
// check illegal inputs, value larger than 63 (maximum of 6 bits)
|
|
__ cm(Assembler::HI, decH0, arrangement, decL0, v27);
|
|
__ cm(Assembler::HI, decH1, arrangement, decL1, v27);
|
|
__ cm(Assembler::HI, decH2, arrangement, decL2, v27);
|
|
__ cm(Assembler::HI, decH3, arrangement, decL3, v27);
|
|
__ orr(in0, arrangement, decH0, decH1);
|
|
__ orr(in1, arrangement, decH2, decH3);
|
|
__ orr(in2, arrangement, in0, in1);
|
|
__ umaxv(in3, arrangement, in2);
|
|
__ umov(rscratch2, in3, __ B, 0);
|
|
|
|
// get the data to output
|
|
__ shl(out0, arrangement, decL0, 2);
|
|
__ ushr(out1, arrangement, decL1, 4);
|
|
__ orr(out0, arrangement, out0, out1);
|
|
__ shl(out1, arrangement, decL1, 4);
|
|
__ ushr(out2, arrangement, decL2, 2);
|
|
__ orr(out1, arrangement, out1, out2);
|
|
__ shl(out2, arrangement, decL2, 6);
|
|
__ orr(out2, arrangement, out2, decL3);
|
|
|
|
__ cbz(rscratch2, NoIllegalData);
|
|
|
|
// handle illegal input
|
|
__ umov(r10, in2, __ D, 0);
|
|
if (size == 16) {
|
|
__ cbnz(r10, ErrorInLowerHalf);
|
|
|
|
// illegal input is in higher half, store the lower half now.
|
|
__ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
|
|
|
|
__ umov(r10, in2, __ D, 1);
|
|
__ umov(r11, out0, __ D, 1);
|
|
__ umov(r12, out1, __ D, 1);
|
|
__ umov(r13, out2, __ D, 1);
|
|
__ b(StoreLegalData);
|
|
|
|
__ BIND(ErrorInLowerHalf);
|
|
}
|
|
__ umov(r11, out0, __ D, 0);
|
|
__ umov(r12, out1, __ D, 0);
|
|
__ umov(r13, out2, __ D, 0);
|
|
|
|
__ BIND(StoreLegalData);
|
|
__ tbnz(r10, 5, Exit); // 0xff indicates illegal input
|
|
__ strb(r11, __ post(dst, 1));
|
|
__ strb(r12, __ post(dst, 1));
|
|
__ strb(r13, __ post(dst, 1));
|
|
__ lsr(r10, r10, 8);
|
|
__ lsr(r11, r11, 8);
|
|
__ lsr(r12, r12, 8);
|
|
__ lsr(r13, r13, 8);
|
|
__ b(StoreLegalData);
|
|
|
|
__ BIND(NoIllegalData);
|
|
__ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
|
|
}
|
|
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Input:
|
|
* c_rarg0 - src_start
|
|
* c_rarg1 - src_offset
|
|
* c_rarg2 - src_length
|
|
* c_rarg3 - dest_start
|
|
* c_rarg4 - dest_offset
|
|
* c_rarg5 - isURL
|
|
* c_rarg6 - isMIME
|
|
*
|
|
*/
|
|
address generate_base64_decodeBlock() {
|
|
|
|
// The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
|
|
// on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
|
|
// titled "Base64 decoding".
|
|
|
|
// Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
|
|
// except the trailing character '=' is also treated illegal value in this intrinsic. That
|
|
// is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
|
|
static const uint8_t fromBase64ForNoSIMD[256] = {
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
|
|
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
|
|
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
|
|
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
};
|
|
|
|
static const uint8_t fromBase64URLForNoSIMD[256] = {
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
|
|
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
|
|
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
|
|
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
|
|
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
};
|
|
|
|
// A legal value of base64 code is in range [0, 127]. We need two lookups
|
|
// with tbl/tbx and combine them to get the decode data. The 1st table vector
|
|
// lookup use tbl, out of range indices are set to 0 in destination. The 2nd
|
|
// table vector lookup use tbx, out of range indices are unchanged in
|
|
// destination. Input [64..126] is mapped to index [65, 127] in second lookup.
|
|
// The value of index 64 is set to 0, so that we know that we already get the
|
|
// decoded data with the 1st lookup.
|
|
static const uint8_t fromBase64ForSIMD[128] = {
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
|
|
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
|
|
14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
|
|
40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
|
|
};
|
|
|
|
static const uint8_t fromBase64URLForSIMD[128] = {
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
|
|
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
|
|
14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
|
|
63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
|
|
40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register src = c_rarg0; // source array
|
|
Register soff = c_rarg1; // source start offset
|
|
Register send = c_rarg2; // source end offset
|
|
Register dst = c_rarg3; // dest array
|
|
Register doff = c_rarg4; // position for writing to dest array
|
|
Register isURL = c_rarg5; // Base64 or URL character set
|
|
Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
|
|
|
|
Register length = send; // reuse send as length of source data to process
|
|
|
|
Register simd_codec = c_rarg6;
|
|
Register nosimd_codec = c_rarg7;
|
|
|
|
Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
|
|
|
|
__ enter();
|
|
|
|
__ add(src, src, soff);
|
|
__ add(dst, dst, doff);
|
|
|
|
__ mov(doff, dst);
|
|
|
|
__ sub(length, send, soff);
|
|
__ bfm(length, zr, 0, 1);
|
|
|
|
__ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
|
|
__ cbz(isURL, ProcessData);
|
|
__ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
|
|
|
|
__ BIND(ProcessData);
|
|
__ mov(rscratch1, length);
|
|
__ cmp(length, (u1)144); // 144 = 80 + 64
|
|
__ br(Assembler::LT, Process4B);
|
|
|
|
// In the MIME case, the line length cannot be more than 76
|
|
// bytes (see RFC 2045). This is too short a block for SIMD
|
|
// to be worthwhile, so we use non-SIMD here.
|
|
__ movw(rscratch1, 79);
|
|
|
|
__ BIND(Process4B);
|
|
__ ldrw(r14, __ post(src, 4));
|
|
__ ubfxw(r10, r14, 0, 8);
|
|
__ ubfxw(r11, r14, 8, 8);
|
|
__ ubfxw(r12, r14, 16, 8);
|
|
__ ubfxw(r13, r14, 24, 8);
|
|
// get the de-code
|
|
__ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
|
|
__ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
|
|
__ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
|
|
__ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
|
|
// error detection, 255u indicates an illegal input
|
|
__ orrw(r14, r10, r11);
|
|
__ orrw(r15, r12, r13);
|
|
__ orrw(r14, r14, r15);
|
|
__ tbnz(r14, 7, Exit);
|
|
// recover the data
|
|
__ lslw(r14, r10, 10);
|
|
__ bfiw(r14, r11, 4, 6);
|
|
__ bfmw(r14, r12, 2, 5);
|
|
__ rev16w(r14, r14);
|
|
__ bfiw(r13, r12, 6, 2);
|
|
__ strh(r14, __ post(dst, 2));
|
|
__ strb(r13, __ post(dst, 1));
|
|
// non-simd loop
|
|
__ subsw(rscratch1, rscratch1, 4);
|
|
__ br(Assembler::GT, Process4B);
|
|
|
|
// if exiting from PreProcess80B, rscratch1 == -1;
|
|
// otherwise, rscratch1 == 0.
|
|
__ cbzw(rscratch1, Exit);
|
|
__ sub(length, length, 80);
|
|
|
|
__ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
|
|
__ cbz(isURL, SIMDEnter);
|
|
__ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
|
|
|
|
__ BIND(SIMDEnter);
|
|
__ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
|
|
__ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
|
|
__ mov(rscratch1, 63);
|
|
__ dup(v27, __ T16B, rscratch1);
|
|
|
|
__ BIND(Process64B);
|
|
__ cmp(length, (u1)64);
|
|
__ br(Assembler::LT, Process32B);
|
|
generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
|
|
__ sub(length, length, 64);
|
|
__ b(Process64B);
|
|
|
|
__ BIND(Process32B);
|
|
__ cmp(length, (u1)32);
|
|
__ br(Assembler::LT, SIMDExit);
|
|
generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
|
|
__ sub(length, length, 32);
|
|
__ b(Process32B);
|
|
|
|
__ BIND(SIMDExit);
|
|
__ cbz(length, Exit);
|
|
__ movw(rscratch1, length);
|
|
__ b(Process4B);
|
|
|
|
__ BIND(Exit);
|
|
__ sub(c_rarg0, dst, doff);
|
|
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Support for spin waits.
|
|
address generate_spin_wait() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_spin_wait_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ spin_wait();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
void generate_lookup_secondary_supers_table_stub() {
|
|
StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
const Register
|
|
r_super_klass = r0,
|
|
r_array_base = r1,
|
|
r_array_length = r2,
|
|
r_array_index = r3,
|
|
r_sub_klass = r4,
|
|
r_bitmap = rscratch2,
|
|
result = r5;
|
|
const FloatRegister
|
|
vtemp = v0;
|
|
|
|
for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
|
|
StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
|
|
Label L_success;
|
|
__ enter();
|
|
__ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
|
|
r_array_base, r_array_length, r_array_index,
|
|
vtemp, result, slot,
|
|
/*stub_is_near*/true);
|
|
__ leave();
|
|
__ ret(lr);
|
|
}
|
|
}
|
|
|
|
// Slow path implementation for UseSecondarySupersTable.
|
|
address generate_lookup_secondary_supers_table_slow_path_stub() {
|
|
StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
const Register
|
|
r_super_klass = r0, // argument
|
|
r_array_base = r1, // argument
|
|
temp1 = r2, // temp
|
|
r_array_index = r3, // argument
|
|
r_bitmap = rscratch2, // argument
|
|
result = r5; // argument
|
|
|
|
__ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
#if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
|
|
|
|
// ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
|
|
//
|
|
// If LSE is in use, generate LSE versions of all the stubs. The
|
|
// non-LSE versions are in atomic_aarch64.S.
|
|
|
|
// class AtomicStubMark records the entry point of a stub and the
|
|
// stub pointer which will point to it. The stub pointer is set to
|
|
// the entry point when ~AtomicStubMark() is called, which must be
|
|
// after ICache::invalidate_range. This ensures safe publication of
|
|
// the generated code.
|
|
class AtomicStubMark {
|
|
address _entry_point;
|
|
aarch64_atomic_stub_t *_stub;
|
|
MacroAssembler *_masm;
|
|
public:
|
|
AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
|
|
_masm = masm;
|
|
__ align(32);
|
|
_entry_point = __ pc();
|
|
_stub = stub;
|
|
}
|
|
~AtomicStubMark() {
|
|
*_stub = (aarch64_atomic_stub_t)_entry_point;
|
|
}
|
|
};
|
|
|
|
// NB: For memory_order_conservative we need a trailing membar after
|
|
// LSE atomic operations but not a leading membar.
|
|
//
|
|
// We don't need a leading membar because a clause in the Arm ARM
|
|
// says:
|
|
//
|
|
// Barrier-ordered-before
|
|
//
|
|
// Barrier instructions order prior Memory effects before subsequent
|
|
// Memory effects generated by the same Observer. A read or a write
|
|
// RW1 is Barrier-ordered-before a read or a write RW 2 from the same
|
|
// Observer if and only if RW1 appears in program order before RW 2
|
|
// and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
|
|
// instruction with both Acquire and Release semantics.
|
|
//
|
|
// All the atomic instructions {ldaddal, swapal, casal} have Acquire
|
|
// and Release semantics, therefore we don't need a leading
|
|
// barrier. However, there is no corresponding Barrier-ordered-after
|
|
// relationship, therefore we need a trailing membar to prevent a
|
|
// later store or load from being reordered with the store in an
|
|
// atomic instruction.
|
|
//
|
|
// This was checked by using the herd7 consistency model simulator
|
|
// (http://diy.inria.fr/) with this test case:
|
|
//
|
|
// AArch64 LseCas
|
|
// { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
|
|
// P0 | P1;
|
|
// LDR W4, [X2] | MOV W3, #0;
|
|
// DMB LD | MOV W4, #1;
|
|
// LDR W3, [X1] | CASAL W3, W4, [X1];
|
|
// | DMB ISH;
|
|
// | STR W4, [X2];
|
|
// exists
|
|
// (0:X3=0 /\ 0:X4=1)
|
|
//
|
|
// If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
|
|
// with the store to x in P1. Without the DMB in P1 this may happen.
|
|
//
|
|
// At the time of writing we don't know of any AArch64 hardware that
|
|
// reorders stores in this way, but the Reference Manual permits it.
|
|
|
|
void gen_cas_entry(Assembler::operand_size size,
|
|
atomic_memory_order order) {
|
|
Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
|
|
exchange_val = c_rarg2;
|
|
bool acquire, release;
|
|
switch (order) {
|
|
case memory_order_relaxed:
|
|
acquire = false;
|
|
release = false;
|
|
break;
|
|
case memory_order_release:
|
|
acquire = false;
|
|
release = true;
|
|
break;
|
|
default:
|
|
acquire = true;
|
|
release = true;
|
|
break;
|
|
}
|
|
__ mov(prev, compare_val);
|
|
__ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
|
|
if (order == memory_order_conservative) {
|
|
__ membar(Assembler::StoreStore|Assembler::StoreLoad);
|
|
}
|
|
if (size == Assembler::xword) {
|
|
__ mov(r0, prev);
|
|
} else {
|
|
__ movw(r0, prev);
|
|
}
|
|
__ ret(lr);
|
|
}
|
|
|
|
void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
|
|
Register prev = r2, addr = c_rarg0, incr = c_rarg1;
|
|
// If not relaxed, then default to conservative. Relaxed is the only
|
|
// case we use enough to be worth specializing.
|
|
if (order == memory_order_relaxed) {
|
|
__ ldadd(size, incr, prev, addr);
|
|
} else {
|
|
__ ldaddal(size, incr, prev, addr);
|
|
__ membar(Assembler::StoreStore|Assembler::StoreLoad);
|
|
}
|
|
if (size == Assembler::xword) {
|
|
__ mov(r0, prev);
|
|
} else {
|
|
__ movw(r0, prev);
|
|
}
|
|
__ ret(lr);
|
|
}
|
|
|
|
void gen_swpal_entry(Assembler::operand_size size) {
|
|
Register prev = r2, addr = c_rarg0, incr = c_rarg1;
|
|
__ swpal(size, incr, prev, addr);
|
|
__ membar(Assembler::StoreStore|Assembler::StoreLoad);
|
|
if (size == Assembler::xword) {
|
|
__ mov(r0, prev);
|
|
} else {
|
|
__ movw(r0, prev);
|
|
}
|
|
__ ret(lr);
|
|
}
|
|
|
|
void generate_atomic_entry_points() {
|
|
if (! UseLSE) {
|
|
return;
|
|
}
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_atomic_entry_points_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address first_entry = __ pc();
|
|
|
|
// ADD, memory_order_conservative
|
|
AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
|
|
gen_ldadd_entry(Assembler::word, memory_order_conservative);
|
|
AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
|
|
gen_ldadd_entry(Assembler::xword, memory_order_conservative);
|
|
|
|
// ADD, memory_order_relaxed
|
|
AtomicStubMark mark_fetch_add_4_relaxed
|
|
(_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
|
|
gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
|
|
AtomicStubMark mark_fetch_add_8_relaxed
|
|
(_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
|
|
gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
|
|
|
|
// XCHG, memory_order_conservative
|
|
AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
|
|
gen_swpal_entry(Assembler::word);
|
|
AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
|
|
gen_swpal_entry(Assembler::xword);
|
|
|
|
// CAS, memory_order_conservative
|
|
AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
|
|
gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
|
|
AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
|
|
gen_cas_entry(MacroAssembler::word, memory_order_conservative);
|
|
AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
|
|
gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
|
|
|
|
// CAS, memory_order_relaxed
|
|
AtomicStubMark mark_cmpxchg_1_relaxed
|
|
(_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
|
|
gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
|
|
AtomicStubMark mark_cmpxchg_4_relaxed
|
|
(_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
|
|
gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
|
|
AtomicStubMark mark_cmpxchg_8_relaxed
|
|
(_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
|
|
gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
|
|
|
|
AtomicStubMark mark_cmpxchg_4_release
|
|
(_masm, &aarch64_atomic_cmpxchg_4_release_impl);
|
|
gen_cas_entry(MacroAssembler::word, memory_order_release);
|
|
AtomicStubMark mark_cmpxchg_8_release
|
|
(_masm, &aarch64_atomic_cmpxchg_8_release_impl);
|
|
gen_cas_entry(MacroAssembler::xword, memory_order_release);
|
|
|
|
AtomicStubMark mark_cmpxchg_4_seq_cst
|
|
(_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
|
|
gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
|
|
AtomicStubMark mark_cmpxchg_8_seq_cst
|
|
(_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
|
|
gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
|
|
|
|
ICache::invalidate_range(first_entry, __ pc() - first_entry);
|
|
}
|
|
#endif // LINUX
|
|
|
|
address generate_cont_thaw(Continuation::thaw_kind kind) {
|
|
bool return_barrier = Continuation::is_thaw_return_barrier(kind);
|
|
bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
|
|
|
|
address start = __ pc();
|
|
|
|
if (return_barrier) {
|
|
__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
|
|
__ mov(sp, rscratch1);
|
|
}
|
|
assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
|
|
|
|
if (return_barrier) {
|
|
// preserve possible return value from a method returning to the return barrier
|
|
__ fmovd(rscratch1, v0);
|
|
__ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
|
|
}
|
|
|
|
__ movw(c_rarg1, (return_barrier ? 1 : 0));
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
|
|
__ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
|
|
|
|
if (return_barrier) {
|
|
// restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
|
|
__ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
|
|
__ fmovd(v0, rscratch1);
|
|
}
|
|
assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
|
|
|
|
|
|
Label thaw_success;
|
|
// rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
|
|
__ cbnz(rscratch2, thaw_success);
|
|
__ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
|
|
__ br(rscratch1);
|
|
__ bind(thaw_success);
|
|
|
|
// make room for the thawed frames
|
|
__ sub(rscratch1, sp, rscratch2);
|
|
__ andr(rscratch1, rscratch1, -16); // align
|
|
__ mov(sp, rscratch1);
|
|
|
|
if (return_barrier) {
|
|
// save original return value -- again
|
|
__ fmovd(rscratch1, v0);
|
|
__ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
|
|
}
|
|
|
|
// If we want, we can templatize thaw by kind, and have three different entries
|
|
__ movw(c_rarg1, (uint32_t)kind);
|
|
|
|
__ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
|
|
__ mov(rscratch2, r0); // r0 is the sp of the yielding frame
|
|
|
|
if (return_barrier) {
|
|
// restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
|
|
__ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
|
|
__ fmovd(v0, rscratch1);
|
|
} else {
|
|
__ mov(r0, zr); // return 0 (success) from doYield
|
|
}
|
|
|
|
// we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
|
|
__ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
|
|
__ mov(rfp, sp);
|
|
|
|
if (return_barrier_exception) {
|
|
__ ldr(c_rarg1, Address(rfp, wordSize)); // return address
|
|
__ authenticate_return_address(c_rarg1);
|
|
__ verify_oop(r0);
|
|
// save return value containing the exception oop in callee-saved R19
|
|
__ mov(r19, r0);
|
|
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
|
|
|
|
// Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
|
|
// __ reinitialize_ptrue();
|
|
|
|
// see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
|
|
|
|
__ mov(r1, r0); // the exception handler
|
|
__ mov(r0, r19); // restore return value containing the exception oop
|
|
__ verify_oop(r0);
|
|
|
|
__ leave();
|
|
__ mov(r3, lr);
|
|
__ br(r1); // the exception handler
|
|
} else {
|
|
// We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
|
|
__ leave();
|
|
__ ret(lr);
|
|
}
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_thaw() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
|
|
StubId stub_id = StubId::stubgen_cont_thaw_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
generate_cont_thaw(Continuation::thaw_top);
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_returnBarrier() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
|
|
// TODO: will probably need multiple return barriers depending on return type
|
|
StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
generate_cont_thaw(Continuation::thaw_return_barrier);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_returnBarrier_exception() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
|
|
StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
generate_cont_thaw(Continuation::thaw_return_barrier_exception);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_preempt_stub() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
StubId stub_id = StubId::stubgen_cont_preempt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ reset_last_Java_frame(true);
|
|
|
|
// Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
|
|
__ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
|
|
__ mov(sp, rscratch2);
|
|
|
|
Label preemption_cancelled;
|
|
__ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
|
|
__ cbnz(rscratch1, preemption_cancelled);
|
|
|
|
// Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
|
|
SharedRuntime::continuation_enter_cleanup(_masm);
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
// We acquired the monitor after freezing the frames so call thaw to continue execution.
|
|
__ bind(preemption_cancelled);
|
|
__ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
|
|
__ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
|
|
__ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
|
|
__ ldr(rscratch1, Address(rscratch1));
|
|
__ br(rscratch1);
|
|
|
|
return start;
|
|
}
|
|
|
|
// In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
|
|
// are represented as long[5], with BITS_PER_LIMB = 26.
|
|
// Pack five 26-bit limbs into three 64-bit registers.
|
|
void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
|
|
__ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
|
|
__ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
|
|
__ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
|
|
__ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
|
|
|
|
__ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
|
|
__ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
|
|
__ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
|
|
__ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
|
|
|
|
if (dest2->is_valid()) {
|
|
__ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
|
|
} else {
|
|
#ifdef ASSERT
|
|
Label OK;
|
|
__ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
|
|
__ br(__ EQ, OK);
|
|
__ stop("high bits of Poly1305 integer should be zero");
|
|
__ should_not_reach_here();
|
|
__ bind(OK);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// As above, but return only a 128-bit integer, packed into two
|
|
// 64-bit registers.
|
|
void pack_26(Register dest0, Register dest1, Register src) {
|
|
pack_26(dest0, dest1, noreg, src);
|
|
}
|
|
|
|
// Multiply and multiply-accumulate unsigned 64-bit registers.
|
|
void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
|
|
__ mul(prod_lo, n, m);
|
|
__ umulh(prod_hi, n, m);
|
|
}
|
|
void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
|
|
wide_mul(rscratch1, rscratch2, n, m);
|
|
__ adds(sum_lo, sum_lo, rscratch1);
|
|
__ adc(sum_hi, sum_hi, rscratch2);
|
|
}
|
|
|
|
// Poly1305, RFC 7539
|
|
|
|
// See https://loup-vaillant.fr/tutorials/poly1305-design for a
|
|
// description of the tricks used to simplify and accelerate this
|
|
// computation.
|
|
|
|
address generate_poly1305_processBlocks() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
Label here;
|
|
__ enter();
|
|
RegSet callee_saved = RegSet::range(r19, r28);
|
|
__ push(callee_saved, sp);
|
|
|
|
RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
|
|
|
|
// Arguments
|
|
const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
|
|
|
|
// R_n is the 128-bit randomly-generated key, packed into two
|
|
// registers. The caller passes this key to us as long[5], with
|
|
// BITS_PER_LIMB = 26.
|
|
const Register R_0 = *++regs, R_1 = *++regs;
|
|
pack_26(R_0, R_1, r_start);
|
|
|
|
// RR_n is (R_n >> 2) * 5
|
|
const Register RR_0 = *++regs, RR_1 = *++regs;
|
|
__ lsr(RR_0, R_0, 2);
|
|
__ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
|
|
__ lsr(RR_1, R_1, 2);
|
|
__ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
|
|
|
|
// U_n is the current checksum
|
|
const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
|
|
pack_26(U_0, U_1, U_2, acc_start);
|
|
|
|
static constexpr int BLOCK_LENGTH = 16;
|
|
Label DONE, LOOP;
|
|
|
|
__ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
|
|
__ br(Assembler::LT, DONE); {
|
|
__ bind(LOOP);
|
|
|
|
// S_n is to be the sum of U_n and the next block of data
|
|
const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
|
|
__ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
|
|
__ adds(S_0, U_0, S_0);
|
|
__ adcs(S_1, U_1, S_1);
|
|
__ adc(S_2, U_2, zr);
|
|
__ add(S_2, S_2, 1);
|
|
|
|
const Register U_0HI = *++regs, U_1HI = *++regs;
|
|
|
|
// NB: this logic depends on some of the special properties of
|
|
// Poly1305 keys. In particular, because we know that the top
|
|
// four bits of R_0 and R_1 are zero, we can add together
|
|
// partial products without any risk of needing to propagate a
|
|
// carry out.
|
|
wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
|
|
wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
|
|
__ andr(U_2, R_0, 3);
|
|
__ mul(U_2, S_2, U_2);
|
|
|
|
// Recycle registers S_0, S_1, S_2
|
|
regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
|
|
|
|
// Partial reduction mod 2**130 - 5
|
|
__ adds(U_1, U_0HI, U_1);
|
|
__ adc(U_2, U_1HI, U_2);
|
|
// Sum now in U_2:U_1:U_0.
|
|
// Dead: U_0HI, U_1HI.
|
|
regs = (regs.remaining() + U_0HI + U_1HI).begin();
|
|
|
|
// U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
|
|
|
|
// First, U_2:U_1:U_0 += (U_2 >> 2)
|
|
__ lsr(rscratch1, U_2, 2);
|
|
__ andr(U_2, U_2, (u8)3);
|
|
__ adds(U_0, U_0, rscratch1);
|
|
__ adcs(U_1, U_1, zr);
|
|
__ adc(U_2, U_2, zr);
|
|
// Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
|
|
__ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
|
|
__ adcs(U_1, U_1, zr);
|
|
__ adc(U_2, U_2, zr);
|
|
|
|
__ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
|
|
__ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
|
|
__ br(~ Assembler::LT, LOOP);
|
|
}
|
|
|
|
// Further reduce modulo 2^130 - 5
|
|
__ lsr(rscratch1, U_2, 2);
|
|
__ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
|
|
__ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
|
|
__ adcs(U_1, U_1, zr);
|
|
__ andr(U_2, U_2, (u1)3);
|
|
__ adc(U_2, U_2, zr);
|
|
|
|
// Unpack the sum into five 26-bit limbs and write to memory.
|
|
__ ubfiz(rscratch1, U_0, 0, 26);
|
|
__ ubfx(rscratch2, U_0, 26, 26);
|
|
__ stp(rscratch1, rscratch2, Address(acc_start));
|
|
__ ubfx(rscratch1, U_0, 52, 12);
|
|
__ bfi(rscratch1, U_1, 12, 14);
|
|
__ ubfx(rscratch2, U_1, 14, 26);
|
|
__ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
|
|
__ ubfx(rscratch1, U_1, 40, 24);
|
|
__ bfi(rscratch1, U_2, 24, 3);
|
|
__ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
|
|
|
|
__ bind(DONE);
|
|
__ pop(callee_saved, sp);
|
|
__ leave();
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
// exception handler for upcall stubs
|
|
address generate_upcall_stub_exception_handler() {
|
|
StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
// Native caller has no idea how to handle exceptions,
|
|
// so we just crash here. Up to callee to catch exceptions.
|
|
__ verify_oop(r0);
|
|
__ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
|
|
__ blr(rscratch1);
|
|
__ should_not_reach_here();
|
|
|
|
return start;
|
|
}
|
|
|
|
// load Method* target of MethodHandle
|
|
// j_rarg0 = jobject receiver
|
|
// rmethod = result
|
|
address generate_upcall_stub_load_target() {
|
|
StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
|
|
// Load target method from receiver
|
|
__ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
|
|
__ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
|
|
__ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
|
|
__ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
|
|
Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
|
|
noreg, noreg);
|
|
__ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
|
|
|
|
__ ret(lr);
|
|
|
|
return start;
|
|
}
|
|
|
|
#undef __
|
|
#define __ masm->
|
|
|
|
class MontgomeryMultiplyGenerator : public MacroAssembler {
|
|
|
|
Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
|
|
Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
|
|
|
|
RegSet _toSave;
|
|
bool _squaring;
|
|
|
|
public:
|
|
MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
|
|
: MacroAssembler(as->code()), _squaring(squaring) {
|
|
|
|
// Register allocation
|
|
|
|
RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
|
|
Pa_base = *regs; // Argument registers
|
|
if (squaring)
|
|
Pb_base = Pa_base;
|
|
else
|
|
Pb_base = *++regs;
|
|
Pn_base = *++regs;
|
|
Rlen= *++regs;
|
|
inv = *++regs;
|
|
Pm_base = *++regs;
|
|
|
|
// Working registers:
|
|
Ra = *++regs; // The current digit of a, b, n, and m.
|
|
Rb = *++regs;
|
|
Rm = *++regs;
|
|
Rn = *++regs;
|
|
|
|
Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
|
|
Pb = *++regs;
|
|
Pm = *++regs;
|
|
Pn = *++regs;
|
|
|
|
t0 = *++regs; // Three registers which form a
|
|
t1 = *++regs; // triple-precision accumuator.
|
|
t2 = *++regs;
|
|
|
|
Ri = *++regs; // Inner and outer loop indexes.
|
|
Rj = *++regs;
|
|
|
|
Rhi_ab = *++regs; // Product registers: low and high parts
|
|
Rlo_ab = *++regs; // of a*b and m*n.
|
|
Rhi_mn = *++regs;
|
|
Rlo_mn = *++regs;
|
|
|
|
// r19 and up are callee-saved.
|
|
_toSave = RegSet::range(r19, *regs) + Pm_base;
|
|
}
|
|
|
|
private:
|
|
void save_regs() {
|
|
push(_toSave, sp);
|
|
}
|
|
|
|
void restore_regs() {
|
|
pop(_toSave, sp);
|
|
}
|
|
|
|
template <typename T>
|
|
void unroll_2(Register count, T block) {
|
|
Label loop, end, odd;
|
|
tbnz(count, 0, odd);
|
|
cbz(count, end);
|
|
align(16);
|
|
bind(loop);
|
|
(this->*block)();
|
|
bind(odd);
|
|
(this->*block)();
|
|
subs(count, count, 2);
|
|
br(Assembler::GT, loop);
|
|
bind(end);
|
|
}
|
|
|
|
template <typename T>
|
|
void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
|
|
Label loop, end, odd;
|
|
tbnz(count, 0, odd);
|
|
cbz(count, end);
|
|
align(16);
|
|
bind(loop);
|
|
(this->*block)(d, s, tmp);
|
|
bind(odd);
|
|
(this->*block)(d, s, tmp);
|
|
subs(count, count, 2);
|
|
br(Assembler::GT, loop);
|
|
bind(end);
|
|
}
|
|
|
|
void pre1(RegisterOrConstant i) {
|
|
block_comment("pre1");
|
|
// Pa = Pa_base;
|
|
// Pb = Pb_base + i;
|
|
// Pm = Pm_base;
|
|
// Pn = Pn_base + i;
|
|
// Ra = *Pa;
|
|
// Rb = *Pb;
|
|
// Rm = *Pm;
|
|
// Rn = *Pn;
|
|
ldr(Ra, Address(Pa_base));
|
|
ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
|
|
ldr(Rm, Address(Pm_base));
|
|
ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
|
|
lea(Pa, Address(Pa_base));
|
|
lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
|
|
lea(Pm, Address(Pm_base));
|
|
lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
|
|
|
|
// Zero the m*n result.
|
|
mov(Rhi_mn, zr);
|
|
mov(Rlo_mn, zr);
|
|
}
|
|
|
|
// The core multiply-accumulate step of a Montgomery
|
|
// multiplication. The idea is to schedule operations as a
|
|
// pipeline so that instructions with long latencies (loads and
|
|
// multiplies) have time to complete before their results are
|
|
// used. This most benefits in-order implementations of the
|
|
// architecture but out-of-order ones also benefit.
|
|
void step() {
|
|
block_comment("step");
|
|
// MACC(Ra, Rb, t0, t1, t2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
umulh(Rhi_ab, Ra, Rb);
|
|
mul(Rlo_ab, Ra, Rb);
|
|
ldr(Ra, pre(Pa, wordSize));
|
|
ldr(Rb, pre(Pb, -wordSize));
|
|
acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
|
|
// previous iteration.
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
umulh(Rhi_mn, Rm, Rn);
|
|
mul(Rlo_mn, Rm, Rn);
|
|
ldr(Rm, pre(Pm, wordSize));
|
|
ldr(Rn, pre(Pn, -wordSize));
|
|
acc(Rhi_ab, Rlo_ab, t0, t1, t2);
|
|
}
|
|
|
|
void post1() {
|
|
block_comment("post1");
|
|
|
|
// MACC(Ra, Rb, t0, t1, t2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
umulh(Rhi_ab, Ra, Rb);
|
|
mul(Rlo_ab, Ra, Rb);
|
|
acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
|
|
acc(Rhi_ab, Rlo_ab, t0, t1, t2);
|
|
|
|
// *Pm = Rm = t0 * inv;
|
|
mul(Rm, t0, inv);
|
|
str(Rm, Address(Pm));
|
|
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// t0 = t1; t1 = t2; t2 = 0;
|
|
umulh(Rhi_mn, Rm, Rn);
|
|
|
|
#ifndef PRODUCT
|
|
// assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
|
|
{
|
|
mul(Rlo_mn, Rm, Rn);
|
|
add(Rlo_mn, t0, Rlo_mn);
|
|
Label ok;
|
|
cbz(Rlo_mn, ok); {
|
|
stop("broken Montgomery multiply");
|
|
} bind(ok);
|
|
}
|
|
#endif
|
|
// We have very carefully set things up so that
|
|
// m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
|
|
// the lower half of Rm * Rn because we know the result already:
|
|
// it must be -t0. t0 + (-t0) must generate a carry iff
|
|
// t0 != 0. So, rather than do a mul and an adds we just set
|
|
// the carry flag iff t0 is nonzero.
|
|
//
|
|
// mul(Rlo_mn, Rm, Rn);
|
|
// adds(zr, t0, Rlo_mn);
|
|
subs(zr, t0, 1); // Set carry iff t0 is nonzero
|
|
adcs(t0, t1, Rhi_mn);
|
|
adc(t1, t2, zr);
|
|
mov(t2, zr);
|
|
}
|
|
|
|
void pre2(RegisterOrConstant i, RegisterOrConstant len) {
|
|
block_comment("pre2");
|
|
// Pa = Pa_base + i-len;
|
|
// Pb = Pb_base + len;
|
|
// Pm = Pm_base + i-len;
|
|
// Pn = Pn_base + len;
|
|
|
|
if (i.is_register()) {
|
|
sub(Rj, i.as_register(), len);
|
|
} else {
|
|
mov(Rj, i.as_constant());
|
|
sub(Rj, Rj, len);
|
|
}
|
|
// Rj == i-len
|
|
|
|
lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
|
|
lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
|
|
lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
|
|
lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
|
|
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
ldr(Ra, pre(Pa, wordSize));
|
|
ldr(Rb, pre(Pb, -wordSize));
|
|
ldr(Rm, pre(Pm, wordSize));
|
|
ldr(Rn, pre(Pn, -wordSize));
|
|
|
|
mov(Rhi_mn, zr);
|
|
mov(Rlo_mn, zr);
|
|
}
|
|
|
|
void post2(RegisterOrConstant i, RegisterOrConstant len) {
|
|
block_comment("post2");
|
|
if (i.is_constant()) {
|
|
mov(Rj, i.as_constant()-len.as_constant());
|
|
} else {
|
|
sub(Rj, i.as_register(), len);
|
|
}
|
|
|
|
adds(t0, t0, Rlo_mn); // The pending m*n, low part
|
|
|
|
// As soon as we know the least significant digit of our result,
|
|
// store it.
|
|
// Pm_base[i-len] = t0;
|
|
str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
|
|
|
|
// t0 = t1; t1 = t2; t2 = 0;
|
|
adcs(t0, t1, Rhi_mn); // The pending m*n, high part
|
|
adc(t1, t2, zr);
|
|
mov(t2, zr);
|
|
}
|
|
|
|
// A carry in t0 after Montgomery multiplication means that we
|
|
// should subtract multiples of n from our result in m. We'll
|
|
// keep doing that until there is no carry.
|
|
void normalize(RegisterOrConstant len) {
|
|
block_comment("normalize");
|
|
// while (t0)
|
|
// t0 = sub(Pm_base, Pn_base, t0, len);
|
|
Label loop, post, again;
|
|
Register cnt = t1, i = t2; // Re-use registers; we're done with them now
|
|
cbz(t0, post); {
|
|
bind(again); {
|
|
mov(i, zr);
|
|
mov(cnt, len);
|
|
ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
|
|
ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
|
|
subs(zr, zr, zr); // set carry flag, i.e. no borrow
|
|
align(16);
|
|
bind(loop); {
|
|
sbcs(Rm, Rm, Rn);
|
|
str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
|
|
add(i, i, 1);
|
|
ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
|
|
ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
|
|
sub(cnt, cnt, 1);
|
|
} cbnz(cnt, loop);
|
|
sbc(t0, t0, zr);
|
|
} cbnz(t0, again);
|
|
} bind(post);
|
|
}
|
|
|
|
// Move memory at s to d, reversing words.
|
|
// Increments d to end of copied memory
|
|
// Destroys tmp1, tmp2
|
|
// Preserves len
|
|
// Leaves s pointing to the address which was in d at start
|
|
void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
|
|
assert(tmp1->encoding() < r19->encoding(), "register corruption");
|
|
assert(tmp2->encoding() < r19->encoding(), "register corruption");
|
|
|
|
lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
|
|
mov(tmp1, len);
|
|
unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
|
|
sub(s, d, len, ext::uxtw, LogBytesPerWord);
|
|
}
|
|
// where
|
|
void reverse1(Register d, Register s, Register tmp) {
|
|
ldr(tmp, pre(s, -wordSize));
|
|
ror(tmp, tmp, 32);
|
|
str(tmp, post(d, wordSize));
|
|
}
|
|
|
|
void step_squaring() {
|
|
// An extra ACC
|
|
step();
|
|
acc(Rhi_ab, Rlo_ab, t0, t1, t2);
|
|
}
|
|
|
|
void last_squaring(RegisterOrConstant i) {
|
|
Label dont;
|
|
// if ((i & 1) == 0) {
|
|
tbnz(i.as_register(), 0, dont); {
|
|
// MACC(Ra, Rb, t0, t1, t2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
umulh(Rhi_ab, Ra, Rb);
|
|
mul(Rlo_ab, Ra, Rb);
|
|
acc(Rhi_ab, Rlo_ab, t0, t1, t2);
|
|
} bind(dont);
|
|
}
|
|
|
|
void extra_step_squaring() {
|
|
acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
|
|
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
umulh(Rhi_mn, Rm, Rn);
|
|
mul(Rlo_mn, Rm, Rn);
|
|
ldr(Rm, pre(Pm, wordSize));
|
|
ldr(Rn, pre(Pn, -wordSize));
|
|
}
|
|
|
|
void post1_squaring() {
|
|
acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
|
|
|
|
// *Pm = Rm = t0 * inv;
|
|
mul(Rm, t0, inv);
|
|
str(Rm, Address(Pm));
|
|
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// t0 = t1; t1 = t2; t2 = 0;
|
|
umulh(Rhi_mn, Rm, Rn);
|
|
|
|
#ifndef PRODUCT
|
|
// assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
|
|
{
|
|
mul(Rlo_mn, Rm, Rn);
|
|
add(Rlo_mn, t0, Rlo_mn);
|
|
Label ok;
|
|
cbz(Rlo_mn, ok); {
|
|
stop("broken Montgomery multiply");
|
|
} bind(ok);
|
|
}
|
|
#endif
|
|
// We have very carefully set things up so that
|
|
// m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
|
|
// the lower half of Rm * Rn because we know the result already:
|
|
// it must be -t0. t0 + (-t0) must generate a carry iff
|
|
// t0 != 0. So, rather than do a mul and an adds we just set
|
|
// the carry flag iff t0 is nonzero.
|
|
//
|
|
// mul(Rlo_mn, Rm, Rn);
|
|
// adds(zr, t0, Rlo_mn);
|
|
subs(zr, t0, 1); // Set carry iff t0 is nonzero
|
|
adcs(t0, t1, Rhi_mn);
|
|
adc(t1, t2, zr);
|
|
mov(t2, zr);
|
|
}
|
|
|
|
void acc(Register Rhi, Register Rlo,
|
|
Register t0, Register t1, Register t2) {
|
|
adds(t0, t0, Rlo);
|
|
adcs(t1, t1, Rhi);
|
|
adc(t2, t2, zr);
|
|
}
|
|
|
|
public:
|
|
/**
|
|
* Fast Montgomery multiplication. The derivation of the
|
|
* algorithm is in A Cryptographic Library for the Motorola
|
|
* DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
|
|
*
|
|
* Arguments:
|
|
*
|
|
* Inputs for multiplication:
|
|
* c_rarg0 - int array elements a
|
|
* c_rarg1 - int array elements b
|
|
* c_rarg2 - int array elements n (the modulus)
|
|
* c_rarg3 - int length
|
|
* c_rarg4 - int inv
|
|
* c_rarg5 - int array elements m (the result)
|
|
*
|
|
* Inputs for squaring:
|
|
* c_rarg0 - int array elements a
|
|
* c_rarg1 - int array elements n (the modulus)
|
|
* c_rarg2 - int length
|
|
* c_rarg3 - int inv
|
|
* c_rarg4 - int array elements m (the result)
|
|
*
|
|
*/
|
|
address generate_multiply() {
|
|
Label argh, nothing;
|
|
bind(argh);
|
|
stop("MontgomeryMultiply total_allocation must be <= 8192");
|
|
|
|
align(CodeEntryAlignment);
|
|
address entry = pc();
|
|
|
|
cbzw(Rlen, nothing);
|
|
|
|
enter();
|
|
|
|
// Make room.
|
|
cmpw(Rlen, 512);
|
|
br(Assembler::HI, argh);
|
|
sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
|
|
andr(sp, Ra, -2 * wordSize);
|
|
|
|
lsrw(Rlen, Rlen, 1); // length in longwords = len/2
|
|
|
|
{
|
|
// Copy input args, reversing as we go. We use Ra as a
|
|
// temporary variable.
|
|
reverse(Ra, Pa_base, Rlen, t0, t1);
|
|
if (!_squaring)
|
|
reverse(Ra, Pb_base, Rlen, t0, t1);
|
|
reverse(Ra, Pn_base, Rlen, t0, t1);
|
|
}
|
|
|
|
// Push all call-saved registers and also Pm_base which we'll need
|
|
// at the end.
|
|
save_regs();
|
|
|
|
#ifndef PRODUCT
|
|
// assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
|
|
{
|
|
ldr(Rn, Address(Pn_base, 0));
|
|
mul(Rlo_mn, Rn, inv);
|
|
subs(zr, Rlo_mn, -1);
|
|
Label ok;
|
|
br(EQ, ok); {
|
|
stop("broken inverse in Montgomery multiply");
|
|
} bind(ok);
|
|
}
|
|
#endif
|
|
|
|
mov(Pm_base, Ra);
|
|
|
|
mov(t0, zr);
|
|
mov(t1, zr);
|
|
mov(t2, zr);
|
|
|
|
block_comment("for (int i = 0; i < len; i++) {");
|
|
mov(Ri, zr); {
|
|
Label loop, end;
|
|
cmpw(Ri, Rlen);
|
|
br(Assembler::GE, end);
|
|
|
|
bind(loop);
|
|
pre1(Ri);
|
|
|
|
block_comment(" for (j = i; j; j--) {"); {
|
|
movw(Rj, Ri);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
|
|
} block_comment(" } // j");
|
|
|
|
post1();
|
|
addw(Ri, Ri, 1);
|
|
cmpw(Ri, Rlen);
|
|
br(Assembler::LT, loop);
|
|
bind(end);
|
|
block_comment("} // i");
|
|
}
|
|
|
|
block_comment("for (int i = len; i < 2*len; i++) {");
|
|
mov(Ri, Rlen); {
|
|
Label loop, end;
|
|
cmpw(Ri, Rlen, Assembler::LSL, 1);
|
|
br(Assembler::GE, end);
|
|
|
|
bind(loop);
|
|
pre2(Ri, Rlen);
|
|
|
|
block_comment(" for (j = len*2-i-1; j; j--) {"); {
|
|
lslw(Rj, Rlen, 1);
|
|
subw(Rj, Rj, Ri);
|
|
subw(Rj, Rj, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
|
|
} block_comment(" } // j");
|
|
|
|
post2(Ri, Rlen);
|
|
addw(Ri, Ri, 1);
|
|
cmpw(Ri, Rlen, Assembler::LSL, 1);
|
|
br(Assembler::LT, loop);
|
|
bind(end);
|
|
}
|
|
block_comment("} // i");
|
|
|
|
normalize(Rlen);
|
|
|
|
mov(Ra, Pm_base); // Save Pm_base in Ra
|
|
restore_regs(); // Restore caller's Pm_base
|
|
|
|
// Copy our result into caller's Pm_base
|
|
reverse(Pm_base, Ra, Rlen, t0, t1);
|
|
|
|
leave();
|
|
bind(nothing);
|
|
ret(lr);
|
|
|
|
return entry;
|
|
}
|
|
// In C, approximately:
|
|
|
|
// void
|
|
// montgomery_multiply(julong Pa_base[], julong Pb_base[],
|
|
// julong Pn_base[], julong Pm_base[],
|
|
// julong inv, int len) {
|
|
// julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
|
|
// julong *Pa, *Pb, *Pn, *Pm;
|
|
// julong Ra, Rb, Rn, Rm;
|
|
|
|
// int i;
|
|
|
|
// assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
|
|
|
|
// for (i = 0; i < len; i++) {
|
|
// int j;
|
|
|
|
// Pa = Pa_base;
|
|
// Pb = Pb_base + i;
|
|
// Pm = Pm_base;
|
|
// Pn = Pn_base + i;
|
|
|
|
// Ra = *Pa;
|
|
// Rb = *Pb;
|
|
// Rm = *Pm;
|
|
// Rn = *Pn;
|
|
|
|
// int iters = i;
|
|
// for (j = 0; iters--; j++) {
|
|
// assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
|
|
// MACC(Ra, Rb, t0, t1, t2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
// assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
// }
|
|
|
|
// assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
|
|
// MACC(Ra, Rb, t0, t1, t2);
|
|
// *Pm = Rm = t0 * inv;
|
|
// assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
|
|
// assert(t0 == 0, "broken Montgomery multiply");
|
|
|
|
// t0 = t1; t1 = t2; t2 = 0;
|
|
// }
|
|
|
|
// for (i = len; i < 2*len; i++) {
|
|
// int j;
|
|
|
|
// Pa = Pa_base + i-len;
|
|
// Pb = Pb_base + len;
|
|
// Pm = Pm_base + i-len;
|
|
// Pn = Pn_base + len;
|
|
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
|
|
// int iters = len*2-i-1;
|
|
// for (j = i-len+1; iters--; j++) {
|
|
// assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
|
|
// MACC(Ra, Rb, t0, t1, t2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
// assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
// }
|
|
|
|
// Pm_base[i-len] = t0;
|
|
// t0 = t1; t1 = t2; t2 = 0;
|
|
// }
|
|
|
|
// while (t0)
|
|
// t0 = sub(Pm_base, Pn_base, t0, len);
|
|
// }
|
|
|
|
/**
|
|
* Fast Montgomery squaring. This uses asymptotically 25% fewer
|
|
* multiplies than Montgomery multiplication so it should be up to
|
|
* 25% faster. However, its loop control is more complex and it
|
|
* may actually run slower on some machines.
|
|
*
|
|
* Arguments:
|
|
*
|
|
* Inputs:
|
|
* c_rarg0 - int array elements a
|
|
* c_rarg1 - int array elements n (the modulus)
|
|
* c_rarg2 - int length
|
|
* c_rarg3 - int inv
|
|
* c_rarg4 - int array elements m (the result)
|
|
*
|
|
*/
|
|
address generate_square() {
|
|
Label argh;
|
|
bind(argh);
|
|
stop("MontgomeryMultiply total_allocation must be <= 8192");
|
|
|
|
align(CodeEntryAlignment);
|
|
address entry = pc();
|
|
|
|
enter();
|
|
|
|
// Make room.
|
|
cmpw(Rlen, 512);
|
|
br(Assembler::HI, argh);
|
|
sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
|
|
andr(sp, Ra, -2 * wordSize);
|
|
|
|
lsrw(Rlen, Rlen, 1); // length in longwords = len/2
|
|
|
|
{
|
|
// Copy input args, reversing as we go. We use Ra as a
|
|
// temporary variable.
|
|
reverse(Ra, Pa_base, Rlen, t0, t1);
|
|
reverse(Ra, Pn_base, Rlen, t0, t1);
|
|
}
|
|
|
|
// Push all call-saved registers and also Pm_base which we'll need
|
|
// at the end.
|
|
save_regs();
|
|
|
|
mov(Pm_base, Ra);
|
|
|
|
mov(t0, zr);
|
|
mov(t1, zr);
|
|
mov(t2, zr);
|
|
|
|
block_comment("for (int i = 0; i < len; i++) {");
|
|
mov(Ri, zr); {
|
|
Label loop, end;
|
|
bind(loop);
|
|
cmp(Ri, Rlen);
|
|
br(Assembler::GE, end);
|
|
|
|
pre1(Ri);
|
|
|
|
block_comment("for (j = (i+1)/2; j; j--) {"); {
|
|
add(Rj, Ri, 1);
|
|
lsr(Rj, Rj, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
|
|
} block_comment(" } // j");
|
|
|
|
last_squaring(Ri);
|
|
|
|
block_comment(" for (j = i/2; j; j--) {"); {
|
|
lsr(Rj, Ri, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
|
|
} block_comment(" } // j");
|
|
|
|
post1_squaring();
|
|
add(Ri, Ri, 1);
|
|
cmp(Ri, Rlen);
|
|
br(Assembler::LT, loop);
|
|
|
|
bind(end);
|
|
block_comment("} // i");
|
|
}
|
|
|
|
block_comment("for (int i = len; i < 2*len; i++) {");
|
|
mov(Ri, Rlen); {
|
|
Label loop, end;
|
|
bind(loop);
|
|
cmp(Ri, Rlen, Assembler::LSL, 1);
|
|
br(Assembler::GE, end);
|
|
|
|
pre2(Ri, Rlen);
|
|
|
|
block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
|
|
lsl(Rj, Rlen, 1);
|
|
sub(Rj, Rj, Ri);
|
|
sub(Rj, Rj, 1);
|
|
lsr(Rj, Rj, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
|
|
} block_comment(" } // j");
|
|
|
|
last_squaring(Ri);
|
|
|
|
block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
|
|
lsl(Rj, Rlen, 1);
|
|
sub(Rj, Rj, Ri);
|
|
lsr(Rj, Rj, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
|
|
} block_comment(" } // j");
|
|
|
|
post2(Ri, Rlen);
|
|
add(Ri, Ri, 1);
|
|
cmp(Ri, Rlen, Assembler::LSL, 1);
|
|
|
|
br(Assembler::LT, loop);
|
|
bind(end);
|
|
block_comment("} // i");
|
|
}
|
|
|
|
normalize(Rlen);
|
|
|
|
mov(Ra, Pm_base); // Save Pm_base in Ra
|
|
restore_regs(); // Restore caller's Pm_base
|
|
|
|
// Copy our result into caller's Pm_base
|
|
reverse(Pm_base, Ra, Rlen, t0, t1);
|
|
|
|
leave();
|
|
ret(lr);
|
|
|
|
return entry;
|
|
}
|
|
// In C, approximately:
|
|
|
|
// void
|
|
// montgomery_square(julong Pa_base[], julong Pn_base[],
|
|
// julong Pm_base[], julong inv, int len) {
|
|
// julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
|
|
// julong *Pa, *Pb, *Pn, *Pm;
|
|
// julong Ra, Rb, Rn, Rm;
|
|
|
|
// int i;
|
|
|
|
// assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
|
|
|
|
// for (i = 0; i < len; i++) {
|
|
// int j;
|
|
|
|
// Pa = Pa_base;
|
|
// Pb = Pa_base + i;
|
|
// Pm = Pm_base;
|
|
// Pn = Pn_base + i;
|
|
|
|
// Ra = *Pa;
|
|
// Rb = *Pb;
|
|
// Rm = *Pm;
|
|
// Rn = *Pn;
|
|
|
|
// int iters = (i+1)/2;
|
|
// for (j = 0; iters--; j++) {
|
|
// assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
|
|
// MACC2(Ra, Rb, t0, t1, t2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
// assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
// }
|
|
// if ((i & 1) == 0) {
|
|
// assert(Ra == Pa_base[j], "must be");
|
|
// MACC(Ra, Ra, t0, t1, t2);
|
|
// }
|
|
// iters = i/2;
|
|
// assert(iters == i-j, "must be");
|
|
// for (; iters--; j++) {
|
|
// assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
// }
|
|
|
|
// *Pm = Rm = t0 * inv;
|
|
// assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
|
|
// assert(t0 == 0, "broken Montgomery multiply");
|
|
|
|
// t0 = t1; t1 = t2; t2 = 0;
|
|
// }
|
|
|
|
// for (i = len; i < 2*len; i++) {
|
|
// int start = i-len+1;
|
|
// int end = start + (len - start)/2;
|
|
// int j;
|
|
|
|
// Pa = Pa_base + i-len;
|
|
// Pb = Pa_base + len;
|
|
// Pm = Pm_base + i-len;
|
|
// Pn = Pn_base + len;
|
|
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
|
|
// int iters = (2*len-i-1)/2;
|
|
// assert(iters == end-start, "must be");
|
|
// for (j = start; iters--; j++) {
|
|
// assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
|
|
// MACC2(Ra, Rb, t0, t1, t2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
// assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
// }
|
|
// if ((i & 1) == 0) {
|
|
// assert(Ra == Pa_base[j], "must be");
|
|
// MACC(Ra, Ra, t0, t1, t2);
|
|
// }
|
|
// iters = (2*len-i)/2;
|
|
// assert(iters == len-j, "must be");
|
|
// for (; iters--; j++) {
|
|
// assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
|
|
// MACC(Rm, Rn, t0, t1, t2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
// }
|
|
// Pm_base[i-len] = t0;
|
|
// t0 = t1; t1 = t2; t2 = 0;
|
|
// }
|
|
|
|
// while (t0)
|
|
// t0 = sub(Pm_base, Pn_base, t0, len);
|
|
// }
|
|
};
|
|
|
|
// Initialization
|
|
void generate_preuniverse_stubs() {
|
|
// preuniverse stubs are not needed for aarch64
|
|
}
|
|
|
|
void generate_initial_stubs() {
|
|
// Generate initial stubs and initializes the entry points
|
|
|
|
// entry points that exist in all platforms Note: This is code
|
|
// that could be shared among different platforms - however the
|
|
// benefit seems to be smaller than the disadvantage of having a
|
|
// much more complicated generator structure. See also comment in
|
|
// stubRoutines.hpp.
|
|
|
|
StubRoutines::_forward_exception_entry = generate_forward_exception();
|
|
|
|
StubRoutines::_call_stub_entry =
|
|
generate_call_stub(StubRoutines::_call_stub_return_address);
|
|
|
|
// is referenced by megamorphic call
|
|
StubRoutines::_catch_exception_entry = generate_catch_exception();
|
|
|
|
// Initialize table for copy memory (arraycopy) check.
|
|
if (UnsafeMemoryAccess::_table == nullptr) {
|
|
UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
|
|
}
|
|
|
|
if (UseCRC32Intrinsics) {
|
|
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
|
|
}
|
|
|
|
if (UseCRC32CIntrinsics) {
|
|
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
|
|
}
|
|
|
|
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
|
|
StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
|
|
}
|
|
|
|
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
|
|
StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
|
|
}
|
|
|
|
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
|
|
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
|
|
StubRoutines::_hf2f = generate_float16ToFloat();
|
|
StubRoutines::_f2hf = generate_floatToFloat16();
|
|
}
|
|
}
|
|
|
|
void generate_continuation_stubs() {
|
|
// Continuation stubs:
|
|
StubRoutines::_cont_thaw = generate_cont_thaw();
|
|
StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
|
|
StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
|
|
StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
|
|
}
|
|
|
|
void generate_final_stubs() {
|
|
// support for verify_oop (must happen after universe_init)
|
|
if (VerifyOops) {
|
|
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
|
}
|
|
|
|
// arraycopy stubs used by compilers
|
|
generate_arraycopy_stubs();
|
|
|
|
StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
|
|
|
|
StubRoutines::aarch64::_spin_wait = generate_spin_wait();
|
|
|
|
StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
|
|
StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
|
|
|
|
#if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
|
|
|
|
generate_atomic_entry_points();
|
|
|
|
#endif // LINUX
|
|
|
|
#ifdef COMPILER2
|
|
if (UseSecondarySupersTable) {
|
|
StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
|
|
if (! InlineSecondarySupersTest) {
|
|
generate_lookup_secondary_supers_table_stub();
|
|
}
|
|
}
|
|
#endif
|
|
|
|
StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
|
|
|
|
StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
|
|
}
|
|
|
|
void generate_compiler_stubs() {
|
|
#if COMPILER2_OR_JVMCI
|
|
|
|
if (UseSVE == 0) {
|
|
StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
|
|
}
|
|
|
|
// array equals stub for large arrays.
|
|
if (!UseSimpleArrayEquals) {
|
|
StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
|
|
}
|
|
|
|
// arrays_hascode stub for large arrays.
|
|
StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
|
|
StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
|
|
StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
|
|
StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
|
|
StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
|
|
|
|
// byte_array_inflate stub for large arrays.
|
|
StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
|
|
|
|
// countPositives stub for large arrays.
|
|
StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
|
|
|
|
generate_compare_long_strings();
|
|
|
|
generate_string_indexof_stubs();
|
|
|
|
#ifdef COMPILER2
|
|
if (UseMultiplyToLenIntrinsic) {
|
|
StubRoutines::_multiplyToLen = generate_multiplyToLen();
|
|
}
|
|
|
|
if (UseSquareToLenIntrinsic) {
|
|
StubRoutines::_squareToLen = generate_squareToLen();
|
|
}
|
|
|
|
if (UseMulAddIntrinsic) {
|
|
StubRoutines::_mulAdd = generate_mulAdd();
|
|
}
|
|
|
|
if (UseSIMDForBigIntegerShiftIntrinsics) {
|
|
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
|
|
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
|
|
}
|
|
|
|
if (UseMontgomeryMultiplyIntrinsic) {
|
|
StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
|
|
StubRoutines::_montgomeryMultiply = g.generate_multiply();
|
|
}
|
|
|
|
if (UseMontgomerySquareIntrinsic) {
|
|
StubId stub_id = StubId::stubgen_montgomerySquare_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
|
|
// We use generate_multiply() rather than generate_square()
|
|
// because it's faster for the sizes of modulus we care about.
|
|
StubRoutines::_montgomerySquare = g.generate_multiply();
|
|
}
|
|
|
|
#endif // COMPILER2
|
|
|
|
if (UseChaCha20Intrinsics) {
|
|
StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
|
|
}
|
|
|
|
if (UseKyberIntrinsics) {
|
|
StubRoutines::_kyberNtt = generate_kyberNtt();
|
|
StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
|
|
StubRoutines::_kyberNttMult = generate_kyberNttMult();
|
|
StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
|
|
StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
|
|
StubRoutines::_kyber12To16 = generate_kyber12To16();
|
|
StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
|
|
}
|
|
|
|
if (UseDilithiumIntrinsics) {
|
|
StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
|
|
StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
|
|
StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
|
|
StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
|
|
StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
|
|
}
|
|
|
|
if (UseBASE64Intrinsics) {
|
|
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
|
|
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
|
|
}
|
|
|
|
// data cache line writeback
|
|
StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
|
|
StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
|
|
|
|
if (UseAESIntrinsics) {
|
|
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
|
|
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
|
|
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
|
|
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
|
|
StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
|
|
}
|
|
if (UseGHASHIntrinsics) {
|
|
// StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
|
|
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
|
|
}
|
|
if (UseAESIntrinsics && UseGHASHIntrinsics) {
|
|
StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
|
|
}
|
|
|
|
if (UseMD5Intrinsics) {
|
|
StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
|
|
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
|
|
}
|
|
if (UseSHA1Intrinsics) {
|
|
StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
|
|
StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
|
|
}
|
|
if (UseSHA256Intrinsics) {
|
|
StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
|
|
StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
|
|
}
|
|
if (UseSHA512Intrinsics) {
|
|
StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
|
|
StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
|
|
}
|
|
if (UseSHA3Intrinsics) {
|
|
|
|
StubRoutines::_double_keccak = generate_double_keccak();
|
|
if (UseSIMDForSHA3Intrinsic) {
|
|
StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
|
|
StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
|
|
} else {
|
|
StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
|
|
StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
|
|
}
|
|
}
|
|
|
|
if (UsePoly1305Intrinsics) {
|
|
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
|
|
}
|
|
|
|
// generate Adler32 intrinsics code
|
|
if (UseAdler32Intrinsics) {
|
|
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
|
|
}
|
|
|
|
#endif // COMPILER2_OR_JVMCI
|
|
}
|
|
|
|
public:
|
|
StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
|
|
switch(blob_id) {
|
|
case BlobId::stubgen_preuniverse_id:
|
|
generate_preuniverse_stubs();
|
|
break;
|
|
case BlobId::stubgen_initial_id:
|
|
generate_initial_stubs();
|
|
break;
|
|
case BlobId::stubgen_continuation_id:
|
|
generate_continuation_stubs();
|
|
break;
|
|
case BlobId::stubgen_compiler_id:
|
|
generate_compiler_stubs();
|
|
break;
|
|
case BlobId::stubgen_final_id:
|
|
generate_final_stubs();
|
|
break;
|
|
default:
|
|
fatal("unexpected blob id: %s", StubInfo::name(blob_id));
|
|
break;
|
|
};
|
|
}
|
|
}; // end class declaration
|
|
|
|
void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
|
|
StubGenerator g(code, blob_id);
|
|
}
|
|
|
|
|
|
#if defined (LINUX)
|
|
|
|
// Define pointers to atomic stubs and initialize them to point to the
|
|
// code in atomic_aarch64.S.
|
|
|
|
#define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
|
|
extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
|
|
(volatile void *ptr, uint64_t arg1, uint64_t arg2); \
|
|
aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
|
|
= aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
|
|
|
|
DEFAULT_ATOMIC_OP(fetch_add, 4, )
|
|
DEFAULT_ATOMIC_OP(fetch_add, 8, )
|
|
DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
|
|
DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
|
|
DEFAULT_ATOMIC_OP(xchg, 4, )
|
|
DEFAULT_ATOMIC_OP(xchg, 8, )
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 1, )
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 4, )
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 8, )
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
|
|
DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
|
|
|
|
#undef DEFAULT_ATOMIC_OP
|
|
|
|
#endif // LINUX
|