mirror of
https://github.com/openjdk/jdk.git
synced 2026-04-24 22:00:44 +00:00
6733 lines
225 KiB
C++
6733 lines
225 KiB
C++
/*
|
|
* Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
|
|
* Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*
|
|
*/
|
|
|
|
#include "asm/macroAssembler.hpp"
|
|
#include "asm/macroAssembler.inline.hpp"
|
|
#include "compiler/oopMap.hpp"
|
|
#include "gc/shared/barrierSet.hpp"
|
|
#include "gc/shared/barrierSetAssembler.hpp"
|
|
#include "interpreter/interpreter.hpp"
|
|
#include "memory/universe.hpp"
|
|
#include "nativeInst_riscv.hpp"
|
|
#include "oops/instanceOop.hpp"
|
|
#include "oops/method.hpp"
|
|
#include "oops/objArrayKlass.hpp"
|
|
#include "oops/oop.inline.hpp"
|
|
#include "prims/methodHandles.hpp"
|
|
#include "prims/upcallLinker.hpp"
|
|
#include "runtime/continuation.hpp"
|
|
#include "runtime/continuationEntry.inline.hpp"
|
|
#include "runtime/frame.inline.hpp"
|
|
#include "runtime/handles.inline.hpp"
|
|
#include "runtime/javaThread.hpp"
|
|
#include "runtime/sharedRuntime.hpp"
|
|
#include "runtime/stubCodeGenerator.hpp"
|
|
#include "runtime/stubRoutines.hpp"
|
|
#include "utilities/align.hpp"
|
|
#include "utilities/powerOfTwo.hpp"
|
|
#ifdef COMPILER2
|
|
#include "opto/runtime.hpp"
|
|
#endif
|
|
|
|
// Declaration and definition of StubGenerator (no .hpp file).
|
|
// For a more detailed description of the stub routine structure
|
|
// see the comment in stubRoutines.hpp
|
|
|
|
#undef __
|
|
#define __ _masm->
|
|
|
|
#ifdef PRODUCT
|
|
#define BLOCK_COMMENT(str) /* nothing */
|
|
#else
|
|
#define BLOCK_COMMENT(str) __ block_comment(str)
|
|
#endif
|
|
|
|
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
|
|
|
|
// Stub Code definitions
|
|
|
|
class StubGenerator: public StubCodeGenerator {
|
|
private:
|
|
|
|
#ifdef PRODUCT
|
|
#define inc_counter_np(counter) ((void)0)
|
|
#else
|
|
void inc_counter_np_(uint& counter) {
|
|
__ incrementw(ExternalAddress((address)&counter));
|
|
}
|
|
#define inc_counter_np(counter) \
|
|
BLOCK_COMMENT("inc_counter " #counter); \
|
|
inc_counter_np_(counter);
|
|
#endif
|
|
|
|
// Call stubs are used to call Java from C
|
|
//
|
|
// Arguments:
|
|
// c_rarg0: call wrapper address address
|
|
// c_rarg1: result address
|
|
// c_rarg2: result type BasicType
|
|
// c_rarg3: method Method*
|
|
// c_rarg4: (interpreter) entry point address
|
|
// c_rarg5: parameters intptr_t*
|
|
// c_rarg6: parameter size (in words) int
|
|
// c_rarg7: thread Thread*
|
|
//
|
|
// There is no return from the stub itself as any Java result
|
|
// is written to result
|
|
//
|
|
// we save x1 (ra) as the return PC at the base of the frame and
|
|
// link x8 (fp) below it as the frame pointer installing sp (x2)
|
|
// into fp.
|
|
//
|
|
// we save x10-x17, which accounts for all the c arguments.
|
|
//
|
|
// TODO: strictly do we need to save them all? they are treated as
|
|
// volatile by C so could we omit saving the ones we are going to
|
|
// place in global registers (thread? method?) or those we only use
|
|
// during setup of the Java call?
|
|
//
|
|
// we don't need to save x5 which C uses as an indirect result location
|
|
// return register.
|
|
//
|
|
// we don't need to save x6-x7 and x28-x31 which both C and Java treat as
|
|
// volatile
|
|
//
|
|
// we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
|
|
// registers and C expects to be callee-save
|
|
//
|
|
// so the stub frame looks like this when we enter Java code
|
|
//
|
|
// [ return_from_Java ] <--- sp
|
|
// [ argument word n ]
|
|
// ...
|
|
// -35 [ argument word 1 ]
|
|
// -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
|
|
// -33 [ saved f27 ]
|
|
// -32 [ saved f26 ]
|
|
// -31 [ saved f25 ]
|
|
// -30 [ saved f24 ]
|
|
// -29 [ saved f23 ]
|
|
// -28 [ saved f22 ]
|
|
// -27 [ saved f21 ]
|
|
// -26 [ saved f20 ]
|
|
// -25 [ saved f19 ]
|
|
// -24 [ saved f18 ]
|
|
// -23 [ saved f9 ]
|
|
// -22 [ saved f8 ]
|
|
// -21 [ saved x27 ]
|
|
// -20 [ saved x26 ]
|
|
// -19 [ saved x25 ]
|
|
// -18 [ saved x24 ]
|
|
// -17 [ saved x23 ]
|
|
// -16 [ saved x22 ]
|
|
// -15 [ saved x21 ]
|
|
// -14 [ saved x20 ]
|
|
// -13 [ saved x19 ]
|
|
// -12 [ saved x18 ]
|
|
// -11 [ saved x9 ]
|
|
// -10 [ call wrapper (x10) ]
|
|
// -9 [ result (x11) ]
|
|
// -8 [ result type (x12) ]
|
|
// -7 [ method (x13) ]
|
|
// -6 [ entry point (x14) ]
|
|
// -5 [ parameters (x15) ]
|
|
// -4 [ parameter size (x16) ]
|
|
// -3 [ thread (x17) ]
|
|
// -2 [ saved fp (x8) ]
|
|
// -1 [ saved ra (x1) ]
|
|
// 0 [ ] <--- fp == saved sp (x2)
|
|
|
|
// Call stub stack layout word offsets from fp
|
|
enum call_stub_layout {
|
|
sp_after_call_off = -34,
|
|
|
|
frm_off = sp_after_call_off,
|
|
f27_off = -33,
|
|
f26_off = -32,
|
|
f25_off = -31,
|
|
f24_off = -30,
|
|
f23_off = -29,
|
|
f22_off = -28,
|
|
f21_off = -27,
|
|
f20_off = -26,
|
|
f19_off = -25,
|
|
f18_off = -24,
|
|
f9_off = -23,
|
|
f8_off = -22,
|
|
|
|
x27_off = -21,
|
|
x26_off = -20,
|
|
x25_off = -19,
|
|
x24_off = -18,
|
|
x23_off = -17,
|
|
x22_off = -16,
|
|
x21_off = -15,
|
|
x20_off = -14,
|
|
x19_off = -13,
|
|
x18_off = -12,
|
|
x9_off = -11,
|
|
|
|
call_wrapper_off = -10,
|
|
result_off = -9,
|
|
result_type_off = -8,
|
|
method_off = -7,
|
|
entry_point_off = -6,
|
|
parameters_off = -5,
|
|
parameter_size_off = -4,
|
|
thread_off = -3,
|
|
fp_f = -2,
|
|
retaddr_off = -1,
|
|
};
|
|
|
|
address generate_call_stub(address& return_address) {
|
|
assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
|
|
(int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
|
|
"adjust this code");
|
|
|
|
StubGenStubId stub_id = StubGenStubId::call_stub_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Address sp_after_call (fp, sp_after_call_off * wordSize);
|
|
|
|
const Address frm_save (fp, frm_off * wordSize);
|
|
const Address call_wrapper (fp, call_wrapper_off * wordSize);
|
|
const Address result (fp, result_off * wordSize);
|
|
const Address result_type (fp, result_type_off * wordSize);
|
|
const Address method (fp, method_off * wordSize);
|
|
const Address entry_point (fp, entry_point_off * wordSize);
|
|
const Address parameters (fp, parameters_off * wordSize);
|
|
const Address parameter_size(fp, parameter_size_off * wordSize);
|
|
|
|
const Address thread (fp, thread_off * wordSize);
|
|
|
|
const Address f27_save (fp, f27_off * wordSize);
|
|
const Address f26_save (fp, f26_off * wordSize);
|
|
const Address f25_save (fp, f25_off * wordSize);
|
|
const Address f24_save (fp, f24_off * wordSize);
|
|
const Address f23_save (fp, f23_off * wordSize);
|
|
const Address f22_save (fp, f22_off * wordSize);
|
|
const Address f21_save (fp, f21_off * wordSize);
|
|
const Address f20_save (fp, f20_off * wordSize);
|
|
const Address f19_save (fp, f19_off * wordSize);
|
|
const Address f18_save (fp, f18_off * wordSize);
|
|
const Address f9_save (fp, f9_off * wordSize);
|
|
const Address f8_save (fp, f8_off * wordSize);
|
|
|
|
const Address x27_save (fp, x27_off * wordSize);
|
|
const Address x26_save (fp, x26_off * wordSize);
|
|
const Address x25_save (fp, x25_off * wordSize);
|
|
const Address x24_save (fp, x24_off * wordSize);
|
|
const Address x23_save (fp, x23_off * wordSize);
|
|
const Address x22_save (fp, x22_off * wordSize);
|
|
const Address x21_save (fp, x21_off * wordSize);
|
|
const Address x20_save (fp, x20_off * wordSize);
|
|
const Address x19_save (fp, x19_off * wordSize);
|
|
const Address x18_save (fp, x18_off * wordSize);
|
|
|
|
const Address x9_save (fp, x9_off * wordSize);
|
|
|
|
// stub code
|
|
|
|
address riscv_entry = __ pc();
|
|
|
|
// set up frame and move sp to end of save area
|
|
__ enter();
|
|
__ addi(sp, fp, sp_after_call_off * wordSize);
|
|
|
|
// save register parameters and Java temporary/global registers
|
|
// n.b. we save thread even though it gets installed in
|
|
// xthread because we want to sanity check tp later
|
|
__ sd(c_rarg7, thread);
|
|
__ sw(c_rarg6, parameter_size);
|
|
__ sd(c_rarg5, parameters);
|
|
__ sd(c_rarg4, entry_point);
|
|
__ sd(c_rarg3, method);
|
|
__ sd(c_rarg2, result_type);
|
|
__ sd(c_rarg1, result);
|
|
__ sd(c_rarg0, call_wrapper);
|
|
|
|
__ sd(x9, x9_save);
|
|
|
|
__ sd(x18, x18_save);
|
|
__ sd(x19, x19_save);
|
|
__ sd(x20, x20_save);
|
|
__ sd(x21, x21_save);
|
|
__ sd(x22, x22_save);
|
|
__ sd(x23, x23_save);
|
|
__ sd(x24, x24_save);
|
|
__ sd(x25, x25_save);
|
|
__ sd(x26, x26_save);
|
|
__ sd(x27, x27_save);
|
|
|
|
__ fsd(f8, f8_save);
|
|
__ fsd(f9, f9_save);
|
|
__ fsd(f18, f18_save);
|
|
__ fsd(f19, f19_save);
|
|
__ fsd(f20, f20_save);
|
|
__ fsd(f21, f21_save);
|
|
__ fsd(f22, f22_save);
|
|
__ fsd(f23, f23_save);
|
|
__ fsd(f24, f24_save);
|
|
__ fsd(f25, f25_save);
|
|
__ fsd(f26, f26_save);
|
|
__ fsd(f27, f27_save);
|
|
|
|
__ frrm(t0);
|
|
__ sd(t0, frm_save);
|
|
// Set frm to the state we need. We do want Round to Nearest. We
|
|
// don't want non-IEEE rounding modes.
|
|
Label skip_fsrmi;
|
|
guarantee(__ RoundingMode::rne == 0, "must be");
|
|
__ beqz(t0, skip_fsrmi);
|
|
__ fsrmi(__ RoundingMode::rne);
|
|
__ bind(skip_fsrmi);
|
|
|
|
// install Java thread in global register now we have saved
|
|
// whatever value it held
|
|
__ mv(xthread, c_rarg7);
|
|
|
|
// And method
|
|
__ mv(xmethod, c_rarg3);
|
|
|
|
// set up the heapbase register
|
|
__ reinit_heapbase();
|
|
|
|
#ifdef ASSERT
|
|
// make sure we have no pending exceptions
|
|
{
|
|
Label L;
|
|
__ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
|
|
__ beqz(t0, L);
|
|
__ stop("StubRoutines::call_stub: entered with pending exception");
|
|
__ BIND(L);
|
|
}
|
|
#endif
|
|
// pass parameters if any
|
|
__ mv(esp, sp);
|
|
__ slli(t0, c_rarg6, LogBytesPerWord);
|
|
__ sub(t0, sp, t0); // Move SP out of the way
|
|
__ andi(sp, t0, -2 * wordSize);
|
|
|
|
BLOCK_COMMENT("pass parameters if any");
|
|
Label parameters_done;
|
|
// parameter count is still in c_rarg6
|
|
// and parameter pointer identifying param 1 is in c_rarg5
|
|
__ beqz(c_rarg6, parameters_done);
|
|
|
|
address loop = __ pc();
|
|
__ ld(t0, Address(c_rarg5, 0));
|
|
__ addi(c_rarg5, c_rarg5, wordSize);
|
|
__ subi(c_rarg6, c_rarg6, 1);
|
|
__ push_reg(t0);
|
|
__ bgtz(c_rarg6, loop);
|
|
|
|
__ BIND(parameters_done);
|
|
|
|
// call Java entry -- passing methdoOop, and current sp
|
|
// xmethod: Method*
|
|
// x19_sender_sp: sender sp
|
|
BLOCK_COMMENT("call Java function");
|
|
__ mv(x19_sender_sp, sp);
|
|
__ jalr(c_rarg4);
|
|
|
|
// save current address for use by exception handling code
|
|
|
|
return_address = __ pc();
|
|
|
|
// store result depending on type (everything that is not
|
|
// T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
|
|
// n.b. this assumes Java returns an integral result in x10
|
|
// and a floating result in j_farg0
|
|
__ ld(j_rarg2, result);
|
|
Label is_long, is_float, is_double, exit;
|
|
__ ld(j_rarg1, result_type);
|
|
__ mv(t0, (u1)T_OBJECT);
|
|
__ beq(j_rarg1, t0, is_long);
|
|
__ mv(t0, (u1)T_LONG);
|
|
__ beq(j_rarg1, t0, is_long);
|
|
__ mv(t0, (u1)T_FLOAT);
|
|
__ beq(j_rarg1, t0, is_float);
|
|
__ mv(t0, (u1)T_DOUBLE);
|
|
__ beq(j_rarg1, t0, is_double);
|
|
|
|
// handle T_INT case
|
|
__ sw(x10, Address(j_rarg2));
|
|
|
|
__ BIND(exit);
|
|
|
|
// pop parameters
|
|
__ addi(esp, fp, sp_after_call_off * wordSize);
|
|
|
|
#ifdef ASSERT
|
|
// verify that threads correspond
|
|
{
|
|
Label L, S;
|
|
__ ld(t0, thread);
|
|
__ bne(xthread, t0, S);
|
|
__ get_thread(t0);
|
|
__ beq(xthread, t0, L);
|
|
__ BIND(S);
|
|
__ stop("StubRoutines::call_stub: threads must correspond");
|
|
__ BIND(L);
|
|
}
|
|
#endif
|
|
|
|
__ pop_cont_fastpath(xthread);
|
|
|
|
// restore callee-save registers
|
|
__ fld(f27, f27_save);
|
|
__ fld(f26, f26_save);
|
|
__ fld(f25, f25_save);
|
|
__ fld(f24, f24_save);
|
|
__ fld(f23, f23_save);
|
|
__ fld(f22, f22_save);
|
|
__ fld(f21, f21_save);
|
|
__ fld(f20, f20_save);
|
|
__ fld(f19, f19_save);
|
|
__ fld(f18, f18_save);
|
|
__ fld(f9, f9_save);
|
|
__ fld(f8, f8_save);
|
|
|
|
__ ld(x27, x27_save);
|
|
__ ld(x26, x26_save);
|
|
__ ld(x25, x25_save);
|
|
__ ld(x24, x24_save);
|
|
__ ld(x23, x23_save);
|
|
__ ld(x22, x22_save);
|
|
__ ld(x21, x21_save);
|
|
__ ld(x20, x20_save);
|
|
__ ld(x19, x19_save);
|
|
__ ld(x18, x18_save);
|
|
|
|
__ ld(x9, x9_save);
|
|
|
|
// restore frm
|
|
Label skip_fsrm;
|
|
__ ld(t0, frm_save);
|
|
__ frrm(t1);
|
|
__ beq(t0, t1, skip_fsrm);
|
|
__ fsrm(t0);
|
|
__ bind(skip_fsrm);
|
|
|
|
__ ld(c_rarg0, call_wrapper);
|
|
__ ld(c_rarg1, result);
|
|
__ ld(c_rarg2, result_type);
|
|
__ ld(c_rarg3, method);
|
|
__ ld(c_rarg4, entry_point);
|
|
__ ld(c_rarg5, parameters);
|
|
__ ld(c_rarg6, parameter_size);
|
|
__ ld(c_rarg7, thread);
|
|
|
|
// leave frame and return to caller
|
|
__ leave();
|
|
__ ret();
|
|
|
|
// handle return types different from T_INT
|
|
|
|
__ BIND(is_long);
|
|
__ sd(x10, Address(j_rarg2, 0));
|
|
__ j(exit);
|
|
|
|
__ BIND(is_float);
|
|
__ fsw(j_farg0, Address(j_rarg2, 0), t0);
|
|
__ j(exit);
|
|
|
|
__ BIND(is_double);
|
|
__ fsd(j_farg0, Address(j_rarg2, 0), t0);
|
|
__ j(exit);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Return point for a Java call if there's an exception thrown in
|
|
// Java code. The exception is caught and transformed into a
|
|
// pending exception stored in JavaThread that can be tested from
|
|
// within the VM.
|
|
//
|
|
// Note: Usually the parameters are removed by the callee. In case
|
|
// of an exception crossing an activation frame boundary, that is
|
|
// not the case if the callee is compiled code => need to setup the
|
|
// sp.
|
|
//
|
|
// x10: exception oop
|
|
|
|
address generate_catch_exception() {
|
|
StubGenStubId stub_id = StubGenStubId::catch_exception_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
// same as in generate_call_stub():
|
|
const Address thread(fp, thread_off * wordSize);
|
|
|
|
#ifdef ASSERT
|
|
// verify that threads correspond
|
|
{
|
|
Label L, S;
|
|
__ ld(t0, thread);
|
|
__ bne(xthread, t0, S);
|
|
__ get_thread(t0);
|
|
__ beq(xthread, t0, L);
|
|
__ bind(S);
|
|
__ stop("StubRoutines::catch_exception: threads must correspond");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
// set pending exception
|
|
__ verify_oop(x10);
|
|
|
|
__ sd(x10, Address(xthread, Thread::pending_exception_offset()));
|
|
__ mv(t0, (address)__FILE__);
|
|
__ sd(t0, Address(xthread, Thread::exception_file_offset()));
|
|
__ mv(t0, (int)__LINE__);
|
|
__ sw(t0, Address(xthread, Thread::exception_line_offset()));
|
|
|
|
// complete return to VM
|
|
assert(StubRoutines::_call_stub_return_address != nullptr,
|
|
"_call_stub_return_address must have been generated before");
|
|
__ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
|
|
|
|
return start;
|
|
}
|
|
|
|
// Continuation point for runtime calls returning with a pending
|
|
// exception. The pending exception check happened in the runtime
|
|
// or native call stub. The pending exception in Thread is
|
|
// converted into a Java-level exception.
|
|
//
|
|
// Contract with Java-level exception handlers:
|
|
// x10: exception
|
|
// x13: throwing pc
|
|
//
|
|
// NOTE: At entry of this stub, exception-pc must be in RA !!
|
|
|
|
// NOTE: this is always used as a jump target within generated code
|
|
// so it just needs to be generated code with no x86 prolog
|
|
|
|
address generate_forward_exception() {
|
|
StubGenStubId stub_id = StubGenStubId::forward_exception_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
// Upon entry, RA points to the return address returning into
|
|
// Java (interpreted or compiled) code; i.e., the return address
|
|
// becomes the throwing pc.
|
|
//
|
|
// Arguments pushed before the runtime call are still on the stack
|
|
// but the exception handler will reset the stack pointer ->
|
|
// ignore them. A potential result in registers can be ignored as
|
|
// well.
|
|
|
|
#ifdef ASSERT
|
|
// make sure this code is only executed if there is a pending exception
|
|
{
|
|
Label L;
|
|
__ ld(t0, Address(xthread, Thread::pending_exception_offset()));
|
|
__ bnez(t0, L);
|
|
__ stop("StubRoutines::forward exception: no pending exception (1)");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
// compute exception handler into x9
|
|
|
|
// call the VM to find the handler address associated with the
|
|
// caller address. pass thread in x10 and caller pc (ret address)
|
|
// in x11. n.b. the caller pc is in ra, unlike x86 where it is on
|
|
// the stack.
|
|
__ mv(c_rarg1, ra);
|
|
// ra will be trashed by the VM call so we move it to x9
|
|
// (callee-saved) because we also need to pass it to the handler
|
|
// returned by this call.
|
|
__ mv(x9, ra);
|
|
BLOCK_COMMENT("call exception_handler_for_return_address");
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
|
|
SharedRuntime::exception_handler_for_return_address),
|
|
xthread, c_rarg1);
|
|
// we should not really care that ra is no longer the callee
|
|
// address. we saved the value the handler needs in x9 so we can
|
|
// just copy it to x13. however, the C2 handler will push its own
|
|
// frame and then calls into the VM and the VM code asserts that
|
|
// the PC for the frame above the handler belongs to a compiled
|
|
// Java method. So, we restore ra here to satisfy that assert.
|
|
__ mv(ra, x9);
|
|
// setup x10 & x13 & clear pending exception
|
|
__ mv(x13, x9);
|
|
__ mv(x9, x10);
|
|
__ ld(x10, Address(xthread, Thread::pending_exception_offset()));
|
|
__ sd(zr, Address(xthread, Thread::pending_exception_offset()));
|
|
|
|
#ifdef ASSERT
|
|
// make sure exception is set
|
|
{
|
|
Label L;
|
|
__ bnez(x10, L);
|
|
__ stop("StubRoutines::forward exception: no pending exception (2)");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
// continue at exception handler
|
|
// x10: exception
|
|
// x13: throwing pc
|
|
// x9: exception handler
|
|
__ verify_oop(x10);
|
|
__ jr(x9);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Non-destructive plausibility checks for oops
|
|
//
|
|
// Arguments:
|
|
// x10: oop to verify
|
|
// t0: error message
|
|
//
|
|
// Stack after saving c_rarg3:
|
|
// [tos + 0]: saved c_rarg3
|
|
// [tos + 1]: saved c_rarg2
|
|
// [tos + 2]: saved ra
|
|
// [tos + 3]: saved t1
|
|
// [tos + 4]: saved x10
|
|
// [tos + 5]: saved t0
|
|
address generate_verify_oop() {
|
|
|
|
StubGenStubId stub_id = StubGenStubId::verify_oop_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Label exit, error;
|
|
|
|
__ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
|
|
|
|
__ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
|
|
__ ld(c_rarg3, Address(c_rarg2));
|
|
__ addi(c_rarg3, c_rarg3, 1);
|
|
__ sd(c_rarg3, Address(c_rarg2));
|
|
|
|
// object is in x10
|
|
// make sure object is 'reasonable'
|
|
__ beqz(x10, exit); // if obj is null it is OK
|
|
|
|
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
|
|
|
|
// return if everything seems ok
|
|
__ bind(exit);
|
|
|
|
__ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
|
|
__ ret();
|
|
|
|
// handle errors
|
|
__ bind(error);
|
|
__ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
|
|
|
|
__ push_reg(RegSet::range(x0, x31), sp);
|
|
// debug(char* msg, int64_t pc, int64_t regs[])
|
|
__ mv(c_rarg0, t0); // pass address of error message
|
|
__ mv(c_rarg1, ra); // pass return address
|
|
__ mv(c_rarg2, sp); // pass address of regs on stack
|
|
#ifndef PRODUCT
|
|
assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
|
|
#endif
|
|
BLOCK_COMMENT("call MacroAssembler::debug");
|
|
__ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
|
|
__ ebreak();
|
|
|
|
return start;
|
|
}
|
|
|
|
// The inner part of zero_words().
|
|
//
|
|
// Inputs:
|
|
// x28: the HeapWord-aligned base address of an array to zero.
|
|
// x29: the count in HeapWords, x29 > 0.
|
|
//
|
|
// Returns x28 and x29, adjusted for the caller to clear.
|
|
// x28: the base address of the tail of words left to clear.
|
|
// x29: the number of words in the tail.
|
|
// x29 < MacroAssembler::zero_words_block_size.
|
|
|
|
address generate_zero_blocks() {
|
|
Label done;
|
|
|
|
const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
if (UseBlockZeroing) {
|
|
// Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
|
|
// after alignment.
|
|
Label small;
|
|
int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
|
|
__ mv(tmp1, low_limit);
|
|
__ blt(cnt, tmp1, small);
|
|
__ zero_dcache_blocks(base, cnt, tmp1, tmp2);
|
|
__ bind(small);
|
|
}
|
|
|
|
{
|
|
// Clear the remaining blocks.
|
|
Label loop;
|
|
__ mv(tmp1, MacroAssembler::zero_words_block_size);
|
|
__ blt(cnt, tmp1, done);
|
|
__ bind(loop);
|
|
for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
|
|
__ sd(zr, Address(base, i * wordSize));
|
|
}
|
|
__ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
|
|
__ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
|
|
__ bge(cnt, tmp1, loop);
|
|
__ bind(done);
|
|
}
|
|
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
typedef enum {
|
|
copy_forwards = 1,
|
|
copy_backwards = -1
|
|
} copy_direction;
|
|
|
|
// Bulk copy of blocks of 8 words.
|
|
//
|
|
// count is a count of words.
|
|
//
|
|
// Precondition: count >= 8
|
|
//
|
|
// Postconditions:
|
|
//
|
|
// The least significant bit of count contains the remaining count
|
|
// of words to copy. The rest of count is trash.
|
|
//
|
|
// s and d are adjusted to point to the remaining words to copy
|
|
//
|
|
void generate_copy_longs(StubGenStubId stub_id, Label &start,
|
|
Register s, Register d, Register count) {
|
|
BasicType type;
|
|
copy_direction direction;
|
|
switch (stub_id) {
|
|
case copy_byte_f_id:
|
|
direction = copy_forwards;
|
|
type = T_BYTE;
|
|
break;
|
|
case copy_byte_b_id:
|
|
direction = copy_backwards;
|
|
type = T_BYTE;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
int unit = wordSize * direction;
|
|
int bias = wordSize;
|
|
|
|
const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
|
|
tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
|
|
|
|
const Register stride = x30;
|
|
|
|
assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
|
|
tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
|
|
assert_different_registers(s, d, count, t0);
|
|
|
|
Label again, drain;
|
|
StubCodeMark mark(this, stub_id);
|
|
__ align(CodeEntryAlignment);
|
|
__ bind(start);
|
|
|
|
if (direction == copy_forwards) {
|
|
__ sub(s, s, bias);
|
|
__ sub(d, d, bias);
|
|
}
|
|
|
|
#ifdef ASSERT
|
|
// Make sure we are never given < 8 words
|
|
{
|
|
Label L;
|
|
|
|
__ mv(t0, 8);
|
|
__ bge(count, t0, L);
|
|
__ stop("genrate_copy_longs called with < 8 words");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
__ ld(tmp_reg0, Address(s, 1 * unit));
|
|
__ ld(tmp_reg1, Address(s, 2 * unit));
|
|
__ ld(tmp_reg2, Address(s, 3 * unit));
|
|
__ ld(tmp_reg3, Address(s, 4 * unit));
|
|
__ ld(tmp_reg4, Address(s, 5 * unit));
|
|
__ ld(tmp_reg5, Address(s, 6 * unit));
|
|
__ ld(tmp_reg6, Address(s, 7 * unit));
|
|
__ ld(tmp_reg7, Address(s, 8 * unit));
|
|
__ addi(s, s, 8 * unit);
|
|
|
|
__ subi(count, count, 16);
|
|
__ bltz(count, drain);
|
|
|
|
__ bind(again);
|
|
|
|
__ sd(tmp_reg0, Address(d, 1 * unit));
|
|
__ sd(tmp_reg1, Address(d, 2 * unit));
|
|
__ sd(tmp_reg2, Address(d, 3 * unit));
|
|
__ sd(tmp_reg3, Address(d, 4 * unit));
|
|
__ sd(tmp_reg4, Address(d, 5 * unit));
|
|
__ sd(tmp_reg5, Address(d, 6 * unit));
|
|
__ sd(tmp_reg6, Address(d, 7 * unit));
|
|
__ sd(tmp_reg7, Address(d, 8 * unit));
|
|
|
|
__ ld(tmp_reg0, Address(s, 1 * unit));
|
|
__ ld(tmp_reg1, Address(s, 2 * unit));
|
|
__ ld(tmp_reg2, Address(s, 3 * unit));
|
|
__ ld(tmp_reg3, Address(s, 4 * unit));
|
|
__ ld(tmp_reg4, Address(s, 5 * unit));
|
|
__ ld(tmp_reg5, Address(s, 6 * unit));
|
|
__ ld(tmp_reg6, Address(s, 7 * unit));
|
|
__ ld(tmp_reg7, Address(s, 8 * unit));
|
|
|
|
__ addi(s, s, 8 * unit);
|
|
__ addi(d, d, 8 * unit);
|
|
|
|
__ subi(count, count, 8);
|
|
__ bgez(count, again);
|
|
|
|
// Drain
|
|
__ bind(drain);
|
|
|
|
__ sd(tmp_reg0, Address(d, 1 * unit));
|
|
__ sd(tmp_reg1, Address(d, 2 * unit));
|
|
__ sd(tmp_reg2, Address(d, 3 * unit));
|
|
__ sd(tmp_reg3, Address(d, 4 * unit));
|
|
__ sd(tmp_reg4, Address(d, 5 * unit));
|
|
__ sd(tmp_reg5, Address(d, 6 * unit));
|
|
__ sd(tmp_reg6, Address(d, 7 * unit));
|
|
__ sd(tmp_reg7, Address(d, 8 * unit));
|
|
__ addi(d, d, 8 * unit);
|
|
|
|
{
|
|
Label L1, L2;
|
|
__ test_bit(t0, count, 2);
|
|
__ beqz(t0, L1);
|
|
|
|
__ ld(tmp_reg0, Address(s, 1 * unit));
|
|
__ ld(tmp_reg1, Address(s, 2 * unit));
|
|
__ ld(tmp_reg2, Address(s, 3 * unit));
|
|
__ ld(tmp_reg3, Address(s, 4 * unit));
|
|
__ addi(s, s, 4 * unit);
|
|
|
|
__ sd(tmp_reg0, Address(d, 1 * unit));
|
|
__ sd(tmp_reg1, Address(d, 2 * unit));
|
|
__ sd(tmp_reg2, Address(d, 3 * unit));
|
|
__ sd(tmp_reg3, Address(d, 4 * unit));
|
|
__ addi(d, d, 4 * unit);
|
|
|
|
__ bind(L1);
|
|
|
|
if (direction == copy_forwards) {
|
|
__ addi(s, s, bias);
|
|
__ addi(d, d, bias);
|
|
}
|
|
|
|
__ test_bit(t0, count, 1);
|
|
__ beqz(t0, L2);
|
|
if (direction == copy_backwards) {
|
|
__ addi(s, s, 2 * unit);
|
|
__ ld(tmp_reg0, Address(s));
|
|
__ ld(tmp_reg1, Address(s, wordSize));
|
|
__ addi(d, d, 2 * unit);
|
|
__ sd(tmp_reg0, Address(d));
|
|
__ sd(tmp_reg1, Address(d, wordSize));
|
|
} else {
|
|
__ ld(tmp_reg0, Address(s));
|
|
__ ld(tmp_reg1, Address(s, wordSize));
|
|
__ addi(s, s, 2 * unit);
|
|
__ sd(tmp_reg0, Address(d));
|
|
__ sd(tmp_reg1, Address(d, wordSize));
|
|
__ addi(d, d, 2 * unit);
|
|
}
|
|
__ bind(L2);
|
|
}
|
|
|
|
__ ret();
|
|
}
|
|
|
|
Label copy_f, copy_b;
|
|
|
|
typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
|
|
|
|
void copy_memory_v(Register s, Register d, Register count, int step) {
|
|
bool is_backward = step < 0;
|
|
int granularity = uabs(step);
|
|
|
|
const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
|
|
assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
|
|
Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
|
|
Label loop_forward, loop_backward, done;
|
|
|
|
__ mv(dst, d);
|
|
__ mv(src, s);
|
|
__ mv(cnt, count);
|
|
|
|
__ bind(loop_forward);
|
|
__ vsetvli(vl, cnt, sew, Assembler::m8);
|
|
if (is_backward) {
|
|
__ bne(vl, cnt, loop_backward);
|
|
}
|
|
|
|
__ vlex_v(v0, src, sew);
|
|
__ sub(cnt, cnt, vl);
|
|
if (sew != Assembler::e8) {
|
|
// when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
|
|
__ slli(vl, vl, sew);
|
|
}
|
|
__ add(src, src, vl);
|
|
|
|
__ vsex_v(v0, dst, sew);
|
|
__ add(dst, dst, vl);
|
|
__ bnez(cnt, loop_forward);
|
|
|
|
if (is_backward) {
|
|
__ j(done);
|
|
|
|
__ bind(loop_backward);
|
|
__ sub(t0, cnt, vl);
|
|
if (sew != Assembler::e8) {
|
|
// when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
|
|
__ slli(t0, t0, sew);
|
|
}
|
|
__ add(tmp1, s, t0);
|
|
__ vlex_v(v0, tmp1, sew);
|
|
__ add(tmp2, d, t0);
|
|
__ vsex_v(v0, tmp2, sew);
|
|
__ sub(cnt, cnt, vl);
|
|
__ bnez(cnt, loop_forward);
|
|
__ bind(done);
|
|
}
|
|
}
|
|
|
|
// All-singing all-dancing memory copy.
|
|
//
|
|
// Copy count units of memory from s to d. The size of a unit is
|
|
// step, which can be positive or negative depending on the direction
|
|
// of copy.
|
|
//
|
|
void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
|
|
Register s, Register d, Register count, int step) {
|
|
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
|
|
return copy_memory_v(s, d, count, step);
|
|
}
|
|
|
|
bool is_backwards = step < 0;
|
|
int granularity = uabs(step);
|
|
|
|
const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
|
|
const Register gct1 = x28, gct2 = x29, gct3 = t2;
|
|
|
|
Label same_aligned;
|
|
Label copy_big, copy32_loop, copy8_loop, copy_small, done;
|
|
|
|
// The size of copy32_loop body increases significantly with ZGC GC barriers.
|
|
// Need conditional far branches to reach a point beyond the loop in this case.
|
|
bool is_far = UseZGC;
|
|
|
|
__ beqz(count, done, is_far);
|
|
__ slli(cnt, count, exact_log2(granularity));
|
|
if (is_backwards) {
|
|
__ add(src, s, cnt);
|
|
__ add(dst, d, cnt);
|
|
} else {
|
|
__ mv(src, s);
|
|
__ mv(dst, d);
|
|
}
|
|
|
|
if (is_aligned) {
|
|
__ subi(t0, cnt, 32);
|
|
__ bgez(t0, copy32_loop);
|
|
__ subi(t0, cnt, 8);
|
|
__ bgez(t0, copy8_loop, is_far);
|
|
__ j(copy_small);
|
|
} else {
|
|
__ mv(t0, 16);
|
|
__ blt(cnt, t0, copy_small, is_far);
|
|
|
|
__ xorr(t0, src, dst);
|
|
__ andi(t0, t0, 0b111);
|
|
__ bnez(t0, copy_small, is_far);
|
|
|
|
__ bind(same_aligned);
|
|
__ andi(t0, src, 0b111);
|
|
__ beqz(t0, copy_big);
|
|
if (is_backwards) {
|
|
__ addi(src, src, step);
|
|
__ addi(dst, dst, step);
|
|
}
|
|
bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
|
|
bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
|
|
if (!is_backwards) {
|
|
__ addi(src, src, step);
|
|
__ addi(dst, dst, step);
|
|
}
|
|
__ subi(cnt, cnt, granularity);
|
|
__ beqz(cnt, done, is_far);
|
|
__ j(same_aligned);
|
|
|
|
__ bind(copy_big);
|
|
__ mv(t0, 32);
|
|
__ blt(cnt, t0, copy8_loop, is_far);
|
|
}
|
|
|
|
__ bind(copy32_loop);
|
|
if (is_backwards) {
|
|
__ subi(src, src, wordSize * 4);
|
|
__ subi(dst, dst, wordSize * 4);
|
|
}
|
|
// we first load 32 bytes, then write it, so the direction here doesn't matter
|
|
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
|
|
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1);
|
|
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
|
|
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
|
|
|
|
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
|
|
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3);
|
|
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
|
|
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
|
|
|
|
if (!is_backwards) {
|
|
__ addi(src, src, wordSize * 4);
|
|
__ addi(dst, dst, wordSize * 4);
|
|
}
|
|
__ subi(t0, cnt, 32 + wordSize * 4);
|
|
__ subi(cnt, cnt, wordSize * 4);
|
|
__ bgez(t0, copy32_loop); // cnt >= 32, do next loop
|
|
|
|
__ beqz(cnt, done); // if that's all - done
|
|
|
|
__ subi(t0, cnt, 8); // if not - copy the reminder
|
|
__ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
|
|
|
|
__ bind(copy8_loop);
|
|
if (is_backwards) {
|
|
__ subi(src, src, wordSize);
|
|
__ subi(dst, dst, wordSize);
|
|
}
|
|
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
|
|
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
|
|
|
|
if (!is_backwards) {
|
|
__ addi(src, src, wordSize);
|
|
__ addi(dst, dst, wordSize);
|
|
}
|
|
__ subi(t0, cnt, 8 + wordSize);
|
|
__ subi(cnt, cnt, wordSize);
|
|
__ bgez(t0, copy8_loop); // cnt >= 8, do next loop
|
|
|
|
__ beqz(cnt, done); // if that's all - done
|
|
|
|
__ bind(copy_small);
|
|
if (is_backwards) {
|
|
__ addi(src, src, step);
|
|
__ addi(dst, dst, step);
|
|
}
|
|
|
|
bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
|
|
bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
|
|
|
|
if (!is_backwards) {
|
|
__ addi(src, src, step);
|
|
__ addi(dst, dst, step);
|
|
}
|
|
__ subi(cnt, cnt, granularity);
|
|
__ bgtz(cnt, copy_small);
|
|
|
|
__ bind(done);
|
|
}
|
|
|
|
// Scan over array at a for count oops, verifying each one.
|
|
// Preserves a and count, clobbers t0 and t1.
|
|
void verify_oop_array(size_t size, Register a, Register count, Register temp) {
|
|
Label loop, end;
|
|
__ mv(t1, zr);
|
|
__ slli(t0, count, exact_log2(size));
|
|
__ bind(loop);
|
|
__ bgeu(t1, t0, end);
|
|
|
|
__ add(temp, a, t1);
|
|
if (size == (size_t)wordSize) {
|
|
__ ld(temp, Address(temp, 0));
|
|
__ verify_oop(temp);
|
|
} else {
|
|
__ lwu(temp, Address(temp, 0));
|
|
__ decode_heap_oop(temp); // calls verify_oop
|
|
}
|
|
__ add(t1, t1, size);
|
|
__ j(loop);
|
|
__ bind(end);
|
|
}
|
|
|
|
// Arguments:
|
|
// stub_id - is used to name the stub and identify all details of
|
|
// how to perform the copy.
|
|
//
|
|
// entry - is assigned to the stub's post push entry point unless
|
|
// it is null
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source array address
|
|
// c_rarg1 - destination array address
|
|
// c_rarg2 - element count, treated as ssize_t, can be zero
|
|
//
|
|
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
|
|
// the hardware handle it. The two dwords within qwords that span
|
|
// cache line boundaries will still be loaded and stored atomically.
|
|
//
|
|
// Side Effects: entry is set to the (post push) entry point so it
|
|
// can be used by the corresponding conjoint copy
|
|
// method
|
|
//
|
|
address generate_disjoint_copy(StubGenStubId stub_id, address* entry) {
|
|
size_t size;
|
|
bool aligned;
|
|
bool is_oop;
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case jbyte_disjoint_arraycopy_id:
|
|
size = sizeof(jbyte);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case arrayof_jbyte_disjoint_arraycopy_id:
|
|
size = sizeof(jbyte);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case jshort_disjoint_arraycopy_id:
|
|
size = sizeof(jshort);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case arrayof_jshort_disjoint_arraycopy_id:
|
|
size = sizeof(jshort);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case jint_disjoint_arraycopy_id:
|
|
size = sizeof(jint);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case arrayof_jint_disjoint_arraycopy_id:
|
|
size = sizeof(jint);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case jlong_disjoint_arraycopy_id:
|
|
// since this is always aligned we can (should!) use the same
|
|
// stub as for case arrayof_jlong_disjoint_arraycopy
|
|
ShouldNotReachHere();
|
|
break;
|
|
case arrayof_jlong_disjoint_arraycopy_id:
|
|
size = sizeof(jlong);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case oop_disjoint_arraycopy_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case arrayof_oop_disjoint_arraycopy_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case oop_disjoint_arraycopy_uninit_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
case arrayof_oop_disjoint_arraycopy_uninit_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
break;
|
|
}
|
|
|
|
const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
|
|
RegSet saved_reg = RegSet::of(s, d, count);
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
if (entry != nullptr) {
|
|
*entry = __ pc();
|
|
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
|
|
BLOCK_COMMENT("Entry:");
|
|
}
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
if (aligned) {
|
|
decorators |= ARRAYCOPY_ALIGNED;
|
|
}
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
|
|
|
|
if (is_oop) {
|
|
// save regs before copy_memory
|
|
__ push_reg(RegSet::of(d, count), sp);
|
|
}
|
|
|
|
{
|
|
// UnsafeMemoryAccess page error: continue after unsafe access
|
|
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
|
|
UnsafeMemoryAccessMark umam(this, add_entry, true);
|
|
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
|
|
}
|
|
|
|
if (is_oop) {
|
|
__ pop_reg(RegSet::of(d, count), sp);
|
|
if (VerifyOops) {
|
|
verify_oop_array(size, d, count, t2);
|
|
}
|
|
}
|
|
|
|
bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
|
|
|
|
__ leave();
|
|
__ mv(x10, zr); // return 0
|
|
__ ret();
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
// stub_id - is used to name the stub and identify all details of
|
|
// how to perform the copy.
|
|
//
|
|
// nooverlap_target - identifes the (post push) entry for the
|
|
// corresponding disjoint copy routine which can be
|
|
// jumped to if the ranges do not actually overlap
|
|
//
|
|
// entry - is assigned to the stub's post push entry point unless
|
|
// it is null
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source array address
|
|
// c_rarg1 - destination array address
|
|
// c_rarg2 - element count, treated as ssize_t, can be zero
|
|
//
|
|
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
|
|
// the hardware handle it. The two dwords within qwords that span
|
|
// cache line boundaries will still be loaded and stored atomically.
|
|
//
|
|
// Side Effects:
|
|
// entry is set to the no-overlap entry point so it can be used by
|
|
// some other conjoint copy method
|
|
//
|
|
address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
|
|
const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
|
|
RegSet saved_regs = RegSet::of(s, d, count);
|
|
int size;
|
|
bool aligned;
|
|
bool is_oop;
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case jbyte_arraycopy_id:
|
|
size = sizeof(jbyte);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case arrayof_jbyte_arraycopy_id:
|
|
size = sizeof(jbyte);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case jshort_arraycopy_id:
|
|
size = sizeof(jshort);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case arrayof_jshort_arraycopy_id:
|
|
size = sizeof(jshort);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case jint_arraycopy_id:
|
|
size = sizeof(jint);
|
|
aligned = false;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case arrayof_jint_arraycopy_id:
|
|
size = sizeof(jint);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case jlong_arraycopy_id:
|
|
// since this is always aligned we can (should!) use the same
|
|
// stub as for case arrayof_jlong_disjoint_arraycopy
|
|
ShouldNotReachHere();
|
|
break;
|
|
case arrayof_jlong_arraycopy_id:
|
|
size = sizeof(jlong);
|
|
aligned = true;
|
|
is_oop = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case oop_arraycopy_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case arrayof_oop_arraycopy_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case oop_arraycopy_uninit_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
case arrayof_oop_arraycopy_uninit_id:
|
|
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
|
|
aligned = !UseCompressedOops;
|
|
is_oop = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
if (entry != nullptr) {
|
|
*entry = __ pc();
|
|
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
|
|
BLOCK_COMMENT("Entry:");
|
|
}
|
|
|
|
// use fwd copy when (d-s) above_equal (count*size)
|
|
__ sub(t0, d, s);
|
|
__ slli(t1, count, exact_log2(size));
|
|
Label L_continue;
|
|
__ bltu(t0, t1, L_continue);
|
|
__ j(nooverlap_target);
|
|
__ bind(L_continue);
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
if (aligned) {
|
|
decorators |= ARRAYCOPY_ALIGNED;
|
|
}
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
|
|
|
|
if (is_oop) {
|
|
// save regs before copy_memory
|
|
__ push_reg(RegSet::of(d, count), sp);
|
|
}
|
|
|
|
{
|
|
// UnsafeMemoryAccess page error: continue after unsafe access
|
|
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
|
|
UnsafeMemoryAccessMark umam(this, add_entry, true);
|
|
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
|
|
}
|
|
|
|
if (is_oop) {
|
|
__ pop_reg(RegSet::of(d, count), sp);
|
|
if (VerifyOops) {
|
|
verify_oop_array(size, d, count, t2);
|
|
}
|
|
}
|
|
bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
|
|
__ leave();
|
|
__ mv(x10, zr); // return 0
|
|
__ ret();
|
|
return start;
|
|
}
|
|
|
|
// Helper for generating a dynamic type check.
|
|
// Smashes t0, t1.
|
|
void generate_type_check(Register sub_klass,
|
|
Register super_check_offset,
|
|
Register super_klass,
|
|
Register result,
|
|
Register tmp1,
|
|
Register tmp2,
|
|
Label& L_success) {
|
|
assert_different_registers(sub_klass, super_check_offset, super_klass);
|
|
|
|
BLOCK_COMMENT("type_check:");
|
|
|
|
Label L_miss;
|
|
|
|
__ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
|
|
__ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
|
|
|
|
// Fall through on failure!
|
|
__ BIND(L_miss);
|
|
}
|
|
|
|
//
|
|
// Generate checkcasting array copy stub
|
|
//
|
|
// Input:
|
|
// c_rarg0 - source array address
|
|
// c_rarg1 - destination array address
|
|
// c_rarg2 - element count, treated as ssize_t, can be zero
|
|
// c_rarg3 - size_t ckoff (super_check_offset)
|
|
// c_rarg4 - oop ckval (super_klass)
|
|
//
|
|
// Output:
|
|
// x10 == 0 - success
|
|
// x10 == -1^K - failure, where K is partial transfer count
|
|
//
|
|
address generate_checkcast_copy(StubGenStubId stub_id, address* entry) {
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case checkcast_arraycopy_id:
|
|
dest_uninitialized = false;
|
|
break;
|
|
case checkcast_arraycopy_uninit_id:
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
|
|
|
|
// Input registers (after setup_arg_regs)
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register count = c_rarg2; // elementscount
|
|
const Register ckoff = c_rarg3; // super_check_offset
|
|
const Register ckval = c_rarg4; // super_klass
|
|
|
|
RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
|
|
RegSet wb_post_saved_regs = RegSet::of(count);
|
|
|
|
// Registers used as temps (x7, x9, x18 are save-on-entry)
|
|
const Register count_save = x19; // orig elementscount
|
|
const Register start_to = x18; // destination array start address
|
|
const Register copied_oop = x7; // actual oop copied
|
|
const Register r9_klass = x9; // oop._klass
|
|
|
|
// Registers used as gc temps (x15, x16, x17 are save-on-call)
|
|
const Register gct1 = x15, gct2 = x16, gct3 = x17;
|
|
|
|
//---------------------------------------------------------------
|
|
// Assembler stub will be used for this call to arraycopy
|
|
// if the two arrays are subtypes of Object[] but the
|
|
// destination array type is not equal to or a supertype
|
|
// of the source type. Each element must be separately
|
|
// checked.
|
|
|
|
assert_different_registers(from, to, count, ckoff, ckval, start_to,
|
|
copied_oop, r9_klass, count_save);
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
// Caller of this entry point must set up the argument registers.
|
|
if (entry != nullptr) {
|
|
*entry = __ pc();
|
|
BLOCK_COMMENT("Entry:");
|
|
}
|
|
|
|
// Empty array: Nothing to do
|
|
__ beqz(count, L_done);
|
|
|
|
__ push_reg(RegSet::of(x7, x9, x18, x19), sp);
|
|
|
|
#ifdef ASSERT
|
|
BLOCK_COMMENT("assert consistent ckoff/ckval");
|
|
// The ckoff and ckval must be mutually consistent,
|
|
// even though caller generates both.
|
|
{ Label L;
|
|
int sco_offset = in_bytes(Klass::super_check_offset_offset());
|
|
__ lwu(start_to, Address(ckval, sco_offset));
|
|
__ beq(ckoff, start_to, L);
|
|
__ stop("super_check_offset inconsistent");
|
|
__ bind(L);
|
|
}
|
|
#endif //ASSERT
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
|
|
bool is_oop = true;
|
|
int element_size = UseCompressedOops ? 4 : 8;
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
|
|
|
|
// save the original count
|
|
__ mv(count_save, count);
|
|
|
|
// Copy from low to high addresses
|
|
__ mv(start_to, to); // Save destination array start address
|
|
__ j(L_load_element);
|
|
|
|
// ======== begin loop ========
|
|
// (Loop is rotated; its entry is L_load_element.)
|
|
// Loop control:
|
|
// for count to 0 do
|
|
// copied_oop = load_heap_oop(from++)
|
|
// ... generate_type_check ...
|
|
// store_heap_oop(to++, copied_oop)
|
|
// end
|
|
|
|
__ align(OptoLoopAlignment);
|
|
|
|
__ BIND(L_store_element);
|
|
bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
|
|
Address(to, 0), copied_oop,
|
|
gct1, gct2, gct3);
|
|
__ addi(to, to, UseCompressedOops ? 4 : 8);
|
|
__ subi(count, count, 1);
|
|
__ beqz(count, L_do_card_marks);
|
|
|
|
// ======== loop entry is here ========
|
|
__ BIND(L_load_element);
|
|
bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
|
|
copied_oop, Address(from, 0),
|
|
gct1);
|
|
__ addi(from, from, UseCompressedOops ? 4 : 8);
|
|
__ beqz(copied_oop, L_store_element);
|
|
|
|
__ load_klass(r9_klass, copied_oop);// query the object klass
|
|
|
|
BLOCK_COMMENT("type_check:");
|
|
generate_type_check(r9_klass, /*sub_klass*/
|
|
ckoff, /*super_check_offset*/
|
|
ckval, /*super_klass*/
|
|
x10, /*result*/
|
|
gct1, /*tmp1*/
|
|
gct2, /*tmp2*/
|
|
L_store_element);
|
|
|
|
// Fall through on failure!
|
|
|
|
// ======== end loop ========
|
|
|
|
// It was a real error; we must depend on the caller to finish the job.
|
|
// Register count = remaining oops, count_orig = total oops.
|
|
// Emit GC store barriers for the oops we have copied and report
|
|
// their number to the caller.
|
|
|
|
__ sub(count, count_save, count); // K = partially copied oop count
|
|
__ xori(count, count, -1); // report (-1^K) to caller
|
|
__ beqz(count, L_done_pop);
|
|
|
|
__ BIND(L_do_card_marks);
|
|
bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
|
|
|
|
__ bind(L_done_pop);
|
|
__ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
|
|
inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
|
|
|
|
__ bind(L_done);
|
|
__ mv(x10, count);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Perform range checks on the proposed arraycopy.
|
|
// Kills temp, but nothing else.
|
|
// Also, clean the sign bits of src_pos and dst_pos.
|
|
void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
|
|
Register src_pos, // source position (c_rarg1)
|
|
Register dst, // destination array oo (c_rarg2)
|
|
Register dst_pos, // destination position (c_rarg3)
|
|
Register length,
|
|
Register temp,
|
|
Label& L_failed) {
|
|
BLOCK_COMMENT("arraycopy_range_checks:");
|
|
|
|
assert_different_registers(t0, temp);
|
|
|
|
// if [src_pos + length > arrayOop(src)->length()] then FAIL
|
|
__ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
|
|
__ addw(temp, length, src_pos);
|
|
__ bgtu(temp, t0, L_failed);
|
|
|
|
// if [dst_pos + length > arrayOop(dst)->length()] then FAIL
|
|
__ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
|
|
__ addw(temp, length, dst_pos);
|
|
__ bgtu(temp, t0, L_failed);
|
|
|
|
// Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
|
|
__ zext(src_pos, src_pos, 32);
|
|
__ zext(dst_pos, dst_pos, 32);
|
|
|
|
BLOCK_COMMENT("arraycopy_range_checks done");
|
|
}
|
|
|
|
//
|
|
// Generate 'unsafe' array copy stub
|
|
// Though just as safe as the other stubs, it takes an unscaled
|
|
// size_t argument instead of an element count.
|
|
//
|
|
// Input:
|
|
// c_rarg0 - source array address
|
|
// c_rarg1 - destination array address
|
|
// c_rarg2 - byte count, treated as ssize_t, can be zero
|
|
//
|
|
// Examines the alignment of the operands and dispatches
|
|
// to a long, int, short, or byte copy loop.
|
|
//
|
|
address generate_unsafe_copy(address byte_copy_entry,
|
|
address short_copy_entry,
|
|
address int_copy_entry,
|
|
address long_copy_entry) {
|
|
assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
|
|
int_copy_entry != nullptr && long_copy_entry != nullptr);
|
|
Label L_long_aligned, L_int_aligned, L_short_aligned;
|
|
const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
// bump this on entry, not on exit:
|
|
inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
|
|
|
|
__ orr(t0, s, d);
|
|
__ orr(t0, t0, count);
|
|
|
|
__ andi(t0, t0, BytesPerLong - 1);
|
|
__ beqz(t0, L_long_aligned);
|
|
__ andi(t0, t0, BytesPerInt - 1);
|
|
__ beqz(t0, L_int_aligned);
|
|
__ test_bit(t0, t0, 0);
|
|
__ beqz(t0, L_short_aligned);
|
|
__ j(RuntimeAddress(byte_copy_entry));
|
|
|
|
__ BIND(L_short_aligned);
|
|
__ srli(count, count, LogBytesPerShort); // size => short_count
|
|
__ j(RuntimeAddress(short_copy_entry));
|
|
__ BIND(L_int_aligned);
|
|
__ srli(count, count, LogBytesPerInt); // size => int_count
|
|
__ j(RuntimeAddress(int_copy_entry));
|
|
__ BIND(L_long_aligned);
|
|
__ srli(count, count, LogBytesPerLong); // size => long_count
|
|
__ j(RuntimeAddress(long_copy_entry));
|
|
|
|
return start;
|
|
}
|
|
|
|
//
|
|
// Generate generic array copy stubs
|
|
//
|
|
// Input:
|
|
// c_rarg0 - src oop
|
|
// c_rarg1 - src_pos (32-bits)
|
|
// c_rarg2 - dst oop
|
|
// c_rarg3 - dst_pos (32-bits)
|
|
// c_rarg4 - element count (32-bits)
|
|
//
|
|
// Output:
|
|
// x10 == 0 - success
|
|
// x10 == -1^K - failure, where K is partial transfer count
|
|
//
|
|
address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
|
|
address int_copy_entry, address oop_copy_entry,
|
|
address long_copy_entry, address checkcast_copy_entry) {
|
|
assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
|
|
int_copy_entry != nullptr && oop_copy_entry != nullptr &&
|
|
long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
|
|
Label L_failed, L_failed_0, L_objArray;
|
|
Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
|
|
|
|
// Input registers
|
|
const Register src = c_rarg0; // source array oop
|
|
const Register src_pos = c_rarg1; // source position
|
|
const Register dst = c_rarg2; // destination array oop
|
|
const Register dst_pos = c_rarg3; // destination position
|
|
const Register length = c_rarg4;
|
|
|
|
// Registers used as temps
|
|
const Register dst_klass = c_rarg5;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
|
|
StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
// bump this on entry, not on exit:
|
|
inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
|
|
|
|
//-----------------------------------------------------------------------
|
|
// Assembler stub will be used for this call to arraycopy
|
|
// if the following conditions are met:
|
|
//
|
|
// (1) src and dst must not be null.
|
|
// (2) src_pos must not be negative.
|
|
// (3) dst_pos must not be negative.
|
|
// (4) length must not be negative.
|
|
// (5) src klass and dst klass should be the same and not null.
|
|
// (6) src and dst should be arrays.
|
|
// (7) src_pos + length must not exceed length of src.
|
|
// (8) dst_pos + length must not exceed length of dst.
|
|
//
|
|
|
|
// if src is null then return -1
|
|
__ beqz(src, L_failed);
|
|
|
|
// if [src_pos < 0] then return -1
|
|
__ sext(t0, src_pos, 32);
|
|
__ bltz(t0, L_failed);
|
|
|
|
// if dst is null then return -1
|
|
__ beqz(dst, L_failed);
|
|
|
|
// if [dst_pos < 0] then return -1
|
|
__ sext(t0, dst_pos, 32);
|
|
__ bltz(t0, L_failed);
|
|
|
|
// registers used as temp
|
|
const Register scratch_length = x28; // elements count to copy
|
|
const Register scratch_src_klass = x29; // array klass
|
|
const Register lh = x30; // layout helper
|
|
|
|
// if [length < 0] then return -1
|
|
__ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
|
|
__ bltz(scratch_length, L_failed);
|
|
|
|
__ load_klass(scratch_src_klass, src);
|
|
#ifdef ASSERT
|
|
{
|
|
BLOCK_COMMENT("assert klasses not null {");
|
|
Label L1, L2;
|
|
__ bnez(scratch_src_klass, L2); // it is broken if klass is null
|
|
__ bind(L1);
|
|
__ stop("broken null klass");
|
|
__ bind(L2);
|
|
__ load_klass(t0, dst, t1);
|
|
__ beqz(t0, L1); // this would be broken also
|
|
BLOCK_COMMENT("} assert klasses not null done");
|
|
}
|
|
#endif
|
|
|
|
// Load layout helper (32-bits)
|
|
//
|
|
// |array_tag| | header_size | element_type | |log2_element_size|
|
|
// 32 30 24 16 8 2 0
|
|
//
|
|
// array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
|
|
//
|
|
|
|
const int lh_offset = in_bytes(Klass::layout_helper_offset());
|
|
|
|
// Handle objArrays completely differently...
|
|
const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
|
|
__ lw(lh, Address(scratch_src_klass, lh_offset));
|
|
__ mv(t0, objArray_lh);
|
|
__ beq(lh, t0, L_objArray);
|
|
|
|
// if [src->klass() != dst->klass()] then return -1
|
|
__ load_klass(t1, dst);
|
|
__ bne(t1, scratch_src_klass, L_failed);
|
|
|
|
// if src->is_Array() isn't null then return -1
|
|
// i.e. (lh >= 0)
|
|
__ bgez(lh, L_failed);
|
|
|
|
// At this point, it is known to be a typeArray (array_tag 0x3).
|
|
#ifdef ASSERT
|
|
{
|
|
BLOCK_COMMENT("assert primitive array {");
|
|
Label L;
|
|
__ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
|
|
__ bge(lh, t1, L);
|
|
__ stop("must be a primitive array");
|
|
__ bind(L);
|
|
BLOCK_COMMENT("} assert primitive array done");
|
|
}
|
|
#endif
|
|
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
|
|
t1, L_failed);
|
|
|
|
// TypeArrayKlass
|
|
//
|
|
// src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
|
|
// dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
|
|
//
|
|
|
|
const Register t0_offset = t0; // array offset
|
|
const Register x30_elsize = lh; // element size
|
|
|
|
// Get array_header_in_bytes()
|
|
int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
|
|
int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
|
|
__ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32;
|
|
__ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
|
|
|
|
__ add(src, src, t0_offset); // src array offset
|
|
__ add(dst, dst, t0_offset); // dst array offset
|
|
BLOCK_COMMENT("choose copy loop based on element size");
|
|
|
|
// next registers should be set before the jump to corresponding stub
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register count = c_rarg2; // elements count
|
|
|
|
// 'from', 'to', 'count' registers should be set in such order
|
|
// since they are the same as 'src', 'src_pos', 'dst'.
|
|
|
|
assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
|
|
|
|
// The possible values of elsize are 0-3, i.e. exact_log2(element
|
|
// size in bytes). We do a simple bitwise binary search.
|
|
__ BIND(L_copy_bytes);
|
|
__ test_bit(t0, x30_elsize, 1);
|
|
__ bnez(t0, L_copy_ints);
|
|
__ test_bit(t0, x30_elsize, 0);
|
|
__ bnez(t0, L_copy_shorts);
|
|
__ add(from, src, src_pos); // src_addr
|
|
__ add(to, dst, dst_pos); // dst_addr
|
|
__ sext(count, scratch_length, 32); // length
|
|
__ j(RuntimeAddress(byte_copy_entry));
|
|
|
|
__ BIND(L_copy_shorts);
|
|
__ shadd(from, src_pos, src, t0, 1); // src_addr
|
|
__ shadd(to, dst_pos, dst, t0, 1); // dst_addr
|
|
__ sext(count, scratch_length, 32); // length
|
|
__ j(RuntimeAddress(short_copy_entry));
|
|
|
|
__ BIND(L_copy_ints);
|
|
__ test_bit(t0, x30_elsize, 0);
|
|
__ bnez(t0, L_copy_longs);
|
|
__ shadd(from, src_pos, src, t0, 2); // src_addr
|
|
__ shadd(to, dst_pos, dst, t0, 2); // dst_addr
|
|
__ sext(count, scratch_length, 32); // length
|
|
__ j(RuntimeAddress(int_copy_entry));
|
|
|
|
__ BIND(L_copy_longs);
|
|
#ifdef ASSERT
|
|
{
|
|
BLOCK_COMMENT("assert long copy {");
|
|
Label L;
|
|
__ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
|
|
__ sext(lh, lh, 32);
|
|
__ mv(t0, LogBytesPerLong);
|
|
__ beq(x30_elsize, t0, L);
|
|
__ stop("must be long copy, but elsize is wrong");
|
|
__ bind(L);
|
|
BLOCK_COMMENT("} assert long copy done");
|
|
}
|
|
#endif
|
|
__ shadd(from, src_pos, src, t0, 3); // src_addr
|
|
__ shadd(to, dst_pos, dst, t0, 3); // dst_addr
|
|
__ sext(count, scratch_length, 32); // length
|
|
__ j(RuntimeAddress(long_copy_entry));
|
|
|
|
// ObjArrayKlass
|
|
__ BIND(L_objArray);
|
|
// live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
|
|
|
|
Label L_plain_copy, L_checkcast_copy;
|
|
// test array classes for subtyping
|
|
__ load_klass(t2, dst);
|
|
__ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
|
|
|
|
// Identically typed arrays can be copied without element-wise checks.
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
|
|
t1, L_failed);
|
|
|
|
__ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
|
|
__ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
|
|
__ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
|
|
__ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
|
|
__ sext(count, scratch_length, 32); // length
|
|
__ BIND(L_plain_copy);
|
|
__ j(RuntimeAddress(oop_copy_entry));
|
|
|
|
__ BIND(L_checkcast_copy);
|
|
// live at this point: scratch_src_klass, scratch_length, t2 (dst_klass)
|
|
{
|
|
// Before looking at dst.length, make sure dst is also an objArray.
|
|
__ lwu(t0, Address(t2, lh_offset));
|
|
__ mv(t1, objArray_lh);
|
|
__ bne(t0, t1, L_failed);
|
|
|
|
// It is safe to examine both src.length and dst.length.
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
|
|
t2, L_failed);
|
|
|
|
__ load_klass(dst_klass, dst); // reload
|
|
|
|
// Marshal the base address arguments now, freeing registers.
|
|
__ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
|
|
__ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
|
|
__ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
|
|
__ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
|
|
__ sext(count, length, 32); // length (reloaded)
|
|
const Register sco_temp = c_rarg3; // this register is free now
|
|
assert_different_registers(from, to, count, sco_temp,
|
|
dst_klass, scratch_src_klass);
|
|
|
|
// Generate the type check.
|
|
const int sco_offset = in_bytes(Klass::super_check_offset_offset());
|
|
__ lwu(sco_temp, Address(dst_klass, sco_offset));
|
|
|
|
// Smashes t0, t1
|
|
generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
|
|
|
|
// Fetch destination element klass from the ObjArrayKlass header.
|
|
int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
|
|
__ ld(dst_klass, Address(dst_klass, ek_offset));
|
|
__ lwu(sco_temp, Address(dst_klass, sco_offset));
|
|
|
|
// the checkcast_copy loop needs two extra arguments:
|
|
assert(c_rarg3 == sco_temp, "#3 already in place");
|
|
// Set up arguments for checkcast_copy_entry.
|
|
__ mv(c_rarg4, dst_klass); // dst.klass.element_klass
|
|
__ j(RuntimeAddress(checkcast_copy_entry));
|
|
}
|
|
|
|
__ BIND(L_failed);
|
|
__ mv(x10, -1);
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
//
|
|
// Generate stub for array fill. If "aligned" is true, the
|
|
// "to" address is assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// to: c_rarg0
|
|
// value: c_rarg1
|
|
// count: c_rarg2 treated as signed
|
|
//
|
|
address generate_fill(StubGenStubId stub_id) {
|
|
BasicType t;
|
|
bool aligned;
|
|
|
|
switch (stub_id) {
|
|
case jbyte_fill_id:
|
|
t = T_BYTE;
|
|
aligned = false;
|
|
break;
|
|
case jshort_fill_id:
|
|
t = T_SHORT;
|
|
aligned = false;
|
|
break;
|
|
case jint_fill_id:
|
|
t = T_INT;
|
|
aligned = false;
|
|
break;
|
|
case arrayof_jbyte_fill_id:
|
|
t = T_BYTE;
|
|
aligned = true;
|
|
break;
|
|
case arrayof_jshort_fill_id:
|
|
t = T_SHORT;
|
|
aligned = true;
|
|
break;
|
|
case arrayof_jint_fill_id:
|
|
t = T_INT;
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
|
|
const Register to = c_rarg0; // source array address
|
|
const Register value = c_rarg1; // value
|
|
const Register count = c_rarg2; // elements count
|
|
|
|
const Register bz_base = x28; // base for block_zero routine
|
|
const Register cnt_words = x29; // temp register
|
|
const Register tmp_reg = t1;
|
|
|
|
__ enter();
|
|
|
|
Label L_fill_elements;
|
|
|
|
int shift = -1;
|
|
switch (t) {
|
|
case T_BYTE:
|
|
shift = 0;
|
|
// Short arrays (< 8 bytes) fill by element
|
|
__ mv(tmp_reg, 8 >> shift);
|
|
__ bltu(count, tmp_reg, L_fill_elements);
|
|
|
|
// Zero extend value
|
|
// 8 bit -> 16 bit
|
|
__ zext(value, value, 8);
|
|
__ slli(tmp_reg, value, 8);
|
|
__ orr(value, value, tmp_reg);
|
|
|
|
// 16 bit -> 32 bit
|
|
__ slli(tmp_reg, value, 16);
|
|
__ orr(value, value, tmp_reg);
|
|
break;
|
|
case T_SHORT:
|
|
shift = 1;
|
|
// Short arrays (< 8 bytes) fill by element
|
|
__ mv(tmp_reg, 8 >> shift);
|
|
__ bltu(count, tmp_reg, L_fill_elements);
|
|
|
|
// Zero extend value
|
|
// 16 bit -> 32 bit
|
|
__ zext(value, value, 16);
|
|
__ slli(tmp_reg, value, 16);
|
|
__ orr(value, value, tmp_reg);
|
|
break;
|
|
case T_INT:
|
|
shift = 2;
|
|
// Short arrays (< 8 bytes) fill by element
|
|
__ mv(tmp_reg, 8 >> shift);
|
|
__ bltu(count, tmp_reg, L_fill_elements);
|
|
break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
|
|
// Align source address at 8 bytes address boundary.
|
|
Label L_skip_align1, L_skip_align2, L_skip_align4;
|
|
if (!aligned) {
|
|
switch (t) {
|
|
case T_BYTE:
|
|
// One byte misalignment happens only for byte arrays.
|
|
__ test_bit(t0, to, 0);
|
|
__ beqz(t0, L_skip_align1);
|
|
__ sb(value, Address(to, 0));
|
|
__ addi(to, to, 1);
|
|
__ subiw(count, count, 1);
|
|
__ bind(L_skip_align1);
|
|
// Fallthrough
|
|
case T_SHORT:
|
|
// Two bytes misalignment happens only for byte and short (char) arrays.
|
|
__ test_bit(t0, to, 1);
|
|
__ beqz(t0, L_skip_align2);
|
|
__ sh(value, Address(to, 0));
|
|
__ addi(to, to, 2);
|
|
__ subiw(count, count, 2 >> shift);
|
|
__ bind(L_skip_align2);
|
|
// Fallthrough
|
|
case T_INT:
|
|
// Align to 8 bytes, we know we are 4 byte aligned to start.
|
|
__ test_bit(t0, to, 2);
|
|
__ beqz(t0, L_skip_align4);
|
|
__ sw(value, Address(to, 0));
|
|
__ addi(to, to, 4);
|
|
__ subiw(count, count, 4 >> shift);
|
|
__ bind(L_skip_align4);
|
|
break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
}
|
|
|
|
//
|
|
// Fill large chunks
|
|
//
|
|
__ srliw(cnt_words, count, 3 - shift); // number of words
|
|
|
|
// 32 bit -> 64 bit
|
|
__ zext(value, value, 32);
|
|
__ slli(tmp_reg, value, 32);
|
|
__ orr(value, value, tmp_reg);
|
|
|
|
__ slli(tmp_reg, cnt_words, 3 - shift);
|
|
__ subw(count, count, tmp_reg);
|
|
{
|
|
__ fill_words(to, cnt_words, value);
|
|
}
|
|
|
|
// Remaining count is less than 8 bytes and address is heapword aligned.
|
|
Label L_fill_2, L_fill_4, L_exit1;
|
|
switch (t) {
|
|
case T_BYTE:
|
|
__ test_bit(t0, count, 0);
|
|
__ beqz(t0, L_fill_2);
|
|
__ sb(value, Address(to, 0));
|
|
__ addi(to, to, 1);
|
|
__ bind(L_fill_2);
|
|
__ test_bit(t0, count, 1);
|
|
__ beqz(t0, L_fill_4);
|
|
__ sh(value, Address(to, 0));
|
|
__ addi(to, to, 2);
|
|
__ bind(L_fill_4);
|
|
__ test_bit(t0, count, 2);
|
|
__ beqz(t0, L_exit1);
|
|
__ sw(value, Address(to, 0));
|
|
break;
|
|
case T_SHORT:
|
|
__ test_bit(t0, count, 0);
|
|
__ beqz(t0, L_fill_4);
|
|
__ sh(value, Address(to, 0));
|
|
__ addi(to, to, 2);
|
|
__ bind(L_fill_4);
|
|
__ test_bit(t0, count, 1);
|
|
__ beqz(t0, L_exit1);
|
|
__ sw(value, Address(to, 0));
|
|
break;
|
|
case T_INT:
|
|
__ beqz(count, L_exit1);
|
|
__ sw(value, Address(to, 0));
|
|
break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
__ bind(L_exit1);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
// Handle copies less than 8 bytes.
|
|
Label L_loop1, L_loop2, L_exit2;
|
|
__ bind(L_fill_elements);
|
|
__ beqz(count, L_exit2);
|
|
switch (t) {
|
|
case T_BYTE:
|
|
__ bind(L_loop1);
|
|
__ sb(value, Address(to, 0));
|
|
__ addi(to, to, 1);
|
|
__ subiw(count, count, 1);
|
|
__ bnez(count, L_loop1);
|
|
break;
|
|
case T_SHORT:
|
|
__ bind(L_loop2);
|
|
__ sh(value, Address(to, 0));
|
|
__ addi(to, to, 2);
|
|
__ subiw(count, count, 2 >> shift);
|
|
__ bnez(count, L_loop2);
|
|
break;
|
|
case T_INT:
|
|
__ sw(value, Address(to, 0));
|
|
break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
__ bind(L_exit2);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
void generate_arraycopy_stubs() {
|
|
address entry = nullptr;
|
|
address entry_jbyte_arraycopy = nullptr;
|
|
address entry_jshort_arraycopy = nullptr;
|
|
address entry_jint_arraycopy = nullptr;
|
|
address entry_oop_arraycopy = nullptr;
|
|
address entry_jlong_arraycopy = nullptr;
|
|
address entry_checkcast_arraycopy = nullptr;
|
|
|
|
generate_copy_longs(StubGenStubId::copy_byte_f_id, copy_f, c_rarg0, c_rarg1, t1);
|
|
generate_copy_longs(StubGenStubId::copy_byte_b_id, copy_b, c_rarg0, c_rarg1, t1);
|
|
|
|
StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
|
|
|
|
//*** jbyte
|
|
// Always need aligned and unaligned versions
|
|
StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
|
|
StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
|
|
StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
|
|
StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
|
|
|
|
//*** jshort
|
|
// Always need aligned and unaligned versions
|
|
StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
|
|
StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
|
|
StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
|
|
StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
|
|
|
|
//*** jint
|
|
// Aligned versions
|
|
StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
|
|
StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
|
|
// In 64 bit we need both aligned and unaligned versions of jint arraycopy.
|
|
// entry_jint_arraycopy always points to the unaligned version
|
|
StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
|
|
StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
|
|
|
|
//*** jlong
|
|
// It is always aligned
|
|
StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
|
|
StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
|
|
StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
|
|
StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
|
|
|
|
//*** oops
|
|
StubRoutines::_arrayof_oop_disjoint_arraycopy
|
|
= generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
|
|
StubRoutines::_arrayof_oop_arraycopy
|
|
= generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
|
|
// Aligned versions without pre-barriers
|
|
StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
|
|
= generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
|
|
StubRoutines::_arrayof_oop_arraycopy_uninit
|
|
= generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
|
|
|
|
StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
|
|
StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
|
|
StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
|
|
StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
|
|
|
|
StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
|
|
StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
|
|
|
|
|
|
StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy,
|
|
entry_jshort_arraycopy,
|
|
entry_jint_arraycopy,
|
|
entry_jlong_arraycopy);
|
|
|
|
StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy,
|
|
entry_jshort_arraycopy,
|
|
entry_jint_arraycopy,
|
|
entry_oop_arraycopy,
|
|
entry_jlong_arraycopy,
|
|
entry_checkcast_arraycopy);
|
|
|
|
StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
|
|
StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
|
|
StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
|
|
StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
|
|
StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
|
|
StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
|
|
}
|
|
|
|
void generate_aes_loadkeys(const Register &key, VectorRegister *working_vregs, int rounds) {
|
|
const int step = 16;
|
|
for (int i = 0; i < rounds; i++) {
|
|
__ vle32_v(working_vregs[i], key);
|
|
// The keys are stored in little-endian array, while we need
|
|
// to operate in big-endian.
|
|
// So performing an endian-swap here with vrev8.v instruction
|
|
__ vrev8_v(working_vregs[i], working_vregs[i]);
|
|
__ addi(key, key, step);
|
|
}
|
|
}
|
|
|
|
void generate_aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
|
|
assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
|
|
|
|
__ vxor_vv(res, res, working_vregs[0]);
|
|
for (int i = 1; i < rounds - 1; i++) {
|
|
__ vaesem_vv(res, working_vregs[i]);
|
|
}
|
|
__ vaesef_vv(res, working_vregs[rounds - 1]);
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - K (key) in little endian int array
|
|
//
|
|
address generate_aescrypt_encryptBlock() {
|
|
assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
Label L_aes128, L_aes192;
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register keylen = c_rarg3;
|
|
|
|
VectorRegister working_vregs[] = {
|
|
v4, v5, v6, v7, v8, v9, v10, v11,
|
|
v12, v13, v14, v15, v16, v17, v18
|
|
};
|
|
const VectorRegister res = v19;
|
|
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
__ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
|
|
__ vle32_v(res, from);
|
|
|
|
__ mv(t2, 52);
|
|
__ blt(keylen, t2, L_aes128);
|
|
__ beq(keylen, t2, L_aes192);
|
|
// Else we fallthrough to the biggest case (256-bit key size)
|
|
|
|
// Note: the following function performs key += 15*16
|
|
generate_aes_loadkeys(key, working_vregs, 15);
|
|
generate_aes_encrypt(res, working_vregs, 15);
|
|
__ vse32_v(res, to);
|
|
__ mv(c_rarg0, 0);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
__ bind(L_aes192);
|
|
// Note: the following function performs key += 13*16
|
|
generate_aes_loadkeys(key, working_vregs, 13);
|
|
generate_aes_encrypt(res, working_vregs, 13);
|
|
__ vse32_v(res, to);
|
|
__ mv(c_rarg0, 0);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
__ bind(L_aes128);
|
|
// Note: the following function performs key += 11*16
|
|
generate_aes_loadkeys(key, working_vregs, 11);
|
|
generate_aes_encrypt(res, working_vregs, 11);
|
|
__ vse32_v(res, to);
|
|
__ mv(c_rarg0, 0);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
void generate_aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
|
|
assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
|
|
|
|
__ vxor_vv(res, res, working_vregs[rounds - 1]);
|
|
for (int i = rounds - 2; i > 0; i--) {
|
|
__ vaesdm_vv(res, working_vregs[i]);
|
|
}
|
|
__ vaesdf_vv(res, working_vregs[0]);
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - K (key) in little endian int array
|
|
//
|
|
address generate_aescrypt_decryptBlock() {
|
|
assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
Label L_aes128, L_aes192;
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register keylen = c_rarg3;
|
|
|
|
VectorRegister working_vregs[] = {
|
|
v4, v5, v6, v7, v8, v9, v10, v11,
|
|
v12, v13, v14, v15, v16, v17, v18
|
|
};
|
|
const VectorRegister res = v19;
|
|
|
|
address start = __ pc();
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
__ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
|
|
__ vle32_v(res, from);
|
|
|
|
__ mv(t2, 52);
|
|
__ blt(keylen, t2, L_aes128);
|
|
__ beq(keylen, t2, L_aes192);
|
|
// Else we fallthrough to the biggest case (256-bit key size)
|
|
|
|
// Note: the following function performs key += 15*16
|
|
generate_aes_loadkeys(key, working_vregs, 15);
|
|
generate_aes_decrypt(res, working_vregs, 15);
|
|
__ vse32_v(res, to);
|
|
__ mv(c_rarg0, 0);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
__ bind(L_aes192);
|
|
// Note: the following function performs key += 13*16
|
|
generate_aes_loadkeys(key, working_vregs, 13);
|
|
generate_aes_decrypt(res, working_vregs, 13);
|
|
__ vse32_v(res, to);
|
|
__ mv(c_rarg0, 0);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
__ bind(L_aes128);
|
|
// Note: the following function performs key += 11*16
|
|
generate_aes_loadkeys(key, working_vregs, 11);
|
|
generate_aes_decrypt(res, working_vregs, 11);
|
|
__ vse32_v(res, to);
|
|
__ mv(c_rarg0, 0);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
// code for comparing 8 characters of strings with Latin1 and Utf16 encoding
|
|
void compare_string_8_x_LU(Register tmpL, Register tmpU,
|
|
Register strL, Register strU, Label& DIFF) {
|
|
const Register tmp = x30, tmpLval = x12;
|
|
|
|
int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
|
|
assert((base_offset % (UseCompactObjectHeaders ? 4 :
|
|
(UseCompressedClassPointers ? 8 : 4))) == 0, "Must be");
|
|
|
|
#ifdef ASSERT
|
|
if (AvoidUnalignedAccesses) {
|
|
Label align_ok;
|
|
__ andi(t0, strL, 0x7);
|
|
__ beqz(t0, align_ok);
|
|
__ stop("bad alignment");
|
|
__ bind(align_ok);
|
|
}
|
|
#endif
|
|
__ ld(tmpLval, Address(strL));
|
|
__ addi(strL, strL, wordSize);
|
|
|
|
// compare first 4 characters
|
|
__ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
|
|
__ addi(strU, strU, wordSize);
|
|
__ inflate_lo32(tmpL, tmpLval);
|
|
__ xorr(tmp, tmpU, tmpL);
|
|
__ bnez(tmp, DIFF);
|
|
|
|
// compare second 4 characters
|
|
__ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
|
|
__ addi(strU, strU, wordSize);
|
|
__ inflate_hi32(tmpL, tmpLval);
|
|
__ xorr(tmp, tmpU, tmpL);
|
|
__ bnez(tmp, DIFF);
|
|
}
|
|
|
|
// x10 = result
|
|
// x11 = str1
|
|
// x12 = cnt1
|
|
// x13 = str2
|
|
// x14 = cnt2
|
|
// x28 = tmp1
|
|
// x29 = tmp2
|
|
// x30 = tmp3
|
|
address generate_compare_long_string_different_encoding(StubGenStubId stub_id) {
|
|
bool isLU;
|
|
switch (stub_id) {
|
|
case compare_long_string_LU_id:
|
|
isLU = true;
|
|
break;
|
|
case compare_long_string_UL_id:
|
|
isLU = false;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
};
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
|
|
const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
|
|
tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
|
|
|
|
int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
|
|
assert((base_offset % (UseCompactObjectHeaders ? 4 :
|
|
(UseCompressedClassPointers ? 8 : 4))) == 0, "Must be");
|
|
|
|
Register strU = isLU ? str2 : str1,
|
|
strL = isLU ? str1 : str2,
|
|
tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
|
|
tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
|
|
|
|
if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
|
|
// Load 4 bytes from strL to make sure main loop is 8-byte aligned
|
|
// cnt2 is >= 68 here, no need to check it for >= 0
|
|
__ lwu(tmpL, Address(strL));
|
|
__ addi(strL, strL, wordSize / 2);
|
|
__ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
|
|
__ addi(strU, strU, wordSize);
|
|
__ inflate_lo32(tmp3, tmpL);
|
|
__ mv(tmpL, tmp3);
|
|
__ xorr(tmp3, tmpU, tmpL);
|
|
__ bnez(tmp3, CALCULATE_DIFFERENCE);
|
|
__ subi(cnt2, cnt2, wordSize / 2);
|
|
}
|
|
|
|
// we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
|
|
__ subi(cnt2, cnt2, wordSize * 2);
|
|
__ bltz(cnt2, TAIL);
|
|
__ bind(SMALL_LOOP); // smaller loop
|
|
__ subi(cnt2, cnt2, wordSize * 2);
|
|
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
|
|
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
|
|
__ bgez(cnt2, SMALL_LOOP);
|
|
__ addi(t0, cnt2, wordSize * 2);
|
|
__ beqz(t0, DONE);
|
|
__ bind(TAIL); // 1..15 characters left
|
|
// Aligned access. Load bytes in portions - 4, 2, 1.
|
|
|
|
__ addi(t0, cnt2, wordSize);
|
|
__ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
|
|
__ bltz(t0, LOAD_LAST);
|
|
// remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
|
|
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
|
|
__ subi(cnt2, cnt2, wordSize);
|
|
__ beqz(cnt2, DONE); // no character left
|
|
__ bind(LOAD_LAST); // cnt2 = 1..7 characters left
|
|
|
|
__ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
|
|
__ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes
|
|
__ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
|
|
__ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string
|
|
__ load_int_misaligned(tmpL, Address(strL), t0, false);
|
|
__ load_long_misaligned(tmpU, Address(strU), t0, 2);
|
|
__ inflate_lo32(tmp3, tmpL);
|
|
__ mv(tmpL, tmp3);
|
|
__ xorr(tmp3, tmpU, tmpL);
|
|
__ bnez(tmp3, CALCULATE_DIFFERENCE);
|
|
|
|
__ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
|
|
__ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string
|
|
__ load_int_misaligned(tmpL, Address(strL), t0, false);
|
|
__ load_long_misaligned(tmpU, Address(strU), t0, 2);
|
|
__ inflate_lo32(tmp3, tmpL);
|
|
__ mv(tmpL, tmp3);
|
|
__ xorr(tmp3, tmpU, tmpL);
|
|
__ bnez(tmp3, CALCULATE_DIFFERENCE);
|
|
__ j(DONE); // no character left
|
|
|
|
// Find the first different characters in the longwords and
|
|
// compute their difference.
|
|
__ bind(CALCULATE_DIFFERENCE);
|
|
// count bits of trailing zero chars
|
|
__ ctzc_bits(tmp4, tmp3);
|
|
__ srl(tmp1, tmp1, tmp4);
|
|
__ srl(tmp2, tmp2, tmp4);
|
|
__ zext(tmp1, tmp1, 16);
|
|
__ zext(tmp2, tmp2, 16);
|
|
__ sub(result, tmp1, tmp2);
|
|
__ bind(DONE);
|
|
__ ret();
|
|
return entry;
|
|
}
|
|
|
|
address generate_method_entry_barrier() {
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
Label deoptimize_label;
|
|
|
|
address start = __ pc();
|
|
|
|
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
|
|
if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
|
|
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
|
|
Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
|
|
__ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
|
|
__ lwu(t1, t1);
|
|
__ sw(t1, thread_epoch_addr);
|
|
// There are two ways this can work:
|
|
// - The writer did system icache shootdown after the instruction stream update.
|
|
// Hence do nothing.
|
|
// - The writer trust us to make sure our icache is in sync before entering.
|
|
// Hence use cmodx fence (fence.i, may change).
|
|
if (UseCtxFencei) {
|
|
__ cmodx_fence();
|
|
}
|
|
__ membar(__ LoadLoad);
|
|
}
|
|
|
|
__ set_last_Java_frame(sp, fp, ra);
|
|
|
|
__ enter();
|
|
__ addi(t1, sp, wordSize);
|
|
|
|
__ subi(sp, sp, 4 * wordSize);
|
|
|
|
__ push_call_clobbered_registers();
|
|
|
|
__ mv(c_rarg0, t1);
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
|
|
|
|
__ reset_last_Java_frame(true);
|
|
|
|
__ mv(t0, x10);
|
|
|
|
__ pop_call_clobbered_registers();
|
|
|
|
__ bnez(t0, deoptimize_label);
|
|
|
|
__ leave();
|
|
__ ret();
|
|
|
|
__ BIND(deoptimize_label);
|
|
|
|
__ ld(t0, Address(sp, 0));
|
|
__ ld(fp, Address(sp, wordSize));
|
|
__ ld(ra, Address(sp, wordSize * 2));
|
|
__ ld(t1, Address(sp, wordSize * 3));
|
|
|
|
__ mv(sp, t0);
|
|
__ jr(t1);
|
|
|
|
return start;
|
|
}
|
|
|
|
// x10 = result
|
|
// x11 = str1
|
|
// x12 = cnt1
|
|
// x13 = str2
|
|
// x14 = cnt2
|
|
// x28 = tmp1
|
|
// x29 = tmp2
|
|
// x30 = tmp3
|
|
// x31 = tmp4
|
|
address generate_compare_long_string_same_encoding(StubGenStubId stub_id) {
|
|
bool isLL;
|
|
switch (stub_id) {
|
|
case compare_long_string_LL_id:
|
|
isLL = true;
|
|
break;
|
|
case compare_long_string_UU_id:
|
|
isLL = false;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
};
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
|
|
LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
|
|
const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
|
|
tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
|
|
RegSet spilled_regs = RegSet::of(tmp4, tmp5);
|
|
|
|
// cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
|
|
// update cnt2 counter with already loaded 8 bytes
|
|
__ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
|
|
// update pointers, because of previous read
|
|
__ addi(str1, str1, wordSize);
|
|
__ addi(str2, str2, wordSize);
|
|
// less than 16 bytes left?
|
|
__ subi(cnt2, cnt2, isLL ? 16 : 8);
|
|
__ push_reg(spilled_regs, sp);
|
|
__ bltz(cnt2, TAIL);
|
|
__ bind(SMALL_LOOP);
|
|
// compare 16 bytes of strings with same encoding
|
|
__ ld(tmp5, Address(str1));
|
|
__ addi(str1, str1, 8);
|
|
__ xorr(tmp4, tmp1, tmp2);
|
|
__ ld(cnt1, Address(str2));
|
|
__ addi(str2, str2, 8);
|
|
__ bnez(tmp4, DIFF);
|
|
__ ld(tmp1, Address(str1));
|
|
__ addi(str1, str1, 8);
|
|
__ xorr(tmp4, tmp5, cnt1);
|
|
__ ld(tmp2, Address(str2));
|
|
__ addi(str2, str2, 8);
|
|
__ bnez(tmp4, DIFF2);
|
|
|
|
__ subi(cnt2, cnt2, isLL ? 16 : 8);
|
|
__ bgez(cnt2, SMALL_LOOP);
|
|
__ bind(TAIL);
|
|
__ addi(cnt2, cnt2, isLL ? 16 : 8);
|
|
__ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
|
|
__ subi(cnt2, cnt2, isLL ? 8 : 4);
|
|
__ blez(cnt2, CHECK_LAST);
|
|
__ xorr(tmp4, tmp1, tmp2);
|
|
__ bnez(tmp4, DIFF);
|
|
__ ld(tmp1, Address(str1));
|
|
__ addi(str1, str1, 8);
|
|
__ ld(tmp2, Address(str2));
|
|
__ addi(str2, str2, 8);
|
|
__ subi(cnt2, cnt2, isLL ? 8 : 4);
|
|
__ bind(CHECK_LAST);
|
|
if (!isLL) {
|
|
__ add(cnt2, cnt2, cnt2); // now in bytes
|
|
}
|
|
__ xorr(tmp4, tmp1, tmp2);
|
|
__ bnez(tmp4, DIFF);
|
|
__ add(str1, str1, cnt2);
|
|
__ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
|
|
__ add(str2, str2, cnt2);
|
|
__ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
|
|
__ xorr(tmp4, tmp5, cnt1);
|
|
__ beqz(tmp4, LENGTH_DIFF);
|
|
// Find the first different characters in the longwords and
|
|
// compute their difference.
|
|
__ bind(DIFF2);
|
|
// count bits of trailing zero chars
|
|
__ ctzc_bits(tmp3, tmp4, isLL);
|
|
__ srl(tmp5, tmp5, tmp3);
|
|
__ srl(cnt1, cnt1, tmp3);
|
|
if (isLL) {
|
|
__ zext(tmp5, tmp5, 8);
|
|
__ zext(cnt1, cnt1, 8);
|
|
} else {
|
|
__ zext(tmp5, tmp5, 16);
|
|
__ zext(cnt1, cnt1, 16);
|
|
}
|
|
__ sub(result, tmp5, cnt1);
|
|
__ j(LENGTH_DIFF);
|
|
__ bind(DIFF);
|
|
// count bits of trailing zero chars
|
|
__ ctzc_bits(tmp3, tmp4, isLL);
|
|
__ srl(tmp1, tmp1, tmp3);
|
|
__ srl(tmp2, tmp2, tmp3);
|
|
if (isLL) {
|
|
__ zext(tmp1, tmp1, 8);
|
|
__ zext(tmp2, tmp2, 8);
|
|
} else {
|
|
__ zext(tmp1, tmp1, 16);
|
|
__ zext(tmp2, tmp2, 16);
|
|
}
|
|
__ sub(result, tmp1, tmp2);
|
|
__ j(LENGTH_DIFF);
|
|
__ bind(LAST_CHECK_AND_LENGTH_DIFF);
|
|
__ xorr(tmp4, tmp1, tmp2);
|
|
__ bnez(tmp4, DIFF);
|
|
__ bind(LENGTH_DIFF);
|
|
__ pop_reg(spilled_regs, sp);
|
|
__ ret();
|
|
return entry;
|
|
}
|
|
|
|
void generate_compare_long_strings() {
|
|
StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubGenStubId::compare_long_string_LL_id);
|
|
StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubGenStubId::compare_long_string_UU_id);
|
|
StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubGenStubId::compare_long_string_LU_id);
|
|
StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubGenStubId::compare_long_string_UL_id);
|
|
}
|
|
|
|
// x10 result
|
|
// x11 src
|
|
// x12 src count
|
|
// x13 pattern
|
|
// x14 pattern count
|
|
address generate_string_indexof_linear(StubGenStubId stub_id)
|
|
{
|
|
bool needle_isL;
|
|
bool haystack_isL;
|
|
switch (stub_id) {
|
|
case string_indexof_linear_ll_id:
|
|
needle_isL = true;
|
|
haystack_isL = true;
|
|
break;
|
|
case string_indexof_linear_ul_id:
|
|
needle_isL = true;
|
|
haystack_isL = false;
|
|
break;
|
|
case string_indexof_linear_uu_id:
|
|
needle_isL = false;
|
|
haystack_isL = false;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
|
|
int needle_chr_size = needle_isL ? 1 : 2;
|
|
int haystack_chr_size = haystack_isL ? 1 : 2;
|
|
int needle_chr_shift = needle_isL ? 0 : 1;
|
|
int haystack_chr_shift = haystack_isL ? 0 : 1;
|
|
bool isL = needle_isL && haystack_isL;
|
|
// parameters
|
|
Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
|
|
// temporary registers
|
|
Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
|
|
// redefinitions
|
|
Register ch1 = x28, ch2 = x29;
|
|
RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
|
|
|
|
__ push_reg(spilled_regs, sp);
|
|
|
|
Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
|
|
L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
|
|
L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
|
|
L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
|
|
L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
|
|
L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
|
|
|
|
__ ld(ch1, Address(needle));
|
|
__ ld(ch2, Address(haystack));
|
|
// src.length - pattern.length
|
|
__ sub(haystack_len, haystack_len, needle_len);
|
|
|
|
// first is needle[0]
|
|
__ zext(first, ch1, needle_isL ? 8 : 16);
|
|
|
|
uint64_t mask0101 = UCONST64(0x0101010101010101);
|
|
uint64_t mask0001 = UCONST64(0x0001000100010001);
|
|
__ mv(mask1, haystack_isL ? mask0101 : mask0001);
|
|
__ mul(first, first, mask1);
|
|
uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
|
|
uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
|
|
__ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
|
|
if (needle_isL != haystack_isL) {
|
|
__ mv(tmp, ch1);
|
|
}
|
|
__ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
|
|
__ blez(haystack_len, L_SMALL);
|
|
|
|
if (needle_isL != haystack_isL) {
|
|
__ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
|
|
}
|
|
// xorr, sub, orr, notr, andr
|
|
// compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
|
|
// eg:
|
|
// first: aa aa aa aa aa aa aa aa
|
|
// ch2: aa aa li nx jd ka aa aa
|
|
// match_mask: 80 80 00 00 00 00 80 80
|
|
__ compute_match_mask(ch2, first, match_mask, mask1, mask2);
|
|
|
|
// search first char of needle, if success, goto L_HAS_ZERO;
|
|
__ bnez(match_mask, L_HAS_ZERO);
|
|
__ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
|
|
__ addi(result, result, wordSize / haystack_chr_size);
|
|
__ addi(haystack, haystack, wordSize);
|
|
__ bltz(haystack_len, L_POST_LOOP);
|
|
|
|
__ bind(L_LOOP);
|
|
__ ld(ch2, Address(haystack));
|
|
__ compute_match_mask(ch2, first, match_mask, mask1, mask2);
|
|
__ bnez(match_mask, L_HAS_ZERO);
|
|
|
|
__ bind(L_LOOP_PROCEED);
|
|
__ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
|
|
__ addi(haystack, haystack, wordSize);
|
|
__ addi(result, result, wordSize / haystack_chr_size);
|
|
__ bgez(haystack_len, L_LOOP);
|
|
|
|
__ bind(L_POST_LOOP);
|
|
__ mv(ch2, -wordSize / haystack_chr_size);
|
|
__ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
|
|
__ ld(ch2, Address(haystack));
|
|
__ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
|
|
__ neg(haystack_len, haystack_len);
|
|
__ xorr(ch2, first, ch2);
|
|
__ sub(match_mask, ch2, mask1);
|
|
__ orr(ch2, ch2, mask2);
|
|
__ mv(trailing_zeros, -1); // all bits set
|
|
__ j(L_SMALL_PROCEED);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(L_SMALL);
|
|
__ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
|
|
__ neg(haystack_len, haystack_len);
|
|
if (needle_isL != haystack_isL) {
|
|
__ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
|
|
}
|
|
__ xorr(ch2, first, ch2);
|
|
__ sub(match_mask, ch2, mask1);
|
|
__ orr(ch2, ch2, mask2);
|
|
__ mv(trailing_zeros, -1); // all bits set
|
|
|
|
__ bind(L_SMALL_PROCEED);
|
|
__ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
|
|
__ notr(ch2, ch2);
|
|
__ andr(match_mask, match_mask, ch2);
|
|
__ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
|
|
__ beqz(match_mask, NOMATCH);
|
|
|
|
__ bind(L_SMALL_HAS_ZERO_LOOP);
|
|
// count bits of trailing zero chars
|
|
__ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
|
|
__ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
|
|
__ mv(ch2, wordSize / haystack_chr_size);
|
|
__ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
|
|
__ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
|
|
__ mv(trailing_zeros, wordSize / haystack_chr_size);
|
|
__ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
|
|
|
|
__ bind(L_SMALL_CMP_LOOP);
|
|
__ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
|
|
__ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
|
|
needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
|
|
haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
|
|
__ addi(trailing_zeros, trailing_zeros, 1);
|
|
__ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
|
|
__ beq(first, ch2, L_SMALL_CMP_LOOP);
|
|
|
|
__ bind(L_SMALL_CMP_LOOP_NOMATCH);
|
|
__ beqz(match_mask, NOMATCH);
|
|
// count bits of trailing zero chars
|
|
__ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
|
|
__ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
|
|
__ addi(result, result, 1);
|
|
__ addi(haystack, haystack, haystack_chr_size);
|
|
__ j(L_SMALL_HAS_ZERO_LOOP);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(L_SMALL_CMP_LOOP_LAST_CMP);
|
|
__ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
|
|
__ j(DONE);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
|
|
__ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
|
|
__ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
|
|
__ j(DONE);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(L_HAS_ZERO);
|
|
// count bits of trailing zero chars
|
|
__ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
|
|
__ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
|
|
__ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
|
|
__ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
|
|
__ subi(result, result, 1); // array index from 0, so result -= 1
|
|
|
|
__ bind(L_HAS_ZERO_LOOP);
|
|
__ mv(needle_len, wordSize / haystack_chr_size);
|
|
__ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
|
|
__ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
|
|
// load next 8 bytes from haystack, and increase result index
|
|
__ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
|
|
__ addi(result, result, 1);
|
|
__ mv(trailing_zeros, wordSize / haystack_chr_size);
|
|
__ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
|
|
|
|
// compare one char
|
|
__ bind(L_CMP_LOOP);
|
|
__ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
|
|
needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
|
|
__ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
|
|
haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
|
|
__ addi(trailing_zeros, trailing_zeros, 1); // next char index
|
|
__ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
|
|
__ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
|
|
__ beq(needle_len, ch2, L_CMP_LOOP);
|
|
|
|
__ bind(L_CMP_LOOP_NOMATCH);
|
|
__ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
|
|
// count bits of trailing zero chars
|
|
__ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
|
|
__ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
|
|
__ addi(haystack, haystack, haystack_chr_size);
|
|
__ j(L_HAS_ZERO_LOOP);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(L_CMP_LOOP_LAST_CMP);
|
|
__ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
|
|
__ j(DONE);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(L_CMP_LOOP_LAST_CMP2);
|
|
__ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
|
|
__ addi(result, result, 1);
|
|
__ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
|
|
__ j(DONE);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(L_HAS_ZERO_LOOP_NOMATCH);
|
|
// 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
|
|
// L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
|
|
// so, result was increased at max by wordSize/str2_chr_size - 1, so,
|
|
// respective high bit wasn't changed. L_LOOP_PROCEED will increase
|
|
// result by analyzed characters value, so, we can just reset lower bits
|
|
// in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
|
|
// 2) restore needle_len and haystack_len values from "compressed" haystack_len
|
|
// 3) advance haystack value to represent next haystack octet. result & 7/3 is
|
|
// index of last analyzed substring inside current octet. So, haystack in at
|
|
// respective start address. We need to advance it to next octet
|
|
__ andi(match_mask, result, wordSize / haystack_chr_size - 1);
|
|
__ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
|
|
__ andi(result, result, haystack_isL ? -8 : -4);
|
|
__ slli(tmp, match_mask, haystack_chr_shift);
|
|
__ sub(haystack, haystack, tmp);
|
|
__ sext(haystack_len, haystack_len, 32);
|
|
__ j(L_LOOP_PROCEED);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(NOMATCH);
|
|
__ mv(result, -1);
|
|
|
|
__ bind(DONE);
|
|
__ pop_reg(spilled_regs, sp);
|
|
__ ret();
|
|
return entry;
|
|
}
|
|
|
|
void generate_string_indexof_stubs()
|
|
{
|
|
StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_ll_id);
|
|
StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_uu_id);
|
|
StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_ul_id);
|
|
}
|
|
|
|
#ifdef COMPILER2
|
|
void generate_lookup_secondary_supers_table_stub() {
|
|
StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
const Register
|
|
r_super_klass = x10,
|
|
r_array_base = x11,
|
|
r_array_length = x12,
|
|
r_array_index = x13,
|
|
r_sub_klass = x14,
|
|
result = x15,
|
|
r_bitmap = x16;
|
|
|
|
for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
|
|
StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
|
|
Label L_success;
|
|
__ enter();
|
|
__ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
|
|
r_array_base, r_array_length, r_array_index,
|
|
r_bitmap, slot, /*stub_is_near*/true);
|
|
__ leave();
|
|
__ ret();
|
|
}
|
|
}
|
|
|
|
// Slow path implementation for UseSecondarySupersTable.
|
|
address generate_lookup_secondary_supers_table_slow_path_stub() {
|
|
StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
const Register
|
|
r_super_klass = x10, // argument
|
|
r_array_base = x11, // argument
|
|
temp1 = x12, // tmp
|
|
r_array_index = x13, // argument
|
|
result = x15, // argument
|
|
r_bitmap = x16; // argument
|
|
|
|
|
|
__ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_mulAdd()
|
|
{
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::mulAdd_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address entry = __ pc();
|
|
|
|
const Register out = x10;
|
|
const Register in = x11;
|
|
const Register offset = x12;
|
|
const Register len = x13;
|
|
const Register k = x14;
|
|
const Register tmp = x28;
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter();
|
|
__ mul_add(out, in, offset, len, k, tmp);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return entry;
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Input:
|
|
* c_rarg0 - x address
|
|
* c_rarg1 - x length
|
|
* c_rarg2 - y address
|
|
* c_rarg3 - y length
|
|
* c_rarg4 - z address
|
|
*/
|
|
address generate_multiplyToLen()
|
|
{
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
|
|
const Register x = x10;
|
|
const Register xlen = x11;
|
|
const Register y = x12;
|
|
const Register ylen = x13;
|
|
const Register z = x14;
|
|
|
|
const Register tmp0 = x15;
|
|
const Register tmp1 = x16;
|
|
const Register tmp2 = x17;
|
|
const Register tmp3 = x7;
|
|
const Register tmp4 = x28;
|
|
const Register tmp5 = x29;
|
|
const Register tmp6 = x30;
|
|
const Register tmp7 = x31;
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
__ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret();
|
|
|
|
return entry;
|
|
}
|
|
|
|
address generate_squareToLen()
|
|
{
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::squareToLen_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
|
|
const Register x = x10;
|
|
const Register xlen = x11;
|
|
const Register z = x12;
|
|
const Register y = x14; // == x
|
|
const Register ylen = x15; // == xlen
|
|
|
|
const Register tmp0 = x13; // zlen, unused
|
|
const Register tmp1 = x16;
|
|
const Register tmp2 = x17;
|
|
const Register tmp3 = x7;
|
|
const Register tmp4 = x28;
|
|
const Register tmp5 = x29;
|
|
const Register tmp6 = x30;
|
|
const Register tmp7 = x31;
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter();
|
|
__ mv(y, x);
|
|
__ mv(ylen, xlen);
|
|
__ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return entry;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Input:
|
|
// c_rarg0 - newArr address
|
|
// c_rarg1 - oldArr address
|
|
// c_rarg2 - newIdx
|
|
// c_rarg3 - shiftCount
|
|
// c_rarg4 - numIter
|
|
//
|
|
address generate_bigIntegerLeftShift() {
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
|
|
Label loop, exit;
|
|
|
|
Register newArr = c_rarg0;
|
|
Register oldArr = c_rarg1;
|
|
Register newIdx = c_rarg2;
|
|
Register shiftCount = c_rarg3;
|
|
Register numIter = c_rarg4;
|
|
|
|
Register shiftRevCount = c_rarg5;
|
|
Register oldArrNext = t1;
|
|
|
|
__ beqz(numIter, exit);
|
|
__ shadd(newArr, newIdx, newArr, t0, 2);
|
|
|
|
__ mv(shiftRevCount, 32);
|
|
__ sub(shiftRevCount, shiftRevCount, shiftCount);
|
|
|
|
__ bind(loop);
|
|
__ addi(oldArrNext, oldArr, 4);
|
|
__ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
|
|
__ vle32_v(v0, oldArr);
|
|
__ vle32_v(v4, oldArrNext);
|
|
__ vsll_vx(v0, v0, shiftCount);
|
|
__ vsrl_vx(v4, v4, shiftRevCount);
|
|
__ vor_vv(v0, v0, v4);
|
|
__ vse32_v(v0, newArr);
|
|
__ sub(numIter, numIter, t0);
|
|
__ shadd(oldArr, t0, oldArr, t1, 2);
|
|
__ shadd(newArr, t0, newArr, t1, 2);
|
|
__ bnez(numIter, loop);
|
|
|
|
__ bind(exit);
|
|
__ ret();
|
|
|
|
return entry;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Input:
|
|
// c_rarg0 - newArr address
|
|
// c_rarg1 - oldArr address
|
|
// c_rarg2 - newIdx
|
|
// c_rarg3 - shiftCount
|
|
// c_rarg4 - numIter
|
|
//
|
|
address generate_bigIntegerRightShift() {
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
|
|
Label loop, exit;
|
|
|
|
Register newArr = c_rarg0;
|
|
Register oldArr = c_rarg1;
|
|
Register newIdx = c_rarg2;
|
|
Register shiftCount = c_rarg3;
|
|
Register numIter = c_rarg4;
|
|
Register idx = numIter;
|
|
|
|
Register shiftRevCount = c_rarg5;
|
|
Register oldArrNext = c_rarg6;
|
|
Register newArrCur = t0;
|
|
Register oldArrCur = t1;
|
|
|
|
__ beqz(idx, exit);
|
|
__ shadd(newArr, newIdx, newArr, t0, 2);
|
|
|
|
__ mv(shiftRevCount, 32);
|
|
__ sub(shiftRevCount, shiftRevCount, shiftCount);
|
|
|
|
__ bind(loop);
|
|
__ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
|
|
__ sub(idx, idx, t0);
|
|
__ shadd(oldArrNext, idx, oldArr, t1, 2);
|
|
__ shadd(newArrCur, idx, newArr, t1, 2);
|
|
__ addi(oldArrCur, oldArrNext, 4);
|
|
__ vle32_v(v0, oldArrCur);
|
|
__ vle32_v(v4, oldArrNext);
|
|
__ vsrl_vx(v0, v0, shiftCount);
|
|
__ vsll_vx(v4, v4, shiftRevCount);
|
|
__ vor_vv(v0, v0, v4);
|
|
__ vse32_v(v0, newArrCur);
|
|
__ bnez(idx, loop);
|
|
|
|
__ bind(exit);
|
|
__ ret();
|
|
|
|
return entry;
|
|
}
|
|
#endif
|
|
|
|
#ifdef COMPILER2
|
|
class MontgomeryMultiplyGenerator : public MacroAssembler {
|
|
|
|
Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
|
|
Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
|
|
|
|
RegSet _toSave;
|
|
bool _squaring;
|
|
|
|
public:
|
|
MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
|
|
: MacroAssembler(as->code()), _squaring(squaring) {
|
|
|
|
// Register allocation
|
|
|
|
RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
|
|
Pa_base = *regs; // Argument registers
|
|
if (squaring) {
|
|
Pb_base = Pa_base;
|
|
} else {
|
|
Pb_base = *++regs;
|
|
}
|
|
Pn_base = *++regs;
|
|
Rlen= *++regs;
|
|
inv = *++regs;
|
|
Pm_base = *++regs;
|
|
|
|
// Working registers:
|
|
Ra = *++regs; // The current digit of a, b, n, and m.
|
|
Rb = *++regs;
|
|
Rm = *++regs;
|
|
Rn = *++regs;
|
|
|
|
Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
|
|
Pb = *++regs;
|
|
Pm = *++regs;
|
|
Pn = *++regs;
|
|
|
|
tmp0 = *++regs; // Three registers which form a
|
|
tmp1 = *++regs; // triple-precision accumuator.
|
|
tmp2 = *++regs;
|
|
|
|
Ri = x6; // Inner and outer loop indexes.
|
|
Rj = x7;
|
|
|
|
Rhi_ab = x28; // Product registers: low and high parts
|
|
Rlo_ab = x29; // of a*b and m*n.
|
|
Rhi_mn = x30;
|
|
Rlo_mn = x31;
|
|
|
|
// x18 and up are callee-saved.
|
|
_toSave = RegSet::range(x18, *regs) + Pm_base;
|
|
}
|
|
|
|
private:
|
|
void save_regs() {
|
|
push_reg(_toSave, sp);
|
|
}
|
|
|
|
void restore_regs() {
|
|
pop_reg(_toSave, sp);
|
|
}
|
|
|
|
template <typename T>
|
|
void unroll_2(Register count, T block) {
|
|
Label loop, end, odd;
|
|
beqz(count, end);
|
|
test_bit(t0, count, 0);
|
|
bnez(t0, odd);
|
|
align(16);
|
|
bind(loop);
|
|
(this->*block)();
|
|
bind(odd);
|
|
(this->*block)();
|
|
subi(count, count, 2);
|
|
bgtz(count, loop);
|
|
bind(end);
|
|
}
|
|
|
|
template <typename T>
|
|
void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
|
|
Label loop, end, odd;
|
|
beqz(count, end);
|
|
test_bit(tmp, count, 0);
|
|
bnez(tmp, odd);
|
|
align(16);
|
|
bind(loop);
|
|
(this->*block)(d, s, tmp);
|
|
bind(odd);
|
|
(this->*block)(d, s, tmp);
|
|
subi(count, count, 2);
|
|
bgtz(count, loop);
|
|
bind(end);
|
|
}
|
|
|
|
void pre1(RegisterOrConstant i) {
|
|
block_comment("pre1");
|
|
// Pa = Pa_base;
|
|
// Pb = Pb_base + i;
|
|
// Pm = Pm_base;
|
|
// Pn = Pn_base + i;
|
|
// Ra = *Pa;
|
|
// Rb = *Pb;
|
|
// Rm = *Pm;
|
|
// Rn = *Pn;
|
|
if (i.is_register()) {
|
|
slli(t0, i.as_register(), LogBytesPerWord);
|
|
} else {
|
|
mv(t0, i.as_constant());
|
|
slli(t0, t0, LogBytesPerWord);
|
|
}
|
|
|
|
mv(Pa, Pa_base);
|
|
add(Pb, Pb_base, t0);
|
|
mv(Pm, Pm_base);
|
|
add(Pn, Pn_base, t0);
|
|
|
|
ld(Ra, Address(Pa));
|
|
ld(Rb, Address(Pb));
|
|
ld(Rm, Address(Pm));
|
|
ld(Rn, Address(Pn));
|
|
|
|
// Zero the m*n result.
|
|
mv(Rhi_mn, zr);
|
|
mv(Rlo_mn, zr);
|
|
}
|
|
|
|
// The core multiply-accumulate step of a Montgomery
|
|
// multiplication. The idea is to schedule operations as a
|
|
// pipeline so that instructions with long latencies (loads and
|
|
// multiplies) have time to complete before their results are
|
|
// used. This most benefits in-order implementations of the
|
|
// architecture but out-of-order ones also benefit.
|
|
void step() {
|
|
block_comment("step");
|
|
// MACC(Ra, Rb, tmp0, tmp1, tmp2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
mulhu(Rhi_ab, Ra, Rb);
|
|
mul(Rlo_ab, Ra, Rb);
|
|
addi(Pa, Pa, wordSize);
|
|
ld(Ra, Address(Pa));
|
|
subi(Pb, Pb, wordSize);
|
|
ld(Rb, Address(Pb));
|
|
acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
|
|
// previous iteration.
|
|
// MACC(Rm, Rn, tmp0, tmp1, tmp2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
mulhu(Rhi_mn, Rm, Rn);
|
|
mul(Rlo_mn, Rm, Rn);
|
|
addi(Pm, Pm, wordSize);
|
|
ld(Rm, Address(Pm));
|
|
subi(Pn, Pn, wordSize);
|
|
ld(Rn, Address(Pn));
|
|
acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
|
|
}
|
|
|
|
void post1() {
|
|
block_comment("post1");
|
|
|
|
// MACC(Ra, Rb, tmp0, tmp1, tmp2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
mulhu(Rhi_ab, Ra, Rb);
|
|
mul(Rlo_ab, Ra, Rb);
|
|
acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
|
|
acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
|
|
|
|
// *Pm = Rm = tmp0 * inv;
|
|
mul(Rm, tmp0, inv);
|
|
sd(Rm, Address(Pm));
|
|
|
|
// MACC(Rm, Rn, tmp0, tmp1, tmp2);
|
|
// tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
|
|
mulhu(Rhi_mn, Rm, Rn);
|
|
|
|
#ifndef PRODUCT
|
|
// assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
|
|
{
|
|
mul(Rlo_mn, Rm, Rn);
|
|
add(Rlo_mn, tmp0, Rlo_mn);
|
|
Label ok;
|
|
beqz(Rlo_mn, ok);
|
|
stop("broken Montgomery multiply");
|
|
bind(ok);
|
|
}
|
|
#endif
|
|
// We have very carefully set things up so that
|
|
// m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
|
|
// the lower half of Rm * Rn because we know the result already:
|
|
// it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff
|
|
// tmp0 != 0. So, rather than do a mul and an cad we just set
|
|
// the carry flag iff tmp0 is nonzero.
|
|
//
|
|
// mul(Rlo_mn, Rm, Rn);
|
|
// cad(zr, tmp0, Rlo_mn);
|
|
subi(t0, tmp0, 1);
|
|
sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
|
|
cadc(tmp0, tmp1, Rhi_mn, t0);
|
|
adc(tmp1, tmp2, zr, t0);
|
|
mv(tmp2, zr);
|
|
}
|
|
|
|
void pre2(Register i, Register len) {
|
|
block_comment("pre2");
|
|
// Pa = Pa_base + i-len;
|
|
// Pb = Pb_base + len;
|
|
// Pm = Pm_base + i-len;
|
|
// Pn = Pn_base + len;
|
|
|
|
sub(Rj, i, len);
|
|
// Rj == i-len
|
|
|
|
// Ra as temp register
|
|
slli(Ra, Rj, LogBytesPerWord);
|
|
add(Pa, Pa_base, Ra);
|
|
add(Pm, Pm_base, Ra);
|
|
slli(Ra, len, LogBytesPerWord);
|
|
add(Pb, Pb_base, Ra);
|
|
add(Pn, Pn_base, Ra);
|
|
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
addi(Pa, Pa, wordSize);
|
|
ld(Ra, Address(Pa));
|
|
subi(Pb, Pb, wordSize);
|
|
ld(Rb, Address(Pb));
|
|
addi(Pm, Pm, wordSize);
|
|
ld(Rm, Address(Pm));
|
|
subi(Pn, Pn, wordSize);
|
|
ld(Rn, Address(Pn));
|
|
|
|
mv(Rhi_mn, zr);
|
|
mv(Rlo_mn, zr);
|
|
}
|
|
|
|
void post2(Register i, Register len) {
|
|
block_comment("post2");
|
|
sub(Rj, i, len);
|
|
|
|
cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
|
|
|
|
// As soon as we know the least significant digit of our result,
|
|
// store it.
|
|
// Pm_base[i-len] = tmp0;
|
|
// Rj as temp register
|
|
slli(Rj, Rj, LogBytesPerWord);
|
|
add(Rj, Pm_base, Rj);
|
|
sd(tmp0, Address(Rj));
|
|
|
|
// tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
|
|
cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
|
|
adc(tmp1, tmp2, zr, t0);
|
|
mv(tmp2, zr);
|
|
}
|
|
|
|
// A carry in tmp0 after Montgomery multiplication means that we
|
|
// should subtract multiples of n from our result in m. We'll
|
|
// keep doing that until there is no carry.
|
|
void normalize(Register len) {
|
|
block_comment("normalize");
|
|
// while (tmp0)
|
|
// tmp0 = sub(Pm_base, Pn_base, tmp0, len);
|
|
Label loop, post, again;
|
|
Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
|
|
beqz(tmp0, post); {
|
|
bind(again); {
|
|
mv(i, zr);
|
|
mv(cnt, len);
|
|
slli(Rn, i, LogBytesPerWord);
|
|
add(Rm, Pm_base, Rn);
|
|
ld(Rm, Address(Rm));
|
|
add(Rn, Pn_base, Rn);
|
|
ld(Rn, Address(Rn));
|
|
mv(t0, 1); // set carry flag, i.e. no borrow
|
|
align(16);
|
|
bind(loop); {
|
|
notr(Rn, Rn);
|
|
add(Rm, Rm, t0);
|
|
add(Rm, Rm, Rn);
|
|
sltu(t0, Rm, Rn);
|
|
slli(Rn, i, LogBytesPerWord); // Rn as temp register
|
|
add(Rn, Pm_base, Rn);
|
|
sd(Rm, Address(Rn));
|
|
addi(i, i, 1);
|
|
slli(Rn, i, LogBytesPerWord);
|
|
add(Rm, Pm_base, Rn);
|
|
ld(Rm, Address(Rm));
|
|
add(Rn, Pn_base, Rn);
|
|
ld(Rn, Address(Rn));
|
|
subi(cnt, cnt, 1);
|
|
} bnez(cnt, loop);
|
|
subi(tmp0, tmp0, 1);
|
|
add(tmp0, tmp0, t0);
|
|
} bnez(tmp0, again);
|
|
} bind(post);
|
|
}
|
|
|
|
// Move memory at s to d, reversing words.
|
|
// Increments d to end of copied memory
|
|
// Destroys tmp1, tmp2
|
|
// Preserves len
|
|
// Leaves s pointing to the address which was in d at start
|
|
void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
|
|
assert(tmp1->encoding() < x28->encoding(), "register corruption");
|
|
assert(tmp2->encoding() < x28->encoding(), "register corruption");
|
|
|
|
shadd(s, len, s, tmp1, LogBytesPerWord);
|
|
mv(tmp1, len);
|
|
unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
|
|
slli(tmp1, len, LogBytesPerWord);
|
|
sub(s, d, tmp1);
|
|
}
|
|
// [63...0] -> [31...0][63...32]
|
|
void reverse1(Register d, Register s, Register tmp) {
|
|
subi(s, s, wordSize);
|
|
ld(tmp, Address(s));
|
|
ror(tmp, tmp, 32, t0);
|
|
sd(tmp, Address(d));
|
|
addi(d, d, wordSize);
|
|
}
|
|
|
|
void step_squaring() {
|
|
// An extra ACC
|
|
step();
|
|
acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
|
|
}
|
|
|
|
void last_squaring(Register i) {
|
|
Label dont;
|
|
// if ((i & 1) == 0) {
|
|
test_bit(t0, i, 0);
|
|
bnez(t0, dont); {
|
|
// MACC(Ra, Rb, tmp0, tmp1, tmp2);
|
|
// Ra = *++Pa;
|
|
// Rb = *--Pb;
|
|
mulhu(Rhi_ab, Ra, Rb);
|
|
mul(Rlo_ab, Ra, Rb);
|
|
acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
|
|
} bind(dont);
|
|
}
|
|
|
|
void extra_step_squaring() {
|
|
acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
|
|
|
|
// MACC(Rm, Rn, tmp0, tmp1, tmp2);
|
|
// Rm = *++Pm;
|
|
// Rn = *--Pn;
|
|
mulhu(Rhi_mn, Rm, Rn);
|
|
mul(Rlo_mn, Rm, Rn);
|
|
addi(Pm, Pm, wordSize);
|
|
ld(Rm, Address(Pm));
|
|
subi(Pn, Pn, wordSize);
|
|
ld(Rn, Address(Pn));
|
|
}
|
|
|
|
void post1_squaring() {
|
|
acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
|
|
|
|
// *Pm = Rm = tmp0 * inv;
|
|
mul(Rm, tmp0, inv);
|
|
sd(Rm, Address(Pm));
|
|
|
|
// MACC(Rm, Rn, tmp0, tmp1, tmp2);
|
|
// tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
|
|
mulhu(Rhi_mn, Rm, Rn);
|
|
|
|
#ifndef PRODUCT
|
|
// assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
|
|
{
|
|
mul(Rlo_mn, Rm, Rn);
|
|
add(Rlo_mn, tmp0, Rlo_mn);
|
|
Label ok;
|
|
beqz(Rlo_mn, ok); {
|
|
stop("broken Montgomery multiply");
|
|
} bind(ok);
|
|
}
|
|
#endif
|
|
// We have very carefully set things up so that
|
|
// m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
|
|
// the lower half of Rm * Rn because we know the result already:
|
|
// it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff
|
|
// tmp0 != 0. So, rather than do a mul and a cad we just set
|
|
// the carry flag iff tmp0 is nonzero.
|
|
//
|
|
// mul(Rlo_mn, Rm, Rn);
|
|
// cad(zr, tmp, Rlo_mn);
|
|
subi(t0, tmp0, 1);
|
|
sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
|
|
cadc(tmp0, tmp1, Rhi_mn, t0);
|
|
adc(tmp1, tmp2, zr, t0);
|
|
mv(tmp2, zr);
|
|
}
|
|
|
|
// use t0 as carry
|
|
void acc(Register Rhi, Register Rlo,
|
|
Register tmp0, Register tmp1, Register tmp2) {
|
|
cad(tmp0, tmp0, Rlo, t0);
|
|
cadc(tmp1, tmp1, Rhi, t0);
|
|
adc(tmp2, tmp2, zr, t0);
|
|
}
|
|
|
|
public:
|
|
/**
|
|
* Fast Montgomery multiplication. The derivation of the
|
|
* algorithm is in A Cryptographic Library for the Motorola
|
|
* DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
|
|
*
|
|
* Arguments:
|
|
*
|
|
* Inputs for multiplication:
|
|
* c_rarg0 - int array elements a
|
|
* c_rarg1 - int array elements b
|
|
* c_rarg2 - int array elements n (the modulus)
|
|
* c_rarg3 - int length
|
|
* c_rarg4 - int inv
|
|
* c_rarg5 - int array elements m (the result)
|
|
*
|
|
* Inputs for squaring:
|
|
* c_rarg0 - int array elements a
|
|
* c_rarg1 - int array elements n (the modulus)
|
|
* c_rarg2 - int length
|
|
* c_rarg3 - int inv
|
|
* c_rarg4 - int array elements m (the result)
|
|
*
|
|
*/
|
|
address generate_multiply() {
|
|
Label argh, nothing;
|
|
bind(argh);
|
|
stop("MontgomeryMultiply total_allocation must be <= 8192");
|
|
|
|
align(CodeEntryAlignment);
|
|
address entry = pc();
|
|
|
|
beqz(Rlen, nothing);
|
|
|
|
enter();
|
|
|
|
// Make room.
|
|
mv(Ra, 512);
|
|
bgt(Rlen, Ra, argh);
|
|
slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
|
|
sub(Ra, sp, Ra);
|
|
andi(sp, Ra, -2 * wordSize);
|
|
|
|
srliw(Rlen, Rlen, 1); // length in longwords = len/2
|
|
|
|
{
|
|
// Copy input args, reversing as we go. We use Ra as a
|
|
// temporary variable.
|
|
reverse(Ra, Pa_base, Rlen, Ri, Rj);
|
|
if (!_squaring)
|
|
reverse(Ra, Pb_base, Rlen, Ri, Rj);
|
|
reverse(Ra, Pn_base, Rlen, Ri, Rj);
|
|
}
|
|
|
|
// Push all call-saved registers and also Pm_base which we'll need
|
|
// at the end.
|
|
save_regs();
|
|
|
|
#ifndef PRODUCT
|
|
// assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
|
|
{
|
|
ld(Rn, Address(Pn_base));
|
|
mul(Rlo_mn, Rn, inv);
|
|
mv(t0, -1);
|
|
Label ok;
|
|
beq(Rlo_mn, t0, ok);
|
|
stop("broken inverse in Montgomery multiply");
|
|
bind(ok);
|
|
}
|
|
#endif
|
|
|
|
mv(Pm_base, Ra);
|
|
|
|
mv(tmp0, zr);
|
|
mv(tmp1, zr);
|
|
mv(tmp2, zr);
|
|
|
|
block_comment("for (int i = 0; i < len; i++) {");
|
|
mv(Ri, zr); {
|
|
Label loop, end;
|
|
bge(Ri, Rlen, end);
|
|
|
|
bind(loop);
|
|
pre1(Ri);
|
|
|
|
block_comment(" for (j = i; j; j--) {"); {
|
|
mv(Rj, Ri);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
|
|
} block_comment(" } // j");
|
|
|
|
post1();
|
|
addiw(Ri, Ri, 1);
|
|
blt(Ri, Rlen, loop);
|
|
bind(end);
|
|
block_comment("} // i");
|
|
}
|
|
|
|
block_comment("for (int i = len; i < 2*len; i++) {");
|
|
mv(Ri, Rlen); {
|
|
Label loop, end;
|
|
slli(t0, Rlen, 1);
|
|
bge(Ri, t0, end);
|
|
|
|
bind(loop);
|
|
pre2(Ri, Rlen);
|
|
|
|
block_comment(" for (j = len*2-i-1; j; j--) {"); {
|
|
slliw(Rj, Rlen, 1);
|
|
subw(Rj, Rj, Ri);
|
|
subiw(Rj, Rj, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
|
|
} block_comment(" } // j");
|
|
|
|
post2(Ri, Rlen);
|
|
addiw(Ri, Ri, 1);
|
|
slli(t0, Rlen, 1);
|
|
blt(Ri, t0, loop);
|
|
bind(end);
|
|
}
|
|
block_comment("} // i");
|
|
|
|
normalize(Rlen);
|
|
|
|
mv(Ra, Pm_base); // Save Pm_base in Ra
|
|
restore_regs(); // Restore caller's Pm_base
|
|
|
|
// Copy our result into caller's Pm_base
|
|
reverse(Pm_base, Ra, Rlen, Ri, Rj);
|
|
|
|
leave();
|
|
bind(nothing);
|
|
ret();
|
|
|
|
return entry;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* Arguments:
|
|
*
|
|
* Inputs:
|
|
* c_rarg0 - int array elements a
|
|
* c_rarg1 - int array elements n (the modulus)
|
|
* c_rarg2 - int length
|
|
* c_rarg3 - int inv
|
|
* c_rarg4 - int array elements m (the result)
|
|
*
|
|
*/
|
|
address generate_square() {
|
|
Label argh;
|
|
bind(argh);
|
|
stop("MontgomeryMultiply total_allocation must be <= 8192");
|
|
|
|
align(CodeEntryAlignment);
|
|
address entry = pc();
|
|
|
|
enter();
|
|
|
|
// Make room.
|
|
mv(Ra, 512);
|
|
bgt(Rlen, Ra, argh);
|
|
slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
|
|
sub(Ra, sp, Ra);
|
|
andi(sp, Ra, -2 * wordSize);
|
|
|
|
srliw(Rlen, Rlen, 1); // length in longwords = len/2
|
|
|
|
{
|
|
// Copy input args, reversing as we go. We use Ra as a
|
|
// temporary variable.
|
|
reverse(Ra, Pa_base, Rlen, Ri, Rj);
|
|
reverse(Ra, Pn_base, Rlen, Ri, Rj);
|
|
}
|
|
|
|
// Push all call-saved registers and also Pm_base which we'll need
|
|
// at the end.
|
|
save_regs();
|
|
|
|
mv(Pm_base, Ra);
|
|
|
|
mv(tmp0, zr);
|
|
mv(tmp1, zr);
|
|
mv(tmp2, zr);
|
|
|
|
block_comment("for (int i = 0; i < len; i++) {");
|
|
mv(Ri, zr); {
|
|
Label loop, end;
|
|
bind(loop);
|
|
bge(Ri, Rlen, end);
|
|
|
|
pre1(Ri);
|
|
|
|
block_comment("for (j = (i+1)/2; j; j--) {"); {
|
|
addi(Rj, Ri, 1);
|
|
srliw(Rj, Rj, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
|
|
} block_comment(" } // j");
|
|
|
|
last_squaring(Ri);
|
|
|
|
block_comment(" for (j = i/2; j; j--) {"); {
|
|
srliw(Rj, Ri, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
|
|
} block_comment(" } // j");
|
|
|
|
post1_squaring();
|
|
addi(Ri, Ri, 1);
|
|
blt(Ri, Rlen, loop);
|
|
|
|
bind(end);
|
|
block_comment("} // i");
|
|
}
|
|
|
|
block_comment("for (int i = len; i < 2*len; i++) {");
|
|
mv(Ri, Rlen); {
|
|
Label loop, end;
|
|
bind(loop);
|
|
slli(t0, Rlen, 1);
|
|
bge(Ri, t0, end);
|
|
|
|
pre2(Ri, Rlen);
|
|
|
|
block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
|
|
slli(Rj, Rlen, 1);
|
|
sub(Rj, Rj, Ri);
|
|
subi(Rj, Rj, 1);
|
|
srliw(Rj, Rj, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
|
|
} block_comment(" } // j");
|
|
|
|
last_squaring(Ri);
|
|
|
|
block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
|
|
slli(Rj, Rlen, 1);
|
|
sub(Rj, Rj, Ri);
|
|
srliw(Rj, Rj, 1);
|
|
unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
|
|
} block_comment(" } // j");
|
|
|
|
post2(Ri, Rlen);
|
|
addi(Ri, Ri, 1);
|
|
slli(t0, Rlen, 1);
|
|
blt(Ri, t0, loop);
|
|
|
|
bind(end);
|
|
block_comment("} // i");
|
|
}
|
|
|
|
normalize(Rlen);
|
|
|
|
mv(Ra, Pm_base); // Save Pm_base in Ra
|
|
restore_regs(); // Restore caller's Pm_base
|
|
|
|
// Copy our result into caller's Pm_base
|
|
reverse(Pm_base, Ra, Rlen, Ri, Rj);
|
|
|
|
leave();
|
|
ret();
|
|
|
|
return entry;
|
|
}
|
|
};
|
|
|
|
#endif // COMPILER2
|
|
|
|
address generate_cont_thaw(Continuation::thaw_kind kind) {
|
|
bool return_barrier = Continuation::is_thaw_return_barrier(kind);
|
|
bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
|
|
|
|
address start = __ pc();
|
|
|
|
if (return_barrier) {
|
|
__ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
{
|
|
Label OK;
|
|
__ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
|
|
__ beq(sp, t0, OK);
|
|
__ stop("incorrect sp");
|
|
__ bind(OK);
|
|
}
|
|
#endif
|
|
|
|
if (return_barrier) {
|
|
// preserve possible return value from a method returning to the return barrier
|
|
__ subi(sp, sp, 2 * wordSize);
|
|
__ fsd(f10, Address(sp, 0 * wordSize));
|
|
__ sd(x10, Address(sp, 1 * wordSize));
|
|
}
|
|
|
|
__ mv(c_rarg1, (return_barrier ? 1 : 0));
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
|
|
__ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
|
|
|
|
if (return_barrier) {
|
|
// restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
|
|
__ ld(x10, Address(sp, 1 * wordSize));
|
|
__ fld(f10, Address(sp, 0 * wordSize));
|
|
__ addi(sp, sp, 2 * wordSize);
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
{
|
|
Label OK;
|
|
__ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
|
|
__ beq(sp, t0, OK);
|
|
__ stop("incorrect sp");
|
|
__ bind(OK);
|
|
}
|
|
#endif
|
|
|
|
Label thaw_success;
|
|
// t1 contains the size of the frames to thaw, 0 if overflow or no more frames
|
|
__ bnez(t1, thaw_success);
|
|
__ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
|
|
__ bind(thaw_success);
|
|
|
|
// make room for the thawed frames
|
|
__ sub(t0, sp, t1);
|
|
__ andi(sp, t0, -16); // align
|
|
|
|
if (return_barrier) {
|
|
// save original return value -- again
|
|
__ subi(sp, sp, 2 * wordSize);
|
|
__ fsd(f10, Address(sp, 0 * wordSize));
|
|
__ sd(x10, Address(sp, 1 * wordSize));
|
|
}
|
|
|
|
// If we want, we can templatize thaw by kind, and have three different entries
|
|
__ mv(c_rarg1, kind);
|
|
|
|
__ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
|
|
__ mv(t1, x10); // x10 is the sp of the yielding frame
|
|
|
|
if (return_barrier) {
|
|
// restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
|
|
__ ld(x10, Address(sp, 1 * wordSize));
|
|
__ fld(f10, Address(sp, 0 * wordSize));
|
|
__ addi(sp, sp, 2 * wordSize);
|
|
} else {
|
|
__ mv(x10, zr); // return 0 (success) from doYield
|
|
}
|
|
|
|
// we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
|
|
__ mv(fp, t1);
|
|
__ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
|
|
|
|
if (return_barrier_exception) {
|
|
__ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
|
|
__ verify_oop(x10);
|
|
__ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
|
|
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
|
|
|
|
// see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
|
|
|
|
__ mv(x11, x10); // the exception handler
|
|
__ mv(x10, x9); // restore return value contaning the exception oop
|
|
__ verify_oop(x10);
|
|
|
|
__ leave();
|
|
__ mv(x13, ra);
|
|
__ jr(x11); // the exception handler
|
|
} else {
|
|
// We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
|
|
__ leave();
|
|
__ ret();
|
|
}
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_thaw() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
|
|
StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
generate_cont_thaw(Continuation::thaw_top);
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_returnBarrier() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
|
|
// TODO: will probably need multiple return barriers depending on return type
|
|
StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
generate_cont_thaw(Continuation::thaw_return_barrier);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_returnBarrier_exception() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
|
|
StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
generate_cont_thaw(Continuation::thaw_return_barrier_exception);
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_preempt_stub() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ reset_last_Java_frame(true);
|
|
|
|
// Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
|
|
__ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
|
|
|
|
Label preemption_cancelled;
|
|
__ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
|
|
__ bnez(t0, preemption_cancelled);
|
|
|
|
// Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
|
|
SharedRuntime::continuation_enter_cleanup(_masm);
|
|
__ leave();
|
|
__ ret();
|
|
|
|
// We acquired the monitor after freezing the frames so call thaw to continue execution.
|
|
__ bind(preemption_cancelled);
|
|
__ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
|
|
__ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
|
|
__ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
|
|
__ ld(t1, Address(t1));
|
|
__ jr(t1);
|
|
|
|
return start;
|
|
}
|
|
|
|
#if COMPILER2_OR_JVMCI
|
|
|
|
#undef __
|
|
#define __ this->
|
|
|
|
class Sha2Generator : public MacroAssembler {
|
|
StubCodeGenerator* _cgen;
|
|
public:
|
|
Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
|
|
address generate_sha256_implCompress(StubGenStubId stub_id) {
|
|
return generate_sha2_implCompress(Assembler::e32, stub_id);
|
|
}
|
|
address generate_sha512_implCompress(StubGenStubId stub_id) {
|
|
return generate_sha2_implCompress(Assembler::e64, stub_id);
|
|
}
|
|
private:
|
|
|
|
void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
|
|
if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
|
|
else __ vle64_v(vr, sr);
|
|
}
|
|
|
|
void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
|
|
if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
|
|
else __ vse64_v(vr, sr);
|
|
}
|
|
|
|
// Overview of the logic in each "quad round".
|
|
//
|
|
// The code below repeats 16/20 times the logic implementing four rounds
|
|
// of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
|
|
// to implementing the 64/80 single rounds.
|
|
//
|
|
// // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
|
|
// // Output:
|
|
// // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
|
|
// vl1reXX.v vTmp1, ofs
|
|
//
|
|
// // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
|
|
// addi ofs, ofs, 16/32
|
|
//
|
|
// // Add constants to message schedule words:
|
|
// // Input
|
|
// // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
|
|
// // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
|
|
// // Output
|
|
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
|
|
// vadd.vv vTmp0, vTmp1, vW0
|
|
//
|
|
// // 2 rounds of working variables updates.
|
|
// // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
|
|
// // Input:
|
|
// // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] "
|
|
// // vState0 = {a[t],b[t],e[t],f[t]}
|
|
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
|
|
// // Output:
|
|
// // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
|
|
// // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] "
|
|
// vsha2cl.vv vState1, vState0, vTmp0
|
|
//
|
|
// // 2 rounds of working variables updates.
|
|
// // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
|
|
// // Input
|
|
// // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] "
|
|
// // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] "
|
|
// // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
|
|
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
|
|
// // Output:
|
|
// // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] "
|
|
// vsha2ch.vv vState0, vState1, vTmp0
|
|
//
|
|
// // Combine 2QW into 1QW
|
|
// //
|
|
// // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
|
|
// // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
|
|
// // and it can only take 3 vectors as inputs. Hence we need to combine
|
|
// // vW1[0] and vW2[1..3] in a single vector.
|
|
// //
|
|
// // vmerge Vt4, Vt1, Vt2, V0
|
|
// // Input
|
|
// // V0 = mask // first word from vW2, 1..3 words from vW1
|
|
// // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
|
|
// // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
|
|
// // Output
|
|
// // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
|
|
// vmerge.vvm vTmp0, vW2, vW1, v0
|
|
//
|
|
// // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
|
|
// // Input
|
|
// // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0]
|
|
// // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12]
|
|
// // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4]
|
|
// // Output (next four message schedule words)
|
|
// // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16]
|
|
// vsha2ms.vv vW0, vTmp0, vW3
|
|
//
|
|
// BEFORE
|
|
// vW0 - vW3 hold the message schedule words (initially the block words)
|
|
// vW0 = W[ 3: 0] "oldest"
|
|
// vW1 = W[ 7: 4]
|
|
// vW2 = W[11: 8]
|
|
// vW3 = W[15:12] "newest"
|
|
//
|
|
// vt6 - vt7 hold the working state variables
|
|
// vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0}
|
|
// vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2}
|
|
//
|
|
// AFTER
|
|
// vW0 - vW3 hold the message schedule words (initially the block words)
|
|
// vW1 = W[ 7: 4] "oldest"
|
|
// vW2 = W[11: 8]
|
|
// vW3 = W[15:12]
|
|
// vW0 = W[19:16] "newest"
|
|
//
|
|
// vState0 and vState1 hold the working state variables
|
|
// vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
|
|
// vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
|
|
//
|
|
// The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
|
|
// hence the uses of those vectors rotate in each round, and we get back to the
|
|
// initial configuration every 4 quad-rounds. We could avoid those changes at
|
|
// the cost of moving those vectors at the end of each quad-rounds.
|
|
void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
|
|
Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
|
|
bool gen_words = true, bool step_const = true) {
|
|
__ vleXX_v(vset_sew, vtemp, scalarconst);
|
|
if (step_const) {
|
|
__ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
|
|
}
|
|
__ vadd_vv(vtemp2, vtemp, rot1);
|
|
__ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
|
|
__ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
|
|
if (gen_words) {
|
|
__ vmerge_vvm(vtemp2, rot3, rot2);
|
|
__ vsha2ms_vv(rot1, vtemp2, rot4);
|
|
}
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - byte[] source+offset
|
|
// c_rarg1 - int[] SHA.state
|
|
// c_rarg2 - int offset
|
|
// c_rarg3 - int limit
|
|
//
|
|
address generate_sha2_implCompress(Assembler::SEW vset_sew, StubGenStubId stub_id) {
|
|
alignas(64) static const uint32_t round_consts_256[64] = {
|
|
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
|
|
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
|
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
|
|
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
|
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
|
|
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
|
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
|
|
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
|
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
|
|
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
|
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
|
|
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
|
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
|
|
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
|
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
|
|
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
|
};
|
|
alignas(64) static const uint64_t round_consts_512[80] = {
|
|
0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
|
|
0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
|
|
0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
|
|
0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
|
|
0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
|
|
0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
|
|
0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
|
|
0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
|
|
0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
|
|
0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
|
|
0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
|
|
0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
|
|
0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
|
|
0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
|
|
0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
|
|
0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
|
|
0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
|
|
0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
|
|
0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
|
|
0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
|
|
0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
|
|
0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
|
|
0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
|
|
0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
|
|
0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
|
|
0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
|
|
0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
|
|
};
|
|
const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
|
|
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case sha256_implCompress_id:
|
|
assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
|
|
multi_block = false;
|
|
break;
|
|
case sha256_implCompressMB_id:
|
|
assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
|
|
multi_block = true;
|
|
break;
|
|
case sha512_implCompress_id:
|
|
assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
|
|
multi_block = false;
|
|
break;
|
|
case sha512_implCompressMB_id:
|
|
assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
};
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(_cgen, stub_id);
|
|
address start = __ pc();
|
|
|
|
Register buf = c_rarg0;
|
|
Register state = c_rarg1;
|
|
Register ofs = c_rarg2;
|
|
Register limit = c_rarg3;
|
|
Register consts = t2; // caller saved
|
|
Register state_c = x28; // caller saved
|
|
VectorRegister vindex = v2;
|
|
VectorRegister vW0 = v4;
|
|
VectorRegister vW1 = v6;
|
|
VectorRegister vW2 = v8;
|
|
VectorRegister vW3 = v10;
|
|
VectorRegister vState0 = v12;
|
|
VectorRegister vState1 = v14;
|
|
VectorRegister vHash0 = v16;
|
|
VectorRegister vHash1 = v18;
|
|
VectorRegister vTmp0 = v20;
|
|
VectorRegister vTmp1 = v22;
|
|
|
|
Label multi_block_loop;
|
|
|
|
__ enter();
|
|
|
|
address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
|
|
la(consts, ExternalAddress(constant_table));
|
|
|
|
// Register use in this function:
|
|
//
|
|
// VECTORS
|
|
// vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
|
|
// schedule words (Wt). They start with the message block
|
|
// content (W0 to W15), then further words in the message
|
|
// schedule generated via vsha2ms from previous Wt.
|
|
// Initially:
|
|
// vW0 = W[ 3:0] = { W3, W2, W1, W0}
|
|
// vW1 = W[ 7:4] = { W7, W6, W5, W4}
|
|
// vW2 = W[ 11:8] = {W11, W10, W9, W8}
|
|
// vW3 = W[15:12] = {W15, W14, W13, W12}
|
|
//
|
|
// vState0 - vState1 hold the working state variables (a, b, ..., h)
|
|
// vState0 = {f[t],e[t],b[t],a[t]}
|
|
// vState1 = {h[t],g[t],d[t],c[t]}
|
|
// Initially:
|
|
// vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
|
|
// vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
|
|
//
|
|
// v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
|
|
//
|
|
// vTmp0 = temporary, Wt+Kt
|
|
// vTmp1 = temporary, Kt
|
|
//
|
|
// vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
|
|
//
|
|
// During most of the function the vector state is configured so that each
|
|
// vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
|
|
|
|
// vsha2ch/vsha2cl uses EGW of 4*SEW.
|
|
// SHA256 SEW = e32, EGW = 128-bits
|
|
// SHA512 SEW = e64, EGW = 256-bits
|
|
//
|
|
// VLEN is required to be at least 128.
|
|
// For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
|
|
//
|
|
// m1: LMUL=1/2
|
|
// ta: tail agnostic (don't care about those lanes)
|
|
// ma: mask agnostic (don't care about those lanes)
|
|
// x0 is not written, we known the number of vector elements.
|
|
|
|
if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
|
|
__ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
|
|
} else {
|
|
__ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
|
|
}
|
|
|
|
int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
|
|
__ li(t0, indexes);
|
|
__ vmv_v_x(vindex, t0);
|
|
|
|
// Step-over a,b, so we are pointing to c.
|
|
// const_add is equal to 4x state variable, div by 2 is thus 2, a,b
|
|
__ addi(state_c, state, const_add/2);
|
|
|
|
// Use index-load to get {f,e,b,a},{h,g,d,c}
|
|
__ vluxei8_v(vState0, state, vindex);
|
|
__ vluxei8_v(vState1, state_c, vindex);
|
|
|
|
__ bind(multi_block_loop);
|
|
|
|
// Capture the initial H values in vHash0 and vHash1 to allow for computing
|
|
// the resulting H', since H' = H+{a',b',c',...,h'}.
|
|
__ vmv_v_v(vHash0, vState0);
|
|
__ vmv_v_v(vHash1, vState1);
|
|
|
|
// Load the 512/1024-bits of the message block in vW0-vW3 and perform
|
|
// an endian swap on each 4/8 bytes element.
|
|
//
|
|
// If Zvkb is not implemented one can use vrgather
|
|
// with an index sequence to byte-swap.
|
|
// sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12]
|
|
// <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
|
|
// this sequence. 'vid' gives us the N.
|
|
__ vleXX_v(vset_sew, vW0, buf);
|
|
__ vrev8_v(vW0, vW0);
|
|
__ addi(buf, buf, const_add);
|
|
__ vleXX_v(vset_sew, vW1, buf);
|
|
__ vrev8_v(vW1, vW1);
|
|
__ addi(buf, buf, const_add);
|
|
__ vleXX_v(vset_sew, vW2, buf);
|
|
__ vrev8_v(vW2, vW2);
|
|
__ addi(buf, buf, const_add);
|
|
__ vleXX_v(vset_sew, vW3, buf);
|
|
__ vrev8_v(vW3, vW3);
|
|
__ addi(buf, buf, const_add);
|
|
|
|
// Set v0 up for the vmerge that replaces the first word (idx==0)
|
|
__ vid_v(v0);
|
|
__ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0)
|
|
|
|
VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
|
|
int rot_pos = 0;
|
|
// Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
|
|
const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
|
|
for (int i = 0; i < qr_end; i++) {
|
|
sha2_quad_round(vset_sew,
|
|
rotation_regs[(rot_pos + 0) & 0x3],
|
|
rotation_regs[(rot_pos + 1) & 0x3],
|
|
rotation_regs[(rot_pos + 2) & 0x3],
|
|
rotation_regs[(rot_pos + 3) & 0x3],
|
|
consts,
|
|
vTmp1, vTmp0, vState0, vState1);
|
|
++rot_pos;
|
|
}
|
|
// Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
|
|
// Note that we stop generating new message schedule words (Wt, vW0-13)
|
|
// as we already generated all the words we end up consuming (i.e., W[63:60]).
|
|
const int qr_c_end = qr_end + 4;
|
|
for (int i = qr_end; i < qr_c_end; i++) {
|
|
sha2_quad_round(vset_sew,
|
|
rotation_regs[(rot_pos + 0) & 0x3],
|
|
rotation_regs[(rot_pos + 1) & 0x3],
|
|
rotation_regs[(rot_pos + 2) & 0x3],
|
|
rotation_regs[(rot_pos + 3) & 0x3],
|
|
consts,
|
|
vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
|
|
++rot_pos;
|
|
}
|
|
|
|
//--------------------------------------------------------------------------------
|
|
// Compute the updated hash value H'
|
|
// H' = H + {h',g',...,b',a'}
|
|
// = {h,g,...,b,a} + {h',g',...,b',a'}
|
|
// = {h+h',g+g',...,b+b',a+a'}
|
|
|
|
// H' = H+{a',b',c',...,h'}
|
|
__ vadd_vv(vState0, vHash0, vState0);
|
|
__ vadd_vv(vState1, vHash1, vState1);
|
|
|
|
if (multi_block) {
|
|
int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
|
|
__ subi(consts, consts, total_adds);
|
|
__ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
|
|
__ ble(ofs, limit, multi_block_loop);
|
|
__ mv(c_rarg0, ofs); // return ofs
|
|
}
|
|
|
|
// Store H[0..8] = {a,b,c,d,e,f,g,h} from
|
|
// vState0 = {f,e,b,a}
|
|
// vState1 = {h,g,d,c}
|
|
__ vsuxei8_v(vState0, state, vindex);
|
|
__ vsuxei8_v(vState1, state_c, vindex);
|
|
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
};
|
|
|
|
#undef __
|
|
#define __ _masm->
|
|
|
|
// Set of L registers that correspond to a contiguous memory area.
|
|
// Each 64-bit register typically corresponds to 2 32-bit integers.
|
|
template <uint L>
|
|
class RegCache {
|
|
private:
|
|
MacroAssembler *_masm;
|
|
Register _regs[L];
|
|
|
|
public:
|
|
RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
|
|
assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
|
|
auto it = rs.begin();
|
|
for (auto &r: _regs) {
|
|
r = *it;
|
|
++it;
|
|
}
|
|
}
|
|
|
|
// generate load for the i'th register
|
|
void gen_load(uint i, Register base) {
|
|
assert(i < L, "invalid i: %u", i);
|
|
__ ld(_regs[i], Address(base, 8 * i));
|
|
}
|
|
|
|
// add i'th 32-bit integer to dest
|
|
void add_u32(const Register dest, uint i, const Register rtmp = t0) {
|
|
assert(i < 2 * L, "invalid i: %u", i);
|
|
|
|
if (is_even(i)) {
|
|
// Use the bottom 32 bits. No need to mask off the top 32 bits
|
|
// as addw will do the right thing.
|
|
__ addw(dest, dest, _regs[i / 2]);
|
|
} else {
|
|
// Use the top 32 bits by right-shifting them.
|
|
__ srli(rtmp, _regs[i / 2], 32);
|
|
__ addw(dest, dest, rtmp);
|
|
}
|
|
}
|
|
};
|
|
|
|
typedef RegCache<8> BufRegCache;
|
|
|
|
// a += value + x + ac;
|
|
// a = Integer.rotateLeft(a, s) + b;
|
|
void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
|
|
Register a, Register b, Register c, Register d,
|
|
int k, int s, int t,
|
|
Register value) {
|
|
// a += ac
|
|
__ addw(a, a, t, t1);
|
|
|
|
// a += x;
|
|
reg_cache.add_u32(a, k);
|
|
// a += value;
|
|
__ addw(a, a, value);
|
|
|
|
// a = Integer.rotateLeft(a, s) + b;
|
|
__ rolw(a, a, s);
|
|
__ addw(a, a, b);
|
|
}
|
|
|
|
// a += ((b & c) | ((~b) & d)) + x + ac;
|
|
// a = Integer.rotateLeft(a, s) + b;
|
|
void md5_FF(BufRegCache& reg_cache,
|
|
Register a, Register b, Register c, Register d,
|
|
int k, int s, int t,
|
|
Register rtmp1, Register rtmp2) {
|
|
// rtmp1 = b & c
|
|
__ andr(rtmp1, b, c);
|
|
|
|
// rtmp2 = (~b) & d
|
|
__ andn(rtmp2, d, b);
|
|
|
|
// rtmp1 = (b & c) | ((~b) & d)
|
|
__ orr(rtmp1, rtmp1, rtmp2);
|
|
|
|
m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
|
|
}
|
|
|
|
// a += ((b & d) | (c & (~d))) + x + ac;
|
|
// a = Integer.rotateLeft(a, s) + b;
|
|
void md5_GG(BufRegCache& reg_cache,
|
|
Register a, Register b, Register c, Register d,
|
|
int k, int s, int t,
|
|
Register rtmp1, Register rtmp2) {
|
|
// rtmp1 = b & d
|
|
__ andr(rtmp1, b, d);
|
|
|
|
// rtmp2 = c & (~d)
|
|
__ andn(rtmp2, c, d);
|
|
|
|
// rtmp1 = (b & d) | (c & (~d))
|
|
__ orr(rtmp1, rtmp1, rtmp2);
|
|
|
|
m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
|
|
}
|
|
|
|
// a += ((b ^ c) ^ d) + x + ac;
|
|
// a = Integer.rotateLeft(a, s) + b;
|
|
void md5_HH(BufRegCache& reg_cache,
|
|
Register a, Register b, Register c, Register d,
|
|
int k, int s, int t,
|
|
Register rtmp1, Register rtmp2) {
|
|
// rtmp1 = (b ^ c) ^ d
|
|
__ xorr(rtmp2, b, c);
|
|
__ xorr(rtmp1, rtmp2, d);
|
|
|
|
m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
|
|
}
|
|
|
|
// a += (c ^ (b | (~d))) + x + ac;
|
|
// a = Integer.rotateLeft(a, s) + b;
|
|
void md5_II(BufRegCache& reg_cache,
|
|
Register a, Register b, Register c, Register d,
|
|
int k, int s, int t,
|
|
Register rtmp1, Register rtmp2) {
|
|
// rtmp1 = c ^ (b | (~d))
|
|
__ orn(rtmp2, b, d);
|
|
__ xorr(rtmp1, c, rtmp2);
|
|
|
|
m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - byte[] source+offset
|
|
// c_rarg1 - int[] SHA.state
|
|
// c_rarg2 - int offset (multi_block == True)
|
|
// c_rarg3 - int limit (multi_block == True)
|
|
//
|
|
// Registers:
|
|
// x0 zero (zero)
|
|
// x1 ra (return address)
|
|
// x2 sp (stack pointer)
|
|
// x3 gp (global pointer)
|
|
// x4 tp (thread pointer)
|
|
// x5 t0 (tmp register)
|
|
// x6 t1 (tmp register)
|
|
// x7 t2 state0
|
|
// x8 f0/s0 (frame pointer)
|
|
// x9 s1
|
|
// x10 a0 rtmp1 / c_rarg0
|
|
// x11 a1 rtmp2 / c_rarg1
|
|
// x12 a2 a / c_rarg2
|
|
// x13 a3 b / c_rarg3
|
|
// x14 a4 c
|
|
// x15 a5 d
|
|
// x16 a6 buf
|
|
// x17 a7 state
|
|
// x18 s2 ofs [saved-reg] (multi_block == True)
|
|
// x19 s3 limit [saved-reg] (multi_block == True)
|
|
// x20 s4 state1 [saved-reg]
|
|
// x21 s5 state2 [saved-reg]
|
|
// x22 s6 state3 [saved-reg]
|
|
// x23 s7
|
|
// x24 s8 buf0 [saved-reg]
|
|
// x25 s9 buf1 [saved-reg]
|
|
// x26 s10 buf2 [saved-reg]
|
|
// x27 s11 buf3 [saved-reg]
|
|
// x28 t3 buf4
|
|
// x29 t4 buf5
|
|
// x30 t5 buf6
|
|
// x31 t6 buf7
|
|
address generate_md5_implCompress(StubGenStubId stub_id) {
|
|
__ align(CodeEntryAlignment);
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case md5_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case md5_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
};
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
// rotation constants
|
|
const int S11 = 7;
|
|
const int S12 = 12;
|
|
const int S13 = 17;
|
|
const int S14 = 22;
|
|
const int S21 = 5;
|
|
const int S22 = 9;
|
|
const int S23 = 14;
|
|
const int S24 = 20;
|
|
const int S31 = 4;
|
|
const int S32 = 11;
|
|
const int S33 = 16;
|
|
const int S34 = 23;
|
|
const int S41 = 6;
|
|
const int S42 = 10;
|
|
const int S43 = 15;
|
|
const int S44 = 21;
|
|
|
|
const int64_t mask32 = 0xffffffff;
|
|
|
|
Register buf_arg = c_rarg0; // a0
|
|
Register state_arg = c_rarg1; // a1
|
|
Register ofs_arg = c_rarg2; // a2
|
|
Register limit_arg = c_rarg3; // a3
|
|
|
|
// we'll copy the args to these registers to free up a0-a3
|
|
// to use for other values manipulated by instructions
|
|
// that can be compressed
|
|
Register buf = x16; // a6
|
|
Register state = x17; // a7
|
|
Register ofs = x18; // s2
|
|
Register limit = x19; // s3
|
|
|
|
// using x12->15 to allow compressed instructions
|
|
Register a = x12; // a2
|
|
Register b = x13; // a3
|
|
Register c = x14; // a4
|
|
Register d = x15; // a5
|
|
|
|
Register state0 = x7; // t2
|
|
Register state1 = x20; // s4
|
|
Register state2 = x21; // s5
|
|
Register state3 = x22; // s6
|
|
|
|
// using x10->x11 to allow compressed instructions
|
|
Register rtmp1 = x10; // a0
|
|
Register rtmp2 = x11; // a1
|
|
|
|
RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
|
|
RegSet reg_cache_regs;
|
|
reg_cache_regs += reg_cache_saved_regs;
|
|
reg_cache_regs += RegSet::of(t3, t4, t5, t6);
|
|
BufRegCache reg_cache(_masm, reg_cache_regs);
|
|
|
|
RegSet saved_regs;
|
|
if (multi_block) {
|
|
saved_regs += RegSet::of(ofs, limit);
|
|
}
|
|
saved_regs += RegSet::of(state1, state2, state3);
|
|
saved_regs += reg_cache_saved_regs;
|
|
|
|
__ push_reg(saved_regs, sp);
|
|
|
|
__ mv(buf, buf_arg);
|
|
__ mv(state, state_arg);
|
|
if (multi_block) {
|
|
__ mv(ofs, ofs_arg);
|
|
__ mv(limit, limit_arg);
|
|
}
|
|
|
|
// to minimize the number of memory operations:
|
|
// read the 4 state 4-byte values in pairs, with a single ld,
|
|
// and split them into 2 registers.
|
|
//
|
|
// And, as the core algorithm of md5 works on 32-bits words, so
|
|
// in the following code, it does not care about the content of
|
|
// higher 32-bits in state[x]. Based on this observation,
|
|
// we can apply further optimization, which is to just ignore the
|
|
// higher 32-bits in state0/state2, rather than set the higher
|
|
// 32-bits of state0/state2 to zero explicitly with extra instructions.
|
|
__ ld(state0, Address(state));
|
|
__ srli(state1, state0, 32);
|
|
__ ld(state2, Address(state, 8));
|
|
__ srli(state3, state2, 32);
|
|
|
|
Label md5_loop;
|
|
__ BIND(md5_loop);
|
|
|
|
__ mv(a, state0);
|
|
__ mv(b, state1);
|
|
__ mv(c, state2);
|
|
__ mv(d, state3);
|
|
|
|
// Round 1
|
|
reg_cache.gen_load(0, buf);
|
|
md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2);
|
|
md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2);
|
|
reg_cache.gen_load(1, buf);
|
|
md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2);
|
|
md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2);
|
|
reg_cache.gen_load(2, buf);
|
|
md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2);
|
|
md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2);
|
|
reg_cache.gen_load(3, buf);
|
|
md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2);
|
|
md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2);
|
|
reg_cache.gen_load(4, buf);
|
|
md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2);
|
|
md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2);
|
|
reg_cache.gen_load(5, buf);
|
|
md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
|
|
md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
|
|
reg_cache.gen_load(6, buf);
|
|
md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
|
|
md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
|
|
reg_cache.gen_load(7, buf);
|
|
md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
|
|
md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
|
|
|
|
// Round 2
|
|
md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2);
|
|
md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
|
|
|
|
// Round 3
|
|
md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
|
|
md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2);
|
|
|
|
// Round 4
|
|
md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2);
|
|
md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2);
|
|
md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
|
|
md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2);
|
|
md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
|
|
md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2);
|
|
md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
|
|
md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2);
|
|
md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2);
|
|
md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
|
|
md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2);
|
|
md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
|
|
md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2);
|
|
md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
|
|
md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
|
|
md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2);
|
|
|
|
__ addw(state0, state0, a);
|
|
__ addw(state1, state1, b);
|
|
__ addw(state2, state2, c);
|
|
__ addw(state3, state3, d);
|
|
|
|
if (multi_block) {
|
|
__ addi(buf, buf, 64);
|
|
__ addi(ofs, ofs, 64);
|
|
// if (ofs <= limit) goto m5_loop
|
|
__ bge(limit, ofs, md5_loop);
|
|
__ mv(c_rarg0, ofs); // return ofs
|
|
}
|
|
|
|
// to minimize the number of memory operations:
|
|
// write back the 4 state 4-byte values in pairs, with a single sd
|
|
__ mv(t0, mask32);
|
|
__ andr(state0, state0, t0);
|
|
__ slli(state1, state1, 32);
|
|
__ orr(state0, state0, state1);
|
|
__ sd(state0, Address(state));
|
|
__ andr(state2, state2, t0);
|
|
__ slli(state3, state3, 32);
|
|
__ orr(state2, state2, state3);
|
|
__ sd(state2, Address(state, 8));
|
|
|
|
__ pop_reg(saved_regs, sp);
|
|
__ ret();
|
|
|
|
return (address) start;
|
|
}
|
|
|
|
/**
|
|
* Perform the quarter round calculations on values contained within four vector registers.
|
|
*
|
|
* @param aVec the SIMD register containing only the "a" values
|
|
* @param bVec the SIMD register containing only the "b" values
|
|
* @param cVec the SIMD register containing only the "c" values
|
|
* @param dVec the SIMD register containing only the "d" values
|
|
* @param tmp_vr temporary vector register holds intermedia values.
|
|
*/
|
|
void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
|
|
VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
|
|
// a += b, d ^= a, d <<<= 16
|
|
__ vadd_vv(aVec, aVec, bVec);
|
|
__ vxor_vv(dVec, dVec, aVec);
|
|
__ vrole32_vi(dVec, 16, tmp_vr);
|
|
|
|
// c += d, b ^= c, b <<<= 12
|
|
__ vadd_vv(cVec, cVec, dVec);
|
|
__ vxor_vv(bVec, bVec, cVec);
|
|
__ vrole32_vi(bVec, 12, tmp_vr);
|
|
|
|
// a += b, d ^= a, d <<<= 8
|
|
__ vadd_vv(aVec, aVec, bVec);
|
|
__ vxor_vv(dVec, dVec, aVec);
|
|
__ vrole32_vi(dVec, 8, tmp_vr);
|
|
|
|
// c += d, b ^= c, b <<<= 7
|
|
__ vadd_vv(cVec, cVec, dVec);
|
|
__ vxor_vv(bVec, bVec, cVec);
|
|
__ vrole32_vi(bVec, 7, tmp_vr);
|
|
}
|
|
|
|
/**
|
|
* int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
|
|
*
|
|
* Input arguments:
|
|
* c_rarg0 - state, the starting state
|
|
* c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function
|
|
*
|
|
* Implementation Note:
|
|
* Parallelization is achieved by loading individual state elements into vectors for N blocks.
|
|
* N depends on single vector register length.
|
|
*/
|
|
address generate_chacha20Block() {
|
|
Label L_Rounds;
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
const int states_len = 16;
|
|
const int step = 4;
|
|
const Register state = c_rarg0;
|
|
const Register key_stream = c_rarg1;
|
|
const Register tmp_addr = t0;
|
|
const Register length = t1;
|
|
|
|
// Organize vector registers in an array that facilitates
|
|
// putting repetitive opcodes into loop structures below.
|
|
const VectorRegister work_vrs[16] = {
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10, v11, v12, v13, v14, v15
|
|
};
|
|
const VectorRegister tmp_vr = v16;
|
|
const VectorRegister counter_vr = v17;
|
|
|
|
{
|
|
// Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
|
|
// in java level.
|
|
__ vsetivli(length, 16, Assembler::e32, Assembler::m1);
|
|
}
|
|
|
|
// Load from source state.
|
|
// Every element in source state is duplicated to all elements in the corresponding vector.
|
|
__ mv(tmp_addr, state);
|
|
for (int i = 0; i < states_len; i += 1) {
|
|
__ vlse32_v(work_vrs[i], tmp_addr, zr);
|
|
__ addi(tmp_addr, tmp_addr, step);
|
|
}
|
|
// Adjust counter for every individual block.
|
|
__ vid_v(counter_vr);
|
|
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
|
|
|
|
// Perform 10 iterations of the 8 quarter round set
|
|
{
|
|
const Register loop = t2; // share t2 with other non-overlapping usages.
|
|
__ mv(loop, 10);
|
|
__ BIND(L_Rounds);
|
|
|
|
chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr);
|
|
chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr);
|
|
chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
|
|
chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
|
|
|
|
chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
|
|
chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
|
|
chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr);
|
|
chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr);
|
|
|
|
__ subi(loop, loop, 1);
|
|
__ bnez(loop, L_Rounds);
|
|
}
|
|
|
|
// Add the original state into the end working state.
|
|
// We do this by first duplicating every element in source state array to the corresponding
|
|
// vector, then adding it to the post-loop working state.
|
|
__ mv(tmp_addr, state);
|
|
for (int i = 0; i < states_len; i += 1) {
|
|
__ vlse32_v(tmp_vr, tmp_addr, zr);
|
|
__ addi(tmp_addr, tmp_addr, step);
|
|
__ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
|
|
}
|
|
// Add the counter overlay onto work_vrs[12] at the end.
|
|
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
|
|
|
|
// Store result to key stream.
|
|
{
|
|
const Register stride = t2; // share t2 with other non-overlapping usages.
|
|
// Every block occupies 64 bytes, so we use 64 as stride of the vector store.
|
|
__ mv(stride, 64);
|
|
for (int i = 0; i < states_len; i += 1) {
|
|
__ vsse32_v(work_vrs[i], key_stream, stride);
|
|
__ addi(key_stream, key_stream, step);
|
|
}
|
|
}
|
|
|
|
// Return length of output key_stream
|
|
__ slli(c_rarg0, length, 6);
|
|
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return (address) start;
|
|
}
|
|
|
|
|
|
// ------------------------ SHA-1 intrinsic ------------------------
|
|
|
|
// K't =
|
|
// 5a827999, 0 <= t <= 19
|
|
// 6ed9eba1, 20 <= t <= 39
|
|
// 8f1bbcdc, 40 <= t <= 59
|
|
// ca62c1d6, 60 <= t <= 79
|
|
void sha1_prepare_k(Register cur_k, int round) {
|
|
assert(round >= 0 && round < 80, "must be");
|
|
|
|
static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
|
|
if ((round % 20) == 0) {
|
|
__ mv(cur_k, ks[round/20]);
|
|
}
|
|
}
|
|
|
|
// W't =
|
|
// M't, 0 <= t <= 15
|
|
// ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
|
|
void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
|
|
assert(round >= 0 && round < 80, "must be");
|
|
|
|
if (round < 16) {
|
|
// in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
|
|
// in ws[0], high part contains W't-0, low part contains W't-1,
|
|
// in ws[1], high part contains W't-2, low part contains W't-3,
|
|
// ...
|
|
// in ws[7], high part contains W't-14, low part contains W't-15.
|
|
|
|
if ((round % 2) == 0) {
|
|
__ ld(ws[round/2], Address(buf, (round/2) * 8));
|
|
// reverse bytes, as SHA-1 is defined in big-endian.
|
|
__ revb(ws[round/2], ws[round/2]);
|
|
__ srli(cur_w, ws[round/2], 32);
|
|
} else {
|
|
__ mv(cur_w, ws[round/2]);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
if ((round % 2) == 0) {
|
|
int idx = 16;
|
|
// W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
|
|
__ srli(t1, ws[(idx-8)/2], 32);
|
|
__ xorr(t0, ws[(idx-3)/2], t1);
|
|
|
|
__ srli(t1, ws[(idx-14)/2], 32);
|
|
__ srli(cur_w, ws[(idx-16)/2], 32);
|
|
__ xorr(cur_w, cur_w, t1);
|
|
|
|
__ xorr(cur_w, cur_w, t0);
|
|
__ rolw(cur_w, cur_w, 1, t0);
|
|
|
|
// copy the cur_w value to ws[8].
|
|
// now, valid w't values are at:
|
|
// w0: ws[0]'s lower 32 bits
|
|
// w1 ~ w14: ws[1] ~ ws[7]
|
|
// w15: ws[8]'s higher 32 bits
|
|
__ slli(ws[idx/2], cur_w, 32);
|
|
|
|
return;
|
|
}
|
|
|
|
int idx = 17;
|
|
// W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
|
|
__ srli(t1, ws[(idx-3)/2], 32);
|
|
__ xorr(t0, t1, ws[(idx-8)/2]);
|
|
|
|
__ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
|
|
|
|
__ xorr(cur_w, cur_w, t0);
|
|
__ rolw(cur_w, cur_w, 1, t0);
|
|
|
|
// copy the cur_w value to ws[8]
|
|
__ zext(cur_w, cur_w, 32);
|
|
__ orr(ws[idx/2], ws[idx/2], cur_w);
|
|
|
|
// shift the w't registers, so they start from ws[0] again.
|
|
// now, valid w't values are at:
|
|
// w0 ~ w15: ws[0] ~ ws[7]
|
|
Register ws_0 = ws[0];
|
|
for (int i = 0; i < 16/2; i++) {
|
|
ws[i] = ws[i+1];
|
|
}
|
|
ws[8] = ws_0;
|
|
}
|
|
|
|
// f't(x, y, z) =
|
|
// Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19
|
|
// Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39
|
|
// Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59
|
|
// Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79
|
|
void sha1_f(Register dst, Register x, Register y, Register z, int round) {
|
|
assert(round >= 0 && round < 80, "must be");
|
|
assert_different_registers(dst, x, y, z, t0, t1);
|
|
|
|
if (round < 20) {
|
|
// (x & y) ^ (~x & z)
|
|
__ andr(t0, x, y);
|
|
__ andn(dst, z, x);
|
|
__ xorr(dst, dst, t0);
|
|
} else if (round >= 40 && round < 60) {
|
|
// (x & y) ^ (x & z) ^ (y & z)
|
|
__ andr(t0, x, y);
|
|
__ andr(t1, x, z);
|
|
__ andr(dst, y, z);
|
|
__ xorr(dst, dst, t0);
|
|
__ xorr(dst, dst, t1);
|
|
} else {
|
|
// x ^ y ^ z
|
|
__ xorr(dst, x, y);
|
|
__ xorr(dst, dst, z);
|
|
}
|
|
}
|
|
|
|
// T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
|
|
// e = d
|
|
// d = c
|
|
// c = ROTL'30(b)
|
|
// b = a
|
|
// a = T
|
|
void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
|
|
Register cur_k, Register cur_w, Register tmp, int round) {
|
|
assert(round >= 0 && round < 80, "must be");
|
|
assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
|
|
|
|
// T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
|
|
|
|
// cur_w will be recalculated at the beginning of each round,
|
|
// so, we can reuse it as a temp register here.
|
|
Register tmp2 = cur_w;
|
|
|
|
// reuse e as a temporary register, as we will mv new value into it later
|
|
Register tmp3 = e;
|
|
__ add(tmp2, cur_k, tmp2);
|
|
__ add(tmp3, tmp3, tmp2);
|
|
__ rolw(tmp2, a, 5, t0);
|
|
|
|
sha1_f(tmp, b, c, d, round);
|
|
|
|
__ add(tmp2, tmp2, tmp);
|
|
__ add(tmp2, tmp2, tmp3);
|
|
|
|
// e = d
|
|
// d = c
|
|
// c = ROTL'30(b)
|
|
// b = a
|
|
// a = T
|
|
__ mv(e, d);
|
|
__ mv(d, c);
|
|
|
|
__ rolw(c, b, 30);
|
|
__ mv(b, a);
|
|
__ mv(a, tmp2);
|
|
}
|
|
|
|
// H(i)0 = a + H(i-1)0
|
|
// H(i)1 = b + H(i-1)1
|
|
// H(i)2 = c + H(i-1)2
|
|
// H(i)3 = d + H(i-1)3
|
|
// H(i)4 = e + H(i-1)4
|
|
void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
|
|
Register prev_ab, Register prev_cd, Register prev_e) {
|
|
assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
|
|
|
|
__ add(a, a, prev_ab);
|
|
__ srli(prev_ab, prev_ab, 32);
|
|
__ add(b, b, prev_ab);
|
|
|
|
__ add(c, c, prev_cd);
|
|
__ srli(prev_cd, prev_cd, 32);
|
|
__ add(d, d, prev_cd);
|
|
|
|
__ add(e, e, prev_e);
|
|
}
|
|
|
|
void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
|
|
Register prev_ab, Register prev_cd, Register prev_e) {
|
|
assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
|
|
|
|
__ slli(t0, b, 32);
|
|
__ zext(prev_ab, a, 32);
|
|
__ orr(prev_ab, prev_ab, t0);
|
|
|
|
__ slli(t0, d, 32);
|
|
__ zext(prev_cd, c, 32);
|
|
__ orr(prev_cd, prev_cd, t0);
|
|
|
|
__ mv(prev_e, e);
|
|
}
|
|
|
|
// Intrinsic for:
|
|
// void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
|
|
// void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0: byte[] src array + offset
|
|
// c_rarg1: int[] SHA.state
|
|
// - - - - - - below are only for implCompressMultiBlock0 - - - - - -
|
|
// c_rarg2: int offset
|
|
// c_rarg3: int limit
|
|
//
|
|
// Outputs:
|
|
// - - - - - - below are only for implCompressMultiBlock0 - - - - - -
|
|
// c_rarg0: int offset, when (multi_block == true)
|
|
//
|
|
address generate_sha1_implCompress(StubGenStubId stub_id) {
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case sha1_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case sha1_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
};
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
RegSet saved_regs = RegSet::range(x18, x27);
|
|
if (multi_block) {
|
|
// use x9 as src below.
|
|
saved_regs += RegSet::of(x9);
|
|
}
|
|
__ push_reg(saved_regs, sp);
|
|
|
|
// c_rarg0 - c_rarg3: x10 - x13
|
|
Register buf = c_rarg0;
|
|
Register state = c_rarg1;
|
|
Register offset = c_rarg2;
|
|
Register limit = c_rarg3;
|
|
// use src to contain the original start point of the array.
|
|
Register src = x9;
|
|
|
|
if (multi_block) {
|
|
__ sub(limit, limit, offset);
|
|
__ add(limit, limit, buf);
|
|
__ sub(src, buf, offset);
|
|
}
|
|
|
|
// [args-reg]: x14 - x17
|
|
// [temp-reg]: x28 - x31
|
|
// [saved-reg]: x18 - x27
|
|
|
|
// h0/1/2/3/4
|
|
const Register a = x14, b = x15, c = x16, d = x17, e = x28;
|
|
// w0, w1, ... w15
|
|
// put two adjecent w's in one register:
|
|
// one at high word part, another at low word part
|
|
// at different round (even or odd), w't value reside in different items in ws[].
|
|
// w0 ~ w15, either reside in
|
|
// ws[0] ~ ws[7], where
|
|
// w0 at higher 32 bits of ws[0],
|
|
// w1 at lower 32 bits of ws[0],
|
|
// ...
|
|
// w14 at higher 32 bits of ws[7],
|
|
// w15 at lower 32 bits of ws[7].
|
|
// or, reside in
|
|
// w0: ws[0]'s lower 32 bits
|
|
// w1 ~ w14: ws[1] ~ ws[7]
|
|
// w15: ws[8]'s higher 32 bits
|
|
Register ws[9] = {x29, x30, x31, x18,
|
|
x19, x20, x21, x22,
|
|
x23}; // auxiliary register for calculating w's value
|
|
// current k't's value
|
|
const Register cur_k = x24;
|
|
// current w't's value
|
|
const Register cur_w = x25;
|
|
// values of a, b, c, d, e in the previous round
|
|
const Register prev_ab = x26, prev_cd = x27;
|
|
const Register prev_e = offset; // reuse offset/c_rarg2
|
|
|
|
// load 5 words state into a, b, c, d, e.
|
|
//
|
|
// To minimize the number of memory operations, we apply following
|
|
// optimization: read the states (a/b/c/d) of 4-byte values in pairs,
|
|
// with a single ld, and split them into 2 registers.
|
|
//
|
|
// And, as the core algorithm of SHA-1 works on 32-bits words, so
|
|
// in the following code, it does not care about the content of
|
|
// higher 32-bits in a/b/c/d/e. Based on this observation,
|
|
// we can apply further optimization, which is to just ignore the
|
|
// higher 32-bits in a/c/e, rather than set the higher
|
|
// 32-bits of a/c/e to zero explicitly with extra instructions.
|
|
__ ld(a, Address(state, 0));
|
|
__ srli(b, a, 32);
|
|
__ ld(c, Address(state, 8));
|
|
__ srli(d, c, 32);
|
|
__ lw(e, Address(state, 16));
|
|
|
|
Label L_sha1_loop;
|
|
if (multi_block) {
|
|
__ BIND(L_sha1_loop);
|
|
}
|
|
|
|
sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
|
|
|
|
for (int round = 0; round < 80; round++) {
|
|
// prepare K't value
|
|
sha1_prepare_k(cur_k, round);
|
|
|
|
// prepare W't value
|
|
sha1_prepare_w(cur_w, ws, buf, round);
|
|
|
|
// one round process
|
|
sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
|
|
}
|
|
|
|
// compute the intermediate hash value
|
|
sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
|
|
|
|
if (multi_block) {
|
|
int64_t block_bytes = 16 * 4;
|
|
__ addi(buf, buf, block_bytes);
|
|
|
|
__ bge(limit, buf, L_sha1_loop, true);
|
|
}
|
|
|
|
// store back the state.
|
|
__ zext(a, a, 32);
|
|
__ slli(b, b, 32);
|
|
__ orr(a, a, b);
|
|
__ sd(a, Address(state, 0));
|
|
__ zext(c, c, 32);
|
|
__ slli(d, d, 32);
|
|
__ orr(c, c, d);
|
|
__ sd(c, Address(state, 8));
|
|
__ sw(e, Address(state, 16));
|
|
|
|
// return offset
|
|
if (multi_block) {
|
|
__ sub(c_rarg0, buf, src);
|
|
}
|
|
|
|
__ pop_reg(saved_regs, sp);
|
|
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return (address) start;
|
|
}
|
|
|
|
/**
|
|
* vector registers:
|
|
* input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
|
|
* index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
|
|
* output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
|
|
*
|
|
* NOTE: each field will occupy a vector register group
|
|
*/
|
|
void base64_vector_encode_round(Register src, Register dst, Register codec,
|
|
Register size, Register stepSrc, Register stepDst,
|
|
VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
|
|
VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
|
|
VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
|
|
Assembler::LMUL lmul) {
|
|
// set vector register type/len
|
|
__ vsetvli(x0, size, Assembler::e8, lmul);
|
|
|
|
// segmented load src into v registers: mem(src) => vr(3)
|
|
__ vlseg3e8_v(inputV1, src);
|
|
|
|
// src = src + register_group_len_bytes * 3
|
|
__ add(src, src, stepSrc);
|
|
|
|
// encoding
|
|
// 1. compute index into lookup table: vr(3) => vr(4)
|
|
__ vsrl_vi(idxV1, inputV1, 2);
|
|
|
|
__ vsrl_vi(idxV2, inputV2, 2);
|
|
__ vsll_vi(inputV1, inputV1, 6);
|
|
__ vor_vv(idxV2, idxV2, inputV1);
|
|
__ vsrl_vi(idxV2, idxV2, 2);
|
|
|
|
__ vsrl_vi(idxV3, inputV3, 4);
|
|
__ vsll_vi(inputV2, inputV2, 4);
|
|
__ vor_vv(idxV3, inputV2, idxV3);
|
|
__ vsrl_vi(idxV3, idxV3, 2);
|
|
|
|
__ vsll_vi(idxV4, inputV3, 2);
|
|
__ vsrl_vi(idxV4, idxV4, 2);
|
|
|
|
// 2. indexed load: vr(4) => vr(4)
|
|
__ vluxei8_v(outputV1, codec, idxV1);
|
|
__ vluxei8_v(outputV2, codec, idxV2);
|
|
__ vluxei8_v(outputV3, codec, idxV3);
|
|
__ vluxei8_v(outputV4, codec, idxV4);
|
|
|
|
// segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
|
|
__ vsseg4e8_v(outputV1, dst);
|
|
|
|
// dst = dst + register_group_len_bytes * 4
|
|
__ add(dst, dst, stepDst);
|
|
}
|
|
|
|
/**
|
|
* void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
|
|
*
|
|
* Input arguments:
|
|
* c_rarg0 - src, source array
|
|
* c_rarg1 - sp, src start offset
|
|
* c_rarg2 - sl, src end offset
|
|
* c_rarg3 - dst, dest array
|
|
* c_rarg4 - dp, dst start offset
|
|
* c_rarg5 - isURL, Base64 or URL character set
|
|
*/
|
|
address generate_base64_encodeBlock() {
|
|
alignas(64) static const char toBase64[64] = {
|
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
|
|
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
|
|
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
|
|
};
|
|
|
|
alignas(64) static const char toBase64URL[64] = {
|
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
|
|
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
|
|
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
Register src = c_rarg0;
|
|
Register soff = c_rarg1;
|
|
Register send = c_rarg2;
|
|
Register dst = c_rarg3;
|
|
Register doff = c_rarg4;
|
|
Register isURL = c_rarg5;
|
|
|
|
Register codec = c_rarg6;
|
|
Register length = c_rarg7; // total length of src data in bytes
|
|
|
|
Label ProcessData, Exit;
|
|
|
|
// length should be multiple of 3
|
|
__ sub(length, send, soff);
|
|
// real src/dst to process data
|
|
__ add(src, src, soff);
|
|
__ add(dst, dst, doff);
|
|
|
|
// load the codec base address
|
|
__ la(codec, ExternalAddress((address) toBase64));
|
|
__ beqz(isURL, ProcessData);
|
|
__ la(codec, ExternalAddress((address) toBase64URL));
|
|
__ BIND(ProcessData);
|
|
|
|
// vector version
|
|
if (UseRVV) {
|
|
Label ProcessM2, ProcessM1, ProcessScalar;
|
|
|
|
Register size = soff;
|
|
Register stepSrcM1 = send;
|
|
Register stepSrcM2 = doff;
|
|
Register stepDst = isURL;
|
|
|
|
__ mv(size, MaxVectorSize * 2);
|
|
__ mv(stepSrcM1, MaxVectorSize * 3);
|
|
__ slli(stepSrcM2, stepSrcM1, 1);
|
|
__ mv(stepDst, MaxVectorSize * 2 * 4);
|
|
|
|
__ blt(length, stepSrcM2, ProcessM1);
|
|
|
|
__ BIND(ProcessM2);
|
|
base64_vector_encode_round(src, dst, codec,
|
|
size, stepSrcM2, stepDst,
|
|
v2, v4, v6, // inputs
|
|
v8, v10, v12, v14, // indexes
|
|
v16, v18, v20, v22, // outputs
|
|
Assembler::m2);
|
|
|
|
__ sub(length, length, stepSrcM2);
|
|
__ bge(length, stepSrcM2, ProcessM2);
|
|
|
|
__ BIND(ProcessM1);
|
|
__ blt(length, stepSrcM1, ProcessScalar);
|
|
|
|
__ srli(size, size, 1);
|
|
__ srli(stepDst, stepDst, 1);
|
|
base64_vector_encode_round(src, dst, codec,
|
|
size, stepSrcM1, stepDst,
|
|
v1, v2, v3, // inputs
|
|
v4, v5, v6, v7, // indexes
|
|
v8, v9, v10, v11, // outputs
|
|
Assembler::m1);
|
|
__ sub(length, length, stepSrcM1);
|
|
|
|
__ BIND(ProcessScalar);
|
|
}
|
|
|
|
// scalar version
|
|
{
|
|
Register byte1 = soff, byte0 = send, byte2 = doff;
|
|
Register combined24Bits = isURL;
|
|
|
|
__ beqz(length, Exit);
|
|
|
|
Label ScalarLoop;
|
|
__ BIND(ScalarLoop);
|
|
{
|
|
// plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
|
|
// encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
|
|
|
|
// load 3 bytes src data
|
|
__ lbu(byte0, Address(src, 0));
|
|
__ lbu(byte1, Address(src, 1));
|
|
__ lbu(byte2, Address(src, 2));
|
|
__ addi(src, src, 3);
|
|
|
|
// construct 24 bits from 3 bytes
|
|
__ slliw(byte0, byte0, 16);
|
|
__ slliw(byte1, byte1, 8);
|
|
__ orr(combined24Bits, byte0, byte1);
|
|
__ orr(combined24Bits, combined24Bits, byte2);
|
|
|
|
// get codec index and encode(ie. load from codec by index)
|
|
__ slliw(byte0, combined24Bits, 8);
|
|
__ srliw(byte0, byte0, 26);
|
|
__ add(byte0, codec, byte0);
|
|
__ lbu(byte0, byte0);
|
|
|
|
__ slliw(byte1, combined24Bits, 14);
|
|
__ srliw(byte1, byte1, 26);
|
|
__ add(byte1, codec, byte1);
|
|
__ lbu(byte1, byte1);
|
|
|
|
__ slliw(byte2, combined24Bits, 20);
|
|
__ srliw(byte2, byte2, 26);
|
|
__ add(byte2, codec, byte2);
|
|
__ lbu(byte2, byte2);
|
|
|
|
__ andi(combined24Bits, combined24Bits, 0x3f);
|
|
__ add(combined24Bits, codec, combined24Bits);
|
|
__ lbu(combined24Bits, combined24Bits);
|
|
|
|
// store 4 bytes encoded data
|
|
__ sb(byte0, Address(dst, 0));
|
|
__ sb(byte1, Address(dst, 1));
|
|
__ sb(byte2, Address(dst, 2));
|
|
__ sb(combined24Bits, Address(dst, 3));
|
|
|
|
__ subi(length, length, 3);
|
|
__ addi(dst, dst, 4);
|
|
// loop back
|
|
__ bnez(length, ScalarLoop);
|
|
}
|
|
}
|
|
|
|
__ BIND(Exit);
|
|
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return (address) start;
|
|
}
|
|
|
|
/**
|
|
* vector registers:
|
|
* input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
|
|
* index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
|
|
* output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
|
|
*
|
|
* NOTE: each field will occupy a single vector register group
|
|
*/
|
|
void base64_vector_decode_round(Register src, Register dst, Register codec,
|
|
Register size, Register stepSrc, Register stepDst, Register failedIdx,
|
|
VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
|
|
VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
|
|
VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
|
|
Assembler::LMUL lmul) {
|
|
// set vector register type/len
|
|
__ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
|
|
|
|
// segmented load src into v registers: mem(src) => vr(4)
|
|
__ vlseg4e8_v(inputV1, src);
|
|
|
|
// src = src + register_group_len_bytes * 4
|
|
__ add(src, src, stepSrc);
|
|
|
|
// decoding
|
|
// 1. indexed load: vr(4) => vr(4)
|
|
__ vluxei8_v(idxV1, codec, inputV1);
|
|
__ vluxei8_v(idxV2, codec, inputV2);
|
|
__ vluxei8_v(idxV3, codec, inputV3);
|
|
__ vluxei8_v(idxV4, codec, inputV4);
|
|
|
|
// 2. check wrong data
|
|
__ vor_vv(outputV1, idxV1, idxV2);
|
|
__ vor_vv(outputV2, idxV3, idxV4);
|
|
__ vor_vv(outputV1, outputV1, outputV2);
|
|
__ vmseq_vi(v0, outputV1, -1);
|
|
__ vfirst_m(failedIdx, v0);
|
|
Label NoFailure, FailureAtIdx0;
|
|
// valid value can only be -1 when < 0
|
|
__ bltz(failedIdx, NoFailure);
|
|
// when the first data (at index 0) fails, no need to process data anymore
|
|
__ beqz(failedIdx, FailureAtIdx0);
|
|
__ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
|
|
__ slli(stepDst, failedIdx, 1);
|
|
__ add(stepDst, failedIdx, stepDst);
|
|
__ BIND(NoFailure);
|
|
|
|
// 3. compute the decoded data: vr(4) => vr(3)
|
|
__ vsll_vi(idxV1, idxV1, 2);
|
|
__ vsrl_vi(outputV1, idxV2, 4);
|
|
__ vor_vv(outputV1, outputV1, idxV1);
|
|
|
|
__ vsll_vi(idxV2, idxV2, 4);
|
|
__ vsrl_vi(outputV2, idxV3, 2);
|
|
__ vor_vv(outputV2, outputV2, idxV2);
|
|
|
|
__ vsll_vi(idxV3, idxV3, 6);
|
|
__ vor_vv(outputV3, idxV4, idxV3);
|
|
|
|
// segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
|
|
__ vsseg3e8_v(outputV1, dst);
|
|
|
|
// dst = dst + register_group_len_bytes * 3
|
|
__ add(dst, dst, stepDst);
|
|
__ BIND(FailureAtIdx0);
|
|
}
|
|
|
|
/**
|
|
* int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
|
|
*
|
|
* Input arguments:
|
|
* c_rarg0 - src, source array
|
|
* c_rarg1 - sp, src start offset
|
|
* c_rarg2 - sl, src end offset
|
|
* c_rarg3 - dst, dest array
|
|
* c_rarg4 - dp, dst start offset
|
|
* c_rarg5 - isURL, Base64 or URL character set
|
|
* c_rarg6 - isMIME, Decoding MIME block
|
|
*/
|
|
address generate_base64_decodeBlock() {
|
|
|
|
static const uint8_t fromBase64[256] = {
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
|
|
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
|
|
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
|
|
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
};
|
|
|
|
static const uint8_t fromBase64URL[256] = {
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
|
|
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
|
|
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
|
|
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
|
|
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
|
};
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
|
|
Register src = c_rarg0;
|
|
Register soff = c_rarg1;
|
|
Register send = c_rarg2;
|
|
Register dst = c_rarg3;
|
|
Register doff = c_rarg4;
|
|
Register isURL = c_rarg5;
|
|
Register isMIME = c_rarg6;
|
|
|
|
Register codec = c_rarg7;
|
|
Register dstBackup = t6;
|
|
Register length = t3; // total length of src data in bytes
|
|
|
|
Label ProcessData, Exit;
|
|
Label ProcessScalar, ScalarLoop;
|
|
|
|
// passed in length (send - soff) is guaranteed to be > 4,
|
|
// and in this intrinsic we only process data of length in multiple of 4,
|
|
// it's not guaranteed to be multiple of 4 by java level, so do it explicitly
|
|
__ sub(length, send, soff);
|
|
__ andi(length, length, -4);
|
|
// real src/dst to process data
|
|
__ add(src, src, soff);
|
|
__ add(dst, dst, doff);
|
|
// backup of dst, used to calculate the return value at exit
|
|
__ mv(dstBackup, dst);
|
|
|
|
// load the codec base address
|
|
__ la(codec, ExternalAddress((address) fromBase64));
|
|
__ beqz(isURL, ProcessData);
|
|
__ la(codec, ExternalAddress((address) fromBase64URL));
|
|
__ BIND(ProcessData);
|
|
|
|
// vector version
|
|
if (UseRVV) {
|
|
// for MIME case, it has a default length limit of 76 which could be
|
|
// different(smaller) from (send - soff), so in MIME case, we go through
|
|
// the scalar code path directly.
|
|
__ bnez(isMIME, ScalarLoop);
|
|
|
|
Label ProcessM1, ProcessM2;
|
|
|
|
Register failedIdx = soff;
|
|
Register stepSrcM1 = send;
|
|
Register stepSrcM2 = doff;
|
|
Register stepDst = isURL;
|
|
Register size = t4;
|
|
|
|
__ mv(size, MaxVectorSize * 2);
|
|
__ mv(stepSrcM1, MaxVectorSize * 4);
|
|
__ slli(stepSrcM2, stepSrcM1, 1);
|
|
__ mv(stepDst, MaxVectorSize * 2 * 3);
|
|
|
|
__ blt(length, stepSrcM2, ProcessM1);
|
|
|
|
|
|
// Assembler::m2
|
|
__ BIND(ProcessM2);
|
|
base64_vector_decode_round(src, dst, codec,
|
|
size, stepSrcM2, stepDst, failedIdx,
|
|
v2, v4, v6, v8, // inputs
|
|
v10, v12, v14, v16, // indexes
|
|
v18, v20, v22, // outputs
|
|
Assembler::m2);
|
|
__ sub(length, length, stepSrcM2);
|
|
|
|
// error check
|
|
// valid value of failedIdx can only be -1 when < 0
|
|
__ bgez(failedIdx, Exit);
|
|
|
|
__ bge(length, stepSrcM2, ProcessM2);
|
|
|
|
|
|
// Assembler::m1
|
|
__ BIND(ProcessM1);
|
|
__ blt(length, stepSrcM1, ProcessScalar);
|
|
|
|
__ srli(size, size, 1);
|
|
__ srli(stepDst, stepDst, 1);
|
|
base64_vector_decode_round(src, dst, codec,
|
|
size, stepSrcM1, stepDst, failedIdx,
|
|
v1, v2, v3, v4, // inputs
|
|
v5, v6, v7, v8, // indexes
|
|
v9, v10, v11, // outputs
|
|
Assembler::m1);
|
|
__ sub(length, length, stepSrcM1);
|
|
|
|
// error check
|
|
// valid value of failedIdx can only be -1 when < 0
|
|
__ bgez(failedIdx, Exit);
|
|
|
|
__ BIND(ProcessScalar);
|
|
__ beqz(length, Exit);
|
|
}
|
|
|
|
// scalar version
|
|
{
|
|
Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
|
|
Register combined32Bits = t4;
|
|
|
|
// encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
|
|
// plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
|
|
__ BIND(ScalarLoop);
|
|
|
|
// load 4 bytes encoded src data
|
|
__ lbu(byte0, Address(src, 0));
|
|
__ lbu(byte1, Address(src, 1));
|
|
__ lbu(byte2, Address(src, 2));
|
|
__ lbu(byte3, Address(src, 3));
|
|
__ addi(src, src, 4);
|
|
|
|
// get codec index and decode (ie. load from codec by index)
|
|
__ add(byte0, codec, byte0);
|
|
__ add(byte1, codec, byte1);
|
|
__ lb(byte0, Address(byte0, 0));
|
|
__ lb(byte1, Address(byte1, 0));
|
|
__ add(byte2, codec, byte2);
|
|
__ add(byte3, codec, byte3);
|
|
__ lb(byte2, Address(byte2, 0));
|
|
__ lb(byte3, Address(byte3, 0));
|
|
__ slliw(byte0, byte0, 18);
|
|
__ slliw(byte1, byte1, 12);
|
|
__ orr(byte0, byte0, byte1);
|
|
__ orr(byte0, byte0, byte3);
|
|
__ slliw(byte2, byte2, 6);
|
|
// For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
|
|
// 1. error check below
|
|
// 2. decode below
|
|
__ orr(combined32Bits, byte0, byte2);
|
|
|
|
// error check
|
|
__ bltz(combined32Bits, Exit);
|
|
|
|
// store 3 bytes decoded data
|
|
__ sraiw(byte0, combined32Bits, 16);
|
|
__ sraiw(byte1, combined32Bits, 8);
|
|
__ sb(byte0, Address(dst, 0));
|
|
__ sb(byte1, Address(dst, 1));
|
|
__ sb(combined32Bits, Address(dst, 2));
|
|
|
|
__ subi(length, length, 4);
|
|
__ addi(dst, dst, 3);
|
|
// loop back
|
|
__ bnez(length, ScalarLoop);
|
|
}
|
|
|
|
__ BIND(Exit);
|
|
__ sub(c_rarg0, dst, dstBackup);
|
|
|
|
__ leave();
|
|
__ ret();
|
|
|
|
return (address) start;
|
|
}
|
|
|
|
void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
|
|
VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
|
|
Register temp0, Register temp1, Register temp2, Register temp3,
|
|
VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
|
|
|
|
assert((lmul == Assembler::m4 && step == 64) ||
|
|
(lmul == Assembler::m2 && step == 32) ||
|
|
(lmul == Assembler::m1 && step == 16),
|
|
"LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
|
|
// Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
|
|
// The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
|
|
// We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
|
|
// In non-vectorized code, we update s1 and s2 as:
|
|
// s1 <- s1 + b1
|
|
// s2 <- s2 + s1
|
|
// s1 <- s1 + b2
|
|
// s2 <- s2 + b1
|
|
// ...
|
|
// s1 <- s1 + b64
|
|
// s2 <- s2 + s1
|
|
// Putting above assignments together, we have:
|
|
// s1_new = s1 + b1 + b2 + ... + b64
|
|
// s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
|
|
// = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
|
|
// = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
|
|
|
|
__ mv(temp3, step);
|
|
// Load data
|
|
__ vsetvli(temp0, temp3, Assembler::e8, lmul);
|
|
__ vle8_v(vbytes, buff);
|
|
__ addi(buff, buff, step);
|
|
|
|
// Upper bound reduction sum for s1_new:
|
|
// 0xFF * 64 = 0x3FC0, so:
|
|
// 1. Need to do vector-widening reduction sum
|
|
// 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
|
|
__ vwredsumu_vs(vs1acc, vbytes, vzero);
|
|
// Multiplication for s2_new
|
|
__ vwmulu_vv(vs2acc, vtable, vbytes);
|
|
|
|
// s2 = s2 + s1 * log2(step)
|
|
__ slli(temp1, s1, exact_log2(step));
|
|
__ add(s2, s2, temp1);
|
|
|
|
// Summing up calculated results for s2_new
|
|
if (MaxVectorSize > 16) {
|
|
__ vsetvli(temp0, temp3, Assembler::e16, lmul);
|
|
} else {
|
|
// Half of vector-widening multiplication result is in successor of vs2acc
|
|
// group for vlen == 16, in which case we need to double vector register
|
|
// group width in order to reduction sum all of them
|
|
Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
|
|
(lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
|
|
__ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
|
|
}
|
|
// Upper bound for reduction sum:
|
|
// 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
|
|
// 1. Need to do vector-widening reduction sum
|
|
// 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
|
|
__ vwredsumu_vs(vtemp1, vs2acc, vzero);
|
|
|
|
// Extracting results for:
|
|
// s1_new
|
|
__ vmv_x_s(temp0, vs1acc);
|
|
__ add(s1, s1, temp0);
|
|
// s2_new
|
|
__ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
|
|
__ vmv_x_s(temp1, vtemp1);
|
|
__ add(s2, s2, temp1);
|
|
}
|
|
|
|
/***
|
|
* int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
|
|
*
|
|
* Arguments:
|
|
*
|
|
* Inputs:
|
|
* c_rarg0 - int adler
|
|
* c_rarg1 - byte* buff (b + off)
|
|
* c_rarg2 - int len
|
|
*
|
|
* Output:
|
|
* c_rarg0 - int adler result
|
|
*/
|
|
address generate_updateBytesAdler32() {
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
|
|
L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
|
|
|
|
// Aliases
|
|
Register adler = c_rarg0;
|
|
Register s1 = c_rarg0;
|
|
Register s2 = c_rarg3;
|
|
Register buff = c_rarg1;
|
|
Register len = c_rarg2;
|
|
Register nmax = c_rarg4;
|
|
Register base = c_rarg5;
|
|
Register count = c_rarg6;
|
|
Register temp0 = t3;
|
|
Register temp1 = t4;
|
|
Register temp2 = t5;
|
|
Register temp3 = t6;
|
|
|
|
VectorRegister vzero = v31;
|
|
VectorRegister vbytes = v8; // group: v8, v9, v10, v11
|
|
VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
|
|
VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
|
|
VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
|
|
VectorRegister vtable_32 = v4; // group: v4, v5
|
|
VectorRegister vtable_16 = v30;
|
|
VectorRegister vtemp1 = v28;
|
|
VectorRegister vtemp2 = v29;
|
|
|
|
// Max number of bytes we can process before having to take the mod
|
|
// 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
|
|
const uint64_t BASE = 0xfff1;
|
|
const uint64_t NMAX = 0x15B0;
|
|
|
|
// Loops steps
|
|
int step_64 = 64;
|
|
int step_32 = 32;
|
|
int step_16 = 16;
|
|
int step_1 = 1;
|
|
|
|
__ enter(); // Required for proper stackwalking of RuntimeStub frame
|
|
__ mv(temp1, 64);
|
|
__ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
|
|
|
|
// Generating accumulation coefficients for further calculations
|
|
// vtable_64:
|
|
__ vid_v(vtemp1);
|
|
__ vrsub_vx(vtable_64, vtemp1, temp1);
|
|
// vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
|
|
|
|
// vtable_32:
|
|
__ mv(temp1, 32);
|
|
__ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
|
|
__ vid_v(vtemp1);
|
|
__ vrsub_vx(vtable_32, vtemp1, temp1);
|
|
// vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
|
|
|
|
__ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
|
|
// vtable_16:
|
|
__ mv(temp1, 16);
|
|
__ vid_v(vtemp1);
|
|
__ vrsub_vx(vtable_16, vtemp1, temp1);
|
|
// vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
|
|
|
|
__ vmv_v_i(vzero, 0);
|
|
|
|
__ mv(base, BASE);
|
|
__ mv(nmax, NMAX);
|
|
|
|
// s1 is initialized to the lower 16 bits of adler
|
|
// s2 is initialized to the upper 16 bits of adler
|
|
__ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
|
|
__ zext(s1, adler, 16); // s1 = (adler & 0xffff)
|
|
|
|
// The pipelined loop needs at least 16 elements for 1 iteration
|
|
// It does check this, but it is more effective to skip to the cleanup loop
|
|
__ mv(temp0, step_16);
|
|
__ bgeu(len, temp0, L_nmax);
|
|
__ beqz(len, L_combine);
|
|
|
|
// Jumping to L_by1_loop
|
|
__ subi(len, len, step_1);
|
|
__ j(L_by1_loop);
|
|
|
|
__ bind(L_nmax);
|
|
__ sub(len, len, nmax);
|
|
__ subi(count, nmax, 16);
|
|
__ bltz(len, L_by16);
|
|
|
|
// Align L_nmax loop by 64
|
|
__ bind(L_nmax_loop_entry);
|
|
__ subi(count, count, 32);
|
|
|
|
__ bind(L_nmax_loop);
|
|
adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
|
|
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
|
|
vtemp1, vtemp2, step_64, Assembler::m4);
|
|
__ subi(count, count, step_64);
|
|
__ bgtz(count, L_nmax_loop);
|
|
|
|
// There are three iterations left to do
|
|
adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
|
|
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
|
|
vtemp1, vtemp2, step_32, Assembler::m2);
|
|
adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
|
|
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
|
|
vtemp1, vtemp2, step_16, Assembler::m1);
|
|
|
|
// s1 = s1 % BASE
|
|
__ remuw(s1, s1, base);
|
|
// s2 = s2 % BASE
|
|
__ remuw(s2, s2, base);
|
|
|
|
__ sub(len, len, nmax);
|
|
__ subi(count, nmax, 16);
|
|
__ bgez(len, L_nmax_loop_entry);
|
|
|
|
__ bind(L_by16);
|
|
__ add(len, len, count);
|
|
__ bltz(len, L_by1);
|
|
// Trying to unroll
|
|
__ mv(temp3, step_64);
|
|
__ blt(len, temp3, L_by16_loop);
|
|
|
|
__ bind(L_by16_loop_unroll);
|
|
adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
|
|
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
|
|
vtemp1, vtemp2, step_64, Assembler::m4);
|
|
__ subi(len, len, step_64);
|
|
// By now the temp3 should still be 64
|
|
__ bge(len, temp3, L_by16_loop_unroll);
|
|
|
|
__ bind(L_by16_loop);
|
|
adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
|
|
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
|
|
vtemp1, vtemp2, step_16, Assembler::m1);
|
|
__ subi(len, len, step_16);
|
|
__ bgez(len, L_by16_loop);
|
|
|
|
__ bind(L_by1);
|
|
__ addi(len, len, 15);
|
|
__ bltz(len, L_do_mod);
|
|
|
|
__ bind(L_by1_loop);
|
|
__ lbu(temp0, Address(buff, 0));
|
|
__ addi(buff, buff, step_1);
|
|
__ add(s1, temp0, s1);
|
|
__ add(s2, s2, s1);
|
|
__ subi(len, len, step_1);
|
|
__ bgez(len, L_by1_loop);
|
|
|
|
__ bind(L_do_mod);
|
|
// s1 = s1 % BASE
|
|
__ remuw(s1, s1, base);
|
|
// s2 = s2 % BASE
|
|
__ remuw(s2, s2, base);
|
|
|
|
// Combine lower bits and higher bits
|
|
// adler = s1 | (s2 << 16)
|
|
__ bind(L_combine);
|
|
__ slli(s2, s2, 16);
|
|
__ orr(s1, s1, s2);
|
|
|
|
__ leave(); // Required for proper stackwalking of RuntimeStub frame
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
#endif // COMPILER2_OR_JVMCI
|
|
|
|
// x10 = input (float16)
|
|
// f10 = result (float)
|
|
// t1 = temporary register
|
|
address generate_float16ToFloat() {
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::hf2f_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
BLOCK_COMMENT("float16ToFloat:");
|
|
|
|
FloatRegister dst = f10;
|
|
Register src = x10;
|
|
Label NaN_SLOW;
|
|
|
|
assert(VM_Version::supports_float16_float_conversion(), "must");
|
|
|
|
// On riscv, NaN needs a special process as fcvt does not work in that case.
|
|
// On riscv, Inf does not need a special process as fcvt can handle it correctly.
|
|
// but we consider to get the slow path to process NaN and Inf at the same time,
|
|
// as both of them are rare cases, and if we try to get the slow path to handle
|
|
// only NaN case it would sacrifise the performance for normal cases,
|
|
// i.e. non-NaN and non-Inf cases.
|
|
|
|
// check whether it's a NaN or +/- Inf.
|
|
__ mv(t0, 0x7c00);
|
|
__ andr(t1, src, t0);
|
|
// jump to stub processing NaN and Inf cases.
|
|
__ beq(t0, t1, NaN_SLOW);
|
|
|
|
// non-NaN or non-Inf cases, just use built-in instructions.
|
|
__ fmv_h_x(dst, src);
|
|
__ fcvt_s_h(dst, dst);
|
|
__ ret();
|
|
|
|
__ bind(NaN_SLOW);
|
|
// following instructions mainly focus on NaN, as riscv does not handle
|
|
// NaN well with fcvt, but the code also works for Inf at the same time.
|
|
|
|
// construct a NaN in 32 bits from the NaN in 16 bits,
|
|
// we need the payloads of non-canonical NaNs to be preserved.
|
|
__ mv(t1, 0x7f800000);
|
|
// sign-bit was already set via sign-extension if necessary.
|
|
__ slli(t0, src, 13);
|
|
__ orr(t1, t0, t1);
|
|
__ fmv_w_x(dst, t1);
|
|
|
|
__ ret();
|
|
return entry;
|
|
}
|
|
|
|
// f10 = input (float)
|
|
// x10 = result (float16)
|
|
// f11 = temporary float register
|
|
// t1 = temporary register
|
|
address generate_floatToFloat16() {
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::f2hf_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address entry = __ pc();
|
|
BLOCK_COMMENT("floatToFloat16:");
|
|
|
|
Register dst = x10;
|
|
FloatRegister src = f10, ftmp = f11;
|
|
Label NaN_SLOW;
|
|
|
|
assert(VM_Version::supports_float16_float_conversion(), "must");
|
|
|
|
// On riscv, NaN needs a special process as fcvt does not work in that case.
|
|
|
|
// check whether it's a NaN.
|
|
// replace fclass with feq as performance optimization.
|
|
__ feq_s(t0, src, src);
|
|
// jump to stub processing NaN cases.
|
|
__ beqz(t0, NaN_SLOW);
|
|
|
|
// non-NaN cases, just use built-in instructions.
|
|
__ fcvt_h_s(ftmp, src);
|
|
__ fmv_x_h(dst, ftmp);
|
|
__ ret();
|
|
|
|
__ bind(NaN_SLOW);
|
|
__ fmv_x_w(dst, src);
|
|
|
|
// preserve the payloads of non-canonical NaNs.
|
|
__ srai(dst, dst, 13);
|
|
// preserve the sign bit.
|
|
__ srai(t1, dst, 13);
|
|
__ slli(t1, t1, 10);
|
|
__ mv(t0, 0x3ff);
|
|
__ orr(t1, t1, t0);
|
|
|
|
// get the result by merging sign bit and payloads of preserved non-canonical NaNs.
|
|
__ andr(dst, dst, t1);
|
|
|
|
__ ret();
|
|
return entry;
|
|
}
|
|
|
|
#ifdef COMPILER2
|
|
|
|
static const int64_t right_2_bits = right_n_bits(2);
|
|
static const int64_t right_3_bits = right_n_bits(3);
|
|
|
|
// In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
|
|
// are represented as long[5], with BITS_PER_LIMB = 26.
|
|
// Pack five 26-bit limbs into three 64-bit registers.
|
|
void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
|
|
assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
|
|
|
|
// The goal is to have 128-bit value in dest2:dest1:dest0
|
|
__ ld(dest0, Address(src, 0)); // 26 bits in dest0
|
|
|
|
__ ld(tmp1, Address(src, sizeof(jlong)));
|
|
__ slli(tmp1, tmp1, 26);
|
|
__ add(dest0, dest0, tmp1); // 52 bits in dest0
|
|
|
|
__ ld(tmp2, Address(src, 2 * sizeof(jlong)));
|
|
__ slli(tmp1, tmp2, 52);
|
|
__ add(dest0, dest0, tmp1); // dest0 is full
|
|
|
|
__ srli(dest1, tmp2, 12); // 14-bit in dest1
|
|
|
|
__ ld(tmp1, Address(src, 3 * sizeof(jlong)));
|
|
__ slli(tmp1, tmp1, 14);
|
|
__ add(dest1, dest1, tmp1); // 40-bit in dest1
|
|
|
|
__ ld(tmp1, Address(src, 4 * sizeof(jlong)));
|
|
__ slli(tmp2, tmp1, 40);
|
|
__ add(dest1, dest1, tmp2); // dest1 is full
|
|
|
|
if (dest2->is_valid()) {
|
|
__ srli(tmp1, tmp1, 24);
|
|
__ mv(dest2, tmp1); // 2 bits in dest2
|
|
} else {
|
|
#ifdef ASSERT
|
|
Label OK;
|
|
__ srli(tmp1, tmp1, 24);
|
|
__ beq(zr, tmp1, OK); // 2 bits
|
|
__ stop("high bits of Poly1305 integer should be zero");
|
|
__ should_not_reach_here();
|
|
__ bind(OK);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// As above, but return only a 128-bit integer, packed into two
|
|
// 64-bit registers.
|
|
void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
|
|
poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
|
|
}
|
|
|
|
// U_2:U_1:U_0: += (U_2 >> 2) * 5
|
|
void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
|
|
assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
|
|
|
|
// First, U_2:U_1:U_0 += (U_2 >> 2)
|
|
__ srli(tmp1, U_2, 2);
|
|
__ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
|
|
__ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
|
|
__ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
|
|
__ add(U_2, U_2, tmp2);
|
|
|
|
// Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
|
|
__ slli(tmp1, tmp1, 2);
|
|
__ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
|
|
__ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
|
|
__ add(U_2, U_2, tmp2);
|
|
}
|
|
|
|
// Poly1305, RFC 7539
|
|
// void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
|
|
|
|
// Arguments:
|
|
// c_rarg0: input_start -- where the input is stored
|
|
// c_rarg1: length
|
|
// c_rarg2: acc_start -- where the output will be stored
|
|
// c_rarg3: r_start -- where the randomly generated 128-bit key is stored
|
|
|
|
// See https://loup-vaillant.fr/tutorials/poly1305-design for a
|
|
// description of the tricks used to simplify and accelerate this
|
|
// computation.
|
|
|
|
address generate_poly1305_processBlocks() {
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
__ enter();
|
|
Label here;
|
|
|
|
RegSet saved_regs = RegSet::range(x18, x21);
|
|
RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
|
|
__ push_reg(saved_regs, sp);
|
|
|
|
// Arguments
|
|
const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
|
|
|
|
// R_n is the 128-bit randomly-generated key, packed into two
|
|
// registers. The caller passes this key to us as long[5], with
|
|
// BITS_PER_LIMB = 26.
|
|
const Register R_0 = *regs, R_1 = *++regs;
|
|
poly1305_pack_26(R_0, R_1, r_start, t1, t2);
|
|
|
|
// RR_n is (R_n >> 2) * 5
|
|
const Register RR_0 = *++regs, RR_1 = *++regs;
|
|
__ srli(t1, R_0, 2);
|
|
__ shadd(RR_0, t1, t1, t2, 2);
|
|
__ srli(t1, R_1, 2);
|
|
__ shadd(RR_1, t1, t1, t2, 2);
|
|
|
|
// U_n is the current checksum
|
|
const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
|
|
poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
|
|
|
|
static constexpr int BLOCK_LENGTH = 16;
|
|
Label DONE, LOOP;
|
|
|
|
__ mv(t1, BLOCK_LENGTH);
|
|
__ blt(length, t1, DONE); {
|
|
__ bind(LOOP);
|
|
|
|
// S_n is to be the sum of U_n and the next block of data
|
|
const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
|
|
__ ld(S_0, Address(input_start, 0));
|
|
__ ld(S_1, Address(input_start, wordSize));
|
|
|
|
__ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
|
|
__ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
|
|
__ add(S_2, U_2, t1);
|
|
|
|
__ addi(S_2, S_2, 1);
|
|
|
|
const Register U_0HI = *++regs, U_1HI = *++regs;
|
|
|
|
// NB: this logic depends on some of the special properties of
|
|
// Poly1305 keys. In particular, because we know that the top
|
|
// four bits of R_0 and R_1 are zero, we can add together
|
|
// partial products without any risk of needing to propagate a
|
|
// carry out.
|
|
__ wide_mul(U_0, U_0HI, S_0, R_0);
|
|
__ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
|
|
__ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
|
|
|
|
__ wide_mul(U_1, U_1HI, S_0, R_1);
|
|
__ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
|
|
__ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
|
|
|
|
__ andi(U_2, R_0, right_2_bits);
|
|
__ mul(U_2, S_2, U_2);
|
|
|
|
// Partial reduction mod 2**130 - 5
|
|
__ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
|
|
__ adc(U_2, U_2, U_1HI, t1);
|
|
// Sum is now in U_2:U_1:U_0.
|
|
|
|
// U_2:U_1:U_0: += (U_2 >> 2) * 5
|
|
poly1305_reduce(U_2, U_1, U_0, t1, t2);
|
|
|
|
__ subi(length, length, BLOCK_LENGTH);
|
|
__ addi(input_start, input_start, BLOCK_LENGTH);
|
|
__ mv(t1, BLOCK_LENGTH);
|
|
__ bge(length, t1, LOOP);
|
|
}
|
|
|
|
// Further reduce modulo 2^130 - 5
|
|
poly1305_reduce(U_2, U_1, U_0, t1, t2);
|
|
|
|
// Unpack the sum into five 26-bit limbs and write to memory.
|
|
// First 26 bits is the first limb
|
|
__ slli(t1, U_0, 38); // Take lowest 26 bits
|
|
__ srli(t1, t1, 38);
|
|
__ sd(t1, Address(acc_start)); // First 26-bit limb
|
|
|
|
// 27-52 bits of U_0 is the second limb
|
|
__ slli(t1, U_0, 12); // Take next 27-52 bits
|
|
__ srli(t1, t1, 38);
|
|
__ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
|
|
|
|
// Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
|
|
__ srli(t1, U_0, 52);
|
|
__ slli(t2, U_1, 50);
|
|
__ srli(t2, t2, 38);
|
|
__ add(t1, t1, t2);
|
|
__ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
|
|
|
|
// Storing 15-40 bits of U_1
|
|
__ slli(t1, U_1, 24); // Already used up 14 bits
|
|
__ srli(t1, t1, 38); // Clear all other bits from t1
|
|
__ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
|
|
|
|
// Storing 41-64 bits of U_1 and first three bits from U_2 in one register
|
|
__ srli(t1, U_1, 40);
|
|
__ andi(t2, U_2, right_3_bits);
|
|
__ slli(t2, t2, 24);
|
|
__ add(t1, t1, t2);
|
|
__ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
|
|
|
|
__ bind(DONE);
|
|
__ pop_reg(saved_regs, sp);
|
|
__ leave(); // Required for proper stackwalking
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
#endif // COMPILER2
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Inputs:
|
|
* c_rarg0 - int crc
|
|
* c_rarg1 - byte* buf
|
|
* c_rarg2 - int length
|
|
*
|
|
* Output:
|
|
* c_rarg0 - int crc result
|
|
*/
|
|
address generate_updateBytesCRC32() {
|
|
assert(UseCRC32Intrinsics, "what are we doing here?");
|
|
|
|
__ align(CodeEntryAlignment);
|
|
StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
|
|
// input parameters
|
|
const Register crc = c_rarg0; // crc
|
|
const Register buf = c_rarg1; // source java byte array address
|
|
const Register len = c_rarg2; // length
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
__ kernel_crc32(crc, buf, len,
|
|
c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
|
|
c_rarg7, t2, t3, t4, t5, t6); // misc tmps
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
// exception handler for upcall stubs
|
|
address generate_upcall_stub_exception_handler() {
|
|
StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
// Native caller has no idea how to handle exceptions,
|
|
// so we just crash here. Up to callee to catch exceptions.
|
|
__ verify_oop(x10); // return a exception oop in a0
|
|
__ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
|
|
__ should_not_reach_here();
|
|
|
|
return start;
|
|
}
|
|
|
|
// load Method* target of MethodHandle
|
|
// j_rarg0 = jobject receiver
|
|
// xmethod = Method* result
|
|
address generate_upcall_stub_load_target() {
|
|
|
|
StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ resolve_global_jobject(j_rarg0, t0, t1);
|
|
// Load target method from receiver
|
|
__ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
|
|
__ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
|
|
__ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
|
|
__ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
|
|
Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
|
|
noreg, noreg);
|
|
__ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
|
|
|
|
__ ret();
|
|
|
|
return start;
|
|
}
|
|
|
|
#undef __
|
|
|
|
// Initialization
|
|
void generate_initial_stubs() {
|
|
// Generate initial stubs and initializes the entry points
|
|
|
|
// entry points that exist in all platforms Note: This is code
|
|
// that could be shared among different platforms - however the
|
|
// benefit seems to be smaller than the disadvantage of having a
|
|
// much more complicated generator structure. See also comment in
|
|
// stubRoutines.hpp.
|
|
|
|
StubRoutines::_forward_exception_entry = generate_forward_exception();
|
|
|
|
if (UnsafeMemoryAccess::_table == nullptr) {
|
|
UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
|
|
}
|
|
|
|
StubRoutines::_call_stub_entry =
|
|
generate_call_stub(StubRoutines::_call_stub_return_address);
|
|
|
|
// is referenced by megamorphic call
|
|
StubRoutines::_catch_exception_entry = generate_catch_exception();
|
|
|
|
if (UseCRC32Intrinsics) {
|
|
// set table address before stub generation which use it
|
|
StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table;
|
|
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
|
|
}
|
|
|
|
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
|
|
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
|
|
StubRoutines::_hf2f = generate_float16ToFloat();
|
|
StubRoutines::_f2hf = generate_floatToFloat16();
|
|
}
|
|
}
|
|
|
|
void generate_continuation_stubs() {
|
|
// Continuation stubs:
|
|
StubRoutines::_cont_thaw = generate_cont_thaw();
|
|
StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
|
|
StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
|
|
StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
|
|
}
|
|
|
|
void generate_final_stubs() {
|
|
// support for verify_oop (must happen after universe_init)
|
|
if (VerifyOops) {
|
|
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
|
}
|
|
|
|
// arraycopy stubs used by compilers
|
|
generate_arraycopy_stubs();
|
|
|
|
StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
|
|
|
|
#ifdef COMPILER2
|
|
if (UseSecondarySupersTable) {
|
|
StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
|
|
if (!InlineSecondarySupersTest) {
|
|
generate_lookup_secondary_supers_table_stub();
|
|
}
|
|
}
|
|
#endif // COMPILER2
|
|
|
|
StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
|
|
StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
|
|
|
|
StubRoutines::riscv::set_completed();
|
|
}
|
|
|
|
void generate_compiler_stubs() {
|
|
#ifdef COMPILER2
|
|
if (UseMulAddIntrinsic) {
|
|
StubRoutines::_mulAdd = generate_mulAdd();
|
|
}
|
|
|
|
if (UseMultiplyToLenIntrinsic) {
|
|
StubRoutines::_multiplyToLen = generate_multiplyToLen();
|
|
}
|
|
|
|
if (UseSquareToLenIntrinsic) {
|
|
StubRoutines::_squareToLen = generate_squareToLen();
|
|
}
|
|
|
|
if (UseMontgomeryMultiplyIntrinsic) {
|
|
StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
|
|
StubRoutines::_montgomeryMultiply = g.generate_multiply();
|
|
}
|
|
|
|
if (UseMontgomerySquareIntrinsic) {
|
|
StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
|
|
StubRoutines::_montgomerySquare = g.generate_square();
|
|
}
|
|
|
|
if (UseAESIntrinsics) {
|
|
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
|
|
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
|
|
}
|
|
|
|
if (UsePoly1305Intrinsics) {
|
|
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
|
|
}
|
|
|
|
if (UseRVV) {
|
|
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
|
|
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
|
|
}
|
|
|
|
if (UseSHA256Intrinsics) {
|
|
Sha2Generator sha2(_masm, this);
|
|
StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
|
|
StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
|
|
}
|
|
|
|
if (UseSHA512Intrinsics) {
|
|
Sha2Generator sha2(_masm, this);
|
|
StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
|
|
StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
|
|
}
|
|
|
|
if (UseMD5Intrinsics) {
|
|
StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
|
|
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
|
|
}
|
|
|
|
if (UseChaCha20Intrinsics) {
|
|
StubRoutines::_chacha20Block = generate_chacha20Block();
|
|
}
|
|
|
|
if (UseSHA1Intrinsics) {
|
|
StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
|
|
StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
|
|
}
|
|
|
|
if (UseBASE64Intrinsics) {
|
|
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
|
|
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
|
|
}
|
|
|
|
if (UseAdler32Intrinsics) {
|
|
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
|
|
}
|
|
|
|
generate_compare_long_strings();
|
|
|
|
generate_string_indexof_stubs();
|
|
|
|
#endif // COMPILER2
|
|
}
|
|
|
|
public:
|
|
StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
|
|
switch(blob_id) {
|
|
case initial_id:
|
|
generate_initial_stubs();
|
|
break;
|
|
case continuation_id:
|
|
generate_continuation_stubs();
|
|
break;
|
|
case compiler_id:
|
|
generate_compiler_stubs();
|
|
break;
|
|
case final_id:
|
|
generate_final_stubs();
|
|
break;
|
|
default:
|
|
fatal("unexpected blob id: %d", blob_id);
|
|
break;
|
|
};
|
|
}
|
|
}; // end class declaration
|
|
|
|
void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
|
|
StubGenerator g(code, blob_id);
|
|
}
|