jdk/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
2025-05-12 03:01:46 +00:00

6733 lines
225 KiB
C++

/*
* Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
* Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "compiler/oopMap.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "interpreter/interpreter.hpp"
#include "memory/universe.hpp"
#include "nativeInst_riscv.hpp"
#include "oops/instanceOop.hpp"
#include "oops/method.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "prims/upcallLinker.hpp"
#include "runtime/continuation.hpp"
#include "runtime/continuationEntry.inline.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/javaThread.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "utilities/align.hpp"
#include "utilities/powerOfTwo.hpp"
#ifdef COMPILER2
#include "opto/runtime.hpp"
#endif
// Declaration and definition of StubGenerator (no .hpp file).
// For a more detailed description of the stub routine structure
// see the comment in stubRoutines.hpp
#undef __
#define __ _masm->
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
// Stub Code definitions
class StubGenerator: public StubCodeGenerator {
private:
#ifdef PRODUCT
#define inc_counter_np(counter) ((void)0)
#else
void inc_counter_np_(uint& counter) {
__ incrementw(ExternalAddress((address)&counter));
}
#define inc_counter_np(counter) \
BLOCK_COMMENT("inc_counter " #counter); \
inc_counter_np_(counter);
#endif
// Call stubs are used to call Java from C
//
// Arguments:
// c_rarg0: call wrapper address address
// c_rarg1: result address
// c_rarg2: result type BasicType
// c_rarg3: method Method*
// c_rarg4: (interpreter) entry point address
// c_rarg5: parameters intptr_t*
// c_rarg6: parameter size (in words) int
// c_rarg7: thread Thread*
//
// There is no return from the stub itself as any Java result
// is written to result
//
// we save x1 (ra) as the return PC at the base of the frame and
// link x8 (fp) below it as the frame pointer installing sp (x2)
// into fp.
//
// we save x10-x17, which accounts for all the c arguments.
//
// TODO: strictly do we need to save them all? they are treated as
// volatile by C so could we omit saving the ones we are going to
// place in global registers (thread? method?) or those we only use
// during setup of the Java call?
//
// we don't need to save x5 which C uses as an indirect result location
// return register.
//
// we don't need to save x6-x7 and x28-x31 which both C and Java treat as
// volatile
//
// we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
// registers and C expects to be callee-save
//
// so the stub frame looks like this when we enter Java code
//
// [ return_from_Java ] <--- sp
// [ argument word n ]
// ...
// -35 [ argument word 1 ]
// -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
// -33 [ saved f27 ]
// -32 [ saved f26 ]
// -31 [ saved f25 ]
// -30 [ saved f24 ]
// -29 [ saved f23 ]
// -28 [ saved f22 ]
// -27 [ saved f21 ]
// -26 [ saved f20 ]
// -25 [ saved f19 ]
// -24 [ saved f18 ]
// -23 [ saved f9 ]
// -22 [ saved f8 ]
// -21 [ saved x27 ]
// -20 [ saved x26 ]
// -19 [ saved x25 ]
// -18 [ saved x24 ]
// -17 [ saved x23 ]
// -16 [ saved x22 ]
// -15 [ saved x21 ]
// -14 [ saved x20 ]
// -13 [ saved x19 ]
// -12 [ saved x18 ]
// -11 [ saved x9 ]
// -10 [ call wrapper (x10) ]
// -9 [ result (x11) ]
// -8 [ result type (x12) ]
// -7 [ method (x13) ]
// -6 [ entry point (x14) ]
// -5 [ parameters (x15) ]
// -4 [ parameter size (x16) ]
// -3 [ thread (x17) ]
// -2 [ saved fp (x8) ]
// -1 [ saved ra (x1) ]
// 0 [ ] <--- fp == saved sp (x2)
// Call stub stack layout word offsets from fp
enum call_stub_layout {
sp_after_call_off = -34,
frm_off = sp_after_call_off,
f27_off = -33,
f26_off = -32,
f25_off = -31,
f24_off = -30,
f23_off = -29,
f22_off = -28,
f21_off = -27,
f20_off = -26,
f19_off = -25,
f18_off = -24,
f9_off = -23,
f8_off = -22,
x27_off = -21,
x26_off = -20,
x25_off = -19,
x24_off = -18,
x23_off = -17,
x22_off = -16,
x21_off = -15,
x20_off = -14,
x19_off = -13,
x18_off = -12,
x9_off = -11,
call_wrapper_off = -10,
result_off = -9,
result_type_off = -8,
method_off = -7,
entry_point_off = -6,
parameters_off = -5,
parameter_size_off = -4,
thread_off = -3,
fp_f = -2,
retaddr_off = -1,
};
address generate_call_stub(address& return_address) {
assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
(int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
"adjust this code");
StubGenStubId stub_id = StubGenStubId::call_stub_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
const Address sp_after_call (fp, sp_after_call_off * wordSize);
const Address frm_save (fp, frm_off * wordSize);
const Address call_wrapper (fp, call_wrapper_off * wordSize);
const Address result (fp, result_off * wordSize);
const Address result_type (fp, result_type_off * wordSize);
const Address method (fp, method_off * wordSize);
const Address entry_point (fp, entry_point_off * wordSize);
const Address parameters (fp, parameters_off * wordSize);
const Address parameter_size(fp, parameter_size_off * wordSize);
const Address thread (fp, thread_off * wordSize);
const Address f27_save (fp, f27_off * wordSize);
const Address f26_save (fp, f26_off * wordSize);
const Address f25_save (fp, f25_off * wordSize);
const Address f24_save (fp, f24_off * wordSize);
const Address f23_save (fp, f23_off * wordSize);
const Address f22_save (fp, f22_off * wordSize);
const Address f21_save (fp, f21_off * wordSize);
const Address f20_save (fp, f20_off * wordSize);
const Address f19_save (fp, f19_off * wordSize);
const Address f18_save (fp, f18_off * wordSize);
const Address f9_save (fp, f9_off * wordSize);
const Address f8_save (fp, f8_off * wordSize);
const Address x27_save (fp, x27_off * wordSize);
const Address x26_save (fp, x26_off * wordSize);
const Address x25_save (fp, x25_off * wordSize);
const Address x24_save (fp, x24_off * wordSize);
const Address x23_save (fp, x23_off * wordSize);
const Address x22_save (fp, x22_off * wordSize);
const Address x21_save (fp, x21_off * wordSize);
const Address x20_save (fp, x20_off * wordSize);
const Address x19_save (fp, x19_off * wordSize);
const Address x18_save (fp, x18_off * wordSize);
const Address x9_save (fp, x9_off * wordSize);
// stub code
address riscv_entry = __ pc();
// set up frame and move sp to end of save area
__ enter();
__ addi(sp, fp, sp_after_call_off * wordSize);
// save register parameters and Java temporary/global registers
// n.b. we save thread even though it gets installed in
// xthread because we want to sanity check tp later
__ sd(c_rarg7, thread);
__ sw(c_rarg6, parameter_size);
__ sd(c_rarg5, parameters);
__ sd(c_rarg4, entry_point);
__ sd(c_rarg3, method);
__ sd(c_rarg2, result_type);
__ sd(c_rarg1, result);
__ sd(c_rarg0, call_wrapper);
__ sd(x9, x9_save);
__ sd(x18, x18_save);
__ sd(x19, x19_save);
__ sd(x20, x20_save);
__ sd(x21, x21_save);
__ sd(x22, x22_save);
__ sd(x23, x23_save);
__ sd(x24, x24_save);
__ sd(x25, x25_save);
__ sd(x26, x26_save);
__ sd(x27, x27_save);
__ fsd(f8, f8_save);
__ fsd(f9, f9_save);
__ fsd(f18, f18_save);
__ fsd(f19, f19_save);
__ fsd(f20, f20_save);
__ fsd(f21, f21_save);
__ fsd(f22, f22_save);
__ fsd(f23, f23_save);
__ fsd(f24, f24_save);
__ fsd(f25, f25_save);
__ fsd(f26, f26_save);
__ fsd(f27, f27_save);
__ frrm(t0);
__ sd(t0, frm_save);
// Set frm to the state we need. We do want Round to Nearest. We
// don't want non-IEEE rounding modes.
Label skip_fsrmi;
guarantee(__ RoundingMode::rne == 0, "must be");
__ beqz(t0, skip_fsrmi);
__ fsrmi(__ RoundingMode::rne);
__ bind(skip_fsrmi);
// install Java thread in global register now we have saved
// whatever value it held
__ mv(xthread, c_rarg7);
// And method
__ mv(xmethod, c_rarg3);
// set up the heapbase register
__ reinit_heapbase();
#ifdef ASSERT
// make sure we have no pending exceptions
{
Label L;
__ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
__ beqz(t0, L);
__ stop("StubRoutines::call_stub: entered with pending exception");
__ BIND(L);
}
#endif
// pass parameters if any
__ mv(esp, sp);
__ slli(t0, c_rarg6, LogBytesPerWord);
__ sub(t0, sp, t0); // Move SP out of the way
__ andi(sp, t0, -2 * wordSize);
BLOCK_COMMENT("pass parameters if any");
Label parameters_done;
// parameter count is still in c_rarg6
// and parameter pointer identifying param 1 is in c_rarg5
__ beqz(c_rarg6, parameters_done);
address loop = __ pc();
__ ld(t0, Address(c_rarg5, 0));
__ addi(c_rarg5, c_rarg5, wordSize);
__ subi(c_rarg6, c_rarg6, 1);
__ push_reg(t0);
__ bgtz(c_rarg6, loop);
__ BIND(parameters_done);
// call Java entry -- passing methdoOop, and current sp
// xmethod: Method*
// x19_sender_sp: sender sp
BLOCK_COMMENT("call Java function");
__ mv(x19_sender_sp, sp);
__ jalr(c_rarg4);
// save current address for use by exception handling code
return_address = __ pc();
// store result depending on type (everything that is not
// T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
// n.b. this assumes Java returns an integral result in x10
// and a floating result in j_farg0
__ ld(j_rarg2, result);
Label is_long, is_float, is_double, exit;
__ ld(j_rarg1, result_type);
__ mv(t0, (u1)T_OBJECT);
__ beq(j_rarg1, t0, is_long);
__ mv(t0, (u1)T_LONG);
__ beq(j_rarg1, t0, is_long);
__ mv(t0, (u1)T_FLOAT);
__ beq(j_rarg1, t0, is_float);
__ mv(t0, (u1)T_DOUBLE);
__ beq(j_rarg1, t0, is_double);
// handle T_INT case
__ sw(x10, Address(j_rarg2));
__ BIND(exit);
// pop parameters
__ addi(esp, fp, sp_after_call_off * wordSize);
#ifdef ASSERT
// verify that threads correspond
{
Label L, S;
__ ld(t0, thread);
__ bne(xthread, t0, S);
__ get_thread(t0);
__ beq(xthread, t0, L);
__ BIND(S);
__ stop("StubRoutines::call_stub: threads must correspond");
__ BIND(L);
}
#endif
__ pop_cont_fastpath(xthread);
// restore callee-save registers
__ fld(f27, f27_save);
__ fld(f26, f26_save);
__ fld(f25, f25_save);
__ fld(f24, f24_save);
__ fld(f23, f23_save);
__ fld(f22, f22_save);
__ fld(f21, f21_save);
__ fld(f20, f20_save);
__ fld(f19, f19_save);
__ fld(f18, f18_save);
__ fld(f9, f9_save);
__ fld(f8, f8_save);
__ ld(x27, x27_save);
__ ld(x26, x26_save);
__ ld(x25, x25_save);
__ ld(x24, x24_save);
__ ld(x23, x23_save);
__ ld(x22, x22_save);
__ ld(x21, x21_save);
__ ld(x20, x20_save);
__ ld(x19, x19_save);
__ ld(x18, x18_save);
__ ld(x9, x9_save);
// restore frm
Label skip_fsrm;
__ ld(t0, frm_save);
__ frrm(t1);
__ beq(t0, t1, skip_fsrm);
__ fsrm(t0);
__ bind(skip_fsrm);
__ ld(c_rarg0, call_wrapper);
__ ld(c_rarg1, result);
__ ld(c_rarg2, result_type);
__ ld(c_rarg3, method);
__ ld(c_rarg4, entry_point);
__ ld(c_rarg5, parameters);
__ ld(c_rarg6, parameter_size);
__ ld(c_rarg7, thread);
// leave frame and return to caller
__ leave();
__ ret();
// handle return types different from T_INT
__ BIND(is_long);
__ sd(x10, Address(j_rarg2, 0));
__ j(exit);
__ BIND(is_float);
__ fsw(j_farg0, Address(j_rarg2, 0), t0);
__ j(exit);
__ BIND(is_double);
__ fsd(j_farg0, Address(j_rarg2, 0), t0);
__ j(exit);
return start;
}
// Return point for a Java call if there's an exception thrown in
// Java code. The exception is caught and transformed into a
// pending exception stored in JavaThread that can be tested from
// within the VM.
//
// Note: Usually the parameters are removed by the callee. In case
// of an exception crossing an activation frame boundary, that is
// not the case if the callee is compiled code => need to setup the
// sp.
//
// x10: exception oop
address generate_catch_exception() {
StubGenStubId stub_id = StubGenStubId::catch_exception_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
// same as in generate_call_stub():
const Address thread(fp, thread_off * wordSize);
#ifdef ASSERT
// verify that threads correspond
{
Label L, S;
__ ld(t0, thread);
__ bne(xthread, t0, S);
__ get_thread(t0);
__ beq(xthread, t0, L);
__ bind(S);
__ stop("StubRoutines::catch_exception: threads must correspond");
__ bind(L);
}
#endif
// set pending exception
__ verify_oop(x10);
__ sd(x10, Address(xthread, Thread::pending_exception_offset()));
__ mv(t0, (address)__FILE__);
__ sd(t0, Address(xthread, Thread::exception_file_offset()));
__ mv(t0, (int)__LINE__);
__ sw(t0, Address(xthread, Thread::exception_line_offset()));
// complete return to VM
assert(StubRoutines::_call_stub_return_address != nullptr,
"_call_stub_return_address must have been generated before");
__ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
return start;
}
// Continuation point for runtime calls returning with a pending
// exception. The pending exception check happened in the runtime
// or native call stub. The pending exception in Thread is
// converted into a Java-level exception.
//
// Contract with Java-level exception handlers:
// x10: exception
// x13: throwing pc
//
// NOTE: At entry of this stub, exception-pc must be in RA !!
// NOTE: this is always used as a jump target within generated code
// so it just needs to be generated code with no x86 prolog
address generate_forward_exception() {
StubGenStubId stub_id = StubGenStubId::forward_exception_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
// Upon entry, RA points to the return address returning into
// Java (interpreted or compiled) code; i.e., the return address
// becomes the throwing pc.
//
// Arguments pushed before the runtime call are still on the stack
// but the exception handler will reset the stack pointer ->
// ignore them. A potential result in registers can be ignored as
// well.
#ifdef ASSERT
// make sure this code is only executed if there is a pending exception
{
Label L;
__ ld(t0, Address(xthread, Thread::pending_exception_offset()));
__ bnez(t0, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
}
#endif
// compute exception handler into x9
// call the VM to find the handler address associated with the
// caller address. pass thread in x10 and caller pc (ret address)
// in x11. n.b. the caller pc is in ra, unlike x86 where it is on
// the stack.
__ mv(c_rarg1, ra);
// ra will be trashed by the VM call so we move it to x9
// (callee-saved) because we also need to pass it to the handler
// returned by this call.
__ mv(x9, ra);
BLOCK_COMMENT("call exception_handler_for_return_address");
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
SharedRuntime::exception_handler_for_return_address),
xthread, c_rarg1);
// we should not really care that ra is no longer the callee
// address. we saved the value the handler needs in x9 so we can
// just copy it to x13. however, the C2 handler will push its own
// frame and then calls into the VM and the VM code asserts that
// the PC for the frame above the handler belongs to a compiled
// Java method. So, we restore ra here to satisfy that assert.
__ mv(ra, x9);
// setup x10 & x13 & clear pending exception
__ mv(x13, x9);
__ mv(x9, x10);
__ ld(x10, Address(xthread, Thread::pending_exception_offset()));
__ sd(zr, Address(xthread, Thread::pending_exception_offset()));
#ifdef ASSERT
// make sure exception is set
{
Label L;
__ bnez(x10, L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
}
#endif
// continue at exception handler
// x10: exception
// x13: throwing pc
// x9: exception handler
__ verify_oop(x10);
__ jr(x9);
return start;
}
// Non-destructive plausibility checks for oops
//
// Arguments:
// x10: oop to verify
// t0: error message
//
// Stack after saving c_rarg3:
// [tos + 0]: saved c_rarg3
// [tos + 1]: saved c_rarg2
// [tos + 2]: saved ra
// [tos + 3]: saved t1
// [tos + 4]: saved x10
// [tos + 5]: saved t0
address generate_verify_oop() {
StubGenStubId stub_id = StubGenStubId::verify_oop_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
Label exit, error;
__ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
__ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
__ ld(c_rarg3, Address(c_rarg2));
__ addi(c_rarg3, c_rarg3, 1);
__ sd(c_rarg3, Address(c_rarg2));
// object is in x10
// make sure object is 'reasonable'
__ beqz(x10, exit); // if obj is null it is OK
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
// return if everything seems ok
__ bind(exit);
__ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
__ ret();
// handle errors
__ bind(error);
__ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
__ push_reg(RegSet::range(x0, x31), sp);
// debug(char* msg, int64_t pc, int64_t regs[])
__ mv(c_rarg0, t0); // pass address of error message
__ mv(c_rarg1, ra); // pass return address
__ mv(c_rarg2, sp); // pass address of regs on stack
#ifndef PRODUCT
assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
#endif
BLOCK_COMMENT("call MacroAssembler::debug");
__ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
__ ebreak();
return start;
}
// The inner part of zero_words().
//
// Inputs:
// x28: the HeapWord-aligned base address of an array to zero.
// x29: the count in HeapWords, x29 > 0.
//
// Returns x28 and x29, adjusted for the caller to clear.
// x28: the base address of the tail of words left to clear.
// x29: the number of words in the tail.
// x29 < MacroAssembler::zero_words_block_size.
address generate_zero_blocks() {
Label done;
const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
if (UseBlockZeroing) {
// Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
// after alignment.
Label small;
int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
__ mv(tmp1, low_limit);
__ blt(cnt, tmp1, small);
__ zero_dcache_blocks(base, cnt, tmp1, tmp2);
__ bind(small);
}
{
// Clear the remaining blocks.
Label loop;
__ mv(tmp1, MacroAssembler::zero_words_block_size);
__ blt(cnt, tmp1, done);
__ bind(loop);
for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
__ sd(zr, Address(base, i * wordSize));
}
__ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
__ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
__ bge(cnt, tmp1, loop);
__ bind(done);
}
__ ret();
return start;
}
typedef enum {
copy_forwards = 1,
copy_backwards = -1
} copy_direction;
// Bulk copy of blocks of 8 words.
//
// count is a count of words.
//
// Precondition: count >= 8
//
// Postconditions:
//
// The least significant bit of count contains the remaining count
// of words to copy. The rest of count is trash.
//
// s and d are adjusted to point to the remaining words to copy
//
void generate_copy_longs(StubGenStubId stub_id, Label &start,
Register s, Register d, Register count) {
BasicType type;
copy_direction direction;
switch (stub_id) {
case copy_byte_f_id:
direction = copy_forwards;
type = T_BYTE;
break;
case copy_byte_b_id:
direction = copy_backwards;
type = T_BYTE;
break;
default:
ShouldNotReachHere();
}
int unit = wordSize * direction;
int bias = wordSize;
const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
const Register stride = x30;
assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
assert_different_registers(s, d, count, t0);
Label again, drain;
StubCodeMark mark(this, stub_id);
__ align(CodeEntryAlignment);
__ bind(start);
if (direction == copy_forwards) {
__ sub(s, s, bias);
__ sub(d, d, bias);
}
#ifdef ASSERT
// Make sure we are never given < 8 words
{
Label L;
__ mv(t0, 8);
__ bge(count, t0, L);
__ stop("genrate_copy_longs called with < 8 words");
__ bind(L);
}
#endif
__ ld(tmp_reg0, Address(s, 1 * unit));
__ ld(tmp_reg1, Address(s, 2 * unit));
__ ld(tmp_reg2, Address(s, 3 * unit));
__ ld(tmp_reg3, Address(s, 4 * unit));
__ ld(tmp_reg4, Address(s, 5 * unit));
__ ld(tmp_reg5, Address(s, 6 * unit));
__ ld(tmp_reg6, Address(s, 7 * unit));
__ ld(tmp_reg7, Address(s, 8 * unit));
__ addi(s, s, 8 * unit);
__ subi(count, count, 16);
__ bltz(count, drain);
__ bind(again);
__ sd(tmp_reg0, Address(d, 1 * unit));
__ sd(tmp_reg1, Address(d, 2 * unit));
__ sd(tmp_reg2, Address(d, 3 * unit));
__ sd(tmp_reg3, Address(d, 4 * unit));
__ sd(tmp_reg4, Address(d, 5 * unit));
__ sd(tmp_reg5, Address(d, 6 * unit));
__ sd(tmp_reg6, Address(d, 7 * unit));
__ sd(tmp_reg7, Address(d, 8 * unit));
__ ld(tmp_reg0, Address(s, 1 * unit));
__ ld(tmp_reg1, Address(s, 2 * unit));
__ ld(tmp_reg2, Address(s, 3 * unit));
__ ld(tmp_reg3, Address(s, 4 * unit));
__ ld(tmp_reg4, Address(s, 5 * unit));
__ ld(tmp_reg5, Address(s, 6 * unit));
__ ld(tmp_reg6, Address(s, 7 * unit));
__ ld(tmp_reg7, Address(s, 8 * unit));
__ addi(s, s, 8 * unit);
__ addi(d, d, 8 * unit);
__ subi(count, count, 8);
__ bgez(count, again);
// Drain
__ bind(drain);
__ sd(tmp_reg0, Address(d, 1 * unit));
__ sd(tmp_reg1, Address(d, 2 * unit));
__ sd(tmp_reg2, Address(d, 3 * unit));
__ sd(tmp_reg3, Address(d, 4 * unit));
__ sd(tmp_reg4, Address(d, 5 * unit));
__ sd(tmp_reg5, Address(d, 6 * unit));
__ sd(tmp_reg6, Address(d, 7 * unit));
__ sd(tmp_reg7, Address(d, 8 * unit));
__ addi(d, d, 8 * unit);
{
Label L1, L2;
__ test_bit(t0, count, 2);
__ beqz(t0, L1);
__ ld(tmp_reg0, Address(s, 1 * unit));
__ ld(tmp_reg1, Address(s, 2 * unit));
__ ld(tmp_reg2, Address(s, 3 * unit));
__ ld(tmp_reg3, Address(s, 4 * unit));
__ addi(s, s, 4 * unit);
__ sd(tmp_reg0, Address(d, 1 * unit));
__ sd(tmp_reg1, Address(d, 2 * unit));
__ sd(tmp_reg2, Address(d, 3 * unit));
__ sd(tmp_reg3, Address(d, 4 * unit));
__ addi(d, d, 4 * unit);
__ bind(L1);
if (direction == copy_forwards) {
__ addi(s, s, bias);
__ addi(d, d, bias);
}
__ test_bit(t0, count, 1);
__ beqz(t0, L2);
if (direction == copy_backwards) {
__ addi(s, s, 2 * unit);
__ ld(tmp_reg0, Address(s));
__ ld(tmp_reg1, Address(s, wordSize));
__ addi(d, d, 2 * unit);
__ sd(tmp_reg0, Address(d));
__ sd(tmp_reg1, Address(d, wordSize));
} else {
__ ld(tmp_reg0, Address(s));
__ ld(tmp_reg1, Address(s, wordSize));
__ addi(s, s, 2 * unit);
__ sd(tmp_reg0, Address(d));
__ sd(tmp_reg1, Address(d, wordSize));
__ addi(d, d, 2 * unit);
}
__ bind(L2);
}
__ ret();
}
Label copy_f, copy_b;
typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
void copy_memory_v(Register s, Register d, Register count, int step) {
bool is_backward = step < 0;
int granularity = uabs(step);
const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
Label loop_forward, loop_backward, done;
__ mv(dst, d);
__ mv(src, s);
__ mv(cnt, count);
__ bind(loop_forward);
__ vsetvli(vl, cnt, sew, Assembler::m8);
if (is_backward) {
__ bne(vl, cnt, loop_backward);
}
__ vlex_v(v0, src, sew);
__ sub(cnt, cnt, vl);
if (sew != Assembler::e8) {
// when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
__ slli(vl, vl, sew);
}
__ add(src, src, vl);
__ vsex_v(v0, dst, sew);
__ add(dst, dst, vl);
__ bnez(cnt, loop_forward);
if (is_backward) {
__ j(done);
__ bind(loop_backward);
__ sub(t0, cnt, vl);
if (sew != Assembler::e8) {
// when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
__ slli(t0, t0, sew);
}
__ add(tmp1, s, t0);
__ vlex_v(v0, tmp1, sew);
__ add(tmp2, d, t0);
__ vsex_v(v0, tmp2, sew);
__ sub(cnt, cnt, vl);
__ bnez(cnt, loop_forward);
__ bind(done);
}
}
// All-singing all-dancing memory copy.
//
// Copy count units of memory from s to d. The size of a unit is
// step, which can be positive or negative depending on the direction
// of copy.
//
void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
Register s, Register d, Register count, int step) {
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
return copy_memory_v(s, d, count, step);
}
bool is_backwards = step < 0;
int granularity = uabs(step);
const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
const Register gct1 = x28, gct2 = x29, gct3 = t2;
Label same_aligned;
Label copy_big, copy32_loop, copy8_loop, copy_small, done;
// The size of copy32_loop body increases significantly with ZGC GC barriers.
// Need conditional far branches to reach a point beyond the loop in this case.
bool is_far = UseZGC;
__ beqz(count, done, is_far);
__ slli(cnt, count, exact_log2(granularity));
if (is_backwards) {
__ add(src, s, cnt);
__ add(dst, d, cnt);
} else {
__ mv(src, s);
__ mv(dst, d);
}
if (is_aligned) {
__ subi(t0, cnt, 32);
__ bgez(t0, copy32_loop);
__ subi(t0, cnt, 8);
__ bgez(t0, copy8_loop, is_far);
__ j(copy_small);
} else {
__ mv(t0, 16);
__ blt(cnt, t0, copy_small, is_far);
__ xorr(t0, src, dst);
__ andi(t0, t0, 0b111);
__ bnez(t0, copy_small, is_far);
__ bind(same_aligned);
__ andi(t0, src, 0b111);
__ beqz(t0, copy_big);
if (is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
}
bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
if (!is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
}
__ subi(cnt, cnt, granularity);
__ beqz(cnt, done, is_far);
__ j(same_aligned);
__ bind(copy_big);
__ mv(t0, 32);
__ blt(cnt, t0, copy8_loop, is_far);
}
__ bind(copy32_loop);
if (is_backwards) {
__ subi(src, src, wordSize * 4);
__ subi(dst, dst, wordSize * 4);
}
// we first load 32 bytes, then write it, so the direction here doesn't matter
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1);
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
if (!is_backwards) {
__ addi(src, src, wordSize * 4);
__ addi(dst, dst, wordSize * 4);
}
__ subi(t0, cnt, 32 + wordSize * 4);
__ subi(cnt, cnt, wordSize * 4);
__ bgez(t0, copy32_loop); // cnt >= 32, do next loop
__ beqz(cnt, done); // if that's all - done
__ subi(t0, cnt, 8); // if not - copy the reminder
__ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
__ bind(copy8_loop);
if (is_backwards) {
__ subi(src, src, wordSize);
__ subi(dst, dst, wordSize);
}
bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
if (!is_backwards) {
__ addi(src, src, wordSize);
__ addi(dst, dst, wordSize);
}
__ subi(t0, cnt, 8 + wordSize);
__ subi(cnt, cnt, wordSize);
__ bgez(t0, copy8_loop); // cnt >= 8, do next loop
__ beqz(cnt, done); // if that's all - done
__ bind(copy_small);
if (is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
}
bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
if (!is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
}
__ subi(cnt, cnt, granularity);
__ bgtz(cnt, copy_small);
__ bind(done);
}
// Scan over array at a for count oops, verifying each one.
// Preserves a and count, clobbers t0 and t1.
void verify_oop_array(size_t size, Register a, Register count, Register temp) {
Label loop, end;
__ mv(t1, zr);
__ slli(t0, count, exact_log2(size));
__ bind(loop);
__ bgeu(t1, t0, end);
__ add(temp, a, t1);
if (size == (size_t)wordSize) {
__ ld(temp, Address(temp, 0));
__ verify_oop(temp);
} else {
__ lwu(temp, Address(temp, 0));
__ decode_heap_oop(temp); // calls verify_oop
}
__ add(t1, t1, size);
__ j(loop);
__ bind(end);
}
// Arguments:
// stub_id - is used to name the stub and identify all details of
// how to perform the copy.
//
// entry - is assigned to the stub's post push entry point unless
// it is null
//
// Inputs:
// c_rarg0 - source array address
// c_rarg1 - destination array address
// c_rarg2 - element count, treated as ssize_t, can be zero
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomically.
//
// Side Effects: entry is set to the (post push) entry point so it
// can be used by the corresponding conjoint copy
// method
//
address generate_disjoint_copy(StubGenStubId stub_id, address* entry) {
size_t size;
bool aligned;
bool is_oop;
bool dest_uninitialized;
switch (stub_id) {
case jbyte_disjoint_arraycopy_id:
size = sizeof(jbyte);
aligned = false;
is_oop = false;
dest_uninitialized = false;
break;
case arrayof_jbyte_disjoint_arraycopy_id:
size = sizeof(jbyte);
aligned = true;
is_oop = false;
dest_uninitialized = false;
break;
case jshort_disjoint_arraycopy_id:
size = sizeof(jshort);
aligned = false;
is_oop = false;
dest_uninitialized = false;
break;
case arrayof_jshort_disjoint_arraycopy_id:
size = sizeof(jshort);
aligned = true;
is_oop = false;
dest_uninitialized = false;
break;
case jint_disjoint_arraycopy_id:
size = sizeof(jint);
aligned = false;
is_oop = false;
dest_uninitialized = false;
break;
case arrayof_jint_disjoint_arraycopy_id:
size = sizeof(jint);
aligned = true;
is_oop = false;
dest_uninitialized = false;
break;
case jlong_disjoint_arraycopy_id:
// since this is always aligned we can (should!) use the same
// stub as for case arrayof_jlong_disjoint_arraycopy
ShouldNotReachHere();
break;
case arrayof_jlong_disjoint_arraycopy_id:
size = sizeof(jlong);
aligned = true;
is_oop = false;
dest_uninitialized = false;
break;
case oop_disjoint_arraycopy_id:
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
aligned = !UseCompressedOops;
is_oop = true;
dest_uninitialized = false;
break;
case arrayof_oop_disjoint_arraycopy_id:
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
aligned = !UseCompressedOops;
is_oop = true;
dest_uninitialized = false;
break;
case oop_disjoint_arraycopy_uninit_id:
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
aligned = !UseCompressedOops;
is_oop = true;
dest_uninitialized = true;
break;
case arrayof_oop_disjoint_arraycopy_uninit_id:
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
aligned = !UseCompressedOops;
is_oop = true;
dest_uninitialized = true;
break;
default:
ShouldNotReachHere();
break;
}
const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
RegSet saved_reg = RegSet::of(s, d, count);
__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter();
if (entry != nullptr) {
*entry = __ pc();
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
BLOCK_COMMENT("Entry:");
}
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
if (is_oop) {
// save regs before copy_memory
__ push_reg(RegSet::of(d, count), sp);
}
{
// UnsafeMemoryAccess page error: continue after unsafe access
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
UnsafeMemoryAccessMark umam(this, add_entry, true);
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
}
if (is_oop) {
__ pop_reg(RegSet::of(d, count), sp);
if (VerifyOops) {
verify_oop_array(size, d, count, t2);
}
}
bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
__ leave();
__ mv(x10, zr); // return 0
__ ret();
return start;
}
// Arguments:
// stub_id - is used to name the stub and identify all details of
// how to perform the copy.
//
// nooverlap_target - identifes the (post push) entry for the
// corresponding disjoint copy routine which can be
// jumped to if the ranges do not actually overlap
//
// entry - is assigned to the stub's post push entry point unless
// it is null
//
// Inputs:
// c_rarg0 - source array address
// c_rarg1 - destination array address
// c_rarg2 - element count, treated as ssize_t, can be zero
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomically.
//
// Side Effects:
// entry is set to the no-overlap entry point so it can be used by
// some other conjoint copy method
//
address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
RegSet saved_regs = RegSet::of(s, d, count);
int size;
bool aligned;
bool is_oop;
bool dest_uninitialized;
switch (stub_id) {
case jbyte_arraycopy_id:
size = sizeof(jbyte);
aligned = false;
is_oop = false;
dest_uninitialized = false;
break;
case arrayof_jbyte_arraycopy_id:
size = sizeof(jbyte);
aligned = true;
is_oop = false;
dest_uninitialized = false;
break;
case jshort_arraycopy_id:
size = sizeof(jshort);
aligned = false;
is_oop = false;
dest_uninitialized = false;
break;
case arrayof_jshort_arraycopy_id:
size = sizeof(jshort);
aligned = true;
is_oop = false;
dest_uninitialized = false;
break;
case jint_arraycopy_id:
size = sizeof(jint);
aligned = false;
is_oop = false;
dest_uninitialized = false;
break;
case arrayof_jint_arraycopy_id:
size = sizeof(jint);
aligned = true;
is_oop = false;
dest_uninitialized = false;
break;
case jlong_arraycopy_id:
// since this is always aligned we can (should!) use the same
// stub as for case arrayof_jlong_disjoint_arraycopy
ShouldNotReachHere();
break;
case arrayof_jlong_arraycopy_id:
size = sizeof(jlong);
aligned = true;
is_oop = false;
dest_uninitialized = false;
break;
case oop_arraycopy_id:
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
aligned = !UseCompressedOops;
is_oop = true;
dest_uninitialized = false;
break;
case arrayof_oop_arraycopy_id:
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
aligned = !UseCompressedOops;
is_oop = true;
dest_uninitialized = false;
break;
case oop_arraycopy_uninit_id:
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
aligned = !UseCompressedOops;
is_oop = true;
dest_uninitialized = true;
break;
case arrayof_oop_arraycopy_uninit_id:
size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
aligned = !UseCompressedOops;
is_oop = true;
dest_uninitialized = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter();
if (entry != nullptr) {
*entry = __ pc();
// caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
BLOCK_COMMENT("Entry:");
}
// use fwd copy when (d-s) above_equal (count*size)
__ sub(t0, d, s);
__ slli(t1, count, exact_log2(size));
Label L_continue;
__ bltu(t0, t1, L_continue);
__ j(nooverlap_target);
__ bind(L_continue);
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
if (is_oop) {
// save regs before copy_memory
__ push_reg(RegSet::of(d, count), sp);
}
{
// UnsafeMemoryAccess page error: continue after unsafe access
bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
UnsafeMemoryAccessMark umam(this, add_entry, true);
copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
}
if (is_oop) {
__ pop_reg(RegSet::of(d, count), sp);
if (VerifyOops) {
verify_oop_array(size, d, count, t2);
}
}
bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
__ leave();
__ mv(x10, zr); // return 0
__ ret();
return start;
}
// Helper for generating a dynamic type check.
// Smashes t0, t1.
void generate_type_check(Register sub_klass,
Register super_check_offset,
Register super_klass,
Register result,
Register tmp1,
Register tmp2,
Label& L_success) {
assert_different_registers(sub_klass, super_check_offset, super_klass);
BLOCK_COMMENT("type_check:");
Label L_miss;
__ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
__ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
// Fall through on failure!
__ BIND(L_miss);
}
//
// Generate checkcasting array copy stub
//
// Input:
// c_rarg0 - source array address
// c_rarg1 - destination array address
// c_rarg2 - element count, treated as ssize_t, can be zero
// c_rarg3 - size_t ckoff (super_check_offset)
// c_rarg4 - oop ckval (super_klass)
//
// Output:
// x10 == 0 - success
// x10 == -1^K - failure, where K is partial transfer count
//
address generate_checkcast_copy(StubGenStubId stub_id, address* entry) {
bool dest_uninitialized;
switch (stub_id) {
case checkcast_arraycopy_id:
dest_uninitialized = false;
break;
case checkcast_arraycopy_uninit_id:
dest_uninitialized = true;
break;
default:
ShouldNotReachHere();
}
Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
// Input registers (after setup_arg_regs)
const Register from = c_rarg0; // source array address
const Register to = c_rarg1; // destination array address
const Register count = c_rarg2; // elementscount
const Register ckoff = c_rarg3; // super_check_offset
const Register ckval = c_rarg4; // super_klass
RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
RegSet wb_post_saved_regs = RegSet::of(count);
// Registers used as temps (x7, x9, x18 are save-on-entry)
const Register count_save = x19; // orig elementscount
const Register start_to = x18; // destination array start address
const Register copied_oop = x7; // actual oop copied
const Register r9_klass = x9; // oop._klass
// Registers used as gc temps (x15, x16, x17 are save-on-call)
const Register gct1 = x15, gct2 = x16, gct3 = x17;
//---------------------------------------------------------------
// Assembler stub will be used for this call to arraycopy
// if the two arrays are subtypes of Object[] but the
// destination array type is not equal to or a supertype
// of the source type. Each element must be separately
// checked.
assert_different_registers(from, to, count, ckoff, ckval, start_to,
copied_oop, r9_klass, count_save);
__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter(); // required for proper stackwalking of RuntimeStub frame
// Caller of this entry point must set up the argument registers.
if (entry != nullptr) {
*entry = __ pc();
BLOCK_COMMENT("Entry:");
}
// Empty array: Nothing to do
__ beqz(count, L_done);
__ push_reg(RegSet::of(x7, x9, x18, x19), sp);
#ifdef ASSERT
BLOCK_COMMENT("assert consistent ckoff/ckval");
// The ckoff and ckval must be mutually consistent,
// even though caller generates both.
{ Label L;
int sco_offset = in_bytes(Klass::super_check_offset_offset());
__ lwu(start_to, Address(ckval, sco_offset));
__ beq(ckoff, start_to, L);
__ stop("super_check_offset inconsistent");
__ bind(L);
}
#endif //ASSERT
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
bool is_oop = true;
int element_size = UseCompressedOops ? 4 : 8;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
// save the original count
__ mv(count_save, count);
// Copy from low to high addresses
__ mv(start_to, to); // Save destination array start address
__ j(L_load_element);
// ======== begin loop ========
// (Loop is rotated; its entry is L_load_element.)
// Loop control:
// for count to 0 do
// copied_oop = load_heap_oop(from++)
// ... generate_type_check ...
// store_heap_oop(to++, copied_oop)
// end
__ align(OptoLoopAlignment);
__ BIND(L_store_element);
bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
Address(to, 0), copied_oop,
gct1, gct2, gct3);
__ addi(to, to, UseCompressedOops ? 4 : 8);
__ subi(count, count, 1);
__ beqz(count, L_do_card_marks);
// ======== loop entry is here ========
__ BIND(L_load_element);
bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
copied_oop, Address(from, 0),
gct1);
__ addi(from, from, UseCompressedOops ? 4 : 8);
__ beqz(copied_oop, L_store_element);
__ load_klass(r9_klass, copied_oop);// query the object klass
BLOCK_COMMENT("type_check:");
generate_type_check(r9_klass, /*sub_klass*/
ckoff, /*super_check_offset*/
ckval, /*super_klass*/
x10, /*result*/
gct1, /*tmp1*/
gct2, /*tmp2*/
L_store_element);
// Fall through on failure!
// ======== end loop ========
// It was a real error; we must depend on the caller to finish the job.
// Register count = remaining oops, count_orig = total oops.
// Emit GC store barriers for the oops we have copied and report
// their number to the caller.
__ sub(count, count_save, count); // K = partially copied oop count
__ xori(count, count, -1); // report (-1^K) to caller
__ beqz(count, L_done_pop);
__ BIND(L_do_card_marks);
bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
__ bind(L_done_pop);
__ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
__ bind(L_done);
__ mv(x10, count);
__ leave();
__ ret();
return start;
}
// Perform range checks on the proposed arraycopy.
// Kills temp, but nothing else.
// Also, clean the sign bits of src_pos and dst_pos.
void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
Register src_pos, // source position (c_rarg1)
Register dst, // destination array oo (c_rarg2)
Register dst_pos, // destination position (c_rarg3)
Register length,
Register temp,
Label& L_failed) {
BLOCK_COMMENT("arraycopy_range_checks:");
assert_different_registers(t0, temp);
// if [src_pos + length > arrayOop(src)->length()] then FAIL
__ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
__ addw(temp, length, src_pos);
__ bgtu(temp, t0, L_failed);
// if [dst_pos + length > arrayOop(dst)->length()] then FAIL
__ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
__ addw(temp, length, dst_pos);
__ bgtu(temp, t0, L_failed);
// Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
__ zext(src_pos, src_pos, 32);
__ zext(dst_pos, dst_pos, 32);
BLOCK_COMMENT("arraycopy_range_checks done");
}
//
// Generate 'unsafe' array copy stub
// Though just as safe as the other stubs, it takes an unscaled
// size_t argument instead of an element count.
//
// Input:
// c_rarg0 - source array address
// c_rarg1 - destination array address
// c_rarg2 - byte count, treated as ssize_t, can be zero
//
// Examines the alignment of the operands and dispatches
// to a long, int, short, or byte copy loop.
//
address generate_unsafe_copy(address byte_copy_entry,
address short_copy_entry,
address int_copy_entry,
address long_copy_entry) {
assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
int_copy_entry != nullptr && long_copy_entry != nullptr);
Label L_long_aligned, L_int_aligned, L_short_aligned;
const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter(); // required for proper stackwalking of RuntimeStub frame
// bump this on entry, not on exit:
inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
__ orr(t0, s, d);
__ orr(t0, t0, count);
__ andi(t0, t0, BytesPerLong - 1);
__ beqz(t0, L_long_aligned);
__ andi(t0, t0, BytesPerInt - 1);
__ beqz(t0, L_int_aligned);
__ test_bit(t0, t0, 0);
__ beqz(t0, L_short_aligned);
__ j(RuntimeAddress(byte_copy_entry));
__ BIND(L_short_aligned);
__ srli(count, count, LogBytesPerShort); // size => short_count
__ j(RuntimeAddress(short_copy_entry));
__ BIND(L_int_aligned);
__ srli(count, count, LogBytesPerInt); // size => int_count
__ j(RuntimeAddress(int_copy_entry));
__ BIND(L_long_aligned);
__ srli(count, count, LogBytesPerLong); // size => long_count
__ j(RuntimeAddress(long_copy_entry));
return start;
}
//
// Generate generic array copy stubs
//
// Input:
// c_rarg0 - src oop
// c_rarg1 - src_pos (32-bits)
// c_rarg2 - dst oop
// c_rarg3 - dst_pos (32-bits)
// c_rarg4 - element count (32-bits)
//
// Output:
// x10 == 0 - success
// x10 == -1^K - failure, where K is partial transfer count
//
address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
address int_copy_entry, address oop_copy_entry,
address long_copy_entry, address checkcast_copy_entry) {
assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
int_copy_entry != nullptr && oop_copy_entry != nullptr &&
long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
Label L_failed, L_failed_0, L_objArray;
Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
// Input registers
const Register src = c_rarg0; // source array oop
const Register src_pos = c_rarg1; // source position
const Register dst = c_rarg2; // destination array oop
const Register dst_pos = c_rarg3; // destination position
const Register length = c_rarg4;
// Registers used as temps
const Register dst_klass = c_rarg5;
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter(); // required for proper stackwalking of RuntimeStub frame
// bump this on entry, not on exit:
inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
//-----------------------------------------------------------------------
// Assembler stub will be used for this call to arraycopy
// if the following conditions are met:
//
// (1) src and dst must not be null.
// (2) src_pos must not be negative.
// (3) dst_pos must not be negative.
// (4) length must not be negative.
// (5) src klass and dst klass should be the same and not null.
// (6) src and dst should be arrays.
// (7) src_pos + length must not exceed length of src.
// (8) dst_pos + length must not exceed length of dst.
//
// if src is null then return -1
__ beqz(src, L_failed);
// if [src_pos < 0] then return -1
__ sext(t0, src_pos, 32);
__ bltz(t0, L_failed);
// if dst is null then return -1
__ beqz(dst, L_failed);
// if [dst_pos < 0] then return -1
__ sext(t0, dst_pos, 32);
__ bltz(t0, L_failed);
// registers used as temp
const Register scratch_length = x28; // elements count to copy
const Register scratch_src_klass = x29; // array klass
const Register lh = x30; // layout helper
// if [length < 0] then return -1
__ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
__ bltz(scratch_length, L_failed);
__ load_klass(scratch_src_klass, src);
#ifdef ASSERT
{
BLOCK_COMMENT("assert klasses not null {");
Label L1, L2;
__ bnez(scratch_src_klass, L2); // it is broken if klass is null
__ bind(L1);
__ stop("broken null klass");
__ bind(L2);
__ load_klass(t0, dst, t1);
__ beqz(t0, L1); // this would be broken also
BLOCK_COMMENT("} assert klasses not null done");
}
#endif
// Load layout helper (32-bits)
//
// |array_tag| | header_size | element_type | |log2_element_size|
// 32 30 24 16 8 2 0
//
// array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
//
const int lh_offset = in_bytes(Klass::layout_helper_offset());
// Handle objArrays completely differently...
const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
__ lw(lh, Address(scratch_src_klass, lh_offset));
__ mv(t0, objArray_lh);
__ beq(lh, t0, L_objArray);
// if [src->klass() != dst->klass()] then return -1
__ load_klass(t1, dst);
__ bne(t1, scratch_src_klass, L_failed);
// if src->is_Array() isn't null then return -1
// i.e. (lh >= 0)
__ bgez(lh, L_failed);
// At this point, it is known to be a typeArray (array_tag 0x3).
#ifdef ASSERT
{
BLOCK_COMMENT("assert primitive array {");
Label L;
__ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
__ bge(lh, t1, L);
__ stop("must be a primitive array");
__ bind(L);
BLOCK_COMMENT("} assert primitive array done");
}
#endif
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
t1, L_failed);
// TypeArrayKlass
//
// src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
// dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
//
const Register t0_offset = t0; // array offset
const Register x30_elsize = lh; // element size
// Get array_header_in_bytes()
int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
__ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32;
__ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
__ add(src, src, t0_offset); // src array offset
__ add(dst, dst, t0_offset); // dst array offset
BLOCK_COMMENT("choose copy loop based on element size");
// next registers should be set before the jump to corresponding stub
const Register from = c_rarg0; // source array address
const Register to = c_rarg1; // destination array address
const Register count = c_rarg2; // elements count
// 'from', 'to', 'count' registers should be set in such order
// since they are the same as 'src', 'src_pos', 'dst'.
assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
// The possible values of elsize are 0-3, i.e. exact_log2(element
// size in bytes). We do a simple bitwise binary search.
__ BIND(L_copy_bytes);
__ test_bit(t0, x30_elsize, 1);
__ bnez(t0, L_copy_ints);
__ test_bit(t0, x30_elsize, 0);
__ bnez(t0, L_copy_shorts);
__ add(from, src, src_pos); // src_addr
__ add(to, dst, dst_pos); // dst_addr
__ sext(count, scratch_length, 32); // length
__ j(RuntimeAddress(byte_copy_entry));
__ BIND(L_copy_shorts);
__ shadd(from, src_pos, src, t0, 1); // src_addr
__ shadd(to, dst_pos, dst, t0, 1); // dst_addr
__ sext(count, scratch_length, 32); // length
__ j(RuntimeAddress(short_copy_entry));
__ BIND(L_copy_ints);
__ test_bit(t0, x30_elsize, 0);
__ bnez(t0, L_copy_longs);
__ shadd(from, src_pos, src, t0, 2); // src_addr
__ shadd(to, dst_pos, dst, t0, 2); // dst_addr
__ sext(count, scratch_length, 32); // length
__ j(RuntimeAddress(int_copy_entry));
__ BIND(L_copy_longs);
#ifdef ASSERT
{
BLOCK_COMMENT("assert long copy {");
Label L;
__ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
__ sext(lh, lh, 32);
__ mv(t0, LogBytesPerLong);
__ beq(x30_elsize, t0, L);
__ stop("must be long copy, but elsize is wrong");
__ bind(L);
BLOCK_COMMENT("} assert long copy done");
}
#endif
__ shadd(from, src_pos, src, t0, 3); // src_addr
__ shadd(to, dst_pos, dst, t0, 3); // dst_addr
__ sext(count, scratch_length, 32); // length
__ j(RuntimeAddress(long_copy_entry));
// ObjArrayKlass
__ BIND(L_objArray);
// live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
Label L_plain_copy, L_checkcast_copy;
// test array classes for subtyping
__ load_klass(t2, dst);
__ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
// Identically typed arrays can be copied without element-wise checks.
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
t1, L_failed);
__ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
__ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
__ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
__ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
__ sext(count, scratch_length, 32); // length
__ BIND(L_plain_copy);
__ j(RuntimeAddress(oop_copy_entry));
__ BIND(L_checkcast_copy);
// live at this point: scratch_src_klass, scratch_length, t2 (dst_klass)
{
// Before looking at dst.length, make sure dst is also an objArray.
__ lwu(t0, Address(t2, lh_offset));
__ mv(t1, objArray_lh);
__ bne(t0, t1, L_failed);
// It is safe to examine both src.length and dst.length.
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
t2, L_failed);
__ load_klass(dst_klass, dst); // reload
// Marshal the base address arguments now, freeing registers.
__ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
__ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
__ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
__ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
__ sext(count, length, 32); // length (reloaded)
const Register sco_temp = c_rarg3; // this register is free now
assert_different_registers(from, to, count, sco_temp,
dst_klass, scratch_src_klass);
// Generate the type check.
const int sco_offset = in_bytes(Klass::super_check_offset_offset());
__ lwu(sco_temp, Address(dst_klass, sco_offset));
// Smashes t0, t1
generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
// Fetch destination element klass from the ObjArrayKlass header.
int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
__ ld(dst_klass, Address(dst_klass, ek_offset));
__ lwu(sco_temp, Address(dst_klass, sco_offset));
// the checkcast_copy loop needs two extra arguments:
assert(c_rarg3 == sco_temp, "#3 already in place");
// Set up arguments for checkcast_copy_entry.
__ mv(c_rarg4, dst_klass); // dst.klass.element_klass
__ j(RuntimeAddress(checkcast_copy_entry));
}
__ BIND(L_failed);
__ mv(x10, -1);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret();
return start;
}
//
// Generate stub for array fill. If "aligned" is true, the
// "to" address is assumed to be heapword aligned.
//
// Arguments for generated stub:
// to: c_rarg0
// value: c_rarg1
// count: c_rarg2 treated as signed
//
address generate_fill(StubGenStubId stub_id) {
BasicType t;
bool aligned;
switch (stub_id) {
case jbyte_fill_id:
t = T_BYTE;
aligned = false;
break;
case jshort_fill_id:
t = T_SHORT;
aligned = false;
break;
case jint_fill_id:
t = T_INT;
aligned = false;
break;
case arrayof_jbyte_fill_id:
t = T_BYTE;
aligned = true;
break;
case arrayof_jshort_fill_id:
t = T_SHORT;
aligned = true;
break;
case arrayof_jint_fill_id:
t = T_INT;
aligned = true;
break;
default:
ShouldNotReachHere();
};
__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address start = __ pc();
BLOCK_COMMENT("Entry:");
const Register to = c_rarg0; // source array address
const Register value = c_rarg1; // value
const Register count = c_rarg2; // elements count
const Register bz_base = x28; // base for block_zero routine
const Register cnt_words = x29; // temp register
const Register tmp_reg = t1;
__ enter();
Label L_fill_elements;
int shift = -1;
switch (t) {
case T_BYTE:
shift = 0;
// Short arrays (< 8 bytes) fill by element
__ mv(tmp_reg, 8 >> shift);
__ bltu(count, tmp_reg, L_fill_elements);
// Zero extend value
// 8 bit -> 16 bit
__ zext(value, value, 8);
__ slli(tmp_reg, value, 8);
__ orr(value, value, tmp_reg);
// 16 bit -> 32 bit
__ slli(tmp_reg, value, 16);
__ orr(value, value, tmp_reg);
break;
case T_SHORT:
shift = 1;
// Short arrays (< 8 bytes) fill by element
__ mv(tmp_reg, 8 >> shift);
__ bltu(count, tmp_reg, L_fill_elements);
// Zero extend value
// 16 bit -> 32 bit
__ zext(value, value, 16);
__ slli(tmp_reg, value, 16);
__ orr(value, value, tmp_reg);
break;
case T_INT:
shift = 2;
// Short arrays (< 8 bytes) fill by element
__ mv(tmp_reg, 8 >> shift);
__ bltu(count, tmp_reg, L_fill_elements);
break;
default: ShouldNotReachHere();
}
// Align source address at 8 bytes address boundary.
Label L_skip_align1, L_skip_align2, L_skip_align4;
if (!aligned) {
switch (t) {
case T_BYTE:
// One byte misalignment happens only for byte arrays.
__ test_bit(t0, to, 0);
__ beqz(t0, L_skip_align1);
__ sb(value, Address(to, 0));
__ addi(to, to, 1);
__ subiw(count, count, 1);
__ bind(L_skip_align1);
// Fallthrough
case T_SHORT:
// Two bytes misalignment happens only for byte and short (char) arrays.
__ test_bit(t0, to, 1);
__ beqz(t0, L_skip_align2);
__ sh(value, Address(to, 0));
__ addi(to, to, 2);
__ subiw(count, count, 2 >> shift);
__ bind(L_skip_align2);
// Fallthrough
case T_INT:
// Align to 8 bytes, we know we are 4 byte aligned to start.
__ test_bit(t0, to, 2);
__ beqz(t0, L_skip_align4);
__ sw(value, Address(to, 0));
__ addi(to, to, 4);
__ subiw(count, count, 4 >> shift);
__ bind(L_skip_align4);
break;
default: ShouldNotReachHere();
}
}
//
// Fill large chunks
//
__ srliw(cnt_words, count, 3 - shift); // number of words
// 32 bit -> 64 bit
__ zext(value, value, 32);
__ slli(tmp_reg, value, 32);
__ orr(value, value, tmp_reg);
__ slli(tmp_reg, cnt_words, 3 - shift);
__ subw(count, count, tmp_reg);
{
__ fill_words(to, cnt_words, value);
}
// Remaining count is less than 8 bytes and address is heapword aligned.
Label L_fill_2, L_fill_4, L_exit1;
switch (t) {
case T_BYTE:
__ test_bit(t0, count, 0);
__ beqz(t0, L_fill_2);
__ sb(value, Address(to, 0));
__ addi(to, to, 1);
__ bind(L_fill_2);
__ test_bit(t0, count, 1);
__ beqz(t0, L_fill_4);
__ sh(value, Address(to, 0));
__ addi(to, to, 2);
__ bind(L_fill_4);
__ test_bit(t0, count, 2);
__ beqz(t0, L_exit1);
__ sw(value, Address(to, 0));
break;
case T_SHORT:
__ test_bit(t0, count, 0);
__ beqz(t0, L_fill_4);
__ sh(value, Address(to, 0));
__ addi(to, to, 2);
__ bind(L_fill_4);
__ test_bit(t0, count, 1);
__ beqz(t0, L_exit1);
__ sw(value, Address(to, 0));
break;
case T_INT:
__ beqz(count, L_exit1);
__ sw(value, Address(to, 0));
break;
default: ShouldNotReachHere();
}
__ bind(L_exit1);
__ leave();
__ ret();
// Handle copies less than 8 bytes.
Label L_loop1, L_loop2, L_exit2;
__ bind(L_fill_elements);
__ beqz(count, L_exit2);
switch (t) {
case T_BYTE:
__ bind(L_loop1);
__ sb(value, Address(to, 0));
__ addi(to, to, 1);
__ subiw(count, count, 1);
__ bnez(count, L_loop1);
break;
case T_SHORT:
__ bind(L_loop2);
__ sh(value, Address(to, 0));
__ addi(to, to, 2);
__ subiw(count, count, 2 >> shift);
__ bnez(count, L_loop2);
break;
case T_INT:
__ sw(value, Address(to, 0));
break;
default: ShouldNotReachHere();
}
__ bind(L_exit2);
__ leave();
__ ret();
return start;
}
void generate_arraycopy_stubs() {
address entry = nullptr;
address entry_jbyte_arraycopy = nullptr;
address entry_jshort_arraycopy = nullptr;
address entry_jint_arraycopy = nullptr;
address entry_oop_arraycopy = nullptr;
address entry_jlong_arraycopy = nullptr;
address entry_checkcast_arraycopy = nullptr;
generate_copy_longs(StubGenStubId::copy_byte_f_id, copy_f, c_rarg0, c_rarg1, t1);
generate_copy_longs(StubGenStubId::copy_byte_b_id, copy_b, c_rarg0, c_rarg1, t1);
StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
//*** jbyte
// Always need aligned and unaligned versions
StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
//*** jshort
// Always need aligned and unaligned versions
StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
//*** jint
// Aligned versions
StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
// In 64 bit we need both aligned and unaligned versions of jint arraycopy.
// entry_jint_arraycopy always points to the unaligned version
StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
//*** jlong
// It is always aligned
StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
//*** oops
StubRoutines::_arrayof_oop_disjoint_arraycopy
= generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
StubRoutines::_arrayof_oop_arraycopy
= generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
// Aligned versions without pre-barriers
StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
= generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
StubRoutines::_arrayof_oop_arraycopy_uninit
= generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy,
entry_jshort_arraycopy,
entry_jint_arraycopy,
entry_jlong_arraycopy);
StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy,
entry_jshort_arraycopy,
entry_jint_arraycopy,
entry_oop_arraycopy,
entry_jlong_arraycopy,
entry_checkcast_arraycopy);
StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
}
void generate_aes_loadkeys(const Register &key, VectorRegister *working_vregs, int rounds) {
const int step = 16;
for (int i = 0; i < rounds; i++) {
__ vle32_v(working_vregs[i], key);
// The keys are stored in little-endian array, while we need
// to operate in big-endian.
// So performing an endian-swap here with vrev8.v instruction
__ vrev8_v(working_vregs[i], working_vregs[i]);
__ addi(key, key, step);
}
}
void generate_aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
__ vxor_vv(res, res, working_vregs[0]);
for (int i = 1; i < rounds - 1; i++) {
__ vaesem_vv(res, working_vregs[i]);
}
__ vaesef_vv(res, working_vregs[rounds - 1]);
}
// Arguments:
//
// Inputs:
// c_rarg0 - source byte array address
// c_rarg1 - destination byte array address
// c_rarg2 - K (key) in little endian int array
//
address generate_aescrypt_encryptBlock() {
assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
StubCodeMark mark(this, stub_id);
Label L_aes128, L_aes192;
const Register from = c_rarg0; // source array address
const Register to = c_rarg1; // destination array address
const Register key = c_rarg2; // key array address
const Register keylen = c_rarg3;
VectorRegister working_vregs[] = {
v4, v5, v6, v7, v8, v9, v10, v11,
v12, v13, v14, v15, v16, v17, v18
};
const VectorRegister res = v19;
address start = __ pc();
__ enter();
__ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
__ vle32_v(res, from);
__ mv(t2, 52);
__ blt(keylen, t2, L_aes128);
__ beq(keylen, t2, L_aes192);
// Else we fallthrough to the biggest case (256-bit key size)
// Note: the following function performs key += 15*16
generate_aes_loadkeys(key, working_vregs, 15);
generate_aes_encrypt(res, working_vregs, 15);
__ vse32_v(res, to);
__ mv(c_rarg0, 0);
__ leave();
__ ret();
__ bind(L_aes192);
// Note: the following function performs key += 13*16
generate_aes_loadkeys(key, working_vregs, 13);
generate_aes_encrypt(res, working_vregs, 13);
__ vse32_v(res, to);
__ mv(c_rarg0, 0);
__ leave();
__ ret();
__ bind(L_aes128);
// Note: the following function performs key += 11*16
generate_aes_loadkeys(key, working_vregs, 11);
generate_aes_encrypt(res, working_vregs, 11);
__ vse32_v(res, to);
__ mv(c_rarg0, 0);
__ leave();
__ ret();
return start;
}
void generate_aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
__ vxor_vv(res, res, working_vregs[rounds - 1]);
for (int i = rounds - 2; i > 0; i--) {
__ vaesdm_vv(res, working_vregs[i]);
}
__ vaesdf_vv(res, working_vregs[0]);
}
// Arguments:
//
// Inputs:
// c_rarg0 - source byte array address
// c_rarg1 - destination byte array address
// c_rarg2 - K (key) in little endian int array
//
address generate_aescrypt_decryptBlock() {
assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
StubCodeMark mark(this, stub_id);
Label L_aes128, L_aes192;
const Register from = c_rarg0; // source array address
const Register to = c_rarg1; // destination array address
const Register key = c_rarg2; // key array address
const Register keylen = c_rarg3;
VectorRegister working_vregs[] = {
v4, v5, v6, v7, v8, v9, v10, v11,
v12, v13, v14, v15, v16, v17, v18
};
const VectorRegister res = v19;
address start = __ pc();
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
__ vle32_v(res, from);
__ mv(t2, 52);
__ blt(keylen, t2, L_aes128);
__ beq(keylen, t2, L_aes192);
// Else we fallthrough to the biggest case (256-bit key size)
// Note: the following function performs key += 15*16
generate_aes_loadkeys(key, working_vregs, 15);
generate_aes_decrypt(res, working_vregs, 15);
__ vse32_v(res, to);
__ mv(c_rarg0, 0);
__ leave();
__ ret();
__ bind(L_aes192);
// Note: the following function performs key += 13*16
generate_aes_loadkeys(key, working_vregs, 13);
generate_aes_decrypt(res, working_vregs, 13);
__ vse32_v(res, to);
__ mv(c_rarg0, 0);
__ leave();
__ ret();
__ bind(L_aes128);
// Note: the following function performs key += 11*16
generate_aes_loadkeys(key, working_vregs, 11);
generate_aes_decrypt(res, working_vregs, 11);
__ vse32_v(res, to);
__ mv(c_rarg0, 0);
__ leave();
__ ret();
return start;
}
// code for comparing 8 characters of strings with Latin1 and Utf16 encoding
void compare_string_8_x_LU(Register tmpL, Register tmpU,
Register strL, Register strU, Label& DIFF) {
const Register tmp = x30, tmpLval = x12;
int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
assert((base_offset % (UseCompactObjectHeaders ? 4 :
(UseCompressedClassPointers ? 8 : 4))) == 0, "Must be");
#ifdef ASSERT
if (AvoidUnalignedAccesses) {
Label align_ok;
__ andi(t0, strL, 0x7);
__ beqz(t0, align_ok);
__ stop("bad alignment");
__ bind(align_ok);
}
#endif
__ ld(tmpLval, Address(strL));
__ addi(strL, strL, wordSize);
// compare first 4 characters
__ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
__ addi(strU, strU, wordSize);
__ inflate_lo32(tmpL, tmpLval);
__ xorr(tmp, tmpU, tmpL);
__ bnez(tmp, DIFF);
// compare second 4 characters
__ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
__ addi(strU, strU, wordSize);
__ inflate_hi32(tmpL, tmpLval);
__ xorr(tmp, tmpU, tmpL);
__ bnez(tmp, DIFF);
}
// x10 = result
// x11 = str1
// x12 = cnt1
// x13 = str2
// x14 = cnt2
// x28 = tmp1
// x29 = tmp2
// x30 = tmp3
address generate_compare_long_string_different_encoding(StubGenStubId stub_id) {
bool isLU;
switch (stub_id) {
case compare_long_string_LU_id:
isLU = true;
break;
case compare_long_string_UL_id:
isLU = false;
break;
default:
ShouldNotReachHere();
};
__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address entry = __ pc();
Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
assert((base_offset % (UseCompactObjectHeaders ? 4 :
(UseCompressedClassPointers ? 8 : 4))) == 0, "Must be");
Register strU = isLU ? str2 : str1,
strL = isLU ? str1 : str2,
tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
// Load 4 bytes from strL to make sure main loop is 8-byte aligned
// cnt2 is >= 68 here, no need to check it for >= 0
__ lwu(tmpL, Address(strL));
__ addi(strL, strL, wordSize / 2);
__ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
__ addi(strU, strU, wordSize);
__ inflate_lo32(tmp3, tmpL);
__ mv(tmpL, tmp3);
__ xorr(tmp3, tmpU, tmpL);
__ bnez(tmp3, CALCULATE_DIFFERENCE);
__ subi(cnt2, cnt2, wordSize / 2);
}
// we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
__ subi(cnt2, cnt2, wordSize * 2);
__ bltz(cnt2, TAIL);
__ bind(SMALL_LOOP); // smaller loop
__ subi(cnt2, cnt2, wordSize * 2);
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
__ bgez(cnt2, SMALL_LOOP);
__ addi(t0, cnt2, wordSize * 2);
__ beqz(t0, DONE);
__ bind(TAIL); // 1..15 characters left
// Aligned access. Load bytes in portions - 4, 2, 1.
__ addi(t0, cnt2, wordSize);
__ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
__ bltz(t0, LOAD_LAST);
// remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
__ subi(cnt2, cnt2, wordSize);
__ beqz(cnt2, DONE); // no character left
__ bind(LOAD_LAST); // cnt2 = 1..7 characters left
__ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
__ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes
__ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
__ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string
__ load_int_misaligned(tmpL, Address(strL), t0, false);
__ load_long_misaligned(tmpU, Address(strU), t0, 2);
__ inflate_lo32(tmp3, tmpL);
__ mv(tmpL, tmp3);
__ xorr(tmp3, tmpU, tmpL);
__ bnez(tmp3, CALCULATE_DIFFERENCE);
__ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
__ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string
__ load_int_misaligned(tmpL, Address(strL), t0, false);
__ load_long_misaligned(tmpU, Address(strU), t0, 2);
__ inflate_lo32(tmp3, tmpL);
__ mv(tmpL, tmp3);
__ xorr(tmp3, tmpU, tmpL);
__ bnez(tmp3, CALCULATE_DIFFERENCE);
__ j(DONE); // no character left
// Find the first different characters in the longwords and
// compute their difference.
__ bind(CALCULATE_DIFFERENCE);
// count bits of trailing zero chars
__ ctzc_bits(tmp4, tmp3);
__ srl(tmp1, tmp1, tmp4);
__ srl(tmp2, tmp2, tmp4);
__ zext(tmp1, tmp1, 16);
__ zext(tmp2, tmp2, 16);
__ sub(result, tmp1, tmp2);
__ bind(DONE);
__ ret();
return entry;
}
address generate_method_entry_barrier() {
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
StubCodeMark mark(this, stub_id);
Label deoptimize_label;
address start = __ pc();
BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
__ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
__ lwu(t1, t1);
__ sw(t1, thread_epoch_addr);
// There are two ways this can work:
// - The writer did system icache shootdown after the instruction stream update.
// Hence do nothing.
// - The writer trust us to make sure our icache is in sync before entering.
// Hence use cmodx fence (fence.i, may change).
if (UseCtxFencei) {
__ cmodx_fence();
}
__ membar(__ LoadLoad);
}
__ set_last_Java_frame(sp, fp, ra);
__ enter();
__ addi(t1, sp, wordSize);
__ subi(sp, sp, 4 * wordSize);
__ push_call_clobbered_registers();
__ mv(c_rarg0, t1);
__ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
__ reset_last_Java_frame(true);
__ mv(t0, x10);
__ pop_call_clobbered_registers();
__ bnez(t0, deoptimize_label);
__ leave();
__ ret();
__ BIND(deoptimize_label);
__ ld(t0, Address(sp, 0));
__ ld(fp, Address(sp, wordSize));
__ ld(ra, Address(sp, wordSize * 2));
__ ld(t1, Address(sp, wordSize * 3));
__ mv(sp, t0);
__ jr(t1);
return start;
}
// x10 = result
// x11 = str1
// x12 = cnt1
// x13 = str2
// x14 = cnt2
// x28 = tmp1
// x29 = tmp2
// x30 = tmp3
// x31 = tmp4
address generate_compare_long_string_same_encoding(StubGenStubId stub_id) {
bool isLL;
switch (stub_id) {
case compare_long_string_LL_id:
isLL = true;
break;
case compare_long_string_UU_id:
isLL = false;
break;
default:
ShouldNotReachHere();
};
__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address entry = __ pc();
Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
RegSet spilled_regs = RegSet::of(tmp4, tmp5);
// cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
// update cnt2 counter with already loaded 8 bytes
__ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
// update pointers, because of previous read
__ addi(str1, str1, wordSize);
__ addi(str2, str2, wordSize);
// less than 16 bytes left?
__ subi(cnt2, cnt2, isLL ? 16 : 8);
__ push_reg(spilled_regs, sp);
__ bltz(cnt2, TAIL);
__ bind(SMALL_LOOP);
// compare 16 bytes of strings with same encoding
__ ld(tmp5, Address(str1));
__ addi(str1, str1, 8);
__ xorr(tmp4, tmp1, tmp2);
__ ld(cnt1, Address(str2));
__ addi(str2, str2, 8);
__ bnez(tmp4, DIFF);
__ ld(tmp1, Address(str1));
__ addi(str1, str1, 8);
__ xorr(tmp4, tmp5, cnt1);
__ ld(tmp2, Address(str2));
__ addi(str2, str2, 8);
__ bnez(tmp4, DIFF2);
__ subi(cnt2, cnt2, isLL ? 16 : 8);
__ bgez(cnt2, SMALL_LOOP);
__ bind(TAIL);
__ addi(cnt2, cnt2, isLL ? 16 : 8);
__ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
__ subi(cnt2, cnt2, isLL ? 8 : 4);
__ blez(cnt2, CHECK_LAST);
__ xorr(tmp4, tmp1, tmp2);
__ bnez(tmp4, DIFF);
__ ld(tmp1, Address(str1));
__ addi(str1, str1, 8);
__ ld(tmp2, Address(str2));
__ addi(str2, str2, 8);
__ subi(cnt2, cnt2, isLL ? 8 : 4);
__ bind(CHECK_LAST);
if (!isLL) {
__ add(cnt2, cnt2, cnt2); // now in bytes
}
__ xorr(tmp4, tmp1, tmp2);
__ bnez(tmp4, DIFF);
__ add(str1, str1, cnt2);
__ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
__ add(str2, str2, cnt2);
__ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
__ xorr(tmp4, tmp5, cnt1);
__ beqz(tmp4, LENGTH_DIFF);
// Find the first different characters in the longwords and
// compute their difference.
__ bind(DIFF2);
// count bits of trailing zero chars
__ ctzc_bits(tmp3, tmp4, isLL);
__ srl(tmp5, tmp5, tmp3);
__ srl(cnt1, cnt1, tmp3);
if (isLL) {
__ zext(tmp5, tmp5, 8);
__ zext(cnt1, cnt1, 8);
} else {
__ zext(tmp5, tmp5, 16);
__ zext(cnt1, cnt1, 16);
}
__ sub(result, tmp5, cnt1);
__ j(LENGTH_DIFF);
__ bind(DIFF);
// count bits of trailing zero chars
__ ctzc_bits(tmp3, tmp4, isLL);
__ srl(tmp1, tmp1, tmp3);
__ srl(tmp2, tmp2, tmp3);
if (isLL) {
__ zext(tmp1, tmp1, 8);
__ zext(tmp2, tmp2, 8);
} else {
__ zext(tmp1, tmp1, 16);
__ zext(tmp2, tmp2, 16);
}
__ sub(result, tmp1, tmp2);
__ j(LENGTH_DIFF);
__ bind(LAST_CHECK_AND_LENGTH_DIFF);
__ xorr(tmp4, tmp1, tmp2);
__ bnez(tmp4, DIFF);
__ bind(LENGTH_DIFF);
__ pop_reg(spilled_regs, sp);
__ ret();
return entry;
}
void generate_compare_long_strings() {
StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubGenStubId::compare_long_string_LL_id);
StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubGenStubId::compare_long_string_UU_id);
StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubGenStubId::compare_long_string_LU_id);
StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubGenStubId::compare_long_string_UL_id);
}
// x10 result
// x11 src
// x12 src count
// x13 pattern
// x14 pattern count
address generate_string_indexof_linear(StubGenStubId stub_id)
{
bool needle_isL;
bool haystack_isL;
switch (stub_id) {
case string_indexof_linear_ll_id:
needle_isL = true;
haystack_isL = true;
break;
case string_indexof_linear_ul_id:
needle_isL = true;
haystack_isL = false;
break;
case string_indexof_linear_uu_id:
needle_isL = false;
haystack_isL = false;
break;
default:
ShouldNotReachHere();
};
__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address entry = __ pc();
int needle_chr_size = needle_isL ? 1 : 2;
int haystack_chr_size = haystack_isL ? 1 : 2;
int needle_chr_shift = needle_isL ? 0 : 1;
int haystack_chr_shift = haystack_isL ? 0 : 1;
bool isL = needle_isL && haystack_isL;
// parameters
Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
// temporary registers
Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
// redefinitions
Register ch1 = x28, ch2 = x29;
RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
__ push_reg(spilled_regs, sp);
Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
__ ld(ch1, Address(needle));
__ ld(ch2, Address(haystack));
// src.length - pattern.length
__ sub(haystack_len, haystack_len, needle_len);
// first is needle[0]
__ zext(first, ch1, needle_isL ? 8 : 16);
uint64_t mask0101 = UCONST64(0x0101010101010101);
uint64_t mask0001 = UCONST64(0x0001000100010001);
__ mv(mask1, haystack_isL ? mask0101 : mask0001);
__ mul(first, first, mask1);
uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
__ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
if (needle_isL != haystack_isL) {
__ mv(tmp, ch1);
}
__ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
__ blez(haystack_len, L_SMALL);
if (needle_isL != haystack_isL) {
__ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
}
// xorr, sub, orr, notr, andr
// compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
// eg:
// first: aa aa aa aa aa aa aa aa
// ch2: aa aa li nx jd ka aa aa
// match_mask: 80 80 00 00 00 00 80 80
__ compute_match_mask(ch2, first, match_mask, mask1, mask2);
// search first char of needle, if success, goto L_HAS_ZERO;
__ bnez(match_mask, L_HAS_ZERO);
__ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
__ addi(result, result, wordSize / haystack_chr_size);
__ addi(haystack, haystack, wordSize);
__ bltz(haystack_len, L_POST_LOOP);
__ bind(L_LOOP);
__ ld(ch2, Address(haystack));
__ compute_match_mask(ch2, first, match_mask, mask1, mask2);
__ bnez(match_mask, L_HAS_ZERO);
__ bind(L_LOOP_PROCEED);
__ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
__ addi(haystack, haystack, wordSize);
__ addi(result, result, wordSize / haystack_chr_size);
__ bgez(haystack_len, L_LOOP);
__ bind(L_POST_LOOP);
__ mv(ch2, -wordSize / haystack_chr_size);
__ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
__ ld(ch2, Address(haystack));
__ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
__ neg(haystack_len, haystack_len);
__ xorr(ch2, first, ch2);
__ sub(match_mask, ch2, mask1);
__ orr(ch2, ch2, mask2);
__ mv(trailing_zeros, -1); // all bits set
__ j(L_SMALL_PROCEED);
__ align(OptoLoopAlignment);
__ bind(L_SMALL);
__ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
__ neg(haystack_len, haystack_len);
if (needle_isL != haystack_isL) {
__ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
}
__ xorr(ch2, first, ch2);
__ sub(match_mask, ch2, mask1);
__ orr(ch2, ch2, mask2);
__ mv(trailing_zeros, -1); // all bits set
__ bind(L_SMALL_PROCEED);
__ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
__ notr(ch2, ch2);
__ andr(match_mask, match_mask, ch2);
__ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
__ beqz(match_mask, NOMATCH);
__ bind(L_SMALL_HAS_ZERO_LOOP);
// count bits of trailing zero chars
__ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
__ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
__ mv(ch2, wordSize / haystack_chr_size);
__ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
__ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
__ mv(trailing_zeros, wordSize / haystack_chr_size);
__ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
__ bind(L_SMALL_CMP_LOOP);
__ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
__ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
__ addi(trailing_zeros, trailing_zeros, 1);
__ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
__ beq(first, ch2, L_SMALL_CMP_LOOP);
__ bind(L_SMALL_CMP_LOOP_NOMATCH);
__ beqz(match_mask, NOMATCH);
// count bits of trailing zero chars
__ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
__ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
__ addi(result, result, 1);
__ addi(haystack, haystack, haystack_chr_size);
__ j(L_SMALL_HAS_ZERO_LOOP);
__ align(OptoLoopAlignment);
__ bind(L_SMALL_CMP_LOOP_LAST_CMP);
__ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
__ j(DONE);
__ align(OptoLoopAlignment);
__ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
__ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
__ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
__ j(DONE);
__ align(OptoLoopAlignment);
__ bind(L_HAS_ZERO);
// count bits of trailing zero chars
__ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
__ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
__ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
__ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
__ subi(result, result, 1); // array index from 0, so result -= 1
__ bind(L_HAS_ZERO_LOOP);
__ mv(needle_len, wordSize / haystack_chr_size);
__ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
__ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
// load next 8 bytes from haystack, and increase result index
__ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
__ addi(result, result, 1);
__ mv(trailing_zeros, wordSize / haystack_chr_size);
__ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
// compare one char
__ bind(L_CMP_LOOP);
__ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
__ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
__ addi(trailing_zeros, trailing_zeros, 1); // next char index
__ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
__ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
__ beq(needle_len, ch2, L_CMP_LOOP);
__ bind(L_CMP_LOOP_NOMATCH);
__ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
// count bits of trailing zero chars
__ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
__ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
__ addi(haystack, haystack, haystack_chr_size);
__ j(L_HAS_ZERO_LOOP);
__ align(OptoLoopAlignment);
__ bind(L_CMP_LOOP_LAST_CMP);
__ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
__ j(DONE);
__ align(OptoLoopAlignment);
__ bind(L_CMP_LOOP_LAST_CMP2);
__ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
__ addi(result, result, 1);
__ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
__ j(DONE);
__ align(OptoLoopAlignment);
__ bind(L_HAS_ZERO_LOOP_NOMATCH);
// 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
// L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
// so, result was increased at max by wordSize/str2_chr_size - 1, so,
// respective high bit wasn't changed. L_LOOP_PROCEED will increase
// result by analyzed characters value, so, we can just reset lower bits
// in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
// 2) restore needle_len and haystack_len values from "compressed" haystack_len
// 3) advance haystack value to represent next haystack octet. result & 7/3 is
// index of last analyzed substring inside current octet. So, haystack in at
// respective start address. We need to advance it to next octet
__ andi(match_mask, result, wordSize / haystack_chr_size - 1);
__ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
__ andi(result, result, haystack_isL ? -8 : -4);
__ slli(tmp, match_mask, haystack_chr_shift);
__ sub(haystack, haystack, tmp);
__ sext(haystack_len, haystack_len, 32);
__ j(L_LOOP_PROCEED);
__ align(OptoLoopAlignment);
__ bind(NOMATCH);
__ mv(result, -1);
__ bind(DONE);
__ pop_reg(spilled_regs, sp);
__ ret();
return entry;
}
void generate_string_indexof_stubs()
{
StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_ll_id);
StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_uu_id);
StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_ul_id);
}
#ifdef COMPILER2
void generate_lookup_secondary_supers_table_stub() {
StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
StubCodeMark mark(this, stub_id);
const Register
r_super_klass = x10,
r_array_base = x11,
r_array_length = x12,
r_array_index = x13,
r_sub_klass = x14,
result = x15,
r_bitmap = x16;
for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
Label L_success;
__ enter();
__ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
r_array_base, r_array_length, r_array_index,
r_bitmap, slot, /*stub_is_near*/true);
__ leave();
__ ret();
}
}
// Slow path implementation for UseSecondarySupersTable.
address generate_lookup_secondary_supers_table_slow_path_stub() {
StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
const Register
r_super_klass = x10, // argument
r_array_base = x11, // argument
temp1 = x12, // tmp
r_array_index = x13, // argument
result = x15, // argument
r_bitmap = x16; // argument
__ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
__ ret();
return start;
}
address generate_mulAdd()
{
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::mulAdd_id;
StubCodeMark mark(this, stub_id);
address entry = __ pc();
const Register out = x10;
const Register in = x11;
const Register offset = x12;
const Register len = x13;
const Register k = x14;
const Register tmp = x28;
BLOCK_COMMENT("Entry:");
__ enter();
__ mul_add(out, in, offset, len, k, tmp);
__ leave();
__ ret();
return entry;
}
/**
* Arguments:
*
* Input:
* c_rarg0 - x address
* c_rarg1 - x length
* c_rarg2 - y address
* c_rarg3 - y length
* c_rarg4 - z address
*/
address generate_multiplyToLen()
{
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
StubCodeMark mark(this, stub_id);
address entry = __ pc();
const Register x = x10;
const Register xlen = x11;
const Register y = x12;
const Register ylen = x13;
const Register z = x14;
const Register tmp0 = x15;
const Register tmp1 = x16;
const Register tmp2 = x17;
const Register tmp3 = x7;
const Register tmp4 = x28;
const Register tmp5 = x29;
const Register tmp6 = x30;
const Register tmp7 = x31;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret();
return entry;
}
address generate_squareToLen()
{
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::squareToLen_id;
StubCodeMark mark(this, stub_id);
address entry = __ pc();
const Register x = x10;
const Register xlen = x11;
const Register z = x12;
const Register y = x14; // == x
const Register ylen = x15; // == xlen
const Register tmp0 = x13; // zlen, unused
const Register tmp1 = x16;
const Register tmp2 = x17;
const Register tmp3 = x7;
const Register tmp4 = x28;
const Register tmp5 = x29;
const Register tmp6 = x30;
const Register tmp7 = x31;
BLOCK_COMMENT("Entry:");
__ enter();
__ mv(y, x);
__ mv(ylen, xlen);
__ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
__ leave();
__ ret();
return entry;
}
// Arguments:
//
// Input:
// c_rarg0 - newArr address
// c_rarg1 - oldArr address
// c_rarg2 - newIdx
// c_rarg3 - shiftCount
// c_rarg4 - numIter
//
address generate_bigIntegerLeftShift() {
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
StubCodeMark mark(this, stub_id);
address entry = __ pc();
Label loop, exit;
Register newArr = c_rarg0;
Register oldArr = c_rarg1;
Register newIdx = c_rarg2;
Register shiftCount = c_rarg3;
Register numIter = c_rarg4;
Register shiftRevCount = c_rarg5;
Register oldArrNext = t1;
__ beqz(numIter, exit);
__ shadd(newArr, newIdx, newArr, t0, 2);
__ mv(shiftRevCount, 32);
__ sub(shiftRevCount, shiftRevCount, shiftCount);
__ bind(loop);
__ addi(oldArrNext, oldArr, 4);
__ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
__ vle32_v(v0, oldArr);
__ vle32_v(v4, oldArrNext);
__ vsll_vx(v0, v0, shiftCount);
__ vsrl_vx(v4, v4, shiftRevCount);
__ vor_vv(v0, v0, v4);
__ vse32_v(v0, newArr);
__ sub(numIter, numIter, t0);
__ shadd(oldArr, t0, oldArr, t1, 2);
__ shadd(newArr, t0, newArr, t1, 2);
__ bnez(numIter, loop);
__ bind(exit);
__ ret();
return entry;
}
// Arguments:
//
// Input:
// c_rarg0 - newArr address
// c_rarg1 - oldArr address
// c_rarg2 - newIdx
// c_rarg3 - shiftCount
// c_rarg4 - numIter
//
address generate_bigIntegerRightShift() {
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
StubCodeMark mark(this, stub_id);
address entry = __ pc();
Label loop, exit;
Register newArr = c_rarg0;
Register oldArr = c_rarg1;
Register newIdx = c_rarg2;
Register shiftCount = c_rarg3;
Register numIter = c_rarg4;
Register idx = numIter;
Register shiftRevCount = c_rarg5;
Register oldArrNext = c_rarg6;
Register newArrCur = t0;
Register oldArrCur = t1;
__ beqz(idx, exit);
__ shadd(newArr, newIdx, newArr, t0, 2);
__ mv(shiftRevCount, 32);
__ sub(shiftRevCount, shiftRevCount, shiftCount);
__ bind(loop);
__ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
__ sub(idx, idx, t0);
__ shadd(oldArrNext, idx, oldArr, t1, 2);
__ shadd(newArrCur, idx, newArr, t1, 2);
__ addi(oldArrCur, oldArrNext, 4);
__ vle32_v(v0, oldArrCur);
__ vle32_v(v4, oldArrNext);
__ vsrl_vx(v0, v0, shiftCount);
__ vsll_vx(v4, v4, shiftRevCount);
__ vor_vv(v0, v0, v4);
__ vse32_v(v0, newArrCur);
__ bnez(idx, loop);
__ bind(exit);
__ ret();
return entry;
}
#endif
#ifdef COMPILER2
class MontgomeryMultiplyGenerator : public MacroAssembler {
Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
RegSet _toSave;
bool _squaring;
public:
MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
: MacroAssembler(as->code()), _squaring(squaring) {
// Register allocation
RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
Pa_base = *regs; // Argument registers
if (squaring) {
Pb_base = Pa_base;
} else {
Pb_base = *++regs;
}
Pn_base = *++regs;
Rlen= *++regs;
inv = *++regs;
Pm_base = *++regs;
// Working registers:
Ra = *++regs; // The current digit of a, b, n, and m.
Rb = *++regs;
Rm = *++regs;
Rn = *++regs;
Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
Pb = *++regs;
Pm = *++regs;
Pn = *++regs;
tmp0 = *++regs; // Three registers which form a
tmp1 = *++regs; // triple-precision accumuator.
tmp2 = *++regs;
Ri = x6; // Inner and outer loop indexes.
Rj = x7;
Rhi_ab = x28; // Product registers: low and high parts
Rlo_ab = x29; // of a*b and m*n.
Rhi_mn = x30;
Rlo_mn = x31;
// x18 and up are callee-saved.
_toSave = RegSet::range(x18, *regs) + Pm_base;
}
private:
void save_regs() {
push_reg(_toSave, sp);
}
void restore_regs() {
pop_reg(_toSave, sp);
}
template <typename T>
void unroll_2(Register count, T block) {
Label loop, end, odd;
beqz(count, end);
test_bit(t0, count, 0);
bnez(t0, odd);
align(16);
bind(loop);
(this->*block)();
bind(odd);
(this->*block)();
subi(count, count, 2);
bgtz(count, loop);
bind(end);
}
template <typename T>
void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
Label loop, end, odd;
beqz(count, end);
test_bit(tmp, count, 0);
bnez(tmp, odd);
align(16);
bind(loop);
(this->*block)(d, s, tmp);
bind(odd);
(this->*block)(d, s, tmp);
subi(count, count, 2);
bgtz(count, loop);
bind(end);
}
void pre1(RegisterOrConstant i) {
block_comment("pre1");
// Pa = Pa_base;
// Pb = Pb_base + i;
// Pm = Pm_base;
// Pn = Pn_base + i;
// Ra = *Pa;
// Rb = *Pb;
// Rm = *Pm;
// Rn = *Pn;
if (i.is_register()) {
slli(t0, i.as_register(), LogBytesPerWord);
} else {
mv(t0, i.as_constant());
slli(t0, t0, LogBytesPerWord);
}
mv(Pa, Pa_base);
add(Pb, Pb_base, t0);
mv(Pm, Pm_base);
add(Pn, Pn_base, t0);
ld(Ra, Address(Pa));
ld(Rb, Address(Pb));
ld(Rm, Address(Pm));
ld(Rn, Address(Pn));
// Zero the m*n result.
mv(Rhi_mn, zr);
mv(Rlo_mn, zr);
}
// The core multiply-accumulate step of a Montgomery
// multiplication. The idea is to schedule operations as a
// pipeline so that instructions with long latencies (loads and
// multiplies) have time to complete before their results are
// used. This most benefits in-order implementations of the
// architecture but out-of-order ones also benefit.
void step() {
block_comment("step");
// MACC(Ra, Rb, tmp0, tmp1, tmp2);
// Ra = *++Pa;
// Rb = *--Pb;
mulhu(Rhi_ab, Ra, Rb);
mul(Rlo_ab, Ra, Rb);
addi(Pa, Pa, wordSize);
ld(Ra, Address(Pa));
subi(Pb, Pb, wordSize);
ld(Rb, Address(Pb));
acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
// previous iteration.
// MACC(Rm, Rn, tmp0, tmp1, tmp2);
// Rm = *++Pm;
// Rn = *--Pn;
mulhu(Rhi_mn, Rm, Rn);
mul(Rlo_mn, Rm, Rn);
addi(Pm, Pm, wordSize);
ld(Rm, Address(Pm));
subi(Pn, Pn, wordSize);
ld(Rn, Address(Pn));
acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
}
void post1() {
block_comment("post1");
// MACC(Ra, Rb, tmp0, tmp1, tmp2);
// Ra = *++Pa;
// Rb = *--Pb;
mulhu(Rhi_ab, Ra, Rb);
mul(Rlo_ab, Ra, Rb);
acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
// *Pm = Rm = tmp0 * inv;
mul(Rm, tmp0, inv);
sd(Rm, Address(Pm));
// MACC(Rm, Rn, tmp0, tmp1, tmp2);
// tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
mulhu(Rhi_mn, Rm, Rn);
#ifndef PRODUCT
// assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
{
mul(Rlo_mn, Rm, Rn);
add(Rlo_mn, tmp0, Rlo_mn);
Label ok;
beqz(Rlo_mn, ok);
stop("broken Montgomery multiply");
bind(ok);
}
#endif
// We have very carefully set things up so that
// m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
// the lower half of Rm * Rn because we know the result already:
// it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff
// tmp0 != 0. So, rather than do a mul and an cad we just set
// the carry flag iff tmp0 is nonzero.
//
// mul(Rlo_mn, Rm, Rn);
// cad(zr, tmp0, Rlo_mn);
subi(t0, tmp0, 1);
sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
cadc(tmp0, tmp1, Rhi_mn, t0);
adc(tmp1, tmp2, zr, t0);
mv(tmp2, zr);
}
void pre2(Register i, Register len) {
block_comment("pre2");
// Pa = Pa_base + i-len;
// Pb = Pb_base + len;
// Pm = Pm_base + i-len;
// Pn = Pn_base + len;
sub(Rj, i, len);
// Rj == i-len
// Ra as temp register
slli(Ra, Rj, LogBytesPerWord);
add(Pa, Pa_base, Ra);
add(Pm, Pm_base, Ra);
slli(Ra, len, LogBytesPerWord);
add(Pb, Pb_base, Ra);
add(Pn, Pn_base, Ra);
// Ra = *++Pa;
// Rb = *--Pb;
// Rm = *++Pm;
// Rn = *--Pn;
addi(Pa, Pa, wordSize);
ld(Ra, Address(Pa));
subi(Pb, Pb, wordSize);
ld(Rb, Address(Pb));
addi(Pm, Pm, wordSize);
ld(Rm, Address(Pm));
subi(Pn, Pn, wordSize);
ld(Rn, Address(Pn));
mv(Rhi_mn, zr);
mv(Rlo_mn, zr);
}
void post2(Register i, Register len) {
block_comment("post2");
sub(Rj, i, len);
cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
// As soon as we know the least significant digit of our result,
// store it.
// Pm_base[i-len] = tmp0;
// Rj as temp register
slli(Rj, Rj, LogBytesPerWord);
add(Rj, Pm_base, Rj);
sd(tmp0, Address(Rj));
// tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
adc(tmp1, tmp2, zr, t0);
mv(tmp2, zr);
}
// A carry in tmp0 after Montgomery multiplication means that we
// should subtract multiples of n from our result in m. We'll
// keep doing that until there is no carry.
void normalize(Register len) {
block_comment("normalize");
// while (tmp0)
// tmp0 = sub(Pm_base, Pn_base, tmp0, len);
Label loop, post, again;
Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
beqz(tmp0, post); {
bind(again); {
mv(i, zr);
mv(cnt, len);
slli(Rn, i, LogBytesPerWord);
add(Rm, Pm_base, Rn);
ld(Rm, Address(Rm));
add(Rn, Pn_base, Rn);
ld(Rn, Address(Rn));
mv(t0, 1); // set carry flag, i.e. no borrow
align(16);
bind(loop); {
notr(Rn, Rn);
add(Rm, Rm, t0);
add(Rm, Rm, Rn);
sltu(t0, Rm, Rn);
slli(Rn, i, LogBytesPerWord); // Rn as temp register
add(Rn, Pm_base, Rn);
sd(Rm, Address(Rn));
addi(i, i, 1);
slli(Rn, i, LogBytesPerWord);
add(Rm, Pm_base, Rn);
ld(Rm, Address(Rm));
add(Rn, Pn_base, Rn);
ld(Rn, Address(Rn));
subi(cnt, cnt, 1);
} bnez(cnt, loop);
subi(tmp0, tmp0, 1);
add(tmp0, tmp0, t0);
} bnez(tmp0, again);
} bind(post);
}
// Move memory at s to d, reversing words.
// Increments d to end of copied memory
// Destroys tmp1, tmp2
// Preserves len
// Leaves s pointing to the address which was in d at start
void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
assert(tmp1->encoding() < x28->encoding(), "register corruption");
assert(tmp2->encoding() < x28->encoding(), "register corruption");
shadd(s, len, s, tmp1, LogBytesPerWord);
mv(tmp1, len);
unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
slli(tmp1, len, LogBytesPerWord);
sub(s, d, tmp1);
}
// [63...0] -> [31...0][63...32]
void reverse1(Register d, Register s, Register tmp) {
subi(s, s, wordSize);
ld(tmp, Address(s));
ror(tmp, tmp, 32, t0);
sd(tmp, Address(d));
addi(d, d, wordSize);
}
void step_squaring() {
// An extra ACC
step();
acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
}
void last_squaring(Register i) {
Label dont;
// if ((i & 1) == 0) {
test_bit(t0, i, 0);
bnez(t0, dont); {
// MACC(Ra, Rb, tmp0, tmp1, tmp2);
// Ra = *++Pa;
// Rb = *--Pb;
mulhu(Rhi_ab, Ra, Rb);
mul(Rlo_ab, Ra, Rb);
acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
} bind(dont);
}
void extra_step_squaring() {
acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
// MACC(Rm, Rn, tmp0, tmp1, tmp2);
// Rm = *++Pm;
// Rn = *--Pn;
mulhu(Rhi_mn, Rm, Rn);
mul(Rlo_mn, Rm, Rn);
addi(Pm, Pm, wordSize);
ld(Rm, Address(Pm));
subi(Pn, Pn, wordSize);
ld(Rn, Address(Pn));
}
void post1_squaring() {
acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
// *Pm = Rm = tmp0 * inv;
mul(Rm, tmp0, inv);
sd(Rm, Address(Pm));
// MACC(Rm, Rn, tmp0, tmp1, tmp2);
// tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
mulhu(Rhi_mn, Rm, Rn);
#ifndef PRODUCT
// assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
{
mul(Rlo_mn, Rm, Rn);
add(Rlo_mn, tmp0, Rlo_mn);
Label ok;
beqz(Rlo_mn, ok); {
stop("broken Montgomery multiply");
} bind(ok);
}
#endif
// We have very carefully set things up so that
// m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
// the lower half of Rm * Rn because we know the result already:
// it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff
// tmp0 != 0. So, rather than do a mul and a cad we just set
// the carry flag iff tmp0 is nonzero.
//
// mul(Rlo_mn, Rm, Rn);
// cad(zr, tmp, Rlo_mn);
subi(t0, tmp0, 1);
sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
cadc(tmp0, tmp1, Rhi_mn, t0);
adc(tmp1, tmp2, zr, t0);
mv(tmp2, zr);
}
// use t0 as carry
void acc(Register Rhi, Register Rlo,
Register tmp0, Register tmp1, Register tmp2) {
cad(tmp0, tmp0, Rlo, t0);
cadc(tmp1, tmp1, Rhi, t0);
adc(tmp2, tmp2, zr, t0);
}
public:
/**
* Fast Montgomery multiplication. The derivation of the
* algorithm is in A Cryptographic Library for the Motorola
* DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
*
* Arguments:
*
* Inputs for multiplication:
* c_rarg0 - int array elements a
* c_rarg1 - int array elements b
* c_rarg2 - int array elements n (the modulus)
* c_rarg3 - int length
* c_rarg4 - int inv
* c_rarg5 - int array elements m (the result)
*
* Inputs for squaring:
* c_rarg0 - int array elements a
* c_rarg1 - int array elements n (the modulus)
* c_rarg2 - int length
* c_rarg3 - int inv
* c_rarg4 - int array elements m (the result)
*
*/
address generate_multiply() {
Label argh, nothing;
bind(argh);
stop("MontgomeryMultiply total_allocation must be <= 8192");
align(CodeEntryAlignment);
address entry = pc();
beqz(Rlen, nothing);
enter();
// Make room.
mv(Ra, 512);
bgt(Rlen, Ra, argh);
slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
sub(Ra, sp, Ra);
andi(sp, Ra, -2 * wordSize);
srliw(Rlen, Rlen, 1); // length in longwords = len/2
{
// Copy input args, reversing as we go. We use Ra as a
// temporary variable.
reverse(Ra, Pa_base, Rlen, Ri, Rj);
if (!_squaring)
reverse(Ra, Pb_base, Rlen, Ri, Rj);
reverse(Ra, Pn_base, Rlen, Ri, Rj);
}
// Push all call-saved registers and also Pm_base which we'll need
// at the end.
save_regs();
#ifndef PRODUCT
// assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
{
ld(Rn, Address(Pn_base));
mul(Rlo_mn, Rn, inv);
mv(t0, -1);
Label ok;
beq(Rlo_mn, t0, ok);
stop("broken inverse in Montgomery multiply");
bind(ok);
}
#endif
mv(Pm_base, Ra);
mv(tmp0, zr);
mv(tmp1, zr);
mv(tmp2, zr);
block_comment("for (int i = 0; i < len; i++) {");
mv(Ri, zr); {
Label loop, end;
bge(Ri, Rlen, end);
bind(loop);
pre1(Ri);
block_comment(" for (j = i; j; j--) {"); {
mv(Rj, Ri);
unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
} block_comment(" } // j");
post1();
addiw(Ri, Ri, 1);
blt(Ri, Rlen, loop);
bind(end);
block_comment("} // i");
}
block_comment("for (int i = len; i < 2*len; i++) {");
mv(Ri, Rlen); {
Label loop, end;
slli(t0, Rlen, 1);
bge(Ri, t0, end);
bind(loop);
pre2(Ri, Rlen);
block_comment(" for (j = len*2-i-1; j; j--) {"); {
slliw(Rj, Rlen, 1);
subw(Rj, Rj, Ri);
subiw(Rj, Rj, 1);
unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
} block_comment(" } // j");
post2(Ri, Rlen);
addiw(Ri, Ri, 1);
slli(t0, Rlen, 1);
blt(Ri, t0, loop);
bind(end);
}
block_comment("} // i");
normalize(Rlen);
mv(Ra, Pm_base); // Save Pm_base in Ra
restore_regs(); // Restore caller's Pm_base
// Copy our result into caller's Pm_base
reverse(Pm_base, Ra, Rlen, Ri, Rj);
leave();
bind(nothing);
ret();
return entry;
}
/**
*
* Arguments:
*
* Inputs:
* c_rarg0 - int array elements a
* c_rarg1 - int array elements n (the modulus)
* c_rarg2 - int length
* c_rarg3 - int inv
* c_rarg4 - int array elements m (the result)
*
*/
address generate_square() {
Label argh;
bind(argh);
stop("MontgomeryMultiply total_allocation must be <= 8192");
align(CodeEntryAlignment);
address entry = pc();
enter();
// Make room.
mv(Ra, 512);
bgt(Rlen, Ra, argh);
slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
sub(Ra, sp, Ra);
andi(sp, Ra, -2 * wordSize);
srliw(Rlen, Rlen, 1); // length in longwords = len/2
{
// Copy input args, reversing as we go. We use Ra as a
// temporary variable.
reverse(Ra, Pa_base, Rlen, Ri, Rj);
reverse(Ra, Pn_base, Rlen, Ri, Rj);
}
// Push all call-saved registers and also Pm_base which we'll need
// at the end.
save_regs();
mv(Pm_base, Ra);
mv(tmp0, zr);
mv(tmp1, zr);
mv(tmp2, zr);
block_comment("for (int i = 0; i < len; i++) {");
mv(Ri, zr); {
Label loop, end;
bind(loop);
bge(Ri, Rlen, end);
pre1(Ri);
block_comment("for (j = (i+1)/2; j; j--) {"); {
addi(Rj, Ri, 1);
srliw(Rj, Rj, 1);
unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
} block_comment(" } // j");
last_squaring(Ri);
block_comment(" for (j = i/2; j; j--) {"); {
srliw(Rj, Ri, 1);
unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
} block_comment(" } // j");
post1_squaring();
addi(Ri, Ri, 1);
blt(Ri, Rlen, loop);
bind(end);
block_comment("} // i");
}
block_comment("for (int i = len; i < 2*len; i++) {");
mv(Ri, Rlen); {
Label loop, end;
bind(loop);
slli(t0, Rlen, 1);
bge(Ri, t0, end);
pre2(Ri, Rlen);
block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
slli(Rj, Rlen, 1);
sub(Rj, Rj, Ri);
subi(Rj, Rj, 1);
srliw(Rj, Rj, 1);
unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
} block_comment(" } // j");
last_squaring(Ri);
block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
slli(Rj, Rlen, 1);
sub(Rj, Rj, Ri);
srliw(Rj, Rj, 1);
unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
} block_comment(" } // j");
post2(Ri, Rlen);
addi(Ri, Ri, 1);
slli(t0, Rlen, 1);
blt(Ri, t0, loop);
bind(end);
block_comment("} // i");
}
normalize(Rlen);
mv(Ra, Pm_base); // Save Pm_base in Ra
restore_regs(); // Restore caller's Pm_base
// Copy our result into caller's Pm_base
reverse(Pm_base, Ra, Rlen, Ri, Rj);
leave();
ret();
return entry;
}
};
#endif // COMPILER2
address generate_cont_thaw(Continuation::thaw_kind kind) {
bool return_barrier = Continuation::is_thaw_return_barrier(kind);
bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
address start = __ pc();
if (return_barrier) {
__ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
}
#ifndef PRODUCT
{
Label OK;
__ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
__ beq(sp, t0, OK);
__ stop("incorrect sp");
__ bind(OK);
}
#endif
if (return_barrier) {
// preserve possible return value from a method returning to the return barrier
__ subi(sp, sp, 2 * wordSize);
__ fsd(f10, Address(sp, 0 * wordSize));
__ sd(x10, Address(sp, 1 * wordSize));
}
__ mv(c_rarg1, (return_barrier ? 1 : 0));
__ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
__ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
if (return_barrier) {
// restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
__ ld(x10, Address(sp, 1 * wordSize));
__ fld(f10, Address(sp, 0 * wordSize));
__ addi(sp, sp, 2 * wordSize);
}
#ifndef PRODUCT
{
Label OK;
__ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
__ beq(sp, t0, OK);
__ stop("incorrect sp");
__ bind(OK);
}
#endif
Label thaw_success;
// t1 contains the size of the frames to thaw, 0 if overflow or no more frames
__ bnez(t1, thaw_success);
__ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
__ bind(thaw_success);
// make room for the thawed frames
__ sub(t0, sp, t1);
__ andi(sp, t0, -16); // align
if (return_barrier) {
// save original return value -- again
__ subi(sp, sp, 2 * wordSize);
__ fsd(f10, Address(sp, 0 * wordSize));
__ sd(x10, Address(sp, 1 * wordSize));
}
// If we want, we can templatize thaw by kind, and have three different entries
__ mv(c_rarg1, kind);
__ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
__ mv(t1, x10); // x10 is the sp of the yielding frame
if (return_barrier) {
// restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
__ ld(x10, Address(sp, 1 * wordSize));
__ fld(f10, Address(sp, 0 * wordSize));
__ addi(sp, sp, 2 * wordSize);
} else {
__ mv(x10, zr); // return 0 (success) from doYield
}
// we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
__ mv(fp, t1);
__ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
if (return_barrier_exception) {
__ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
__ verify_oop(x10);
__ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
// see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
__ mv(x11, x10); // the exception handler
__ mv(x10, x9); // restore return value contaning the exception oop
__ verify_oop(x10);
__ leave();
__ mv(x13, ra);
__ jr(x11); // the exception handler
} else {
// We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
__ leave();
__ ret();
}
return start;
}
address generate_cont_thaw() {
if (!Continuations::enabled()) return nullptr;
StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
generate_cont_thaw(Continuation::thaw_top);
return start;
}
address generate_cont_returnBarrier() {
if (!Continuations::enabled()) return nullptr;
// TODO: will probably need multiple return barriers depending on return type
StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
generate_cont_thaw(Continuation::thaw_return_barrier);
return start;
}
address generate_cont_returnBarrier_exception() {
if (!Continuations::enabled()) return nullptr;
StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
generate_cont_thaw(Continuation::thaw_return_barrier_exception);
return start;
}
address generate_cont_preempt_stub() {
if (!Continuations::enabled()) return nullptr;
StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ reset_last_Java_frame(true);
// Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
__ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
Label preemption_cancelled;
__ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
__ bnez(t0, preemption_cancelled);
// Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
SharedRuntime::continuation_enter_cleanup(_masm);
__ leave();
__ ret();
// We acquired the monitor after freezing the frames so call thaw to continue execution.
__ bind(preemption_cancelled);
__ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
__ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
__ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
__ ld(t1, Address(t1));
__ jr(t1);
return start;
}
#if COMPILER2_OR_JVMCI
#undef __
#define __ this->
class Sha2Generator : public MacroAssembler {
StubCodeGenerator* _cgen;
public:
Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
address generate_sha256_implCompress(StubGenStubId stub_id) {
return generate_sha2_implCompress(Assembler::e32, stub_id);
}
address generate_sha512_implCompress(StubGenStubId stub_id) {
return generate_sha2_implCompress(Assembler::e64, stub_id);
}
private:
void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
else __ vle64_v(vr, sr);
}
void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
else __ vse64_v(vr, sr);
}
// Overview of the logic in each "quad round".
//
// The code below repeats 16/20 times the logic implementing four rounds
// of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
// to implementing the 64/80 single rounds.
//
// // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
// // Output:
// // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
// vl1reXX.v vTmp1, ofs
//
// // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
// addi ofs, ofs, 16/32
//
// // Add constants to message schedule words:
// // Input
// // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
// // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
// // Output
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
// vadd.vv vTmp0, vTmp1, vW0
//
// // 2 rounds of working variables updates.
// // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
// // Input:
// // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] "
// // vState0 = {a[t],b[t],e[t],f[t]}
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
// // Output:
// // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
// // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] "
// vsha2cl.vv vState1, vState0, vTmp0
//
// // 2 rounds of working variables updates.
// // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
// // Input
// // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] "
// // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] "
// // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
// // Output:
// // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] "
// vsha2ch.vv vState0, vState1, vTmp0
//
// // Combine 2QW into 1QW
// //
// // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
// // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
// // and it can only take 3 vectors as inputs. Hence we need to combine
// // vW1[0] and vW2[1..3] in a single vector.
// //
// // vmerge Vt4, Vt1, Vt2, V0
// // Input
// // V0 = mask // first word from vW2, 1..3 words from vW1
// // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
// // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
// // Output
// // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
// vmerge.vvm vTmp0, vW2, vW1, v0
//
// // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
// // Input
// // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0]
// // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12]
// // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4]
// // Output (next four message schedule words)
// // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16]
// vsha2ms.vv vW0, vTmp0, vW3
//
// BEFORE
// vW0 - vW3 hold the message schedule words (initially the block words)
// vW0 = W[ 3: 0] "oldest"
// vW1 = W[ 7: 4]
// vW2 = W[11: 8]
// vW3 = W[15:12] "newest"
//
// vt6 - vt7 hold the working state variables
// vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0}
// vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2}
//
// AFTER
// vW0 - vW3 hold the message schedule words (initially the block words)
// vW1 = W[ 7: 4] "oldest"
// vW2 = W[11: 8]
// vW3 = W[15:12]
// vW0 = W[19:16] "newest"
//
// vState0 and vState1 hold the working state variables
// vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
// vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
//
// The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
// hence the uses of those vectors rotate in each round, and we get back to the
// initial configuration every 4 quad-rounds. We could avoid those changes at
// the cost of moving those vectors at the end of each quad-rounds.
void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
bool gen_words = true, bool step_const = true) {
__ vleXX_v(vset_sew, vtemp, scalarconst);
if (step_const) {
__ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
}
__ vadd_vv(vtemp2, vtemp, rot1);
__ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
__ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
if (gen_words) {
__ vmerge_vvm(vtemp2, rot3, rot2);
__ vsha2ms_vv(rot1, vtemp2, rot4);
}
}
// Arguments:
//
// Inputs:
// c_rarg0 - byte[] source+offset
// c_rarg1 - int[] SHA.state
// c_rarg2 - int offset
// c_rarg3 - int limit
//
address generate_sha2_implCompress(Assembler::SEW vset_sew, StubGenStubId stub_id) {
alignas(64) static const uint32_t round_consts_256[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
alignas(64) static const uint64_t round_consts_512[80] = {
0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
};
const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
bool multi_block;
switch (stub_id) {
case sha256_implCompress_id:
assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
multi_block = false;
break;
case sha256_implCompressMB_id:
assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
multi_block = true;
break;
case sha512_implCompress_id:
assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
multi_block = false;
break;
case sha512_implCompressMB_id:
assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
multi_block = true;
break;
default:
ShouldNotReachHere();
};
__ align(CodeEntryAlignment);
StubCodeMark mark(_cgen, stub_id);
address start = __ pc();
Register buf = c_rarg0;
Register state = c_rarg1;
Register ofs = c_rarg2;
Register limit = c_rarg3;
Register consts = t2; // caller saved
Register state_c = x28; // caller saved
VectorRegister vindex = v2;
VectorRegister vW0 = v4;
VectorRegister vW1 = v6;
VectorRegister vW2 = v8;
VectorRegister vW3 = v10;
VectorRegister vState0 = v12;
VectorRegister vState1 = v14;
VectorRegister vHash0 = v16;
VectorRegister vHash1 = v18;
VectorRegister vTmp0 = v20;
VectorRegister vTmp1 = v22;
Label multi_block_loop;
__ enter();
address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
la(consts, ExternalAddress(constant_table));
// Register use in this function:
//
// VECTORS
// vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
// schedule words (Wt). They start with the message block
// content (W0 to W15), then further words in the message
// schedule generated via vsha2ms from previous Wt.
// Initially:
// vW0 = W[ 3:0] = { W3, W2, W1, W0}
// vW1 = W[ 7:4] = { W7, W6, W5, W4}
// vW2 = W[ 11:8] = {W11, W10, W9, W8}
// vW3 = W[15:12] = {W15, W14, W13, W12}
//
// vState0 - vState1 hold the working state variables (a, b, ..., h)
// vState0 = {f[t],e[t],b[t],a[t]}
// vState1 = {h[t],g[t],d[t],c[t]}
// Initially:
// vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
// vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
//
// v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
//
// vTmp0 = temporary, Wt+Kt
// vTmp1 = temporary, Kt
//
// vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
//
// During most of the function the vector state is configured so that each
// vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
// vsha2ch/vsha2cl uses EGW of 4*SEW.
// SHA256 SEW = e32, EGW = 128-bits
// SHA512 SEW = e64, EGW = 256-bits
//
// VLEN is required to be at least 128.
// For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
//
// m1: LMUL=1/2
// ta: tail agnostic (don't care about those lanes)
// ma: mask agnostic (don't care about those lanes)
// x0 is not written, we known the number of vector elements.
if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
__ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
} else {
__ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
}
int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
__ li(t0, indexes);
__ vmv_v_x(vindex, t0);
// Step-over a,b, so we are pointing to c.
// const_add is equal to 4x state variable, div by 2 is thus 2, a,b
__ addi(state_c, state, const_add/2);
// Use index-load to get {f,e,b,a},{h,g,d,c}
__ vluxei8_v(vState0, state, vindex);
__ vluxei8_v(vState1, state_c, vindex);
__ bind(multi_block_loop);
// Capture the initial H values in vHash0 and vHash1 to allow for computing
// the resulting H', since H' = H+{a',b',c',...,h'}.
__ vmv_v_v(vHash0, vState0);
__ vmv_v_v(vHash1, vState1);
// Load the 512/1024-bits of the message block in vW0-vW3 and perform
// an endian swap on each 4/8 bytes element.
//
// If Zvkb is not implemented one can use vrgather
// with an index sequence to byte-swap.
// sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12]
// <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
// this sequence. 'vid' gives us the N.
__ vleXX_v(vset_sew, vW0, buf);
__ vrev8_v(vW0, vW0);
__ addi(buf, buf, const_add);
__ vleXX_v(vset_sew, vW1, buf);
__ vrev8_v(vW1, vW1);
__ addi(buf, buf, const_add);
__ vleXX_v(vset_sew, vW2, buf);
__ vrev8_v(vW2, vW2);
__ addi(buf, buf, const_add);
__ vleXX_v(vset_sew, vW3, buf);
__ vrev8_v(vW3, vW3);
__ addi(buf, buf, const_add);
// Set v0 up for the vmerge that replaces the first word (idx==0)
__ vid_v(v0);
__ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0)
VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
int rot_pos = 0;
// Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
for (int i = 0; i < qr_end; i++) {
sha2_quad_round(vset_sew,
rotation_regs[(rot_pos + 0) & 0x3],
rotation_regs[(rot_pos + 1) & 0x3],
rotation_regs[(rot_pos + 2) & 0x3],
rotation_regs[(rot_pos + 3) & 0x3],
consts,
vTmp1, vTmp0, vState0, vState1);
++rot_pos;
}
// Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
// Note that we stop generating new message schedule words (Wt, vW0-13)
// as we already generated all the words we end up consuming (i.e., W[63:60]).
const int qr_c_end = qr_end + 4;
for (int i = qr_end; i < qr_c_end; i++) {
sha2_quad_round(vset_sew,
rotation_regs[(rot_pos + 0) & 0x3],
rotation_regs[(rot_pos + 1) & 0x3],
rotation_regs[(rot_pos + 2) & 0x3],
rotation_regs[(rot_pos + 3) & 0x3],
consts,
vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
++rot_pos;
}
//--------------------------------------------------------------------------------
// Compute the updated hash value H'
// H' = H + {h',g',...,b',a'}
// = {h,g,...,b,a} + {h',g',...,b',a'}
// = {h+h',g+g',...,b+b',a+a'}
// H' = H+{a',b',c',...,h'}
__ vadd_vv(vState0, vHash0, vState0);
__ vadd_vv(vState1, vHash1, vState1);
if (multi_block) {
int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
__ subi(consts, consts, total_adds);
__ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
__ ble(ofs, limit, multi_block_loop);
__ mv(c_rarg0, ofs); // return ofs
}
// Store H[0..8] = {a,b,c,d,e,f,g,h} from
// vState0 = {f,e,b,a}
// vState1 = {h,g,d,c}
__ vsuxei8_v(vState0, state, vindex);
__ vsuxei8_v(vState1, state_c, vindex);
__ leave();
__ ret();
return start;
}
};
#undef __
#define __ _masm->
// Set of L registers that correspond to a contiguous memory area.
// Each 64-bit register typically corresponds to 2 32-bit integers.
template <uint L>
class RegCache {
private:
MacroAssembler *_masm;
Register _regs[L];
public:
RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
auto it = rs.begin();
for (auto &r: _regs) {
r = *it;
++it;
}
}
// generate load for the i'th register
void gen_load(uint i, Register base) {
assert(i < L, "invalid i: %u", i);
__ ld(_regs[i], Address(base, 8 * i));
}
// add i'th 32-bit integer to dest
void add_u32(const Register dest, uint i, const Register rtmp = t0) {
assert(i < 2 * L, "invalid i: %u", i);
if (is_even(i)) {
// Use the bottom 32 bits. No need to mask off the top 32 bits
// as addw will do the right thing.
__ addw(dest, dest, _regs[i / 2]);
} else {
// Use the top 32 bits by right-shifting them.
__ srli(rtmp, _regs[i / 2], 32);
__ addw(dest, dest, rtmp);
}
}
};
typedef RegCache<8> BufRegCache;
// a += value + x + ac;
// a = Integer.rotateLeft(a, s) + b;
void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
Register a, Register b, Register c, Register d,
int k, int s, int t,
Register value) {
// a += ac
__ addw(a, a, t, t1);
// a += x;
reg_cache.add_u32(a, k);
// a += value;
__ addw(a, a, value);
// a = Integer.rotateLeft(a, s) + b;
__ rolw(a, a, s);
__ addw(a, a, b);
}
// a += ((b & c) | ((~b) & d)) + x + ac;
// a = Integer.rotateLeft(a, s) + b;
void md5_FF(BufRegCache& reg_cache,
Register a, Register b, Register c, Register d,
int k, int s, int t,
Register rtmp1, Register rtmp2) {
// rtmp1 = b & c
__ andr(rtmp1, b, c);
// rtmp2 = (~b) & d
__ andn(rtmp2, d, b);
// rtmp1 = (b & c) | ((~b) & d)
__ orr(rtmp1, rtmp1, rtmp2);
m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
}
// a += ((b & d) | (c & (~d))) + x + ac;
// a = Integer.rotateLeft(a, s) + b;
void md5_GG(BufRegCache& reg_cache,
Register a, Register b, Register c, Register d,
int k, int s, int t,
Register rtmp1, Register rtmp2) {
// rtmp1 = b & d
__ andr(rtmp1, b, d);
// rtmp2 = c & (~d)
__ andn(rtmp2, c, d);
// rtmp1 = (b & d) | (c & (~d))
__ orr(rtmp1, rtmp1, rtmp2);
m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
}
// a += ((b ^ c) ^ d) + x + ac;
// a = Integer.rotateLeft(a, s) + b;
void md5_HH(BufRegCache& reg_cache,
Register a, Register b, Register c, Register d,
int k, int s, int t,
Register rtmp1, Register rtmp2) {
// rtmp1 = (b ^ c) ^ d
__ xorr(rtmp2, b, c);
__ xorr(rtmp1, rtmp2, d);
m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
}
// a += (c ^ (b | (~d))) + x + ac;
// a = Integer.rotateLeft(a, s) + b;
void md5_II(BufRegCache& reg_cache,
Register a, Register b, Register c, Register d,
int k, int s, int t,
Register rtmp1, Register rtmp2) {
// rtmp1 = c ^ (b | (~d))
__ orn(rtmp2, b, d);
__ xorr(rtmp1, c, rtmp2);
m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
}
// Arguments:
//
// Inputs:
// c_rarg0 - byte[] source+offset
// c_rarg1 - int[] SHA.state
// c_rarg2 - int offset (multi_block == True)
// c_rarg3 - int limit (multi_block == True)
//
// Registers:
// x0 zero (zero)
// x1 ra (return address)
// x2 sp (stack pointer)
// x3 gp (global pointer)
// x4 tp (thread pointer)
// x5 t0 (tmp register)
// x6 t1 (tmp register)
// x7 t2 state0
// x8 f0/s0 (frame pointer)
// x9 s1
// x10 a0 rtmp1 / c_rarg0
// x11 a1 rtmp2 / c_rarg1
// x12 a2 a / c_rarg2
// x13 a3 b / c_rarg3
// x14 a4 c
// x15 a5 d
// x16 a6 buf
// x17 a7 state
// x18 s2 ofs [saved-reg] (multi_block == True)
// x19 s3 limit [saved-reg] (multi_block == True)
// x20 s4 state1 [saved-reg]
// x21 s5 state2 [saved-reg]
// x22 s6 state3 [saved-reg]
// x23 s7
// x24 s8 buf0 [saved-reg]
// x25 s9 buf1 [saved-reg]
// x26 s10 buf2 [saved-reg]
// x27 s11 buf3 [saved-reg]
// x28 t3 buf4
// x29 t4 buf5
// x30 t5 buf6
// x31 t6 buf7
address generate_md5_implCompress(StubGenStubId stub_id) {
__ align(CodeEntryAlignment);
bool multi_block;
switch (stub_id) {
case md5_implCompress_id:
multi_block = false;
break;
case md5_implCompressMB_id:
multi_block = true;
break;
default:
ShouldNotReachHere();
};
StubCodeMark mark(this, stub_id);
address start = __ pc();
// rotation constants
const int S11 = 7;
const int S12 = 12;
const int S13 = 17;
const int S14 = 22;
const int S21 = 5;
const int S22 = 9;
const int S23 = 14;
const int S24 = 20;
const int S31 = 4;
const int S32 = 11;
const int S33 = 16;
const int S34 = 23;
const int S41 = 6;
const int S42 = 10;
const int S43 = 15;
const int S44 = 21;
const int64_t mask32 = 0xffffffff;
Register buf_arg = c_rarg0; // a0
Register state_arg = c_rarg1; // a1
Register ofs_arg = c_rarg2; // a2
Register limit_arg = c_rarg3; // a3
// we'll copy the args to these registers to free up a0-a3
// to use for other values manipulated by instructions
// that can be compressed
Register buf = x16; // a6
Register state = x17; // a7
Register ofs = x18; // s2
Register limit = x19; // s3
// using x12->15 to allow compressed instructions
Register a = x12; // a2
Register b = x13; // a3
Register c = x14; // a4
Register d = x15; // a5
Register state0 = x7; // t2
Register state1 = x20; // s4
Register state2 = x21; // s5
Register state3 = x22; // s6
// using x10->x11 to allow compressed instructions
Register rtmp1 = x10; // a0
Register rtmp2 = x11; // a1
RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
RegSet reg_cache_regs;
reg_cache_regs += reg_cache_saved_regs;
reg_cache_regs += RegSet::of(t3, t4, t5, t6);
BufRegCache reg_cache(_masm, reg_cache_regs);
RegSet saved_regs;
if (multi_block) {
saved_regs += RegSet::of(ofs, limit);
}
saved_regs += RegSet::of(state1, state2, state3);
saved_regs += reg_cache_saved_regs;
__ push_reg(saved_regs, sp);
__ mv(buf, buf_arg);
__ mv(state, state_arg);
if (multi_block) {
__ mv(ofs, ofs_arg);
__ mv(limit, limit_arg);
}
// to minimize the number of memory operations:
// read the 4 state 4-byte values in pairs, with a single ld,
// and split them into 2 registers.
//
// And, as the core algorithm of md5 works on 32-bits words, so
// in the following code, it does not care about the content of
// higher 32-bits in state[x]. Based on this observation,
// we can apply further optimization, which is to just ignore the
// higher 32-bits in state0/state2, rather than set the higher
// 32-bits of state0/state2 to zero explicitly with extra instructions.
__ ld(state0, Address(state));
__ srli(state1, state0, 32);
__ ld(state2, Address(state, 8));
__ srli(state3, state2, 32);
Label md5_loop;
__ BIND(md5_loop);
__ mv(a, state0);
__ mv(b, state1);
__ mv(c, state2);
__ mv(d, state3);
// Round 1
reg_cache.gen_load(0, buf);
md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2);
md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2);
reg_cache.gen_load(1, buf);
md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2);
md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2);
reg_cache.gen_load(2, buf);
md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2);
md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2);
reg_cache.gen_load(3, buf);
md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2);
md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2);
reg_cache.gen_load(4, buf);
md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2);
md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2);
reg_cache.gen_load(5, buf);
md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
reg_cache.gen_load(6, buf);
md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
reg_cache.gen_load(7, buf);
md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
// Round 2
md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2);
md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2);
md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2);
md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2);
md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2);
md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2);
md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2);
md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2);
md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
// Round 3
md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2);
md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2);
md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2);
md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2);
md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2);
md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2);
md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2);
md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2);
md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2);
md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2);
// Round 4
md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2);
md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2);
md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2);
md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2);
md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2);
md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2);
md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2);
md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2);
md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2);
__ addw(state0, state0, a);
__ addw(state1, state1, b);
__ addw(state2, state2, c);
__ addw(state3, state3, d);
if (multi_block) {
__ addi(buf, buf, 64);
__ addi(ofs, ofs, 64);
// if (ofs <= limit) goto m5_loop
__ bge(limit, ofs, md5_loop);
__ mv(c_rarg0, ofs); // return ofs
}
// to minimize the number of memory operations:
// write back the 4 state 4-byte values in pairs, with a single sd
__ mv(t0, mask32);
__ andr(state0, state0, t0);
__ slli(state1, state1, 32);
__ orr(state0, state0, state1);
__ sd(state0, Address(state));
__ andr(state2, state2, t0);
__ slli(state3, state3, 32);
__ orr(state2, state2, state3);
__ sd(state2, Address(state, 8));
__ pop_reg(saved_regs, sp);
__ ret();
return (address) start;
}
/**
* Perform the quarter round calculations on values contained within four vector registers.
*
* @param aVec the SIMD register containing only the "a" values
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param tmp_vr temporary vector register holds intermedia values.
*/
void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
// a += b, d ^= a, d <<<= 16
__ vadd_vv(aVec, aVec, bVec);
__ vxor_vv(dVec, dVec, aVec);
__ vrole32_vi(dVec, 16, tmp_vr);
// c += d, b ^= c, b <<<= 12
__ vadd_vv(cVec, cVec, dVec);
__ vxor_vv(bVec, bVec, cVec);
__ vrole32_vi(bVec, 12, tmp_vr);
// a += b, d ^= a, d <<<= 8
__ vadd_vv(aVec, aVec, bVec);
__ vxor_vv(dVec, dVec, aVec);
__ vrole32_vi(dVec, 8, tmp_vr);
// c += d, b ^= c, b <<<= 7
__ vadd_vv(cVec, cVec, dVec);
__ vxor_vv(bVec, bVec, cVec);
__ vrole32_vi(bVec, 7, tmp_vr);
}
/**
* int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
*
* Input arguments:
* c_rarg0 - state, the starting state
* c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function
*
* Implementation Note:
* Parallelization is achieved by loading individual state elements into vectors for N blocks.
* N depends on single vector register length.
*/
address generate_chacha20Block() {
Label L_Rounds;
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter();
const int states_len = 16;
const int step = 4;
const Register state = c_rarg0;
const Register key_stream = c_rarg1;
const Register tmp_addr = t0;
const Register length = t1;
// Organize vector registers in an array that facilitates
// putting repetitive opcodes into loop structures below.
const VectorRegister work_vrs[16] = {
v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15
};
const VectorRegister tmp_vr = v16;
const VectorRegister counter_vr = v17;
{
// Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
// in java level.
__ vsetivli(length, 16, Assembler::e32, Assembler::m1);
}
// Load from source state.
// Every element in source state is duplicated to all elements in the corresponding vector.
__ mv(tmp_addr, state);
for (int i = 0; i < states_len; i += 1) {
__ vlse32_v(work_vrs[i], tmp_addr, zr);
__ addi(tmp_addr, tmp_addr, step);
}
// Adjust counter for every individual block.
__ vid_v(counter_vr);
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
// Perform 10 iterations of the 8 quarter round set
{
const Register loop = t2; // share t2 with other non-overlapping usages.
__ mv(loop, 10);
__ BIND(L_Rounds);
chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr);
chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr);
chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr);
chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr);
__ subi(loop, loop, 1);
__ bnez(loop, L_Rounds);
}
// Add the original state into the end working state.
// We do this by first duplicating every element in source state array to the corresponding
// vector, then adding it to the post-loop working state.
__ mv(tmp_addr, state);
for (int i = 0; i < states_len; i += 1) {
__ vlse32_v(tmp_vr, tmp_addr, zr);
__ addi(tmp_addr, tmp_addr, step);
__ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
}
// Add the counter overlay onto work_vrs[12] at the end.
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
// Store result to key stream.
{
const Register stride = t2; // share t2 with other non-overlapping usages.
// Every block occupies 64 bytes, so we use 64 as stride of the vector store.
__ mv(stride, 64);
for (int i = 0; i < states_len; i += 1) {
__ vsse32_v(work_vrs[i], key_stream, stride);
__ addi(key_stream, key_stream, step);
}
}
// Return length of output key_stream
__ slli(c_rarg0, length, 6);
__ leave();
__ ret();
return (address) start;
}
// ------------------------ SHA-1 intrinsic ------------------------
// K't =
// 5a827999, 0 <= t <= 19
// 6ed9eba1, 20 <= t <= 39
// 8f1bbcdc, 40 <= t <= 59
// ca62c1d6, 60 <= t <= 79
void sha1_prepare_k(Register cur_k, int round) {
assert(round >= 0 && round < 80, "must be");
static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
if ((round % 20) == 0) {
__ mv(cur_k, ks[round/20]);
}
}
// W't =
// M't, 0 <= t <= 15
// ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
assert(round >= 0 && round < 80, "must be");
if (round < 16) {
// in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
// in ws[0], high part contains W't-0, low part contains W't-1,
// in ws[1], high part contains W't-2, low part contains W't-3,
// ...
// in ws[7], high part contains W't-14, low part contains W't-15.
if ((round % 2) == 0) {
__ ld(ws[round/2], Address(buf, (round/2) * 8));
// reverse bytes, as SHA-1 is defined in big-endian.
__ revb(ws[round/2], ws[round/2]);
__ srli(cur_w, ws[round/2], 32);
} else {
__ mv(cur_w, ws[round/2]);
}
return;
}
if ((round % 2) == 0) {
int idx = 16;
// W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
__ srli(t1, ws[(idx-8)/2], 32);
__ xorr(t0, ws[(idx-3)/2], t1);
__ srli(t1, ws[(idx-14)/2], 32);
__ srli(cur_w, ws[(idx-16)/2], 32);
__ xorr(cur_w, cur_w, t1);
__ xorr(cur_w, cur_w, t0);
__ rolw(cur_w, cur_w, 1, t0);
// copy the cur_w value to ws[8].
// now, valid w't values are at:
// w0: ws[0]'s lower 32 bits
// w1 ~ w14: ws[1] ~ ws[7]
// w15: ws[8]'s higher 32 bits
__ slli(ws[idx/2], cur_w, 32);
return;
}
int idx = 17;
// W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
__ srli(t1, ws[(idx-3)/2], 32);
__ xorr(t0, t1, ws[(idx-8)/2]);
__ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
__ xorr(cur_w, cur_w, t0);
__ rolw(cur_w, cur_w, 1, t0);
// copy the cur_w value to ws[8]
__ zext(cur_w, cur_w, 32);
__ orr(ws[idx/2], ws[idx/2], cur_w);
// shift the w't registers, so they start from ws[0] again.
// now, valid w't values are at:
// w0 ~ w15: ws[0] ~ ws[7]
Register ws_0 = ws[0];
for (int i = 0; i < 16/2; i++) {
ws[i] = ws[i+1];
}
ws[8] = ws_0;
}
// f't(x, y, z) =
// Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19
// Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39
// Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59
// Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79
void sha1_f(Register dst, Register x, Register y, Register z, int round) {
assert(round >= 0 && round < 80, "must be");
assert_different_registers(dst, x, y, z, t0, t1);
if (round < 20) {
// (x & y) ^ (~x & z)
__ andr(t0, x, y);
__ andn(dst, z, x);
__ xorr(dst, dst, t0);
} else if (round >= 40 && round < 60) {
// (x & y) ^ (x & z) ^ (y & z)
__ andr(t0, x, y);
__ andr(t1, x, z);
__ andr(dst, y, z);
__ xorr(dst, dst, t0);
__ xorr(dst, dst, t1);
} else {
// x ^ y ^ z
__ xorr(dst, x, y);
__ xorr(dst, dst, z);
}
}
// T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
// e = d
// d = c
// c = ROTL'30(b)
// b = a
// a = T
void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
Register cur_k, Register cur_w, Register tmp, int round) {
assert(round >= 0 && round < 80, "must be");
assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
// T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
// cur_w will be recalculated at the beginning of each round,
// so, we can reuse it as a temp register here.
Register tmp2 = cur_w;
// reuse e as a temporary register, as we will mv new value into it later
Register tmp3 = e;
__ add(tmp2, cur_k, tmp2);
__ add(tmp3, tmp3, tmp2);
__ rolw(tmp2, a, 5, t0);
sha1_f(tmp, b, c, d, round);
__ add(tmp2, tmp2, tmp);
__ add(tmp2, tmp2, tmp3);
// e = d
// d = c
// c = ROTL'30(b)
// b = a
// a = T
__ mv(e, d);
__ mv(d, c);
__ rolw(c, b, 30);
__ mv(b, a);
__ mv(a, tmp2);
}
// H(i)0 = a + H(i-1)0
// H(i)1 = b + H(i-1)1
// H(i)2 = c + H(i-1)2
// H(i)3 = d + H(i-1)3
// H(i)4 = e + H(i-1)4
void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
Register prev_ab, Register prev_cd, Register prev_e) {
assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
__ add(a, a, prev_ab);
__ srli(prev_ab, prev_ab, 32);
__ add(b, b, prev_ab);
__ add(c, c, prev_cd);
__ srli(prev_cd, prev_cd, 32);
__ add(d, d, prev_cd);
__ add(e, e, prev_e);
}
void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
Register prev_ab, Register prev_cd, Register prev_e) {
assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
__ slli(t0, b, 32);
__ zext(prev_ab, a, 32);
__ orr(prev_ab, prev_ab, t0);
__ slli(t0, d, 32);
__ zext(prev_cd, c, 32);
__ orr(prev_cd, prev_cd, t0);
__ mv(prev_e, e);
}
// Intrinsic for:
// void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
// void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
//
// Arguments:
//
// Inputs:
// c_rarg0: byte[] src array + offset
// c_rarg1: int[] SHA.state
// - - - - - - below are only for implCompressMultiBlock0 - - - - - -
// c_rarg2: int offset
// c_rarg3: int limit
//
// Outputs:
// - - - - - - below are only for implCompressMultiBlock0 - - - - - -
// c_rarg0: int offset, when (multi_block == true)
//
address generate_sha1_implCompress(StubGenStubId stub_id) {
bool multi_block;
switch (stub_id) {
case sha1_implCompress_id:
multi_block = false;
break;
case sha1_implCompressMB_id:
multi_block = true;
break;
default:
ShouldNotReachHere();
};
__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter();
RegSet saved_regs = RegSet::range(x18, x27);
if (multi_block) {
// use x9 as src below.
saved_regs += RegSet::of(x9);
}
__ push_reg(saved_regs, sp);
// c_rarg0 - c_rarg3: x10 - x13
Register buf = c_rarg0;
Register state = c_rarg1;
Register offset = c_rarg2;
Register limit = c_rarg3;
// use src to contain the original start point of the array.
Register src = x9;
if (multi_block) {
__ sub(limit, limit, offset);
__ add(limit, limit, buf);
__ sub(src, buf, offset);
}
// [args-reg]: x14 - x17
// [temp-reg]: x28 - x31
// [saved-reg]: x18 - x27
// h0/1/2/3/4
const Register a = x14, b = x15, c = x16, d = x17, e = x28;
// w0, w1, ... w15
// put two adjecent w's in one register:
// one at high word part, another at low word part
// at different round (even or odd), w't value reside in different items in ws[].
// w0 ~ w15, either reside in
// ws[0] ~ ws[7], where
// w0 at higher 32 bits of ws[0],
// w1 at lower 32 bits of ws[0],
// ...
// w14 at higher 32 bits of ws[7],
// w15 at lower 32 bits of ws[7].
// or, reside in
// w0: ws[0]'s lower 32 bits
// w1 ~ w14: ws[1] ~ ws[7]
// w15: ws[8]'s higher 32 bits
Register ws[9] = {x29, x30, x31, x18,
x19, x20, x21, x22,
x23}; // auxiliary register for calculating w's value
// current k't's value
const Register cur_k = x24;
// current w't's value
const Register cur_w = x25;
// values of a, b, c, d, e in the previous round
const Register prev_ab = x26, prev_cd = x27;
const Register prev_e = offset; // reuse offset/c_rarg2
// load 5 words state into a, b, c, d, e.
//
// To minimize the number of memory operations, we apply following
// optimization: read the states (a/b/c/d) of 4-byte values in pairs,
// with a single ld, and split them into 2 registers.
//
// And, as the core algorithm of SHA-1 works on 32-bits words, so
// in the following code, it does not care about the content of
// higher 32-bits in a/b/c/d/e. Based on this observation,
// we can apply further optimization, which is to just ignore the
// higher 32-bits in a/c/e, rather than set the higher
// 32-bits of a/c/e to zero explicitly with extra instructions.
__ ld(a, Address(state, 0));
__ srli(b, a, 32);
__ ld(c, Address(state, 8));
__ srli(d, c, 32);
__ lw(e, Address(state, 16));
Label L_sha1_loop;
if (multi_block) {
__ BIND(L_sha1_loop);
}
sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
for (int round = 0; round < 80; round++) {
// prepare K't value
sha1_prepare_k(cur_k, round);
// prepare W't value
sha1_prepare_w(cur_w, ws, buf, round);
// one round process
sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
}
// compute the intermediate hash value
sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
if (multi_block) {
int64_t block_bytes = 16 * 4;
__ addi(buf, buf, block_bytes);
__ bge(limit, buf, L_sha1_loop, true);
}
// store back the state.
__ zext(a, a, 32);
__ slli(b, b, 32);
__ orr(a, a, b);
__ sd(a, Address(state, 0));
__ zext(c, c, 32);
__ slli(d, d, 32);
__ orr(c, c, d);
__ sd(c, Address(state, 8));
__ sw(e, Address(state, 16));
// return offset
if (multi_block) {
__ sub(c_rarg0, buf, src);
}
__ pop_reg(saved_regs, sp);
__ leave();
__ ret();
return (address) start;
}
/**
* vector registers:
* input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
* index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
* output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
*
* NOTE: each field will occupy a vector register group
*/
void base64_vector_encode_round(Register src, Register dst, Register codec,
Register size, Register stepSrc, Register stepDst,
VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
Assembler::LMUL lmul) {
// set vector register type/len
__ vsetvli(x0, size, Assembler::e8, lmul);
// segmented load src into v registers: mem(src) => vr(3)
__ vlseg3e8_v(inputV1, src);
// src = src + register_group_len_bytes * 3
__ add(src, src, stepSrc);
// encoding
// 1. compute index into lookup table: vr(3) => vr(4)
__ vsrl_vi(idxV1, inputV1, 2);
__ vsrl_vi(idxV2, inputV2, 2);
__ vsll_vi(inputV1, inputV1, 6);
__ vor_vv(idxV2, idxV2, inputV1);
__ vsrl_vi(idxV2, idxV2, 2);
__ vsrl_vi(idxV3, inputV3, 4);
__ vsll_vi(inputV2, inputV2, 4);
__ vor_vv(idxV3, inputV2, idxV3);
__ vsrl_vi(idxV3, idxV3, 2);
__ vsll_vi(idxV4, inputV3, 2);
__ vsrl_vi(idxV4, idxV4, 2);
// 2. indexed load: vr(4) => vr(4)
__ vluxei8_v(outputV1, codec, idxV1);
__ vluxei8_v(outputV2, codec, idxV2);
__ vluxei8_v(outputV3, codec, idxV3);
__ vluxei8_v(outputV4, codec, idxV4);
// segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
__ vsseg4e8_v(outputV1, dst);
// dst = dst + register_group_len_bytes * 4
__ add(dst, dst, stepDst);
}
/**
* void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
*
* Input arguments:
* c_rarg0 - src, source array
* c_rarg1 - sp, src start offset
* c_rarg2 - sl, src end offset
* c_rarg3 - dst, dest array
* c_rarg4 - dp, dst start offset
* c_rarg5 - isURL, Base64 or URL character set
*/
address generate_base64_encodeBlock() {
alignas(64) static const char toBase64[64] = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
};
alignas(64) static const char toBase64URL[64] = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
};
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter();
Register src = c_rarg0;
Register soff = c_rarg1;
Register send = c_rarg2;
Register dst = c_rarg3;
Register doff = c_rarg4;
Register isURL = c_rarg5;
Register codec = c_rarg6;
Register length = c_rarg7; // total length of src data in bytes
Label ProcessData, Exit;
// length should be multiple of 3
__ sub(length, send, soff);
// real src/dst to process data
__ add(src, src, soff);
__ add(dst, dst, doff);
// load the codec base address
__ la(codec, ExternalAddress((address) toBase64));
__ beqz(isURL, ProcessData);
__ la(codec, ExternalAddress((address) toBase64URL));
__ BIND(ProcessData);
// vector version
if (UseRVV) {
Label ProcessM2, ProcessM1, ProcessScalar;
Register size = soff;
Register stepSrcM1 = send;
Register stepSrcM2 = doff;
Register stepDst = isURL;
__ mv(size, MaxVectorSize * 2);
__ mv(stepSrcM1, MaxVectorSize * 3);
__ slli(stepSrcM2, stepSrcM1, 1);
__ mv(stepDst, MaxVectorSize * 2 * 4);
__ blt(length, stepSrcM2, ProcessM1);
__ BIND(ProcessM2);
base64_vector_encode_round(src, dst, codec,
size, stepSrcM2, stepDst,
v2, v4, v6, // inputs
v8, v10, v12, v14, // indexes
v16, v18, v20, v22, // outputs
Assembler::m2);
__ sub(length, length, stepSrcM2);
__ bge(length, stepSrcM2, ProcessM2);
__ BIND(ProcessM1);
__ blt(length, stepSrcM1, ProcessScalar);
__ srli(size, size, 1);
__ srli(stepDst, stepDst, 1);
base64_vector_encode_round(src, dst, codec,
size, stepSrcM1, stepDst,
v1, v2, v3, // inputs
v4, v5, v6, v7, // indexes
v8, v9, v10, v11, // outputs
Assembler::m1);
__ sub(length, length, stepSrcM1);
__ BIND(ProcessScalar);
}
// scalar version
{
Register byte1 = soff, byte0 = send, byte2 = doff;
Register combined24Bits = isURL;
__ beqz(length, Exit);
Label ScalarLoop;
__ BIND(ScalarLoop);
{
// plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
// encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
// load 3 bytes src data
__ lbu(byte0, Address(src, 0));
__ lbu(byte1, Address(src, 1));
__ lbu(byte2, Address(src, 2));
__ addi(src, src, 3);
// construct 24 bits from 3 bytes
__ slliw(byte0, byte0, 16);
__ slliw(byte1, byte1, 8);
__ orr(combined24Bits, byte0, byte1);
__ orr(combined24Bits, combined24Bits, byte2);
// get codec index and encode(ie. load from codec by index)
__ slliw(byte0, combined24Bits, 8);
__ srliw(byte0, byte0, 26);
__ add(byte0, codec, byte0);
__ lbu(byte0, byte0);
__ slliw(byte1, combined24Bits, 14);
__ srliw(byte1, byte1, 26);
__ add(byte1, codec, byte1);
__ lbu(byte1, byte1);
__ slliw(byte2, combined24Bits, 20);
__ srliw(byte2, byte2, 26);
__ add(byte2, codec, byte2);
__ lbu(byte2, byte2);
__ andi(combined24Bits, combined24Bits, 0x3f);
__ add(combined24Bits, codec, combined24Bits);
__ lbu(combined24Bits, combined24Bits);
// store 4 bytes encoded data
__ sb(byte0, Address(dst, 0));
__ sb(byte1, Address(dst, 1));
__ sb(byte2, Address(dst, 2));
__ sb(combined24Bits, Address(dst, 3));
__ subi(length, length, 3);
__ addi(dst, dst, 4);
// loop back
__ bnez(length, ScalarLoop);
}
}
__ BIND(Exit);
__ leave();
__ ret();
return (address) start;
}
/**
* vector registers:
* input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
* index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
* output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
*
* NOTE: each field will occupy a single vector register group
*/
void base64_vector_decode_round(Register src, Register dst, Register codec,
Register size, Register stepSrc, Register stepDst, Register failedIdx,
VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
Assembler::LMUL lmul) {
// set vector register type/len
__ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
// segmented load src into v registers: mem(src) => vr(4)
__ vlseg4e8_v(inputV1, src);
// src = src + register_group_len_bytes * 4
__ add(src, src, stepSrc);
// decoding
// 1. indexed load: vr(4) => vr(4)
__ vluxei8_v(idxV1, codec, inputV1);
__ vluxei8_v(idxV2, codec, inputV2);
__ vluxei8_v(idxV3, codec, inputV3);
__ vluxei8_v(idxV4, codec, inputV4);
// 2. check wrong data
__ vor_vv(outputV1, idxV1, idxV2);
__ vor_vv(outputV2, idxV3, idxV4);
__ vor_vv(outputV1, outputV1, outputV2);
__ vmseq_vi(v0, outputV1, -1);
__ vfirst_m(failedIdx, v0);
Label NoFailure, FailureAtIdx0;
// valid value can only be -1 when < 0
__ bltz(failedIdx, NoFailure);
// when the first data (at index 0) fails, no need to process data anymore
__ beqz(failedIdx, FailureAtIdx0);
__ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
__ slli(stepDst, failedIdx, 1);
__ add(stepDst, failedIdx, stepDst);
__ BIND(NoFailure);
// 3. compute the decoded data: vr(4) => vr(3)
__ vsll_vi(idxV1, idxV1, 2);
__ vsrl_vi(outputV1, idxV2, 4);
__ vor_vv(outputV1, outputV1, idxV1);
__ vsll_vi(idxV2, idxV2, 4);
__ vsrl_vi(outputV2, idxV3, 2);
__ vor_vv(outputV2, outputV2, idxV2);
__ vsll_vi(idxV3, idxV3, 6);
__ vor_vv(outputV3, idxV4, idxV3);
// segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
__ vsseg3e8_v(outputV1, dst);
// dst = dst + register_group_len_bytes * 3
__ add(dst, dst, stepDst);
__ BIND(FailureAtIdx0);
}
/**
* int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
*
* Input arguments:
* c_rarg0 - src, source array
* c_rarg1 - sp, src start offset
* c_rarg2 - sl, src end offset
* c_rarg3 - dst, dest array
* c_rarg4 - dp, dst start offset
* c_rarg5 - isURL, Base64 or URL character set
* c_rarg6 - isMIME, Decoding MIME block
*/
address generate_base64_decodeBlock() {
static const uint8_t fromBase64[256] = {
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
};
static const uint8_t fromBase64URL[256] = {
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
};
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter();
Register src = c_rarg0;
Register soff = c_rarg1;
Register send = c_rarg2;
Register dst = c_rarg3;
Register doff = c_rarg4;
Register isURL = c_rarg5;
Register isMIME = c_rarg6;
Register codec = c_rarg7;
Register dstBackup = t6;
Register length = t3; // total length of src data in bytes
Label ProcessData, Exit;
Label ProcessScalar, ScalarLoop;
// passed in length (send - soff) is guaranteed to be > 4,
// and in this intrinsic we only process data of length in multiple of 4,
// it's not guaranteed to be multiple of 4 by java level, so do it explicitly
__ sub(length, send, soff);
__ andi(length, length, -4);
// real src/dst to process data
__ add(src, src, soff);
__ add(dst, dst, doff);
// backup of dst, used to calculate the return value at exit
__ mv(dstBackup, dst);
// load the codec base address
__ la(codec, ExternalAddress((address) fromBase64));
__ beqz(isURL, ProcessData);
__ la(codec, ExternalAddress((address) fromBase64URL));
__ BIND(ProcessData);
// vector version
if (UseRVV) {
// for MIME case, it has a default length limit of 76 which could be
// different(smaller) from (send - soff), so in MIME case, we go through
// the scalar code path directly.
__ bnez(isMIME, ScalarLoop);
Label ProcessM1, ProcessM2;
Register failedIdx = soff;
Register stepSrcM1 = send;
Register stepSrcM2 = doff;
Register stepDst = isURL;
Register size = t4;
__ mv(size, MaxVectorSize * 2);
__ mv(stepSrcM1, MaxVectorSize * 4);
__ slli(stepSrcM2, stepSrcM1, 1);
__ mv(stepDst, MaxVectorSize * 2 * 3);
__ blt(length, stepSrcM2, ProcessM1);
// Assembler::m2
__ BIND(ProcessM2);
base64_vector_decode_round(src, dst, codec,
size, stepSrcM2, stepDst, failedIdx,
v2, v4, v6, v8, // inputs
v10, v12, v14, v16, // indexes
v18, v20, v22, // outputs
Assembler::m2);
__ sub(length, length, stepSrcM2);
// error check
// valid value of failedIdx can only be -1 when < 0
__ bgez(failedIdx, Exit);
__ bge(length, stepSrcM2, ProcessM2);
// Assembler::m1
__ BIND(ProcessM1);
__ blt(length, stepSrcM1, ProcessScalar);
__ srli(size, size, 1);
__ srli(stepDst, stepDst, 1);
base64_vector_decode_round(src, dst, codec,
size, stepSrcM1, stepDst, failedIdx,
v1, v2, v3, v4, // inputs
v5, v6, v7, v8, // indexes
v9, v10, v11, // outputs
Assembler::m1);
__ sub(length, length, stepSrcM1);
// error check
// valid value of failedIdx can only be -1 when < 0
__ bgez(failedIdx, Exit);
__ BIND(ProcessScalar);
__ beqz(length, Exit);
}
// scalar version
{
Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
Register combined32Bits = t4;
// encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
// plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
__ BIND(ScalarLoop);
// load 4 bytes encoded src data
__ lbu(byte0, Address(src, 0));
__ lbu(byte1, Address(src, 1));
__ lbu(byte2, Address(src, 2));
__ lbu(byte3, Address(src, 3));
__ addi(src, src, 4);
// get codec index and decode (ie. load from codec by index)
__ add(byte0, codec, byte0);
__ add(byte1, codec, byte1);
__ lb(byte0, Address(byte0, 0));
__ lb(byte1, Address(byte1, 0));
__ add(byte2, codec, byte2);
__ add(byte3, codec, byte3);
__ lb(byte2, Address(byte2, 0));
__ lb(byte3, Address(byte3, 0));
__ slliw(byte0, byte0, 18);
__ slliw(byte1, byte1, 12);
__ orr(byte0, byte0, byte1);
__ orr(byte0, byte0, byte3);
__ slliw(byte2, byte2, 6);
// For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
// 1. error check below
// 2. decode below
__ orr(combined32Bits, byte0, byte2);
// error check
__ bltz(combined32Bits, Exit);
// store 3 bytes decoded data
__ sraiw(byte0, combined32Bits, 16);
__ sraiw(byte1, combined32Bits, 8);
__ sb(byte0, Address(dst, 0));
__ sb(byte1, Address(dst, 1));
__ sb(combined32Bits, Address(dst, 2));
__ subi(length, length, 4);
__ addi(dst, dst, 3);
// loop back
__ bnez(length, ScalarLoop);
}
__ BIND(Exit);
__ sub(c_rarg0, dst, dstBackup);
__ leave();
__ ret();
return (address) start;
}
void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
Register temp0, Register temp1, Register temp2, Register temp3,
VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
assert((lmul == Assembler::m4 && step == 64) ||
(lmul == Assembler::m2 && step == 32) ||
(lmul == Assembler::m1 && step == 16),
"LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
// Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
// The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
// We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
// In non-vectorized code, we update s1 and s2 as:
// s1 <- s1 + b1
// s2 <- s2 + s1
// s1 <- s1 + b2
// s2 <- s2 + b1
// ...
// s1 <- s1 + b64
// s2 <- s2 + s1
// Putting above assignments together, we have:
// s1_new = s1 + b1 + b2 + ... + b64
// s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
// = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
// = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
__ mv(temp3, step);
// Load data
__ vsetvli(temp0, temp3, Assembler::e8, lmul);
__ vle8_v(vbytes, buff);
__ addi(buff, buff, step);
// Upper bound reduction sum for s1_new:
// 0xFF * 64 = 0x3FC0, so:
// 1. Need to do vector-widening reduction sum
// 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
__ vwredsumu_vs(vs1acc, vbytes, vzero);
// Multiplication for s2_new
__ vwmulu_vv(vs2acc, vtable, vbytes);
// s2 = s2 + s1 * log2(step)
__ slli(temp1, s1, exact_log2(step));
__ add(s2, s2, temp1);
// Summing up calculated results for s2_new
if (MaxVectorSize > 16) {
__ vsetvli(temp0, temp3, Assembler::e16, lmul);
} else {
// Half of vector-widening multiplication result is in successor of vs2acc
// group for vlen == 16, in which case we need to double vector register
// group width in order to reduction sum all of them
Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
(lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
__ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
}
// Upper bound for reduction sum:
// 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
// 1. Need to do vector-widening reduction sum
// 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
__ vwredsumu_vs(vtemp1, vs2acc, vzero);
// Extracting results for:
// s1_new
__ vmv_x_s(temp0, vs1acc);
__ add(s1, s1, temp0);
// s2_new
__ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
__ vmv_x_s(temp1, vtemp1);
__ add(s2, s2, temp1);
}
/***
* int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
*
* Arguments:
*
* Inputs:
* c_rarg0 - int adler
* c_rarg1 - byte* buff (b + off)
* c_rarg2 - int len
*
* Output:
* c_rarg0 - int adler result
*/
address generate_updateBytesAdler32() {
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
// Aliases
Register adler = c_rarg0;
Register s1 = c_rarg0;
Register s2 = c_rarg3;
Register buff = c_rarg1;
Register len = c_rarg2;
Register nmax = c_rarg4;
Register base = c_rarg5;
Register count = c_rarg6;
Register temp0 = t3;
Register temp1 = t4;
Register temp2 = t5;
Register temp3 = t6;
VectorRegister vzero = v31;
VectorRegister vbytes = v8; // group: v8, v9, v10, v11
VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
VectorRegister vtable_32 = v4; // group: v4, v5
VectorRegister vtable_16 = v30;
VectorRegister vtemp1 = v28;
VectorRegister vtemp2 = v29;
// Max number of bytes we can process before having to take the mod
// 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
const uint64_t BASE = 0xfff1;
const uint64_t NMAX = 0x15B0;
// Loops steps
int step_64 = 64;
int step_32 = 32;
int step_16 = 16;
int step_1 = 1;
__ enter(); // Required for proper stackwalking of RuntimeStub frame
__ mv(temp1, 64);
__ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
// Generating accumulation coefficients for further calculations
// vtable_64:
__ vid_v(vtemp1);
__ vrsub_vx(vtable_64, vtemp1, temp1);
// vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
// vtable_32:
__ mv(temp1, 32);
__ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
__ vid_v(vtemp1);
__ vrsub_vx(vtable_32, vtemp1, temp1);
// vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
__ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
// vtable_16:
__ mv(temp1, 16);
__ vid_v(vtemp1);
__ vrsub_vx(vtable_16, vtemp1, temp1);
// vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
__ vmv_v_i(vzero, 0);
__ mv(base, BASE);
__ mv(nmax, NMAX);
// s1 is initialized to the lower 16 bits of adler
// s2 is initialized to the upper 16 bits of adler
__ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
__ zext(s1, adler, 16); // s1 = (adler & 0xffff)
// The pipelined loop needs at least 16 elements for 1 iteration
// It does check this, but it is more effective to skip to the cleanup loop
__ mv(temp0, step_16);
__ bgeu(len, temp0, L_nmax);
__ beqz(len, L_combine);
// Jumping to L_by1_loop
__ subi(len, len, step_1);
__ j(L_by1_loop);
__ bind(L_nmax);
__ sub(len, len, nmax);
__ subi(count, nmax, 16);
__ bltz(len, L_by16);
// Align L_nmax loop by 64
__ bind(L_nmax_loop_entry);
__ subi(count, count, 32);
__ bind(L_nmax_loop);
adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
vtemp1, vtemp2, step_64, Assembler::m4);
__ subi(count, count, step_64);
__ bgtz(count, L_nmax_loop);
// There are three iterations left to do
adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
vtemp1, vtemp2, step_32, Assembler::m2);
adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
vtemp1, vtemp2, step_16, Assembler::m1);
// s1 = s1 % BASE
__ remuw(s1, s1, base);
// s2 = s2 % BASE
__ remuw(s2, s2, base);
__ sub(len, len, nmax);
__ subi(count, nmax, 16);
__ bgez(len, L_nmax_loop_entry);
__ bind(L_by16);
__ add(len, len, count);
__ bltz(len, L_by1);
// Trying to unroll
__ mv(temp3, step_64);
__ blt(len, temp3, L_by16_loop);
__ bind(L_by16_loop_unroll);
adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
vtemp1, vtemp2, step_64, Assembler::m4);
__ subi(len, len, step_64);
// By now the temp3 should still be 64
__ bge(len, temp3, L_by16_loop_unroll);
__ bind(L_by16_loop);
adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
vtemp1, vtemp2, step_16, Assembler::m1);
__ subi(len, len, step_16);
__ bgez(len, L_by16_loop);
__ bind(L_by1);
__ addi(len, len, 15);
__ bltz(len, L_do_mod);
__ bind(L_by1_loop);
__ lbu(temp0, Address(buff, 0));
__ addi(buff, buff, step_1);
__ add(s1, temp0, s1);
__ add(s2, s2, s1);
__ subi(len, len, step_1);
__ bgez(len, L_by1_loop);
__ bind(L_do_mod);
// s1 = s1 % BASE
__ remuw(s1, s1, base);
// s2 = s2 % BASE
__ remuw(s2, s2, base);
// Combine lower bits and higher bits
// adler = s1 | (s2 << 16)
__ bind(L_combine);
__ slli(s2, s2, 16);
__ orr(s1, s1, s2);
__ leave(); // Required for proper stackwalking of RuntimeStub frame
__ ret();
return start;
}
#endif // COMPILER2_OR_JVMCI
// x10 = input (float16)
// f10 = result (float)
// t1 = temporary register
address generate_float16ToFloat() {
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::hf2f_id;
StubCodeMark mark(this, stub_id);
address entry = __ pc();
BLOCK_COMMENT("float16ToFloat:");
FloatRegister dst = f10;
Register src = x10;
Label NaN_SLOW;
assert(VM_Version::supports_float16_float_conversion(), "must");
// On riscv, NaN needs a special process as fcvt does not work in that case.
// On riscv, Inf does not need a special process as fcvt can handle it correctly.
// but we consider to get the slow path to process NaN and Inf at the same time,
// as both of them are rare cases, and if we try to get the slow path to handle
// only NaN case it would sacrifise the performance for normal cases,
// i.e. non-NaN and non-Inf cases.
// check whether it's a NaN or +/- Inf.
__ mv(t0, 0x7c00);
__ andr(t1, src, t0);
// jump to stub processing NaN and Inf cases.
__ beq(t0, t1, NaN_SLOW);
// non-NaN or non-Inf cases, just use built-in instructions.
__ fmv_h_x(dst, src);
__ fcvt_s_h(dst, dst);
__ ret();
__ bind(NaN_SLOW);
// following instructions mainly focus on NaN, as riscv does not handle
// NaN well with fcvt, but the code also works for Inf at the same time.
// construct a NaN in 32 bits from the NaN in 16 bits,
// we need the payloads of non-canonical NaNs to be preserved.
__ mv(t1, 0x7f800000);
// sign-bit was already set via sign-extension if necessary.
__ slli(t0, src, 13);
__ orr(t1, t0, t1);
__ fmv_w_x(dst, t1);
__ ret();
return entry;
}
// f10 = input (float)
// x10 = result (float16)
// f11 = temporary float register
// t1 = temporary register
address generate_floatToFloat16() {
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::f2hf_id;
StubCodeMark mark(this, stub_id);
address entry = __ pc();
BLOCK_COMMENT("floatToFloat16:");
Register dst = x10;
FloatRegister src = f10, ftmp = f11;
Label NaN_SLOW;
assert(VM_Version::supports_float16_float_conversion(), "must");
// On riscv, NaN needs a special process as fcvt does not work in that case.
// check whether it's a NaN.
// replace fclass with feq as performance optimization.
__ feq_s(t0, src, src);
// jump to stub processing NaN cases.
__ beqz(t0, NaN_SLOW);
// non-NaN cases, just use built-in instructions.
__ fcvt_h_s(ftmp, src);
__ fmv_x_h(dst, ftmp);
__ ret();
__ bind(NaN_SLOW);
__ fmv_x_w(dst, src);
// preserve the payloads of non-canonical NaNs.
__ srai(dst, dst, 13);
// preserve the sign bit.
__ srai(t1, dst, 13);
__ slli(t1, t1, 10);
__ mv(t0, 0x3ff);
__ orr(t1, t1, t0);
// get the result by merging sign bit and payloads of preserved non-canonical NaNs.
__ andr(dst, dst, t1);
__ ret();
return entry;
}
#ifdef COMPILER2
static const int64_t right_2_bits = right_n_bits(2);
static const int64_t right_3_bits = right_n_bits(3);
// In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
// are represented as long[5], with BITS_PER_LIMB = 26.
// Pack five 26-bit limbs into three 64-bit registers.
void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
// The goal is to have 128-bit value in dest2:dest1:dest0
__ ld(dest0, Address(src, 0)); // 26 bits in dest0
__ ld(tmp1, Address(src, sizeof(jlong)));
__ slli(tmp1, tmp1, 26);
__ add(dest0, dest0, tmp1); // 52 bits in dest0
__ ld(tmp2, Address(src, 2 * sizeof(jlong)));
__ slli(tmp1, tmp2, 52);
__ add(dest0, dest0, tmp1); // dest0 is full
__ srli(dest1, tmp2, 12); // 14-bit in dest1
__ ld(tmp1, Address(src, 3 * sizeof(jlong)));
__ slli(tmp1, tmp1, 14);
__ add(dest1, dest1, tmp1); // 40-bit in dest1
__ ld(tmp1, Address(src, 4 * sizeof(jlong)));
__ slli(tmp2, tmp1, 40);
__ add(dest1, dest1, tmp2); // dest1 is full
if (dest2->is_valid()) {
__ srli(tmp1, tmp1, 24);
__ mv(dest2, tmp1); // 2 bits in dest2
} else {
#ifdef ASSERT
Label OK;
__ srli(tmp1, tmp1, 24);
__ beq(zr, tmp1, OK); // 2 bits
__ stop("high bits of Poly1305 integer should be zero");
__ should_not_reach_here();
__ bind(OK);
#endif
}
}
// As above, but return only a 128-bit integer, packed into two
// 64-bit registers.
void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
}
// U_2:U_1:U_0: += (U_2 >> 2) * 5
void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
// First, U_2:U_1:U_0 += (U_2 >> 2)
__ srli(tmp1, U_2, 2);
__ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
__ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
__ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
__ add(U_2, U_2, tmp2);
// Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
__ slli(tmp1, tmp1, 2);
__ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
__ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
__ add(U_2, U_2, tmp2);
}
// Poly1305, RFC 7539
// void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
// Arguments:
// c_rarg0: input_start -- where the input is stored
// c_rarg1: length
// c_rarg2: acc_start -- where the output will be stored
// c_rarg3: r_start -- where the randomly generated 128-bit key is stored
// See https://loup-vaillant.fr/tutorials/poly1305-design for a
// description of the tricks used to simplify and accelerate this
// computation.
address generate_poly1305_processBlocks() {
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ enter();
Label here;
RegSet saved_regs = RegSet::range(x18, x21);
RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
__ push_reg(saved_regs, sp);
// Arguments
const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
// R_n is the 128-bit randomly-generated key, packed into two
// registers. The caller passes this key to us as long[5], with
// BITS_PER_LIMB = 26.
const Register R_0 = *regs, R_1 = *++regs;
poly1305_pack_26(R_0, R_1, r_start, t1, t2);
// RR_n is (R_n >> 2) * 5
const Register RR_0 = *++regs, RR_1 = *++regs;
__ srli(t1, R_0, 2);
__ shadd(RR_0, t1, t1, t2, 2);
__ srli(t1, R_1, 2);
__ shadd(RR_1, t1, t1, t2, 2);
// U_n is the current checksum
const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
static constexpr int BLOCK_LENGTH = 16;
Label DONE, LOOP;
__ mv(t1, BLOCK_LENGTH);
__ blt(length, t1, DONE); {
__ bind(LOOP);
// S_n is to be the sum of U_n and the next block of data
const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
__ ld(S_0, Address(input_start, 0));
__ ld(S_1, Address(input_start, wordSize));
__ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
__ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
__ add(S_2, U_2, t1);
__ addi(S_2, S_2, 1);
const Register U_0HI = *++regs, U_1HI = *++regs;
// NB: this logic depends on some of the special properties of
// Poly1305 keys. In particular, because we know that the top
// four bits of R_0 and R_1 are zero, we can add together
// partial products without any risk of needing to propagate a
// carry out.
__ wide_mul(U_0, U_0HI, S_0, R_0);
__ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
__ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
__ wide_mul(U_1, U_1HI, S_0, R_1);
__ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
__ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
__ andi(U_2, R_0, right_2_bits);
__ mul(U_2, S_2, U_2);
// Partial reduction mod 2**130 - 5
__ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
__ adc(U_2, U_2, U_1HI, t1);
// Sum is now in U_2:U_1:U_0.
// U_2:U_1:U_0: += (U_2 >> 2) * 5
poly1305_reduce(U_2, U_1, U_0, t1, t2);
__ subi(length, length, BLOCK_LENGTH);
__ addi(input_start, input_start, BLOCK_LENGTH);
__ mv(t1, BLOCK_LENGTH);
__ bge(length, t1, LOOP);
}
// Further reduce modulo 2^130 - 5
poly1305_reduce(U_2, U_1, U_0, t1, t2);
// Unpack the sum into five 26-bit limbs and write to memory.
// First 26 bits is the first limb
__ slli(t1, U_0, 38); // Take lowest 26 bits
__ srli(t1, t1, 38);
__ sd(t1, Address(acc_start)); // First 26-bit limb
// 27-52 bits of U_0 is the second limb
__ slli(t1, U_0, 12); // Take next 27-52 bits
__ srli(t1, t1, 38);
__ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
// Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
__ srli(t1, U_0, 52);
__ slli(t2, U_1, 50);
__ srli(t2, t2, 38);
__ add(t1, t1, t2);
__ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
// Storing 15-40 bits of U_1
__ slli(t1, U_1, 24); // Already used up 14 bits
__ srli(t1, t1, 38); // Clear all other bits from t1
__ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
// Storing 41-64 bits of U_1 and first three bits from U_2 in one register
__ srli(t1, U_1, 40);
__ andi(t2, U_2, right_3_bits);
__ slli(t2, t2, 24);
__ add(t1, t1, t2);
__ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
__ bind(DONE);
__ pop_reg(saved_regs, sp);
__ leave(); // Required for proper stackwalking
__ ret();
return start;
}
#endif // COMPILER2
/**
* Arguments:
*
* Inputs:
* c_rarg0 - int crc
* c_rarg1 - byte* buf
* c_rarg2 - int length
*
* Output:
* c_rarg0 - int crc result
*/
address generate_updateBytesCRC32() {
assert(UseCRC32Intrinsics, "what are we doing here?");
__ align(CodeEntryAlignment);
StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
// input parameters
const Register crc = c_rarg0; // crc
const Register buf = c_rarg1; // source java byte array address
const Register len = c_rarg2; // length
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ kernel_crc32(crc, buf, len,
c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
c_rarg7, t2, t3, t4, t5, t6); // misc tmps
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret();
return start;
}
// exception handler for upcall stubs
address generate_upcall_stub_exception_handler() {
StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
// Native caller has no idea how to handle exceptions,
// so we just crash here. Up to callee to catch exceptions.
__ verify_oop(x10); // return a exception oop in a0
__ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
__ should_not_reach_here();
return start;
}
// load Method* target of MethodHandle
// j_rarg0 = jobject receiver
// xmethod = Method* result
address generate_upcall_stub_load_target() {
StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ resolve_global_jobject(j_rarg0, t0, t1);
// Load target method from receiver
__ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
__ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
__ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
__ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
noreg, noreg);
__ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
__ ret();
return start;
}
#undef __
// Initialization
void generate_initial_stubs() {
// Generate initial stubs and initializes the entry points
// entry points that exist in all platforms Note: This is code
// that could be shared among different platforms - however the
// benefit seems to be smaller than the disadvantage of having a
// much more complicated generator structure. See also comment in
// stubRoutines.hpp.
StubRoutines::_forward_exception_entry = generate_forward_exception();
if (UnsafeMemoryAccess::_table == nullptr) {
UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
}
StubRoutines::_call_stub_entry =
generate_call_stub(StubRoutines::_call_stub_return_address);
// is referenced by megamorphic call
StubRoutines::_catch_exception_entry = generate_catch_exception();
if (UseCRC32Intrinsics) {
// set table address before stub generation which use it
StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
StubRoutines::_hf2f = generate_float16ToFloat();
StubRoutines::_f2hf = generate_floatToFloat16();
}
}
void generate_continuation_stubs() {
// Continuation stubs:
StubRoutines::_cont_thaw = generate_cont_thaw();
StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
}
void generate_final_stubs() {
// support for verify_oop (must happen after universe_init)
if (VerifyOops) {
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
}
// arraycopy stubs used by compilers
generate_arraycopy_stubs();
StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
#ifdef COMPILER2
if (UseSecondarySupersTable) {
StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
if (!InlineSecondarySupersTest) {
generate_lookup_secondary_supers_table_stub();
}
}
#endif // COMPILER2
StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
StubRoutines::riscv::set_completed();
}
void generate_compiler_stubs() {
#ifdef COMPILER2
if (UseMulAddIntrinsic) {
StubRoutines::_mulAdd = generate_mulAdd();
}
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
if (UseSquareToLenIntrinsic) {
StubRoutines::_squareToLen = generate_squareToLen();
}
if (UseMontgomeryMultiplyIntrinsic) {
StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
StubCodeMark mark(this, stub_id);
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
StubRoutines::_montgomeryMultiply = g.generate_multiply();
}
if (UseMontgomerySquareIntrinsic) {
StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
StubCodeMark mark(this, stub_id);
MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
StubRoutines::_montgomerySquare = g.generate_square();
}
if (UseAESIntrinsics) {
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
}
if (UsePoly1305Intrinsics) {
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
}
if (UseRVV) {
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
}
if (UseSHA256Intrinsics) {
Sha2Generator sha2(_masm, this);
StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
}
if (UseSHA512Intrinsics) {
Sha2Generator sha2(_masm, this);
StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
}
if (UseMD5Intrinsics) {
StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
}
if (UseChaCha20Intrinsics) {
StubRoutines::_chacha20Block = generate_chacha20Block();
}
if (UseSHA1Intrinsics) {
StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
}
if (UseBASE64Intrinsics) {
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
}
if (UseAdler32Intrinsics) {
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
}
generate_compare_long_strings();
generate_string_indexof_stubs();
#endif // COMPILER2
}
public:
StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
switch(blob_id) {
case initial_id:
generate_initial_stubs();
break;
case continuation_id:
generate_continuation_stubs();
break;
case compiler_id:
generate_compiler_stubs();
break;
case final_id:
generate_final_stubs();
break;
default:
fatal("unexpected blob id: %d", blob_id);
break;
};
}
}; // end class declaration
void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
StubGenerator g(code, blob_id);
}