mirror of
https://github.com/openjdk/jdk.git
synced 2026-01-28 12:09:14 +00:00
5122 lines
187 KiB
C++
5122 lines
187 KiB
C++
/*
|
|
* Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2012, 2025 SAP SE. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*
|
|
*/
|
|
|
|
#include "asm/macroAssembler.inline.hpp"
|
|
#include "compiler/oopMap.hpp"
|
|
#include "gc/shared/barrierSet.hpp"
|
|
#include "gc/shared/barrierSetAssembler.hpp"
|
|
#include "gc/shared/barrierSetNMethod.hpp"
|
|
#include "interpreter/interpreter.hpp"
|
|
#include "nativeInst_ppc.hpp"
|
|
#include "oops/instanceOop.hpp"
|
|
#include "oops/method.hpp"
|
|
#include "oops/objArrayKlass.hpp"
|
|
#include "oops/oop.inline.hpp"
|
|
#include "prims/methodHandles.hpp"
|
|
#include "prims/upcallLinker.hpp"
|
|
#include "runtime/continuation.hpp"
|
|
#include "runtime/continuationEntry.inline.hpp"
|
|
#include "runtime/frame.inline.hpp"
|
|
#include "runtime/handles.inline.hpp"
|
|
#include "runtime/javaThread.hpp"
|
|
#include "runtime/sharedRuntime.hpp"
|
|
#include "runtime/stubCodeGenerator.hpp"
|
|
#include "runtime/stubRoutines.hpp"
|
|
#include "runtime/vm_version.hpp"
|
|
#include "utilities/align.hpp"
|
|
#include "utilities/powerOfTwo.hpp"
|
|
#if INCLUDE_ZGC
|
|
#include "gc/z/zBarrierSetAssembler.hpp"
|
|
#endif
|
|
|
|
// Declaration and definition of StubGenerator (no .hpp file).
|
|
// For a more detailed description of the stub routine structure
|
|
// see the comment in stubRoutines.hpp.
|
|
|
|
#define __ _masm->
|
|
|
|
#ifdef PRODUCT
|
|
#define BLOCK_COMMENT(str) // nothing
|
|
#else
|
|
#define BLOCK_COMMENT(str) __ block_comment(str)
|
|
#endif
|
|
|
|
#if defined(ABI_ELFv2)
|
|
#define STUB_ENTRY(name) StubRoutines::name
|
|
#else
|
|
#define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry()
|
|
#endif
|
|
|
|
class StubGenerator: public StubCodeGenerator {
|
|
private:
|
|
|
|
// Call stubs are used to call Java from C
|
|
//
|
|
// Arguments:
|
|
//
|
|
// R3 - call wrapper address : address
|
|
// R4 - result : intptr_t*
|
|
// R5 - result type : BasicType
|
|
// R6 - method : Method
|
|
// R7 - frame mgr entry point : address
|
|
// R8 - parameter block : intptr_t*
|
|
// R9 - parameter count in words : int
|
|
// R10 - thread : Thread*
|
|
//
|
|
address generate_call_stub(address& return_address) {
|
|
// Setup a new c frame, copy java arguments, call template interpreter or
|
|
// native_entry, and process result.
|
|
|
|
StubId stub_id = StubId::stubgen_call_stub_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ function_entry();
|
|
|
|
int save_nonvolatile_registers_size = __ save_nonvolatile_registers_size(true, SuperwordUseVSX);
|
|
|
|
// some sanity checks
|
|
STATIC_ASSERT(StackAlignmentInBytes == 16);
|
|
assert((sizeof(frame::native_abi_minframe) % 16) == 0, "unaligned");
|
|
assert((sizeof(frame::native_abi_reg_args) % 16) == 0, "unaligned");
|
|
assert((save_nonvolatile_registers_size % 16) == 0, "unaligned");
|
|
assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
|
|
assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
|
|
|
|
Register r_arg_call_wrapper_addr = R3;
|
|
Register r_arg_result_addr = R4;
|
|
Register r_arg_result_type = R5;
|
|
Register r_arg_method = R6;
|
|
Register r_arg_entry = R7;
|
|
Register r_arg_argument_addr = R8;
|
|
Register r_arg_argument_count = R9;
|
|
Register r_arg_thread = R10;
|
|
|
|
Register r_entryframe_fp = R2; // volatile
|
|
Register r_argument_size = R11_scratch1; // volatile
|
|
Register r_top_of_arguments_addr = R21_tmp1;
|
|
|
|
{
|
|
// Stack on entry to call_stub:
|
|
//
|
|
// F1 [C_FRAME]
|
|
// ...
|
|
Register r_frame_size = R12_scratch2; // volatile
|
|
Label arguments_copied;
|
|
|
|
// Save LR/CR to caller's C_FRAME.
|
|
__ save_LR_CR(R0);
|
|
|
|
// Keep copy of our frame pointer (caller's SP).
|
|
__ mr(r_entryframe_fp, R1_SP);
|
|
|
|
// calculate frame size
|
|
STATIC_ASSERT(Interpreter::logStackElementSize == 3);
|
|
|
|
// space for arguments aligned up: ((arg_count + 1) * 8) &~ 15
|
|
__ addi(r_frame_size, r_arg_argument_count, 1);
|
|
__ rldicr(r_frame_size, r_frame_size, 3, 63 - 4);
|
|
|
|
// this is the pure space for arguments (excluding alignment padding)
|
|
__ sldi(r_argument_size, r_arg_argument_count, 3);
|
|
|
|
__ addi(r_frame_size, r_frame_size,
|
|
save_nonvolatile_registers_size + frame::entry_frame_locals_size + frame::top_ijava_frame_abi_size);
|
|
|
|
// push ENTRY_FRAME
|
|
__ push_frame(r_frame_size, R0);
|
|
|
|
// Save non-volatiles registers to ENTRY_FRAME.
|
|
__ save_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
|
|
true, SuperwordUseVSX);
|
|
|
|
BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
|
|
// Push ENTRY_FRAME including arguments:
|
|
//
|
|
// F0 [TOP_IJAVA_FRAME_ABI]
|
|
// alignment (optional)
|
|
// [outgoing Java arguments]
|
|
// [non-volatiles]
|
|
// [ENTRY_FRAME_LOCALS]
|
|
// F1 [C_FRAME]
|
|
// ...
|
|
|
|
// initialize call_stub locals (step 1)
|
|
__ std(r_arg_call_wrapper_addr, _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
|
|
__ std(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
|
|
__ std(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
|
|
// we will save arguments_tos_address later
|
|
|
|
BLOCK_COMMENT("Copy Java arguments");
|
|
// copy Java arguments
|
|
|
|
// Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
|
|
__ addi(r_top_of_arguments_addr, r_entryframe_fp,
|
|
-(save_nonvolatile_registers_size + frame::entry_frame_locals_size));
|
|
__ sub(r_top_of_arguments_addr, r_top_of_arguments_addr, r_argument_size);
|
|
|
|
// any arguments to copy?
|
|
__ cmpdi(CR0, r_arg_argument_count, 0);
|
|
__ beq(CR0, arguments_copied);
|
|
|
|
// prepare loop and copy arguments in reverse order
|
|
{
|
|
Register r_argument_addr = R22_tmp2;
|
|
Register r_argumentcopy_addr = R23_tmp3;
|
|
// init CTR with arg_argument_count
|
|
__ mtctr(r_arg_argument_count);
|
|
|
|
// let r_argumentcopy_addr point to last outgoing Java arguments P
|
|
__ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
|
|
|
|
// let r_argument_addr point to last incoming java argument
|
|
__ add(r_argument_addr, r_arg_argument_addr, r_argument_size);
|
|
__ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
|
|
|
|
// now loop while CTR > 0 and copy arguments
|
|
{
|
|
Label next_argument;
|
|
__ bind(next_argument);
|
|
|
|
__ ld(R0, 0, r_argument_addr);
|
|
// argument_addr--;
|
|
__ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
|
|
__ std(R0, 0, r_argumentcopy_addr);
|
|
// argumentcopy_addr++;
|
|
__ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
|
|
|
|
__ bdnz(next_argument);
|
|
}
|
|
}
|
|
|
|
// Arguments copied, continue.
|
|
__ bind(arguments_copied);
|
|
}
|
|
|
|
{
|
|
BLOCK_COMMENT("Call template interpreter or native entry.");
|
|
assert_different_registers(r_arg_entry, r_top_of_arguments_addr, r_arg_method, r_arg_thread);
|
|
|
|
// Register state on entry to template interpreter / native entry:
|
|
//
|
|
// tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
|
|
// R19_method - Method
|
|
// R16_thread - JavaThread*
|
|
|
|
// Tos must point to last argument - element_size.
|
|
const Register tos = R15_esp;
|
|
|
|
__ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
|
|
|
|
// initialize call_stub locals (step 2)
|
|
// now save tos as arguments_tos_address
|
|
__ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
|
|
|
|
// load argument registers for call
|
|
__ mr(R19_method, r_arg_method);
|
|
__ mr(R16_thread, r_arg_thread);
|
|
assert(tos != r_arg_method, "trashed r_arg_method");
|
|
assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
|
|
|
|
// Set R15_prev_state to 0 for simplifying checks in callee.
|
|
__ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R0);
|
|
// Stack on entry to template interpreter / native entry:
|
|
//
|
|
// F0 [TOP_IJAVA_FRAME_ABI]
|
|
// alignment (optional)
|
|
// [outgoing Java arguments]
|
|
// [non-volatiles]
|
|
// [ENTRY_FRAME_LOCALS]
|
|
// F1 [C_FRAME]
|
|
// ...
|
|
//
|
|
|
|
// global toc register
|
|
__ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R0);
|
|
// Remember the senderSP so we interpreter can pop c2i arguments off of the stack
|
|
// when called via a c2i.
|
|
|
|
// Pass initial_caller_sp to framemanager.
|
|
__ mr(R21_sender_SP, R1_SP);
|
|
|
|
// Do a light-weight C-call here, r_arg_entry holds the address
|
|
// of the interpreter entry point (template interpreter or native entry)
|
|
// and save runtime-value of LR in return_address.
|
|
assert(r_arg_entry != tos && r_arg_entry != R19_method && r_arg_entry != R16_thread,
|
|
"trashed r_arg_entry");
|
|
return_address = __ call_stub(r_arg_entry);
|
|
}
|
|
|
|
{
|
|
BLOCK_COMMENT("Returned from template interpreter or native entry.");
|
|
// Now pop frame, process result, and return to caller.
|
|
|
|
// Stack on exit from template interpreter / native entry:
|
|
//
|
|
// F0 [ABI]
|
|
// ...
|
|
// [non-volatiles]
|
|
// [ENTRY_FRAME_LOCALS]
|
|
// F1 [C_FRAME]
|
|
// ...
|
|
//
|
|
// Just pop the topmost frame ...
|
|
//
|
|
|
|
Label ret_is_object;
|
|
Label ret_is_long;
|
|
Label ret_is_float;
|
|
Label ret_is_double;
|
|
|
|
Register r_lr = R11_scratch1;
|
|
Register r_cr = R12_scratch2;
|
|
|
|
// Reload some volatile registers which we've spilled before the call
|
|
// to template interpreter / native entry.
|
|
// Access all locals via frame pointer, because we know nothing about
|
|
// the topmost frame's size.
|
|
__ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP); // restore after call
|
|
assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
|
|
__ ld(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
|
|
__ ld(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
|
|
__ ld(r_cr, _abi0(cr), r_entryframe_fp);
|
|
__ ld(r_lr, _abi0(lr), r_entryframe_fp);
|
|
__ mtcr(r_cr); // restore CR
|
|
__ mtlr(r_lr); // restore LR
|
|
|
|
// Store result depending on type. Everything that is not
|
|
// T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
|
|
// Using volatile CRs.
|
|
__ cmpwi(CR1, r_arg_result_type, T_OBJECT);
|
|
__ cmpwi(CR5, r_arg_result_type, T_LONG);
|
|
__ cmpwi(CR6, r_arg_result_type, T_FLOAT);
|
|
__ cmpwi(CR7, r_arg_result_type, T_DOUBLE);
|
|
|
|
__ pop_cont_fastpath(); // kills CR0, uses R16_thread
|
|
|
|
// restore non-volatile registers
|
|
__ restore_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
|
|
true, SuperwordUseVSX);
|
|
|
|
// pop frame
|
|
__ mr(R1_SP, r_entryframe_fp);
|
|
|
|
// Stack on exit from call_stub:
|
|
//
|
|
// 0 [C_FRAME]
|
|
// ...
|
|
//
|
|
// no call_stub frames left.
|
|
|
|
__ beq(CR1, ret_is_object);
|
|
__ beq(CR5, ret_is_long);
|
|
__ beq(CR6, ret_is_float);
|
|
__ beq(CR7, ret_is_double);
|
|
|
|
// default:
|
|
__ stw(R3_RET, 0, r_arg_result_addr);
|
|
__ blr(); // return to caller
|
|
|
|
// case T_OBJECT:
|
|
// case T_LONG:
|
|
__ bind(ret_is_object);
|
|
__ bind(ret_is_long);
|
|
__ std(R3_RET, 0, r_arg_result_addr);
|
|
__ blr(); // return to caller
|
|
|
|
// case T_FLOAT:
|
|
__ bind(ret_is_float);
|
|
__ stfs(F1_RET, 0, r_arg_result_addr);
|
|
__ blr(); // return to caller
|
|
|
|
// case T_DOUBLE:
|
|
__ bind(ret_is_double);
|
|
__ stfd(F1_RET, 0, r_arg_result_addr);
|
|
__ blr(); // return to caller
|
|
}
|
|
|
|
return start;
|
|
}
|
|
|
|
// Return point for a Java call if there's an exception thrown in
|
|
// Java code. The exception is caught and transformed into a
|
|
// pending exception stored in JavaThread that can be tested from
|
|
// within the VM.
|
|
//
|
|
address generate_catch_exception() {
|
|
StubId stub_id = StubId::stubgen_catch_exception_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
|
|
// Registers alive
|
|
//
|
|
// R16_thread
|
|
// R3_ARG1 - address of pending exception
|
|
// R4_ARG2 - return address in call stub
|
|
|
|
const Register exception_file = R21_tmp1;
|
|
const Register exception_line = R22_tmp2;
|
|
|
|
__ load_const(exception_file, (void*)__FILE__);
|
|
__ load_const(exception_line, (void*)__LINE__);
|
|
|
|
__ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
|
|
// store into `char *'
|
|
__ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
|
|
// store into `int'
|
|
__ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
|
|
|
|
// complete return to VM
|
|
assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
|
|
|
|
__ mtlr(R4_ARG2);
|
|
// continue in call stub
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Continuation point for runtime calls returning with a pending
|
|
// exception. The pending exception check happened in the runtime
|
|
// or native call stub. The pending exception in Thread is
|
|
// converted into a Java-level exception.
|
|
//
|
|
// Read:
|
|
//
|
|
// LR: The pc the runtime library callee wants to return to.
|
|
// Since the exception occurred in the callee, the return pc
|
|
// from the point of view of Java is the exception pc.
|
|
// thread: Needed for method handles.
|
|
//
|
|
// Invalidate:
|
|
//
|
|
// volatile registers (except below).
|
|
//
|
|
// Update:
|
|
//
|
|
// R4_ARG2: exception
|
|
//
|
|
// (LR is unchanged and is live out).
|
|
//
|
|
address generate_forward_exception() {
|
|
StubId stub_id = StubId::stubgen_forward_exception_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
if (VerifyOops) {
|
|
// Get pending exception oop.
|
|
__ ld(R3_ARG1,
|
|
in_bytes(Thread::pending_exception_offset()),
|
|
R16_thread);
|
|
// Make sure that this code is only executed if there is a pending exception.
|
|
{
|
|
Label L;
|
|
__ cmpdi(CR0, R3_ARG1, 0);
|
|
__ bne(CR0, L);
|
|
__ stop("StubRoutines::forward exception: no pending exception (1)");
|
|
__ bind(L);
|
|
}
|
|
__ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
|
|
}
|
|
|
|
// Save LR/CR and copy exception pc (LR) into R4_ARG2.
|
|
__ save_LR(R4_ARG2);
|
|
__ push_frame_reg_args(0, R0);
|
|
// Find exception handler.
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
|
|
SharedRuntime::exception_handler_for_return_address),
|
|
R16_thread,
|
|
R4_ARG2);
|
|
// Copy handler's address.
|
|
__ mtctr(R3_RET);
|
|
__ pop_frame();
|
|
__ restore_LR(R0);
|
|
|
|
// Set up the arguments for the exception handler:
|
|
// - R3_ARG1: exception oop
|
|
// - R4_ARG2: exception pc.
|
|
|
|
// Load pending exception oop.
|
|
__ ld(R3_ARG1,
|
|
in_bytes(Thread::pending_exception_offset()),
|
|
R16_thread);
|
|
|
|
// The exception pc is the return address in the caller.
|
|
// Must load it into R4_ARG2.
|
|
__ mflr(R4_ARG2);
|
|
|
|
#ifdef ASSERT
|
|
// Make sure exception is set.
|
|
{
|
|
Label L;
|
|
__ cmpdi(CR0, R3_ARG1, 0);
|
|
__ bne(CR0, L);
|
|
__ stop("StubRoutines::forward exception: no pending exception (2)");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
// Clear the pending exception.
|
|
__ li(R0, 0);
|
|
__ std(R0,
|
|
in_bytes(Thread::pending_exception_offset()),
|
|
R16_thread);
|
|
// Jump to exception handler.
|
|
__ bctr();
|
|
|
|
return start;
|
|
}
|
|
|
|
#undef __
|
|
#define __ _masm->
|
|
|
|
#if !defined(PRODUCT)
|
|
// Wrapper which calls oopDesc::is_oop_or_null()
|
|
// Only called by MacroAssembler::verify_oop
|
|
static void verify_oop_helper(const char* message, oopDesc* o) {
|
|
if (!oopDesc::is_oop_or_null(o)) {
|
|
fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
|
|
}
|
|
++ StubRoutines::_verify_oop_count;
|
|
}
|
|
#endif
|
|
|
|
// Return address of code to be called from code generated by
|
|
// MacroAssembler::verify_oop.
|
|
//
|
|
// Don't generate, rather use C++ code.
|
|
address generate_verify_oop() {
|
|
// this is actually a `FunctionDescriptor*'.
|
|
address start = nullptr;
|
|
|
|
#if !defined(PRODUCT)
|
|
start = CAST_FROM_FN_PTR(address, verify_oop_helper);
|
|
#endif
|
|
|
|
return start;
|
|
}
|
|
|
|
// Computes the Galois/Counter Mode (GCM) product and reduction.
|
|
//
|
|
// This function performs polynomial multiplication of the subkey H with
|
|
// the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
|
|
// The subkey H is divided into lower, middle, and higher halves.
|
|
// The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
|
|
// The final computed value is stored back into `vState`.
|
|
static void computeGCMProduct(MacroAssembler* _masm,
|
|
VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
|
|
VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
|
|
VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
|
|
VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
|
|
VectorRegister vCombinedResult, VectorRegister vSwappedH) {
|
|
__ vxor(vH, vH, vState);
|
|
__ vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H
|
|
__ vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H
|
|
__ vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H
|
|
__ vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction
|
|
__ vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M
|
|
__ vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M
|
|
__ vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half
|
|
__ vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half
|
|
__ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap
|
|
__ vxor(vLowProduct, vLowProduct, vReducedLow);
|
|
__ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap
|
|
__ vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant
|
|
__ vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products
|
|
__ vxor(vState, vLowProduct, vCombinedResult);
|
|
}
|
|
|
|
// Generate stub for ghash process blocks.
|
|
//
|
|
// Arguments for generated stub:
|
|
// state: R3_ARG1 (long[] state)
|
|
// subkeyH: R4_ARG2 (long[] subH)
|
|
// data: R5_ARG3 (byte[] data)
|
|
// blocks: R6_ARG4 (number of 16-byte blocks to process)
|
|
//
|
|
// The polynomials are processed in bit-reflected order for efficiency reasons.
|
|
// This optimization leverages the structure of the Galois field arithmetic
|
|
// to minimize the number of bit manipulations required during multiplication.
|
|
// For an explanation of how this works, refer :
|
|
// Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
|
|
// Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel®
|
|
// Architecture Processor"
|
|
// http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
|
|
//
|
|
//
|
|
address generate_ghash_processBlocks() {
|
|
StubCodeMark mark(this, "StubRoutines", "ghash");
|
|
address start = __ function_entry();
|
|
|
|
// Registers for parameters
|
|
Register state = R3_ARG1; // long[] state
|
|
Register subkeyH = R4_ARG2; // long[] subH
|
|
Register data = R5_ARG3; // byte[] data
|
|
Register blocks = R6_ARG4;
|
|
Register temp1 = R8;
|
|
// Vector Registers
|
|
VectorRegister vZero = VR0;
|
|
VectorRegister vH = VR1;
|
|
VectorRegister vLowerH = VR2;
|
|
VectorRegister vHigherH = VR3;
|
|
VectorRegister vLowProduct = VR4;
|
|
VectorRegister vMidProduct = VR5;
|
|
VectorRegister vHighProduct = VR6;
|
|
VectorRegister vReducedLow = VR7;
|
|
VectorRegister vTmp8 = VR8;
|
|
VectorRegister vTmp9 = VR9;
|
|
VectorRegister vTmp10 = VR10;
|
|
VectorRegister vSwappedH = VR11;
|
|
VectorRegister vTmp12 = VR12;
|
|
VectorRegister loadOrder = VR13;
|
|
VectorRegister vHigh = VR14;
|
|
VectorRegister vLow = VR15;
|
|
VectorRegister vState = VR16;
|
|
VectorRegister vPerm = VR17;
|
|
VectorRegister vCombinedResult = VR18;
|
|
VectorRegister vConstC2 = VR19;
|
|
|
|
__ li(temp1, 0xc2);
|
|
__ sldi(temp1, temp1, 56);
|
|
__ vspltisb(vZero, 0);
|
|
__ mtvrd(vConstC2, temp1);
|
|
__ lxvd2x(vH->to_vsr(), subkeyH);
|
|
__ lxvd2x(vState->to_vsr(), state);
|
|
// Operations to obtain lower and higher bytes of subkey H.
|
|
__ vspltisb(vReducedLow, 1);
|
|
__ vspltisb(vTmp10, 7);
|
|
__ vsldoi(vTmp8, vZero, vReducedLow, 1); // 0x1
|
|
__ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1
|
|
__ vsplt(vTmp9, 0, vH); // MSB of H
|
|
__ vsl(vH, vH, vReducedLow); // Carry = H<<7
|
|
__ vsrab(vTmp9, vTmp9, vTmp10);
|
|
__ vand(vTmp9, vTmp9, vTmp8); // Carry
|
|
__ vxor(vTmp10, vH, vTmp9);
|
|
__ vsldoi(vConstC2, vZero, vConstC2, 8);
|
|
__ vsldoi(vSwappedH, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H
|
|
__ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L
|
|
__ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H
|
|
#ifdef ASSERT
|
|
__ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero
|
|
__ asm_assert_ne("blocks should NOT be zero");
|
|
#endif
|
|
__ clrldi(blocks, blocks, 32);
|
|
__ mtctr(blocks);
|
|
__ lvsl(loadOrder, temp1);
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
__ vspltisb(vTmp12, 0xf);
|
|
__ vxor(loadOrder, loadOrder, vTmp12);
|
|
#define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder)
|
|
#else
|
|
#define LE_swap_bytes(x)
|
|
#endif
|
|
|
|
// This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
|
|
//
|
|
// The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
|
|
// performing three 128-bit multiplications and combining the results efficiently.
|
|
//
|
|
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
|
|
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
|
|
//
|
|
// Inputs:
|
|
// - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half).
|
|
// - vLowerH: Lower half of the subkey H (A0).
|
|
// - vHigherH: Higher half of the subkey H (A1).
|
|
// - vConstC2: Constant used for reduction (for final processing).
|
|
//
|
|
// References:
|
|
// Shay Gueron, Michael E. Kounavis.
|
|
// "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
|
|
// https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
|
|
//
|
|
Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
|
|
__ andi(temp1, data, 15);
|
|
__ cmpwi(CR0, temp1, 0);
|
|
__ bne(CR0, L_initialize_unaligned_loop);
|
|
|
|
__ bind(L_aligned_loop);
|
|
__ lvx(vH, temp1, data);
|
|
LE_swap_bytes(vH);
|
|
computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
|
|
vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
|
|
__ addi(data, data, 16);
|
|
__ bdnz(L_aligned_loop);
|
|
__ b(L_store);
|
|
|
|
__ bind(L_initialize_unaligned_loop);
|
|
__ li(temp1, 0);
|
|
__ lvsl(vPerm, temp1, data);
|
|
__ lvx(vHigh, temp1, data);
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
__ vspltisb(vTmp12, -1);
|
|
__ vxor(vPerm, vPerm, vTmp12);
|
|
#endif
|
|
__ bind(L_unaligned_loop);
|
|
__ addi(data, data, 16);
|
|
__ lvx(vLow, temp1, data);
|
|
__ vec_perm(vH, vHigh, vLow, vPerm);
|
|
computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
|
|
vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
|
|
__ vmr(vHigh, vLow);
|
|
__ bdnz(L_unaligned_loop);
|
|
|
|
__ bind(L_store);
|
|
__ stxvd2x(vState->to_vsr(), state);
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
// -XX:+OptimizeFill : convert fill/copy loops into intrinsic
|
|
//
|
|
// The code is implemented(ported from sparc) as we believe it benefits JVM98, however
|
|
// tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
|
|
//
|
|
// Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
|
|
// for turning on loop predication optimization, and hence the behavior of "array range check"
|
|
// and "loop invariant check" could be influenced, which potentially boosted JVM98.
|
|
//
|
|
// Generate stub for disjoint short fill. If "aligned" is true, the
|
|
// "to" address is assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// to: R3_ARG1
|
|
// value: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
address generate_fill(StubId stub_id) {
|
|
BasicType t;
|
|
bool aligned;
|
|
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jbyte_fill_id:
|
|
t = T_BYTE;
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_jshort_fill_id:
|
|
t = T_SHORT;
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_jint_fill_id:
|
|
t = T_INT;
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jbyte_fill_id:
|
|
t = T_BYTE;
|
|
aligned = true;
|
|
break;
|
|
case StubId::stubgen_arrayof_jshort_fill_id:
|
|
t = T_SHORT;
|
|
aligned = true;
|
|
break;
|
|
case StubId::stubgen_arrayof_jint_fill_id:
|
|
t = T_INT;
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
|
|
const Register to = R3_ARG1; // source array address
|
|
const Register value = R4_ARG2; // fill value
|
|
const Register count = R5_ARG3; // elements count
|
|
const Register temp = R6_ARG4; // temp register
|
|
|
|
//assert_clean_int(count, O3); // Make sure 'count' is clean int.
|
|
|
|
Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
|
|
Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
|
|
|
|
int shift = -1;
|
|
switch (t) {
|
|
case T_BYTE:
|
|
shift = 2;
|
|
// Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
|
|
__ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
|
|
__ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
|
|
__ blt(CR0, L_fill_elements);
|
|
__ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
|
|
break;
|
|
case T_SHORT:
|
|
shift = 1;
|
|
// Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
|
|
__ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
|
|
__ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
|
|
__ blt(CR0, L_fill_elements);
|
|
break;
|
|
case T_INT:
|
|
shift = 0;
|
|
__ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
|
|
__ blt(CR0, L_fill_4_bytes);
|
|
break;
|
|
default: ShouldNotReachHere();
|
|
}
|
|
|
|
if (!aligned && (t == T_BYTE || t == T_SHORT)) {
|
|
// Align source address at 4 bytes address boundary.
|
|
if (t == T_BYTE) {
|
|
// One byte misalignment happens only for byte arrays.
|
|
__ andi_(temp, to, 1);
|
|
__ beq(CR0, L_skip_align1);
|
|
__ stb(value, 0, to);
|
|
__ addi(to, to, 1);
|
|
__ addi(count, count, -1);
|
|
__ bind(L_skip_align1);
|
|
}
|
|
// Two bytes misalignment happens only for byte and short (char) arrays.
|
|
__ andi_(temp, to, 2);
|
|
__ beq(CR0, L_skip_align2);
|
|
__ sth(value, 0, to);
|
|
__ addi(to, to, 2);
|
|
__ addi(count, count, -(1 << (shift - 1)));
|
|
__ bind(L_skip_align2);
|
|
}
|
|
|
|
if (!aligned) {
|
|
// Align to 8 bytes, we know we are 4 byte aligned to start.
|
|
__ andi_(temp, to, 7);
|
|
__ beq(CR0, L_fill_32_bytes);
|
|
__ stw(value, 0, to);
|
|
__ addi(to, to, 4);
|
|
__ addi(count, count, -(1 << shift));
|
|
__ bind(L_fill_32_bytes);
|
|
}
|
|
|
|
__ li(temp, 8<<shift); // Prepare for 32 byte loop.
|
|
// Clone bytes int->long as above.
|
|
__ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
|
|
|
|
Label L_check_fill_8_bytes;
|
|
// Fill 32-byte chunks.
|
|
__ subf_(count, temp, count);
|
|
__ blt(CR0, L_check_fill_8_bytes);
|
|
|
|
Label L_fill_32_bytes_loop;
|
|
__ align(32);
|
|
__ bind(L_fill_32_bytes_loop);
|
|
|
|
__ std(value, 0, to);
|
|
__ std(value, 8, to);
|
|
__ subf_(count, temp, count); // Update count.
|
|
__ std(value, 16, to);
|
|
__ std(value, 24, to);
|
|
|
|
__ addi(to, to, 32);
|
|
__ bge(CR0, L_fill_32_bytes_loop);
|
|
|
|
__ bind(L_check_fill_8_bytes);
|
|
__ add_(count, temp, count);
|
|
__ beq(CR0, L_exit);
|
|
__ addic_(count, count, -(2 << shift));
|
|
__ blt(CR0, L_fill_4_bytes);
|
|
|
|
//
|
|
// Length is too short, just fill 8 bytes at a time.
|
|
//
|
|
Label L_fill_8_bytes_loop;
|
|
__ bind(L_fill_8_bytes_loop);
|
|
__ std(value, 0, to);
|
|
__ addic_(count, count, -(2 << shift));
|
|
__ addi(to, to, 8);
|
|
__ bge(CR0, L_fill_8_bytes_loop);
|
|
|
|
// Fill trailing 4 bytes.
|
|
__ bind(L_fill_4_bytes);
|
|
__ andi_(temp, count, 1<<shift);
|
|
__ beq(CR0, L_fill_2_bytes);
|
|
|
|
__ stw(value, 0, to);
|
|
if (t == T_BYTE || t == T_SHORT) {
|
|
__ addi(to, to, 4);
|
|
// Fill trailing 2 bytes.
|
|
__ bind(L_fill_2_bytes);
|
|
__ andi_(temp, count, 1<<(shift-1));
|
|
__ beq(CR0, L_fill_byte);
|
|
__ sth(value, 0, to);
|
|
if (t == T_BYTE) {
|
|
__ addi(to, to, 2);
|
|
// Fill trailing byte.
|
|
__ bind(L_fill_byte);
|
|
__ andi_(count, count, 1);
|
|
__ beq(CR0, L_exit);
|
|
__ stb(value, 0, to);
|
|
} else {
|
|
__ bind(L_fill_byte);
|
|
}
|
|
} else {
|
|
__ bind(L_fill_2_bytes);
|
|
}
|
|
__ bind(L_exit);
|
|
__ blr();
|
|
|
|
// Handle copies less than 8 bytes. Int is handled elsewhere.
|
|
if (t == T_BYTE) {
|
|
__ bind(L_fill_elements);
|
|
Label L_fill_2, L_fill_4;
|
|
__ andi_(temp, count, 1);
|
|
__ beq(CR0, L_fill_2);
|
|
__ stb(value, 0, to);
|
|
__ addi(to, to, 1);
|
|
__ bind(L_fill_2);
|
|
__ andi_(temp, count, 2);
|
|
__ beq(CR0, L_fill_4);
|
|
__ stb(value, 0, to);
|
|
__ stb(value, 0, to);
|
|
__ addi(to, to, 2);
|
|
__ bind(L_fill_4);
|
|
__ andi_(temp, count, 4);
|
|
__ beq(CR0, L_exit);
|
|
__ stb(value, 0, to);
|
|
__ stb(value, 1, to);
|
|
__ stb(value, 2, to);
|
|
__ stb(value, 3, to);
|
|
__ blr();
|
|
}
|
|
|
|
if (t == T_SHORT) {
|
|
Label L_fill_2;
|
|
__ bind(L_fill_elements);
|
|
__ andi_(temp, count, 1);
|
|
__ beq(CR0, L_fill_2);
|
|
__ sth(value, 0, to);
|
|
__ addi(to, to, 2);
|
|
__ bind(L_fill_2);
|
|
__ andi_(temp, count, 2);
|
|
__ beq(CR0, L_exit);
|
|
__ sth(value, 0, to);
|
|
__ sth(value, 2, to);
|
|
__ blr();
|
|
}
|
|
return start;
|
|
}
|
|
|
|
inline void assert_positive_int(Register count) {
|
|
#ifdef ASSERT
|
|
__ srdi_(R0, count, 31);
|
|
__ asm_assert_eq("missing zero extend");
|
|
#endif
|
|
}
|
|
|
|
// Generate overlap test for array copy stubs.
|
|
//
|
|
// Input:
|
|
// R3_ARG1 - from
|
|
// R4_ARG2 - to
|
|
// R5_ARG3 - element count
|
|
//
|
|
void array_overlap_test(address no_overlap_target, int log2_elem_size) {
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
|
|
assert_positive_int(R5_ARG3);
|
|
|
|
__ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
|
|
__ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
|
|
__ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
|
|
__ cmpld(CR1, tmp1, tmp2);
|
|
__ crnand(CR0, Assembler::less, CR1, Assembler::less);
|
|
// Overlaps if Src before dst and distance smaller than size.
|
|
// Branch to forward copy routine otherwise (within range of 32kB).
|
|
__ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::less), no_overlap_target);
|
|
|
|
// need to copy backwards
|
|
}
|
|
|
|
// This is common errorexit stub for UnsafeMemoryAccess.
|
|
address generate_unsafecopy_common_error_exit() {
|
|
address start_pc = __ pc();
|
|
Register tmp1 = R6_ARG4;
|
|
// probably copy stub would have changed value reset it.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp1, VM_Version::_dscr_val);
|
|
__ mtdscr(tmp1);
|
|
}
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
return start_pc;
|
|
}
|
|
|
|
// The guideline in the implementations of generate_disjoint_xxx_copy
|
|
// (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
|
|
// single instructions, but to avoid alignment interrupts (see subsequent
|
|
// comment). Furthermore, we try to minimize misaligned access, even
|
|
// though they cause no alignment interrupt.
|
|
//
|
|
// In Big-Endian mode, the PowerPC architecture requires implementations to
|
|
// handle automatically misaligned integer halfword and word accesses,
|
|
// word-aligned integer doubleword accesses, and word-aligned floating-point
|
|
// accesses. Other accesses may or may not generate an Alignment interrupt
|
|
// depending on the implementation.
|
|
// Alignment interrupt handling may require on the order of hundreds of cycles,
|
|
// so every effort should be made to avoid misaligned memory values.
|
|
//
|
|
//
|
|
// Generate stub for disjoint byte copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
address generate_disjoint_byte_copy(StubId stub_id) {
|
|
bool aligned;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jbyte_disjoint_arraycopy_id:
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
Register tmp3 = R8_ARG6;
|
|
Register tmp4 = R9_ARG7;
|
|
|
|
VectorSRegister tmp_vsr1 = VSR1;
|
|
VectorSRegister tmp_vsr2 = VSR2;
|
|
|
|
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
|
|
{
|
|
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
|
|
UnsafeMemoryAccessMark umam(this, !aligned, false);
|
|
|
|
// Don't try anything fancy if arrays don't have many elements.
|
|
__ li(tmp3, 0);
|
|
__ cmpwi(CR0, R5_ARG3, 17);
|
|
__ ble(CR0, l_6); // copy 4 at a time
|
|
|
|
if (!aligned) {
|
|
__ xorr(tmp1, R3_ARG1, R4_ARG2);
|
|
__ andi_(tmp1, tmp1, 3);
|
|
__ bne(CR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
|
|
|
|
// Copy elements if necessary to align to 4 bytes.
|
|
__ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
|
|
__ andi_(tmp1, tmp1, 3);
|
|
__ beq(CR0, l_2);
|
|
|
|
__ subf(R5_ARG3, tmp1, R5_ARG3);
|
|
__ bind(l_9);
|
|
__ lbz(tmp2, 0, R3_ARG1);
|
|
__ addic_(tmp1, tmp1, -1);
|
|
__ stb(tmp2, 0, R4_ARG2);
|
|
__ addi(R3_ARG1, R3_ARG1, 1);
|
|
__ addi(R4_ARG2, R4_ARG2, 1);
|
|
__ bne(CR0, l_9);
|
|
|
|
__ bind(l_2);
|
|
}
|
|
|
|
// copy 8 elements at a time
|
|
__ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
|
|
__ andi_(tmp1, tmp2, 7);
|
|
__ bne(CR0, l_7); // not same alignment -> to or from is aligned -> copy 8
|
|
|
|
// copy a 2-element word if necessary to align to 8 bytes
|
|
__ andi_(R0, R3_ARG1, 7);
|
|
__ beq(CR0, l_7);
|
|
|
|
__ lwzx(tmp2, R3_ARG1, tmp3);
|
|
__ addi(R5_ARG3, R5_ARG3, -4);
|
|
__ stwx(tmp2, R4_ARG2, tmp3);
|
|
{ // FasterArrayCopy
|
|
__ addi(R3_ARG1, R3_ARG1, 4);
|
|
__ addi(R4_ARG2, R4_ARG2, 4);
|
|
}
|
|
__ bind(l_7);
|
|
|
|
{ // FasterArrayCopy
|
|
__ cmpwi(CR0, R5_ARG3, 31);
|
|
__ ble(CR0, l_6); // copy 2 at a time if less than 32 elements remain
|
|
|
|
__ srdi(tmp1, R5_ARG3, 5);
|
|
__ andi_(R5_ARG3, R5_ARG3, 31);
|
|
__ mtctr(tmp1);
|
|
|
|
|
|
// Prefetch the data into the L2 cache.
|
|
__ dcbt(R3_ARG1, 0);
|
|
|
|
// If supported set DSCR pre-fetch to deepest.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
__ li(tmp1, 16);
|
|
|
|
// Backbranch target aligned to 32-byte. Not 16-byte align as
|
|
// loop contains < 8 instructions that fit inside a single
|
|
// i-cache sector.
|
|
__ align(32);
|
|
|
|
__ bind(l_10);
|
|
// Use loop with VSX load/store instructions to
|
|
// copy 32 elements a time.
|
|
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
|
|
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
|
|
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
|
|
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
|
|
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
|
|
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
|
|
__ bdnz(l_10); // Dec CTR and loop if not zero.
|
|
|
|
// Restore DSCR pre-fetch value.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
|
|
} // FasterArrayCopy
|
|
|
|
__ bind(l_6);
|
|
|
|
// copy 4 elements at a time
|
|
__ cmpwi(CR0, R5_ARG3, 4);
|
|
__ blt(CR0, l_1);
|
|
__ srdi(tmp1, R5_ARG3, 2);
|
|
__ mtctr(tmp1); // is > 0
|
|
__ andi_(R5_ARG3, R5_ARG3, 3);
|
|
|
|
{ // FasterArrayCopy
|
|
__ addi(R3_ARG1, R3_ARG1, -4);
|
|
__ addi(R4_ARG2, R4_ARG2, -4);
|
|
__ bind(l_3);
|
|
__ lwzu(tmp2, 4, R3_ARG1);
|
|
__ stwu(tmp2, 4, R4_ARG2);
|
|
__ bdnz(l_3);
|
|
__ addi(R3_ARG1, R3_ARG1, 4);
|
|
__ addi(R4_ARG2, R4_ARG2, 4);
|
|
}
|
|
|
|
// do single element copy
|
|
__ bind(l_1);
|
|
__ cmpwi(CR0, R5_ARG3, 0);
|
|
__ beq(CR0, l_4);
|
|
|
|
{ // FasterArrayCopy
|
|
__ mtctr(R5_ARG3);
|
|
__ addi(R3_ARG1, R3_ARG1, -1);
|
|
__ addi(R4_ARG2, R4_ARG2, -1);
|
|
|
|
__ bind(l_5);
|
|
__ lbzu(tmp2, 1, R3_ARG1);
|
|
__ stbu(tmp2, 1, R4_ARG2);
|
|
__ bdnz(l_5);
|
|
}
|
|
}
|
|
|
|
__ bind(l_4);
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Generate stub for conjoint byte copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
address generate_conjoint_byte_copy(StubId stub_id) {
|
|
bool aligned;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jbyte_arraycopy_id:
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jbyte_arraycopy_id:
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
Register tmp3 = R8_ARG6;
|
|
|
|
address nooverlap_target = aligned ?
|
|
STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy()) :
|
|
STUB_ENTRY(jbyte_disjoint_arraycopy());
|
|
|
|
array_overlap_test(nooverlap_target, 0);
|
|
// Do reverse copy. We assume the case of actual overlap is rare enough
|
|
// that we don't have to optimize it.
|
|
Label l_1, l_2;
|
|
{
|
|
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
|
|
UnsafeMemoryAccessMark umam(this, !aligned, false);
|
|
__ b(l_2);
|
|
__ bind(l_1);
|
|
__ stbx(tmp1, R4_ARG2, R5_ARG3);
|
|
__ bind(l_2);
|
|
__ addic_(R5_ARG3, R5_ARG3, -1);
|
|
__ lbzx(tmp1, R3_ARG1, R5_ARG3);
|
|
__ bge(CR0, l_1);
|
|
}
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Generate stub for disjoint short copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// elm.count: R5_ARG3 treated as signed
|
|
//
|
|
// Strategy for aligned==true:
|
|
//
|
|
// If length <= 9:
|
|
// 1. copy 2 elements at a time (l_6)
|
|
// 2. copy last element if original element count was odd (l_1)
|
|
//
|
|
// If length > 9:
|
|
// 1. copy 4 elements at a time until less than 4 elements are left (l_7)
|
|
// 2. copy 2 elements at a time until less than 2 elements are left (l_6)
|
|
// 3. copy last element if one was left in step 2. (l_1)
|
|
//
|
|
//
|
|
// Strategy for aligned==false:
|
|
//
|
|
// If length <= 9: same as aligned==true case, but NOTE: load/stores
|
|
// can be unaligned (see comment below)
|
|
//
|
|
// If length > 9:
|
|
// 1. continue with step 6. if the alignment of from and to mod 4
|
|
// is different.
|
|
// 2. align from and to to 4 bytes by copying 1 element if necessary
|
|
// 3. at l_2 from and to are 4 byte aligned; continue with
|
|
// 5. if they cannot be aligned to 8 bytes because they have
|
|
// got different alignment mod 8.
|
|
// 4. at this point we know that both, from and to, have the same
|
|
// alignment mod 8, now copy one element if necessary to get
|
|
// 8 byte alignment of from and to.
|
|
// 5. copy 4 elements at a time until less than 4 elements are
|
|
// left; depending on step 3. all load/stores are aligned or
|
|
// either all loads or all stores are unaligned.
|
|
// 6. copy 2 elements at a time until less than 2 elements are
|
|
// left (l_6); arriving here from step 1., there is a chance
|
|
// that all accesses are unaligned.
|
|
// 7. copy last element if one was left in step 6. (l_1)
|
|
//
|
|
// There are unaligned data accesses using integer load/store
|
|
// instructions in this stub. POWER allows such accesses.
|
|
//
|
|
// According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
|
|
// Chapter 2: Effect of Operand Placement on Performance) unaligned
|
|
// integer load/stores have good performance. Only unaligned
|
|
// floating point load/stores can have poor performance.
|
|
//
|
|
// TODO:
|
|
//
|
|
// 1. check if aligning the backbranch target of loops is beneficial
|
|
//
|
|
address generate_disjoint_short_copy(StubId stub_id) {
|
|
bool aligned;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jshort_disjoint_arraycopy_id:
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
Register tmp3 = R8_ARG6;
|
|
Register tmp4 = R9_ARG7;
|
|
|
|
VectorSRegister tmp_vsr1 = VSR1;
|
|
VectorSRegister tmp_vsr2 = VSR2;
|
|
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
|
|
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
|
|
{
|
|
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
|
|
UnsafeMemoryAccessMark umam(this, !aligned, false);
|
|
// don't try anything fancy if arrays don't have many elements
|
|
__ li(tmp3, 0);
|
|
__ cmpwi(CR0, R5_ARG3, 9);
|
|
__ ble(CR0, l_6); // copy 2 at a time
|
|
|
|
if (!aligned) {
|
|
__ xorr(tmp1, R3_ARG1, R4_ARG2);
|
|
__ andi_(tmp1, tmp1, 3);
|
|
__ bne(CR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
|
|
|
|
// At this point it is guaranteed that both, from and to have the same alignment mod 4.
|
|
|
|
// Copy 1 element if necessary to align to 4 bytes.
|
|
__ andi_(tmp1, R3_ARG1, 3);
|
|
__ beq(CR0, l_2);
|
|
|
|
__ lhz(tmp2, 0, R3_ARG1);
|
|
__ addi(R3_ARG1, R3_ARG1, 2);
|
|
__ sth(tmp2, 0, R4_ARG2);
|
|
__ addi(R4_ARG2, R4_ARG2, 2);
|
|
__ addi(R5_ARG3, R5_ARG3, -1);
|
|
__ bind(l_2);
|
|
|
|
// At this point the positions of both, from and to, are at least 4 byte aligned.
|
|
|
|
// Copy 4 elements at a time.
|
|
// Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
|
|
__ xorr(tmp2, R3_ARG1, R4_ARG2);
|
|
__ andi_(tmp1, tmp2, 7);
|
|
__ bne(CR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
|
|
|
|
// Copy a 2-element word if necessary to align to 8 bytes.
|
|
__ andi_(R0, R3_ARG1, 7);
|
|
__ beq(CR0, l_7);
|
|
|
|
__ lwzx(tmp2, R3_ARG1, tmp3);
|
|
__ addi(R5_ARG3, R5_ARG3, -2);
|
|
__ stwx(tmp2, R4_ARG2, tmp3);
|
|
{ // FasterArrayCopy
|
|
__ addi(R3_ARG1, R3_ARG1, 4);
|
|
__ addi(R4_ARG2, R4_ARG2, 4);
|
|
}
|
|
}
|
|
|
|
__ bind(l_7);
|
|
|
|
// Copy 4 elements at a time; either the loads or the stores can
|
|
// be unaligned if aligned == false.
|
|
|
|
{ // FasterArrayCopy
|
|
__ cmpwi(CR0, R5_ARG3, 15);
|
|
__ ble(CR0, l_6); // copy 2 at a time if less than 16 elements remain
|
|
|
|
__ srdi(tmp1, R5_ARG3, 4);
|
|
__ andi_(R5_ARG3, R5_ARG3, 15);
|
|
__ mtctr(tmp1);
|
|
|
|
|
|
// Processor supports VSX, so use it to mass copy.
|
|
|
|
// Prefetch src data into L2 cache.
|
|
__ dcbt(R3_ARG1, 0);
|
|
|
|
// If supported set DSCR pre-fetch to deepest.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
__ li(tmp1, 16);
|
|
|
|
// Backbranch target aligned to 32-byte. It's not aligned 16-byte
|
|
// as loop contains < 8 instructions that fit inside a single
|
|
// i-cache sector.
|
|
__ align(32);
|
|
|
|
__ bind(l_9);
|
|
// Use loop with VSX load/store instructions to
|
|
// copy 16 elements a time.
|
|
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
|
|
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
|
|
__ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
|
|
__ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
|
|
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
|
|
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
|
|
__ bdnz(l_9); // Dec CTR and loop if not zero.
|
|
|
|
// Restore DSCR pre-fetch value.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
|
|
} // FasterArrayCopy
|
|
__ bind(l_6);
|
|
|
|
// copy 2 elements at a time
|
|
{ // FasterArrayCopy
|
|
__ cmpwi(CR0, R5_ARG3, 2);
|
|
__ blt(CR0, l_1);
|
|
__ srdi(tmp1, R5_ARG3, 1);
|
|
__ andi_(R5_ARG3, R5_ARG3, 1);
|
|
|
|
__ addi(R3_ARG1, R3_ARG1, -4);
|
|
__ addi(R4_ARG2, R4_ARG2, -4);
|
|
__ mtctr(tmp1);
|
|
|
|
__ bind(l_3);
|
|
__ lwzu(tmp2, 4, R3_ARG1);
|
|
__ stwu(tmp2, 4, R4_ARG2);
|
|
__ bdnz(l_3);
|
|
|
|
__ addi(R3_ARG1, R3_ARG1, 4);
|
|
__ addi(R4_ARG2, R4_ARG2, 4);
|
|
}
|
|
|
|
// do single element copy
|
|
__ bind(l_1);
|
|
__ cmpwi(CR0, R5_ARG3, 0);
|
|
__ beq(CR0, l_4);
|
|
|
|
{ // FasterArrayCopy
|
|
__ mtctr(R5_ARG3);
|
|
__ addi(R3_ARG1, R3_ARG1, -2);
|
|
__ addi(R4_ARG2, R4_ARG2, -2);
|
|
|
|
__ bind(l_5);
|
|
__ lhzu(tmp2, 2, R3_ARG1);
|
|
__ sthu(tmp2, 2, R4_ARG2);
|
|
__ bdnz(l_5);
|
|
}
|
|
}
|
|
|
|
__ bind(l_4);
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Generate stub for conjoint short copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
address generate_conjoint_short_copy(StubId stub_id) {
|
|
bool aligned;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jshort_arraycopy_id:
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jshort_arraycopy_id:
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
Register tmp3 = R8_ARG6;
|
|
|
|
address nooverlap_target = aligned ?
|
|
STUB_ENTRY(arrayof_jshort_disjoint_arraycopy()) :
|
|
STUB_ENTRY(jshort_disjoint_arraycopy());
|
|
|
|
array_overlap_test(nooverlap_target, 1);
|
|
|
|
Label l_1, l_2;
|
|
{
|
|
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
|
|
UnsafeMemoryAccessMark umam(this, !aligned, false);
|
|
__ sldi(tmp1, R5_ARG3, 1);
|
|
__ b(l_2);
|
|
__ bind(l_1);
|
|
__ sthx(tmp2, R4_ARG2, tmp1);
|
|
__ bind(l_2);
|
|
__ addic_(tmp1, tmp1, -2);
|
|
__ lhzx(tmp2, R3_ARG1, tmp1);
|
|
__ bge(CR0, l_1);
|
|
}
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
|
|
// is true, the "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
void generate_disjoint_int_copy_core(bool aligned) {
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
Register tmp3 = R8_ARG6;
|
|
Register tmp4 = R0;
|
|
|
|
VectorSRegister tmp_vsr1 = VSR1;
|
|
VectorSRegister tmp_vsr2 = VSR2;
|
|
|
|
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
|
|
|
|
// for short arrays, just do single element copy
|
|
__ li(tmp3, 0);
|
|
__ cmpwi(CR0, R5_ARG3, 5);
|
|
__ ble(CR0, l_2);
|
|
|
|
if (!aligned) {
|
|
// check if arrays have same alignment mod 8.
|
|
__ xorr(tmp1, R3_ARG1, R4_ARG2);
|
|
__ andi_(R0, tmp1, 7);
|
|
// Not the same alignment, but ld and std just need to be 4 byte aligned.
|
|
__ bne(CR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
|
|
|
|
// copy 1 element to align to and from on an 8 byte boundary
|
|
__ andi_(R0, R3_ARG1, 7);
|
|
__ beq(CR0, l_4);
|
|
|
|
__ lwzx(tmp2, R3_ARG1, tmp3);
|
|
__ addi(R5_ARG3, R5_ARG3, -1);
|
|
__ stwx(tmp2, R4_ARG2, tmp3);
|
|
{ // FasterArrayCopy
|
|
__ addi(R3_ARG1, R3_ARG1, 4);
|
|
__ addi(R4_ARG2, R4_ARG2, 4);
|
|
}
|
|
__ bind(l_4);
|
|
}
|
|
|
|
{ // FasterArrayCopy
|
|
__ cmpwi(CR0, R5_ARG3, 7);
|
|
__ ble(CR0, l_2); // copy 1 at a time if less than 8 elements remain
|
|
|
|
__ srdi(tmp1, R5_ARG3, 3);
|
|
__ andi_(R5_ARG3, R5_ARG3, 7);
|
|
__ mtctr(tmp1);
|
|
|
|
// Processor supports VSX, so use it to mass copy.
|
|
|
|
// Prefetch the data into the L2 cache.
|
|
__ dcbt(R3_ARG1, 0);
|
|
|
|
// Set DSCR pre-fetch to deepest.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
__ li(tmp1, 16);
|
|
|
|
// Backbranch target aligned to 32-byte. Not 16-byte align as
|
|
// loop contains < 8 instructions that fit inside a single
|
|
// i-cache sector.
|
|
__ align(32);
|
|
|
|
__ bind(l_7);
|
|
// Use loop with VSX load/store instructions to
|
|
// copy 8 elements a time.
|
|
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
|
|
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
|
|
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
|
|
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
|
|
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
|
|
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
|
|
__ bdnz(l_7); // Dec CTR and loop if not zero.
|
|
|
|
// Restore DSCR pre-fetch value.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
|
|
} // FasterArrayCopy
|
|
|
|
// copy 1 element at a time
|
|
__ bind(l_2);
|
|
__ cmpwi(CR0, R5_ARG3, 0);
|
|
__ beq(CR0, l_1);
|
|
|
|
{ // FasterArrayCopy
|
|
__ mtctr(R5_ARG3);
|
|
__ addi(R3_ARG1, R3_ARG1, -4);
|
|
__ addi(R4_ARG2, R4_ARG2, -4);
|
|
|
|
__ bind(l_3);
|
|
__ lwzu(tmp2, 4, R3_ARG1);
|
|
__ stwu(tmp2, 4, R4_ARG2);
|
|
__ bdnz(l_3);
|
|
}
|
|
|
|
__ bind(l_1);
|
|
return;
|
|
}
|
|
|
|
// Generate stub for disjoint int copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
address generate_disjoint_int_copy(StubId stub_id) {
|
|
bool aligned;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jint_disjoint_arraycopy_id:
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
{
|
|
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
|
|
UnsafeMemoryAccessMark umam(this, !aligned, false);
|
|
generate_disjoint_int_copy_core(aligned);
|
|
}
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
return start;
|
|
}
|
|
|
|
// Generate core code for conjoint int copy (and oop copy on
|
|
// 32-bit). If "aligned" is true, the "from" and "to" addresses
|
|
// are assumed to be heapword aligned.
|
|
//
|
|
// Arguments:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
void generate_conjoint_int_copy_core(bool aligned) {
|
|
// Do reverse copy. We assume the case of actual overlap is rare enough
|
|
// that we don't have to optimize it.
|
|
|
|
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
|
|
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
Register tmp3 = R8_ARG6;
|
|
Register tmp4 = R0;
|
|
|
|
VectorSRegister tmp_vsr1 = VSR1;
|
|
VectorSRegister tmp_vsr2 = VSR2;
|
|
|
|
{ // FasterArrayCopy
|
|
__ cmpwi(CR0, R5_ARG3, 0);
|
|
__ beq(CR0, l_6);
|
|
|
|
__ sldi(R5_ARG3, R5_ARG3, 2);
|
|
__ add(R3_ARG1, R3_ARG1, R5_ARG3);
|
|
__ add(R4_ARG2, R4_ARG2, R5_ARG3);
|
|
__ srdi(R5_ARG3, R5_ARG3, 2);
|
|
|
|
if (!aligned) {
|
|
// check if arrays have same alignment mod 8.
|
|
__ xorr(tmp1, R3_ARG1, R4_ARG2);
|
|
__ andi_(R0, tmp1, 7);
|
|
// Not the same alignment, but ld and std just need to be 4 byte aligned.
|
|
__ bne(CR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
|
|
|
|
// copy 1 element to align to and from on an 8 byte boundary
|
|
__ andi_(R0, R3_ARG1, 7);
|
|
__ beq(CR0, l_7);
|
|
|
|
__ addi(R3_ARG1, R3_ARG1, -4);
|
|
__ addi(R4_ARG2, R4_ARG2, -4);
|
|
__ addi(R5_ARG3, R5_ARG3, -1);
|
|
__ lwzx(tmp2, R3_ARG1);
|
|
__ stwx(tmp2, R4_ARG2);
|
|
__ bind(l_7);
|
|
}
|
|
|
|
__ cmpwi(CR0, R5_ARG3, 7);
|
|
__ ble(CR0, l_5); // copy 1 at a time if less than 8 elements remain
|
|
|
|
__ srdi(tmp1, R5_ARG3, 3);
|
|
__ andi(R5_ARG3, R5_ARG3, 7);
|
|
__ mtctr(tmp1);
|
|
|
|
// Processor supports VSX, so use it to mass copy.
|
|
// Prefetch the data into the L2 cache.
|
|
__ dcbt(R3_ARG1, 0);
|
|
|
|
// Set DSCR pre-fetch to deepest.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
__ li(tmp1, 16);
|
|
|
|
// Backbranch target aligned to 32-byte. Not 16-byte align as
|
|
// loop contains < 8 instructions that fit inside a single
|
|
// i-cache sector.
|
|
__ align(32);
|
|
|
|
__ bind(l_4);
|
|
// Use loop with VSX load/store instructions to
|
|
// copy 8 elements a time.
|
|
__ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
|
|
__ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
|
|
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
|
|
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
|
|
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
|
|
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
|
|
__ bdnz(l_4);
|
|
|
|
// Restore DSCR pre-fetch value.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
|
|
__ cmpwi(CR0, R5_ARG3, 0);
|
|
__ beq(CR0, l_6);
|
|
|
|
__ bind(l_5);
|
|
__ mtctr(R5_ARG3);
|
|
__ bind(l_3);
|
|
__ lwz(R0, -4, R3_ARG1);
|
|
__ stw(R0, -4, R4_ARG2);
|
|
__ addi(R3_ARG1, R3_ARG1, -4);
|
|
__ addi(R4_ARG2, R4_ARG2, -4);
|
|
__ bdnz(l_3);
|
|
|
|
__ bind(l_6);
|
|
}
|
|
}
|
|
|
|
// Generate stub for conjoint int copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
address generate_conjoint_int_copy(StubId stub_id) {
|
|
bool aligned;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jint_arraycopy_id:
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jint_arraycopy_id:
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
address nooverlap_target = aligned ?
|
|
STUB_ENTRY(arrayof_jint_disjoint_arraycopy()) :
|
|
STUB_ENTRY(jint_disjoint_arraycopy());
|
|
|
|
array_overlap_test(nooverlap_target, 2);
|
|
{
|
|
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
|
|
UnsafeMemoryAccessMark umam(this, !aligned, false);
|
|
generate_conjoint_int_copy_core(aligned);
|
|
}
|
|
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Generate core code for disjoint long copy (and oop copy on
|
|
// 64-bit). If "aligned" is true, the "from" and "to" addresses
|
|
// are assumed to be heapword aligned.
|
|
//
|
|
// Arguments:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
void generate_disjoint_long_copy_core(bool aligned) {
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
Register tmp3 = R8_ARG6;
|
|
Register tmp4 = R0;
|
|
|
|
Label l_1, l_2, l_3, l_4, l_5;
|
|
|
|
VectorSRegister tmp_vsr1 = VSR1;
|
|
VectorSRegister tmp_vsr2 = VSR2;
|
|
|
|
{ // FasterArrayCopy
|
|
__ cmpwi(CR0, R5_ARG3, 3);
|
|
__ ble(CR0, l_3); // copy 1 at a time if less than 4 elements remain
|
|
|
|
__ srdi(tmp1, R5_ARG3, 2);
|
|
__ andi_(R5_ARG3, R5_ARG3, 3);
|
|
__ mtctr(tmp1);
|
|
|
|
// Processor supports VSX, so use it to mass copy.
|
|
|
|
// Prefetch the data into the L2 cache.
|
|
__ dcbt(R3_ARG1, 0);
|
|
|
|
// Set DSCR pre-fetch to deepest.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
__ li(tmp1, 16);
|
|
|
|
// Backbranch target aligned to 32-byte. Not 16-byte align as
|
|
// loop contains < 8 instructions that fit inside a single
|
|
// i-cache sector.
|
|
__ align(32);
|
|
|
|
__ bind(l_5);
|
|
// Use loop with VSX load/store instructions to
|
|
// copy 4 elements a time.
|
|
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
|
|
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
|
|
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
|
|
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
|
|
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
|
|
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
|
|
__ bdnz(l_5); // Dec CTR and loop if not zero.
|
|
|
|
// Restore DSCR pre-fetch value.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
|
|
} // FasterArrayCopy
|
|
|
|
// copy 1 element at a time
|
|
__ bind(l_3);
|
|
__ cmpwi(CR0, R5_ARG3, 0);
|
|
__ beq(CR0, l_1);
|
|
|
|
{ // FasterArrayCopy
|
|
__ mtctr(R5_ARG3);
|
|
__ addi(R3_ARG1, R3_ARG1, -8);
|
|
__ addi(R4_ARG2, R4_ARG2, -8);
|
|
|
|
__ bind(l_2);
|
|
__ ldu(R0, 8, R3_ARG1);
|
|
__ stdu(R0, 8, R4_ARG2);
|
|
__ bdnz(l_2);
|
|
|
|
}
|
|
__ bind(l_1);
|
|
}
|
|
|
|
// Generate stub for disjoint long copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
address generate_disjoint_long_copy(StubId stub_id) {
|
|
bool aligned;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jlong_disjoint_arraycopy_id:
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
{
|
|
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
|
|
UnsafeMemoryAccessMark umam(this, !aligned, false);
|
|
generate_disjoint_long_copy_core(aligned);
|
|
}
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Generate core code for conjoint long copy (and oop copy on
|
|
// 64-bit). If "aligned" is true, the "from" and "to" addresses
|
|
// are assumed to be heapword aligned.
|
|
//
|
|
// Arguments:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
void generate_conjoint_long_copy_core(bool aligned) {
|
|
Register tmp1 = R6_ARG4;
|
|
Register tmp2 = R7_ARG5;
|
|
Register tmp3 = R8_ARG6;
|
|
Register tmp4 = R0;
|
|
|
|
VectorSRegister tmp_vsr1 = VSR1;
|
|
VectorSRegister tmp_vsr2 = VSR2;
|
|
|
|
Label l_1, l_2, l_3, l_4, l_5;
|
|
|
|
__ cmpwi(CR0, R5_ARG3, 0);
|
|
__ beq(CR0, l_1);
|
|
|
|
{ // FasterArrayCopy
|
|
__ sldi(R5_ARG3, R5_ARG3, 3);
|
|
__ add(R3_ARG1, R3_ARG1, R5_ARG3);
|
|
__ add(R4_ARG2, R4_ARG2, R5_ARG3);
|
|
__ srdi(R5_ARG3, R5_ARG3, 3);
|
|
|
|
__ cmpwi(CR0, R5_ARG3, 3);
|
|
__ ble(CR0, l_5); // copy 1 at a time if less than 4 elements remain
|
|
|
|
__ srdi(tmp1, R5_ARG3, 2);
|
|
__ andi(R5_ARG3, R5_ARG3, 3);
|
|
__ mtctr(tmp1);
|
|
|
|
// Processor supports VSX, so use it to mass copy.
|
|
// Prefetch the data into the L2 cache.
|
|
__ dcbt(R3_ARG1, 0);
|
|
|
|
// Set DSCR pre-fetch to deepest.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
__ li(tmp1, 16);
|
|
|
|
// Backbranch target aligned to 32-byte. Not 16-byte align as
|
|
// loop contains < 8 instructions that fit inside a single
|
|
// i-cache sector.
|
|
__ align(32);
|
|
|
|
__ bind(l_4);
|
|
// Use loop with VSX load/store instructions to
|
|
// copy 4 elements a time.
|
|
__ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
|
|
__ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
|
|
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
|
|
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
|
|
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
|
|
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
|
|
__ bdnz(l_4);
|
|
|
|
// Restore DSCR pre-fetch value.
|
|
if (VM_Version::has_mfdscr()) {
|
|
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
|
|
__ mtdscr(tmp2);
|
|
}
|
|
|
|
__ cmpwi(CR0, R5_ARG3, 0);
|
|
__ beq(CR0, l_1);
|
|
|
|
__ bind(l_5);
|
|
__ mtctr(R5_ARG3);
|
|
__ bind(l_3);
|
|
__ ld(R0, -8, R3_ARG1);
|
|
__ std(R0, -8, R4_ARG2);
|
|
__ addi(R3_ARG1, R3_ARG1, -8);
|
|
__ addi(R4_ARG2, R4_ARG2, -8);
|
|
__ bdnz(l_3);
|
|
|
|
}
|
|
__ bind(l_1);
|
|
}
|
|
|
|
// Generate stub for conjoint long copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
//
|
|
address generate_conjoint_long_copy(StubId stub_id) {
|
|
bool aligned;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_jlong_arraycopy_id:
|
|
aligned = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_jlong_arraycopy_id:
|
|
aligned = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
address nooverlap_target = aligned ?
|
|
STUB_ENTRY(arrayof_jlong_disjoint_arraycopy()) :
|
|
STUB_ENTRY(jlong_disjoint_arraycopy());
|
|
|
|
array_overlap_test(nooverlap_target, 3);
|
|
{
|
|
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
|
|
UnsafeMemoryAccessMark umam(this, !aligned, false);
|
|
generate_conjoint_long_copy_core(aligned);
|
|
}
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Generate stub for conjoint oop copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
// dest_uninitialized: G1 support
|
|
//
|
|
address generate_conjoint_oop_copy(StubId stub_id) {
|
|
bool aligned;
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_oop_arraycopy_id:
|
|
aligned = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_oop_arraycopy_id:
|
|
aligned = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_oop_arraycopy_uninit_id:
|
|
aligned = false;
|
|
dest_uninitialized = true;
|
|
break;
|
|
case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
|
|
aligned = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
address nooverlap_target = aligned ?
|
|
STUB_ENTRY(arrayof_oop_disjoint_arraycopy(dest_uninitialized)) :
|
|
STUB_ENTRY(oop_disjoint_arraycopy(dest_uninitialized));
|
|
|
|
array_overlap_test(nooverlap_target, UseCompressedOops ? 2 : 3);
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
if (aligned) {
|
|
decorators |= ARRAYCOPY_ALIGNED;
|
|
}
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
|
|
|
|
if (UseCompressedOops) {
|
|
generate_conjoint_int_copy_core(aligned);
|
|
} else {
|
|
#if INCLUDE_ZGC
|
|
if (UseZGC) {
|
|
ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
|
|
zbs->generate_conjoint_oop_copy(_masm, dest_uninitialized);
|
|
} else
|
|
#endif
|
|
generate_conjoint_long_copy_core(aligned);
|
|
}
|
|
|
|
bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
return start;
|
|
}
|
|
|
|
// Generate stub for disjoint oop copy. If "aligned" is true, the
|
|
// "from" and "to" addresses are assumed to be heapword aligned.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3_ARG1
|
|
// to: R4_ARG2
|
|
// count: R5_ARG3 treated as signed
|
|
// dest_uninitialized: G1 support
|
|
//
|
|
address generate_disjoint_oop_copy(StubId stub_id) {
|
|
bool aligned;
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_oop_disjoint_arraycopy_id:
|
|
aligned = false;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
|
|
aligned = true;
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
|
|
aligned = false;
|
|
dest_uninitialized = true;
|
|
break;
|
|
case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
|
|
aligned = true;
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
assert_positive_int(R5_ARG3);
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
if (aligned) {
|
|
decorators |= ARRAYCOPY_ALIGNED;
|
|
}
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
|
|
|
|
if (UseCompressedOops) {
|
|
generate_disjoint_int_copy_core(aligned);
|
|
} else {
|
|
#if INCLUDE_ZGC
|
|
if (UseZGC) {
|
|
ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
|
|
zbs->generate_disjoint_oop_copy(_masm, dest_uninitialized);
|
|
} else
|
|
#endif
|
|
generate_disjoint_long_copy_core(aligned);
|
|
}
|
|
|
|
bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
|
|
__ li(R3_RET, 0); // return 0
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
|
|
// Helper for generating a dynamic type check.
|
|
// Smashes only the given temp registers.
|
|
void generate_type_check(Register sub_klass,
|
|
Register super_check_offset,
|
|
Register super_klass,
|
|
Register temp1,
|
|
Register temp2,
|
|
Label& L_success) {
|
|
assert_different_registers(sub_klass, super_check_offset, super_klass);
|
|
|
|
BLOCK_COMMENT("type_check:");
|
|
|
|
Label L_miss;
|
|
|
|
__ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr,
|
|
super_check_offset);
|
|
__ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success);
|
|
|
|
// Fall through on failure!
|
|
__ bind(L_miss);
|
|
}
|
|
|
|
|
|
// Generate stub for checked oop copy.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3
|
|
// to: R4
|
|
// count: R5 treated as signed
|
|
// ckoff: R6 (super_check_offset)
|
|
// ckval: R7 (super_klass)
|
|
// ret: R3 zero for success; (-1^K) where K is partial transfer count
|
|
//
|
|
address generate_checkcast_copy(StubId stub_id) {
|
|
const Register R3_from = R3_ARG1; // source array address
|
|
const Register R4_to = R4_ARG2; // destination array address
|
|
const Register R5_count = R5_ARG3; // elements count
|
|
const Register R6_ckoff = R6_ARG4; // super_check_offset
|
|
const Register R7_ckval = R7_ARG5; // super_klass
|
|
|
|
const Register R8_offset = R8_ARG6; // loop var, with stride wordSize
|
|
const Register R9_remain = R9_ARG7; // loop var, with stride -1
|
|
const Register R10_oop = R10_ARG8; // actual oop copied
|
|
const Register R11_klass = R11_scratch1; // oop._klass
|
|
const Register R12_tmp = R12_scratch2;
|
|
const Register R2_tmp = R2;
|
|
|
|
bool dest_uninitialized;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_checkcast_arraycopy_id:
|
|
dest_uninitialized = false;
|
|
break;
|
|
case StubId::stubgen_checkcast_arraycopy_uninit_id:
|
|
dest_uninitialized = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
//__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
|
|
// Assert that int is 64 bit sign extended and arrays are not conjoint.
|
|
#ifdef ASSERT
|
|
{
|
|
assert_positive_int(R5_ARG3);
|
|
const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
|
|
Label no_overlap;
|
|
__ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
|
|
__ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
|
|
__ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
|
|
__ cmpld(CR1, tmp1, tmp2);
|
|
__ crnand(CR0, Assembler::less, CR1, Assembler::less);
|
|
// Overlaps if Src before dst and distance smaller than size.
|
|
// Branch to forward copy routine otherwise.
|
|
__ blt(CR0, no_overlap);
|
|
__ stop("overlap in checkcast_copy");
|
|
__ bind(no_overlap);
|
|
}
|
|
#endif
|
|
|
|
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
|
|
if (dest_uninitialized) {
|
|
decorators |= IS_DEST_UNINITIALIZED;
|
|
}
|
|
|
|
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
|
|
bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
|
|
|
|
//inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
|
|
|
|
Label load_element, store_element, store_null, success, do_epilogue;
|
|
__ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
|
|
__ li(R8_offset, 0); // Offset from start of arrays.
|
|
__ bne(CR0, load_element);
|
|
|
|
// Empty array: Nothing to do.
|
|
__ li(R3_RET, 0); // Return 0 on (trivial) success.
|
|
__ blr();
|
|
|
|
// ======== begin loop ========
|
|
// (Entry is load_element.)
|
|
__ align(OptoLoopAlignment);
|
|
__ bind(store_element);
|
|
if (UseCompressedOops) {
|
|
__ encode_heap_oop_not_null(R10_oop);
|
|
__ bind(store_null);
|
|
__ stw(R10_oop, R8_offset, R4_to);
|
|
} else {
|
|
__ bind(store_null);
|
|
#if INCLUDE_ZGC
|
|
if (UseZGC) {
|
|
__ store_heap_oop(R10_oop, R8_offset, R4_to, R11_scratch1, R12_tmp, noreg,
|
|
MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
|
|
dest_uninitialized ? IS_DEST_UNINITIALIZED : 0);
|
|
} else
|
|
#endif
|
|
__ std(R10_oop, R8_offset, R4_to);
|
|
}
|
|
|
|
__ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset.
|
|
__ addic_(R9_remain, R9_remain, -1); // Decrement the count.
|
|
__ beq(CR0, success);
|
|
|
|
// ======== loop entry is here ========
|
|
__ bind(load_element);
|
|
#if INCLUDE_ZGC
|
|
if (UseZGC) {
|
|
__ load_heap_oop(R10_oop, R8_offset, R3_from,
|
|
R11_scratch1, R12_tmp,
|
|
MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
|
|
0, &store_null);
|
|
} else
|
|
#endif
|
|
__ load_heap_oop(R10_oop, R8_offset, R3_from,
|
|
R11_scratch1, R12_tmp,
|
|
MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
|
|
AS_RAW, &store_null);
|
|
|
|
__ load_klass(R11_klass, R10_oop); // Query the object klass.
|
|
|
|
generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp,
|
|
// Branch to this on success:
|
|
store_element);
|
|
// ======== end loop ========
|
|
|
|
// It was a real error; we must depend on the caller to finish the job.
|
|
// Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
|
|
// Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
|
|
// and report their number to the caller.
|
|
__ subf_(R5_count, R9_remain, R5_count);
|
|
__ nand(R3_RET, R5_count, R5_count); // report (-1^K) to caller
|
|
__ bne(CR0, do_epilogue);
|
|
__ blr();
|
|
|
|
__ bind(success);
|
|
__ li(R3_RET, 0);
|
|
|
|
__ bind(do_epilogue);
|
|
bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
|
|
|
|
__ blr();
|
|
return start;
|
|
}
|
|
|
|
|
|
// Generate 'unsafe' array copy stub.
|
|
// Though just as safe as the other stubs, it takes an unscaled
|
|
// size_t argument instead of an element count.
|
|
//
|
|
// Arguments for generated stub:
|
|
// from: R3
|
|
// to: R4
|
|
// count: R5 byte count, treated as ssize_t, can be zero
|
|
//
|
|
// Examines the alignment of the operands and dispatches
|
|
// to a long, int, short, or byte copy loop.
|
|
//
|
|
address generate_unsafe_copy(address byte_copy_entry,
|
|
address short_copy_entry,
|
|
address int_copy_entry,
|
|
address long_copy_entry) {
|
|
|
|
const Register R3_from = R3_ARG1; // source array address
|
|
const Register R4_to = R4_ARG2; // destination array address
|
|
const Register R5_count = R5_ARG3; // elements count (as long on PPC64)
|
|
|
|
const Register R6_bits = R6_ARG4; // test copy of low bits
|
|
const Register R7_tmp = R7_ARG5;
|
|
|
|
//__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
|
|
// Bump this on entry, not on exit:
|
|
//inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
|
|
|
|
Label short_copy, int_copy, long_copy;
|
|
|
|
__ orr(R6_bits, R3_from, R4_to);
|
|
__ orr(R6_bits, R6_bits, R5_count);
|
|
__ andi_(R0, R6_bits, (BytesPerLong-1));
|
|
__ beq(CR0, long_copy);
|
|
|
|
__ andi_(R0, R6_bits, (BytesPerInt-1));
|
|
__ beq(CR0, int_copy);
|
|
|
|
__ andi_(R0, R6_bits, (BytesPerShort-1));
|
|
__ beq(CR0, short_copy);
|
|
|
|
// byte_copy:
|
|
__ b(byte_copy_entry);
|
|
|
|
__ bind(short_copy);
|
|
__ srwi(R5_count, R5_count, LogBytesPerShort);
|
|
__ b(short_copy_entry);
|
|
|
|
__ bind(int_copy);
|
|
__ srwi(R5_count, R5_count, LogBytesPerInt);
|
|
__ b(int_copy_entry);
|
|
|
|
__ bind(long_copy);
|
|
__ srwi(R5_count, R5_count, LogBytesPerLong);
|
|
__ b(long_copy_entry);
|
|
|
|
return start;
|
|
}
|
|
|
|
|
|
// Perform range checks on the proposed arraycopy.
|
|
// Kills the two temps, but nothing else.
|
|
// Also, clean the sign bits of src_pos and dst_pos.
|
|
void arraycopy_range_checks(Register src, // source array oop
|
|
Register src_pos, // source position
|
|
Register dst, // destination array oop
|
|
Register dst_pos, // destination position
|
|
Register length, // length of copy
|
|
Register temp1, Register temp2,
|
|
Label& L_failed) {
|
|
BLOCK_COMMENT("arraycopy_range_checks:");
|
|
|
|
const Register array_length = temp1; // scratch
|
|
const Register end_pos = temp2; // scratch
|
|
|
|
// if (src_pos + length > arrayOop(src)->length() ) FAIL;
|
|
__ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
|
|
__ add(end_pos, src_pos, length); // src_pos + length
|
|
__ cmpd(CR0, end_pos, array_length);
|
|
__ bgt(CR0, L_failed);
|
|
|
|
// if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
|
|
__ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
|
|
__ add(end_pos, dst_pos, length); // src_pos + length
|
|
__ cmpd(CR0, end_pos, array_length);
|
|
__ bgt(CR0, L_failed);
|
|
|
|
BLOCK_COMMENT("arraycopy_range_checks done");
|
|
}
|
|
|
|
|
|
// Helper for generate_unsafe_setmemory
|
|
//
|
|
// Atomically fill an array of memory using 1-, 2-, 4-, or 8-byte chunks and return.
|
|
static void do_setmemory_atomic_loop(int elem_size, Register dest, Register size, Register byteVal,
|
|
MacroAssembler *_masm) {
|
|
|
|
Label L_Loop, L_Tail; // 2x unrolled loop
|
|
|
|
// Propagate byte to required width
|
|
if (elem_size > 1) __ rldimi(byteVal, byteVal, 8, 64 - 2 * 8);
|
|
if (elem_size > 2) __ rldimi(byteVal, byteVal, 16, 64 - 2 * 16);
|
|
if (elem_size > 4) __ rldimi(byteVal, byteVal, 32, 64 - 2 * 32);
|
|
|
|
__ srwi_(R0, size, exact_log2(2 * elem_size)); // size is a 32 bit value
|
|
__ beq(CR0, L_Tail);
|
|
__ mtctr(R0);
|
|
|
|
__ align(32); // loop alignment
|
|
__ bind(L_Loop);
|
|
__ store_sized_value(byteVal, 0, dest, elem_size);
|
|
__ store_sized_value(byteVal, elem_size, dest, elem_size);
|
|
__ addi(dest, dest, 2 * elem_size);
|
|
__ bdnz(L_Loop);
|
|
|
|
__ bind(L_Tail);
|
|
__ andi_(R0, size, elem_size);
|
|
__ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintbhBCLRisReturn);
|
|
__ store_sized_value(byteVal, 0, dest, elem_size);
|
|
__ blr();
|
|
}
|
|
|
|
//
|
|
// Generate 'unsafe' set memory stub
|
|
// Though just as safe as the other stubs, it takes an unscaled
|
|
// size_t (# bytes) argument instead of an element count.
|
|
//
|
|
// Input:
|
|
// R3_ARG1 - destination array address
|
|
// R4_ARG2 - byte count (size_t)
|
|
// R5_ARG3 - byte value
|
|
//
|
|
address generate_unsafe_setmemory(address unsafe_byte_fill) {
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
|
|
address start = __ function_entry();
|
|
|
|
// bump this on entry, not on exit:
|
|
// inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
|
|
|
|
{
|
|
Label L_fill8Bytes, L_fill4Bytes, L_fillBytes;
|
|
|
|
const Register dest = R3_ARG1;
|
|
const Register size = R4_ARG2;
|
|
const Register byteVal = R5_ARG3;
|
|
const Register rScratch1 = R6;
|
|
|
|
// fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
|
|
|
|
// Check for pointer & size alignment
|
|
__ orr(rScratch1, dest, size);
|
|
|
|
__ andi_(R0, rScratch1, 7);
|
|
__ beq(CR0, L_fill8Bytes);
|
|
|
|
__ andi_(R0, rScratch1, 3);
|
|
__ beq(CR0, L_fill4Bytes);
|
|
|
|
__ andi_(R0, rScratch1, 1);
|
|
__ bne(CR0, L_fillBytes);
|
|
|
|
// Mark remaining code as such which performs Unsafe accesses.
|
|
UnsafeMemoryAccessMark umam(this, true, false);
|
|
|
|
// At this point, we know the lower bit of size is zero and a
|
|
// multiple of 2
|
|
do_setmemory_atomic_loop(2, dest, size, byteVal, _masm);
|
|
|
|
__ align(32);
|
|
__ bind(L_fill8Bytes);
|
|
// At this point, we know the lower 3 bits of size are zero and a
|
|
// multiple of 8
|
|
do_setmemory_atomic_loop(8, dest, size, byteVal, _masm);
|
|
|
|
__ align(32);
|
|
__ bind(L_fill4Bytes);
|
|
// At this point, we know the lower 2 bits of size are zero and a
|
|
// multiple of 4
|
|
do_setmemory_atomic_loop(4, dest, size, byteVal, _masm);
|
|
|
|
__ align(32);
|
|
__ bind(L_fillBytes);
|
|
do_setmemory_atomic_loop(1, dest, size, byteVal, _masm);
|
|
}
|
|
|
|
return start;
|
|
}
|
|
|
|
|
|
//
|
|
// Generate generic array copy stubs
|
|
//
|
|
// Input:
|
|
// R3 - src oop
|
|
// R4 - src_pos
|
|
// R5 - dst oop
|
|
// R6 - dst_pos
|
|
// R7 - element count
|
|
//
|
|
// Output:
|
|
// R3 == 0 - success
|
|
// R3 == -1 - need to call System.arraycopy
|
|
//
|
|
address generate_generic_copy(address entry_jbyte_arraycopy,
|
|
address entry_jshort_arraycopy,
|
|
address entry_jint_arraycopy,
|
|
address entry_oop_arraycopy,
|
|
address entry_disjoint_oop_arraycopy,
|
|
address entry_jlong_arraycopy,
|
|
address entry_checkcast_arraycopy) {
|
|
Label L_failed, L_objArray;
|
|
|
|
// Input registers
|
|
const Register src = R3_ARG1; // source array oop
|
|
const Register src_pos = R4_ARG2; // source position
|
|
const Register dst = R5_ARG3; // destination array oop
|
|
const Register dst_pos = R6_ARG4; // destination position
|
|
const Register length = R7_ARG5; // elements count
|
|
|
|
// registers used as temp
|
|
const Register src_klass = R8_ARG6; // source array klass
|
|
const Register dst_klass = R9_ARG7; // destination array klass
|
|
const Register lh = R10_ARG8; // layout handler
|
|
const Register temp = R2;
|
|
|
|
//__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_generic_arraycopy_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
|
|
// Bump this on entry, not on exit:
|
|
//inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
|
|
|
|
// In principle, the int arguments could be dirty.
|
|
|
|
//-----------------------------------------------------------------------
|
|
// Assembler stubs will be used for this call to arraycopy
|
|
// if the following conditions are met:
|
|
//
|
|
// (1) src and dst must not be null.
|
|
// (2) src_pos must not be negative.
|
|
// (3) dst_pos must not be negative.
|
|
// (4) length must not be negative.
|
|
// (5) src klass and dst klass should be the same and not null.
|
|
// (6) src and dst should be arrays.
|
|
// (7) src_pos + length must not exceed length of src.
|
|
// (8) dst_pos + length must not exceed length of dst.
|
|
BLOCK_COMMENT("arraycopy initial argument checks");
|
|
|
|
__ cmpdi(CR1, src, 0); // if (src == nullptr) return -1;
|
|
__ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
|
|
__ cmpdi(CR5, dst, 0); // if (dst == nullptr) return -1;
|
|
__ cror(CR1, Assembler::equal, CR0, Assembler::less);
|
|
__ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
|
|
__ cror(CR5, Assembler::equal, CR0, Assembler::less);
|
|
__ extsw_(length, length); // if (length < 0) return -1;
|
|
__ cror(CR1, Assembler::equal, CR5, Assembler::equal);
|
|
__ cror(CR1, Assembler::equal, CR0, Assembler::less);
|
|
__ beq(CR1, L_failed);
|
|
|
|
BLOCK_COMMENT("arraycopy argument klass checks");
|
|
__ load_klass(src_klass, src);
|
|
__ load_klass(dst_klass, dst);
|
|
|
|
// Load layout helper
|
|
//
|
|
// |array_tag| | header_size | element_type | |log2_element_size|
|
|
// 32 30 24 16 8 2 0
|
|
//
|
|
// array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
|
|
//
|
|
|
|
int lh_offset = in_bytes(Klass::layout_helper_offset());
|
|
|
|
// Load 32-bits signed value. Use br() instruction with it to check icc.
|
|
__ lwz(lh, lh_offset, src_klass);
|
|
|
|
// Handle objArrays completely differently...
|
|
jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
|
|
__ load_const_optimized(temp, objArray_lh, R0);
|
|
__ cmpw(CR0, lh, temp);
|
|
__ beq(CR0, L_objArray);
|
|
|
|
__ cmpd(CR5, src_klass, dst_klass); // if (src->klass() != dst->klass()) return -1;
|
|
__ cmpwi(CR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
|
|
|
|
__ crnand(CR5, Assembler::equal, CR6, Assembler::less);
|
|
__ beq(CR5, L_failed);
|
|
|
|
// At this point, it is known to be a typeArray (array_tag 0x3).
|
|
#ifdef ASSERT
|
|
{ Label L;
|
|
jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
|
|
__ load_const_optimized(temp, lh_prim_tag_in_place, R0);
|
|
__ cmpw(CR0, lh, temp);
|
|
__ bge(CR0, L);
|
|
__ stop("must be a primitive array");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
|
|
temp, dst_klass, L_failed);
|
|
|
|
// TypeArrayKlass
|
|
//
|
|
// src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
|
|
// dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
|
|
//
|
|
|
|
const Register offset = dst_klass; // array offset
|
|
const Register elsize = src_klass; // log2 element size
|
|
|
|
__ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
|
|
__ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
|
|
__ add(src, offset, src); // src array offset
|
|
__ add(dst, offset, dst); // dst array offset
|
|
|
|
// Next registers should be set before the jump to corresponding stub.
|
|
const Register from = R3_ARG1; // source array address
|
|
const Register to = R4_ARG2; // destination array address
|
|
const Register count = R5_ARG3; // elements count
|
|
|
|
// 'from', 'to', 'count' registers should be set in this order
|
|
// since they are the same as 'src', 'src_pos', 'dst'.
|
|
|
|
BLOCK_COMMENT("scale indexes to element size");
|
|
__ sld(src_pos, src_pos, elsize);
|
|
__ sld(dst_pos, dst_pos, elsize);
|
|
__ add(from, src_pos, src); // src_addr
|
|
__ add(to, dst_pos, dst); // dst_addr
|
|
__ mr(count, length); // length
|
|
|
|
BLOCK_COMMENT("choose copy loop based on element size");
|
|
// Using conditional branches with range 32kB.
|
|
const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CR0, Assembler::equal);
|
|
__ cmpwi(CR0, elsize, 0);
|
|
__ bc(bo, bi, entry_jbyte_arraycopy);
|
|
__ cmpwi(CR0, elsize, LogBytesPerShort);
|
|
__ bc(bo, bi, entry_jshort_arraycopy);
|
|
__ cmpwi(CR0, elsize, LogBytesPerInt);
|
|
__ bc(bo, bi, entry_jint_arraycopy);
|
|
#ifdef ASSERT
|
|
{ Label L;
|
|
__ cmpwi(CR0, elsize, LogBytesPerLong);
|
|
__ beq(CR0, L);
|
|
__ stop("must be long copy, but elsize is wrong");
|
|
__ bind(L);
|
|
}
|
|
#endif
|
|
__ b(entry_jlong_arraycopy);
|
|
|
|
// ObjArrayKlass
|
|
__ bind(L_objArray);
|
|
// live at this point: src_klass, dst_klass, src[_pos], dst[_pos], length
|
|
|
|
Label L_disjoint_plain_copy, L_checkcast_copy;
|
|
// test array classes for subtyping
|
|
__ cmpd(CR0, src_klass, dst_klass); // usual case is exact equality
|
|
__ bne(CR0, L_checkcast_copy);
|
|
|
|
// Identically typed arrays can be copied without element-wise checks.
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
|
|
temp, lh, L_failed);
|
|
|
|
__ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
|
|
__ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
|
|
__ sldi(src_pos, src_pos, LogBytesPerHeapOop);
|
|
__ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
|
|
__ add(from, src_pos, src); // src_addr
|
|
__ add(to, dst_pos, dst); // dst_addr
|
|
__ mr(count, length); // length
|
|
__ b(entry_oop_arraycopy);
|
|
|
|
__ bind(L_checkcast_copy);
|
|
// live at this point: src_klass, dst_klass
|
|
{
|
|
// Before looking at dst.length, make sure dst is also an objArray.
|
|
__ lwz(temp, lh_offset, dst_klass);
|
|
__ cmpw(CR0, lh, temp);
|
|
__ bne(CR0, L_failed);
|
|
|
|
// It is safe to examine both src.length and dst.length.
|
|
arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
|
|
temp, lh, L_failed);
|
|
|
|
// Marshal the base address arguments now, freeing registers.
|
|
__ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
|
|
__ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
|
|
__ sldi(src_pos, src_pos, LogBytesPerHeapOop);
|
|
__ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
|
|
__ add(from, src_pos, src); // src_addr
|
|
__ add(to, dst_pos, dst); // dst_addr
|
|
__ mr(count, length); // length
|
|
|
|
Register sco_temp = R6_ARG4; // This register is free now.
|
|
assert_different_registers(from, to, count, sco_temp,
|
|
dst_klass, src_klass);
|
|
|
|
// Generate the type check.
|
|
int sco_offset = in_bytes(Klass::super_check_offset_offset());
|
|
__ lwz(sco_temp, sco_offset, dst_klass);
|
|
generate_type_check(src_klass, sco_temp, dst_klass,
|
|
temp, /* temp */ R10_ARG8, L_disjoint_plain_copy);
|
|
|
|
// Fetch destination element klass from the ObjArrayKlass header.
|
|
int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
|
|
|
|
// The checkcast_copy loop needs two extra arguments:
|
|
__ ld(R7_ARG5, ek_offset, dst_klass); // dest elem klass
|
|
__ lwz(R6_ARG4, sco_offset, R7_ARG5); // sco of elem klass
|
|
__ b(entry_checkcast_arraycopy);
|
|
}
|
|
|
|
__ bind(L_disjoint_plain_copy);
|
|
__ b(entry_disjoint_oop_arraycopy);
|
|
|
|
__ bind(L_failed);
|
|
__ li(R3_RET, -1); // return -1
|
|
__ blr();
|
|
return start;
|
|
}
|
|
|
|
// Arguments for generated stub:
|
|
// R3_ARG1 - source byte array address
|
|
// R4_ARG2 - destination byte array address
|
|
// R5_ARG3 - round key array
|
|
address generate_aescrypt_encryptBlock() {
|
|
assert(UseAES, "need AES instructions and misaligned SSE support");
|
|
StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ function_entry();
|
|
|
|
Label L_doLast, L_error;
|
|
|
|
Register from = R3_ARG1; // source array address
|
|
Register to = R4_ARG2; // destination array address
|
|
Register key = R5_ARG3; // round key array
|
|
|
|
Register keylen = R8;
|
|
Register temp = R9;
|
|
Register keypos = R10;
|
|
Register fifteen = R12;
|
|
|
|
VectorRegister vRet = VR0;
|
|
|
|
VectorRegister vKey1 = VR1;
|
|
VectorRegister vKey2 = VR2;
|
|
VectorRegister vKey3 = VR3;
|
|
VectorRegister vKey4 = VR4;
|
|
|
|
VectorRegister fromPerm = VR5;
|
|
VectorRegister keyPerm = VR6;
|
|
VectorRegister toPerm = VR7;
|
|
VectorRegister fSplt = VR8;
|
|
|
|
VectorRegister vTmp1 = VR9;
|
|
VectorRegister vTmp2 = VR10;
|
|
VectorRegister vTmp3 = VR11;
|
|
VectorRegister vTmp4 = VR12;
|
|
|
|
__ li (fifteen, 15);
|
|
|
|
// load unaligned from[0-15] to vRet
|
|
__ lvx (vRet, from);
|
|
__ lvx (vTmp1, fifteen, from);
|
|
__ lvsl (fromPerm, from);
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
__ vspltisb (fSplt, 0x0f);
|
|
__ vxor (fromPerm, fromPerm, fSplt);
|
|
#endif
|
|
__ vperm (vRet, vRet, vTmp1, fromPerm);
|
|
|
|
// load keylen (44 or 52 or 60)
|
|
__ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
|
|
|
|
// to load keys
|
|
__ load_perm (keyPerm, key);
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
__ vspltisb (vTmp2, -16);
|
|
__ vrld (keyPerm, keyPerm, vTmp2);
|
|
__ vrld (keyPerm, keyPerm, vTmp2);
|
|
__ vsldoi (keyPerm, keyPerm, keyPerm, 8);
|
|
#endif
|
|
|
|
// load the 1st round key to vTmp1
|
|
__ lvx (vTmp1, key);
|
|
__ li (keypos, 16);
|
|
__ lvx (vKey1, keypos, key);
|
|
__ vec_perm (vTmp1, vKey1, keyPerm);
|
|
|
|
// 1st round
|
|
__ vxor (vRet, vRet, vTmp1);
|
|
|
|
// load the 2nd round key to vKey1
|
|
__ li (keypos, 32);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vKey2, keyPerm);
|
|
|
|
// load the 3rd round key to vKey2
|
|
__ li (keypos, 48);
|
|
__ lvx (vKey3, keypos, key);
|
|
__ vec_perm (vKey2, vKey3, keyPerm);
|
|
|
|
// load the 4th round key to vKey3
|
|
__ li (keypos, 64);
|
|
__ lvx (vKey4, keypos, key);
|
|
__ vec_perm (vKey3, vKey4, keyPerm);
|
|
|
|
// load the 5th round key to vKey4
|
|
__ li (keypos, 80);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey4, vTmp1, keyPerm);
|
|
|
|
// 2nd - 5th rounds
|
|
__ vcipher (vRet, vRet, vKey1);
|
|
__ vcipher (vRet, vRet, vKey2);
|
|
__ vcipher (vRet, vRet, vKey3);
|
|
__ vcipher (vRet, vRet, vKey4);
|
|
|
|
// load the 6th round key to vKey1
|
|
__ li (keypos, 96);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
|
|
|
|
// load the 7th round key to vKey2
|
|
__ li (keypos, 112);
|
|
__ lvx (vKey3, keypos, key);
|
|
__ vec_perm (vKey2, vKey3, keyPerm);
|
|
|
|
// load the 8th round key to vKey3
|
|
__ li (keypos, 128);
|
|
__ lvx (vKey4, keypos, key);
|
|
__ vec_perm (vKey3, vKey4, keyPerm);
|
|
|
|
// load the 9th round key to vKey4
|
|
__ li (keypos, 144);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey4, vTmp1, keyPerm);
|
|
|
|
// 6th - 9th rounds
|
|
__ vcipher (vRet, vRet, vKey1);
|
|
__ vcipher (vRet, vRet, vKey2);
|
|
__ vcipher (vRet, vRet, vKey3);
|
|
__ vcipher (vRet, vRet, vKey4);
|
|
|
|
// load the 10th round key to vKey1
|
|
__ li (keypos, 160);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
|
|
|
|
// load the 11th round key to vKey2
|
|
__ li (keypos, 176);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey2, vTmp1, keyPerm);
|
|
|
|
// if all round keys are loaded, skip next 4 rounds
|
|
__ cmpwi (CR0, keylen, 44);
|
|
__ beq (CR0, L_doLast);
|
|
|
|
// 10th - 11th rounds
|
|
__ vcipher (vRet, vRet, vKey1);
|
|
__ vcipher (vRet, vRet, vKey2);
|
|
|
|
// load the 12th round key to vKey1
|
|
__ li (keypos, 192);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
|
|
|
|
// load the 13th round key to vKey2
|
|
__ li (keypos, 208);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey2, vTmp1, keyPerm);
|
|
|
|
// if all round keys are loaded, skip next 2 rounds
|
|
__ cmpwi (CR0, keylen, 52);
|
|
__ beq (CR0, L_doLast);
|
|
|
|
#ifdef ASSERT
|
|
__ cmpwi (CR0, keylen, 60);
|
|
__ bne (CR0, L_error);
|
|
#endif
|
|
|
|
// 12th - 13th rounds
|
|
__ vcipher (vRet, vRet, vKey1);
|
|
__ vcipher (vRet, vRet, vKey2);
|
|
|
|
// load the 14th round key to vKey1
|
|
__ li (keypos, 224);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
|
|
|
|
// load the 15th round key to vKey2
|
|
__ li (keypos, 240);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey2, vTmp1, keyPerm);
|
|
|
|
__ bind(L_doLast);
|
|
|
|
// last two rounds
|
|
__ vcipher (vRet, vRet, vKey1);
|
|
__ vcipherlast (vRet, vRet, vKey2);
|
|
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
// toPerm = 0x0F0E0D0C0B0A09080706050403020100
|
|
__ lvsl (toPerm, keypos); // keypos is a multiple of 16
|
|
__ vxor (toPerm, toPerm, fSplt);
|
|
|
|
// Swap Bytes
|
|
__ vperm (vRet, vRet, vRet, toPerm);
|
|
#endif
|
|
|
|
// store result (unaligned)
|
|
// Note: We can't use a read-modify-write sequence which touches additional Bytes.
|
|
Register lo = temp, hi = fifteen; // Reuse
|
|
__ vsldoi (vTmp1, vRet, vRet, 8);
|
|
__ mfvrd (hi, vRet);
|
|
__ mfvrd (lo, vTmp1);
|
|
__ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
|
|
__ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
|
|
|
|
__ blr();
|
|
|
|
#ifdef ASSERT
|
|
__ bind(L_error);
|
|
__ stop("aescrypt_encryptBlock: invalid key length");
|
|
#endif
|
|
return start;
|
|
}
|
|
|
|
// Arguments for generated stub:
|
|
// R3_ARG1 - source byte array address
|
|
// R4_ARG2 - destination byte array address
|
|
// R5_ARG3 - K (key) in little endian int array
|
|
address generate_aescrypt_decryptBlock() {
|
|
assert(UseAES, "need AES instructions and misaligned SSE support");
|
|
StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ function_entry();
|
|
|
|
Label L_doLast, L_do44, L_do52, L_error;
|
|
|
|
Register from = R3_ARG1; // source array address
|
|
Register to = R4_ARG2; // destination array address
|
|
Register key = R5_ARG3; // round key array
|
|
|
|
Register keylen = R8;
|
|
Register temp = R9;
|
|
Register keypos = R10;
|
|
Register fifteen = R12;
|
|
|
|
VectorRegister vRet = VR0;
|
|
|
|
VectorRegister vKey1 = VR1;
|
|
VectorRegister vKey2 = VR2;
|
|
VectorRegister vKey3 = VR3;
|
|
VectorRegister vKey4 = VR4;
|
|
VectorRegister vKey5 = VR5;
|
|
|
|
VectorRegister fromPerm = VR6;
|
|
VectorRegister keyPerm = VR7;
|
|
VectorRegister toPerm = VR8;
|
|
VectorRegister fSplt = VR9;
|
|
|
|
VectorRegister vTmp1 = VR10;
|
|
VectorRegister vTmp2 = VR11;
|
|
VectorRegister vTmp3 = VR12;
|
|
VectorRegister vTmp4 = VR13;
|
|
|
|
__ li (fifteen, 15);
|
|
|
|
// load unaligned from[0-15] to vRet
|
|
__ lvx (vRet, from);
|
|
__ lvx (vTmp1, fifteen, from);
|
|
__ lvsl (fromPerm, from);
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
__ vspltisb (fSplt, 0x0f);
|
|
__ vxor (fromPerm, fromPerm, fSplt);
|
|
#endif
|
|
__ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
|
|
|
|
// load keylen (44 or 52 or 60)
|
|
__ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
|
|
|
|
// to load keys
|
|
__ load_perm (keyPerm, key);
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
__ vxor (vTmp2, vTmp2, vTmp2);
|
|
__ vspltisb (vTmp2, -16);
|
|
__ vrld (keyPerm, keyPerm, vTmp2);
|
|
__ vrld (keyPerm, keyPerm, vTmp2);
|
|
__ vsldoi (keyPerm, keyPerm, keyPerm, 8);
|
|
#endif
|
|
|
|
__ cmpwi (CR0, keylen, 44);
|
|
__ beq (CR0, L_do44);
|
|
|
|
__ cmpwi (CR0, keylen, 52);
|
|
__ beq (CR0, L_do52);
|
|
|
|
#ifdef ASSERT
|
|
__ cmpwi (CR0, keylen, 60);
|
|
__ bne (CR0, L_error);
|
|
#endif
|
|
|
|
// load the 15th round key to vKey1
|
|
__ li (keypos, 240);
|
|
__ lvx (vKey1, keypos, key);
|
|
__ li (keypos, 224);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vKey2, vKey1, keyPerm);
|
|
|
|
// load the 14th round key to vKey2
|
|
__ li (keypos, 208);
|
|
__ lvx (vKey3, keypos, key);
|
|
__ vec_perm (vKey2, vKey3, vKey2, keyPerm);
|
|
|
|
// load the 13th round key to vKey3
|
|
__ li (keypos, 192);
|
|
__ lvx (vKey4, keypos, key);
|
|
__ vec_perm (vKey3, vKey4, vKey3, keyPerm);
|
|
|
|
// load the 12th round key to vKey4
|
|
__ li (keypos, 176);
|
|
__ lvx (vKey5, keypos, key);
|
|
__ vec_perm (vKey4, vKey5, vKey4, keyPerm);
|
|
|
|
// load the 11th round key to vKey5
|
|
__ li (keypos, 160);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
|
|
|
|
// 1st - 5th rounds
|
|
__ vxor (vRet, vRet, vKey1);
|
|
__ vncipher (vRet, vRet, vKey2);
|
|
__ vncipher (vRet, vRet, vKey3);
|
|
__ vncipher (vRet, vRet, vKey4);
|
|
__ vncipher (vRet, vRet, vKey5);
|
|
|
|
__ b (L_doLast);
|
|
|
|
__ align(32);
|
|
__ bind (L_do52);
|
|
|
|
// load the 13th round key to vKey1
|
|
__ li (keypos, 208);
|
|
__ lvx (vKey1, keypos, key);
|
|
__ li (keypos, 192);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vKey2, vKey1, keyPerm);
|
|
|
|
// load the 12th round key to vKey2
|
|
__ li (keypos, 176);
|
|
__ lvx (vKey3, keypos, key);
|
|
__ vec_perm (vKey2, vKey3, vKey2, keyPerm);
|
|
|
|
// load the 11th round key to vKey3
|
|
__ li (keypos, 160);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey3, vTmp1, vKey3, keyPerm);
|
|
|
|
// 1st - 3rd rounds
|
|
__ vxor (vRet, vRet, vKey1);
|
|
__ vncipher (vRet, vRet, vKey2);
|
|
__ vncipher (vRet, vRet, vKey3);
|
|
|
|
__ b (L_doLast);
|
|
|
|
__ align(32);
|
|
__ bind (L_do44);
|
|
|
|
// load the 11th round key to vKey1
|
|
__ li (keypos, 176);
|
|
__ lvx (vKey1, keypos, key);
|
|
__ li (keypos, 160);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey1, vTmp1, vKey1, keyPerm);
|
|
|
|
// 1st round
|
|
__ vxor (vRet, vRet, vKey1);
|
|
|
|
__ bind (L_doLast);
|
|
|
|
// load the 10th round key to vKey1
|
|
__ li (keypos, 144);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
|
|
|
|
// load the 9th round key to vKey2
|
|
__ li (keypos, 128);
|
|
__ lvx (vKey3, keypos, key);
|
|
__ vec_perm (vKey2, vKey3, vKey2, keyPerm);
|
|
|
|
// load the 8th round key to vKey3
|
|
__ li (keypos, 112);
|
|
__ lvx (vKey4, keypos, key);
|
|
__ vec_perm (vKey3, vKey4, vKey3, keyPerm);
|
|
|
|
// load the 7th round key to vKey4
|
|
__ li (keypos, 96);
|
|
__ lvx (vKey5, keypos, key);
|
|
__ vec_perm (vKey4, vKey5, vKey4, keyPerm);
|
|
|
|
// load the 6th round key to vKey5
|
|
__ li (keypos, 80);
|
|
__ lvx (vTmp1, keypos, key);
|
|
__ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
|
|
|
|
// last 10th - 6th rounds
|
|
__ vncipher (vRet, vRet, vKey1);
|
|
__ vncipher (vRet, vRet, vKey2);
|
|
__ vncipher (vRet, vRet, vKey3);
|
|
__ vncipher (vRet, vRet, vKey4);
|
|
__ vncipher (vRet, vRet, vKey5);
|
|
|
|
// load the 5th round key to vKey1
|
|
__ li (keypos, 64);
|
|
__ lvx (vKey2, keypos, key);
|
|
__ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
|
|
|
|
// load the 4th round key to vKey2
|
|
__ li (keypos, 48);
|
|
__ lvx (vKey3, keypos, key);
|
|
__ vec_perm (vKey2, vKey3, vKey2, keyPerm);
|
|
|
|
// load the 3rd round key to vKey3
|
|
__ li (keypos, 32);
|
|
__ lvx (vKey4, keypos, key);
|
|
__ vec_perm (vKey3, vKey4, vKey3, keyPerm);
|
|
|
|
// load the 2nd round key to vKey4
|
|
__ li (keypos, 16);
|
|
__ lvx (vKey5, keypos, key);
|
|
__ vec_perm (vKey4, vKey5, vKey4, keyPerm);
|
|
|
|
// load the 1st round key to vKey5
|
|
__ lvx (vTmp1, key);
|
|
__ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
|
|
|
|
// last 5th - 1th rounds
|
|
__ vncipher (vRet, vRet, vKey1);
|
|
__ vncipher (vRet, vRet, vKey2);
|
|
__ vncipher (vRet, vRet, vKey3);
|
|
__ vncipher (vRet, vRet, vKey4);
|
|
__ vncipherlast (vRet, vRet, vKey5);
|
|
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
// toPerm = 0x0F0E0D0C0B0A09080706050403020100
|
|
__ lvsl (toPerm, keypos); // keypos is a multiple of 16
|
|
__ vxor (toPerm, toPerm, fSplt);
|
|
|
|
// Swap Bytes
|
|
__ vperm (vRet, vRet, vRet, toPerm);
|
|
#endif
|
|
|
|
// store result (unaligned)
|
|
// Note: We can't use a read-modify-write sequence which touches additional Bytes.
|
|
Register lo = temp, hi = fifteen; // Reuse
|
|
__ vsldoi (vTmp1, vRet, vRet, 8);
|
|
__ mfvrd (hi, vRet);
|
|
__ mfvrd (lo, vTmp1);
|
|
__ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
|
|
__ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
|
|
|
|
__ blr();
|
|
|
|
#ifdef ASSERT
|
|
__ bind(L_error);
|
|
__ stop("aescrypt_decryptBlock: invalid key length");
|
|
#endif
|
|
return start;
|
|
}
|
|
|
|
address generate_sha256_implCompress(StubId stub_id) {
|
|
assert(UseSHA, "need SHA instructions");
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_sha256_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case StubId::stubgen_sha256_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
|
|
__ sha256 (multi_block);
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_sha512_implCompress(StubId stub_id) {
|
|
assert(UseSHA, "need SHA instructions");
|
|
bool multi_block;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_sha512_implCompress_id:
|
|
multi_block = false;
|
|
break;
|
|
case StubId::stubgen_sha512_implCompressMB_id:
|
|
multi_block = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
|
|
__ sha512 (multi_block);
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_data_cache_writeback() {
|
|
const Register cacheline = R3_ARG1;
|
|
StubId stub_id = StubId::stubgen_data_cache_writeback_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ cache_wb(Address(cacheline));
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_data_cache_writeback_sync() {
|
|
const Register is_presync = R3_ARG1;
|
|
Register temp = R4;
|
|
Label SKIP;
|
|
StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ andi_(temp, is_presync, 1);
|
|
__ bne(CR0, SKIP);
|
|
__ cache_wbsync(false); // post sync => emit 'sync'
|
|
__ bind(SKIP); // pre sync => emit nothing
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
void generate_arraycopy_stubs() {
|
|
// generate the common exit first so later stubs can rely on it if
|
|
// they want an UnsafeMemoryAccess exit non-local to the stub
|
|
StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
|
|
// register the stub as the default exit with class UnsafeMemoryAccess
|
|
UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
|
|
|
|
// Note: the disjoint stubs must be generated first, some of
|
|
// the conjoint stubs use them.
|
|
|
|
// non-aligned disjoint versions
|
|
StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
|
|
StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubId::stubgen_jshort_disjoint_arraycopy_id);
|
|
StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubId::stubgen_jint_disjoint_arraycopy_id);
|
|
StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubId::stubgen_jlong_disjoint_arraycopy_id);
|
|
StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id);
|
|
StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
|
|
|
|
// aligned disjoint versions
|
|
StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id);
|
|
StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id);
|
|
StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id);
|
|
StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id);
|
|
StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id);
|
|
StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
|
|
|
|
// non-aligned conjoint versions
|
|
StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubId::stubgen_jbyte_arraycopy_id);
|
|
StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubId::stubgen_jshort_arraycopy_id);
|
|
StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(StubId::stubgen_jint_arraycopy_id);
|
|
StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(StubId::stubgen_jlong_arraycopy_id);
|
|
StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_id);
|
|
StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id);
|
|
|
|
// aligned conjoint versions
|
|
StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id);
|
|
StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(StubId::stubgen_arrayof_jshort_arraycopy_id);
|
|
StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(StubId::stubgen_arrayof_jint_arraycopy_id);
|
|
StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(StubId::stubgen_arrayof_jlong_arraycopy_id);
|
|
StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
|
|
StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
|
|
|
|
// special/generic versions
|
|
StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id);
|
|
StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id);
|
|
|
|
StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(STUB_ENTRY(jbyte_arraycopy()),
|
|
STUB_ENTRY(jshort_arraycopy()),
|
|
STUB_ENTRY(jint_arraycopy()),
|
|
STUB_ENTRY(jlong_arraycopy()));
|
|
StubRoutines::_generic_arraycopy = generate_generic_copy(STUB_ENTRY(jbyte_arraycopy()),
|
|
STUB_ENTRY(jshort_arraycopy()),
|
|
STUB_ENTRY(jint_arraycopy()),
|
|
STUB_ENTRY(oop_arraycopy()),
|
|
STUB_ENTRY(oop_disjoint_arraycopy()),
|
|
STUB_ENTRY(jlong_arraycopy()),
|
|
STUB_ENTRY(checkcast_arraycopy()));
|
|
|
|
// fill routines
|
|
#ifdef COMPILER2
|
|
if (OptimizeFill) {
|
|
StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
|
|
StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
|
|
StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
|
|
StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
|
|
StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
|
|
StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
|
|
}
|
|
StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
|
|
#endif
|
|
}
|
|
|
|
// Stub for BigInteger::multiplyToLen()
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Input:
|
|
// R3 - x address
|
|
// R4 - x length
|
|
// R5 - y address
|
|
// R6 - y length
|
|
// R7 - z address
|
|
//
|
|
address generate_multiplyToLen() {
|
|
|
|
StubId stub_id = StubId::stubgen_multiplyToLen_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ function_entry();
|
|
|
|
const Register x = R3;
|
|
const Register xlen = R4;
|
|
const Register y = R5;
|
|
const Register ylen = R6;
|
|
const Register z = R7;
|
|
|
|
const Register tmp1 = R2; // TOC not used.
|
|
const Register tmp2 = R9;
|
|
const Register tmp3 = R10;
|
|
const Register tmp4 = R11;
|
|
const Register tmp5 = R12;
|
|
|
|
// non-volatile regs
|
|
const Register tmp6 = R31;
|
|
const Register tmp7 = R30;
|
|
const Register tmp8 = R29;
|
|
const Register tmp9 = R28;
|
|
const Register tmp10 = R27;
|
|
const Register tmp11 = R26;
|
|
const Register tmp12 = R25;
|
|
const Register tmp13 = R24;
|
|
|
|
BLOCK_COMMENT("Entry:");
|
|
|
|
// C2 does not respect int to long conversion for stub calls.
|
|
__ clrldi(xlen, xlen, 32);
|
|
__ clrldi(ylen, ylen, 32);
|
|
|
|
// Save non-volatile regs (frameless).
|
|
int current_offs = 8;
|
|
__ std(R24, -current_offs, R1_SP); current_offs += 8;
|
|
__ std(R25, -current_offs, R1_SP); current_offs += 8;
|
|
__ std(R26, -current_offs, R1_SP); current_offs += 8;
|
|
__ std(R27, -current_offs, R1_SP); current_offs += 8;
|
|
__ std(R28, -current_offs, R1_SP); current_offs += 8;
|
|
__ std(R29, -current_offs, R1_SP); current_offs += 8;
|
|
__ std(R30, -current_offs, R1_SP); current_offs += 8;
|
|
__ std(R31, -current_offs, R1_SP);
|
|
|
|
__ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5,
|
|
tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
|
|
|
|
// Restore non-volatile regs.
|
|
current_offs = 8;
|
|
__ ld(R24, -current_offs, R1_SP); current_offs += 8;
|
|
__ ld(R25, -current_offs, R1_SP); current_offs += 8;
|
|
__ ld(R26, -current_offs, R1_SP); current_offs += 8;
|
|
__ ld(R27, -current_offs, R1_SP); current_offs += 8;
|
|
__ ld(R28, -current_offs, R1_SP); current_offs += 8;
|
|
__ ld(R29, -current_offs, R1_SP); current_offs += 8;
|
|
__ ld(R30, -current_offs, R1_SP); current_offs += 8;
|
|
__ ld(R31, -current_offs, R1_SP);
|
|
|
|
__ blr(); // Return to caller.
|
|
|
|
return start;
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Input:
|
|
* R3_ARG1 - out address
|
|
* R4_ARG2 - in address
|
|
* R5_ARG3 - offset
|
|
* R6_ARG4 - len
|
|
* R7_ARG5 - k
|
|
* Output:
|
|
* R3_RET - carry
|
|
*/
|
|
address generate_mulAdd() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_mulAdd_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ function_entry();
|
|
|
|
// C2 does not sign extend signed parameters to full 64 bits registers:
|
|
__ rldic (R5_ARG3, R5_ARG3, 2, 32); // always positive
|
|
__ clrldi(R6_ARG4, R6_ARG4, 32); // force zero bits on higher word
|
|
__ clrldi(R7_ARG5, R7_ARG5, 32); // force zero bits on higher word
|
|
|
|
__ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
|
|
|
|
// Moves output carry to return register
|
|
__ mr (R3_RET, R10);
|
|
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Input:
|
|
* R3_ARG1 - in address
|
|
* R4_ARG2 - in length
|
|
* R5_ARG3 - out address
|
|
* R6_ARG4 - out length
|
|
*/
|
|
address generate_squareToLen() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_squareToLen_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ function_entry();
|
|
|
|
// args - higher word is cleaned (unsignedly) due to int to long casting
|
|
const Register in = R3_ARG1;
|
|
const Register in_len = R4_ARG2;
|
|
__ clrldi(in_len, in_len, 32);
|
|
const Register out = R5_ARG3;
|
|
const Register out_len = R6_ARG4;
|
|
__ clrldi(out_len, out_len, 32);
|
|
|
|
// output
|
|
const Register ret = R3_RET;
|
|
|
|
// temporaries
|
|
const Register lplw_s = R7;
|
|
const Register in_aux = R8;
|
|
const Register out_aux = R9;
|
|
const Register piece = R10;
|
|
const Register product = R14;
|
|
const Register lplw = R15;
|
|
const Register i_minus1 = R16;
|
|
const Register carry = R17;
|
|
const Register offset = R18;
|
|
const Register off_aux = R19;
|
|
const Register t = R20;
|
|
const Register mlen = R21;
|
|
const Register len = R22;
|
|
const Register a = R23;
|
|
const Register b = R24;
|
|
const Register i = R25;
|
|
const Register c = R26;
|
|
const Register cs = R27;
|
|
|
|
// Labels
|
|
Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
|
|
Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
|
|
|
|
// Save non-volatile regs (frameless).
|
|
int current_offs = -8;
|
|
__ std(R28, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R27, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R26, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R25, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R24, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R23, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R22, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R21, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R20, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R19, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R18, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R17, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R16, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R15, current_offs, R1_SP); current_offs -= 8;
|
|
__ std(R14, current_offs, R1_SP);
|
|
|
|
// Store the squares, right shifted one bit (i.e., divided by 2)
|
|
__ subi (out_aux, out, 8);
|
|
__ subi (in_aux, in, 4);
|
|
__ cmpwi (CR0, in_len, 0);
|
|
// Initialize lplw outside of the loop
|
|
__ xorr (lplw, lplw, lplw);
|
|
__ ble (CR0, SKIP_LOOP_SQUARE); // in_len <= 0
|
|
__ mtctr (in_len);
|
|
|
|
__ bind(LOOP_SQUARE);
|
|
__ lwzu (piece, 4, in_aux);
|
|
__ mulld (product, piece, piece);
|
|
// shift left 63 bits and only keep the MSB
|
|
__ rldic (lplw_s, lplw, 63, 0);
|
|
__ mr (lplw, product);
|
|
// shift right 1 bit without sign extension
|
|
__ srdi (product, product, 1);
|
|
// join them to the same register and store it
|
|
__ orr (product, lplw_s, product);
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
// Swap low and high words for little endian
|
|
__ rldicl (product, product, 32, 0);
|
|
#endif
|
|
__ stdu (product, 8, out_aux);
|
|
__ bdnz (LOOP_SQUARE);
|
|
|
|
__ bind(SKIP_LOOP_SQUARE);
|
|
|
|
// Add in off-diagonal sums
|
|
__ cmpwi (CR0, in_len, 0);
|
|
__ ble (CR0, SKIP_DIAGONAL_SUM);
|
|
// Avoid CTR usage here in order to use it at mulAdd
|
|
__ subi (i_minus1, in_len, 1);
|
|
__ li (offset, 4);
|
|
|
|
__ bind(LOOP_DIAGONAL_SUM);
|
|
|
|
__ sldi (off_aux, out_len, 2);
|
|
__ sub (off_aux, off_aux, offset);
|
|
|
|
__ mr (len, i_minus1);
|
|
__ sldi (mlen, i_minus1, 2);
|
|
__ lwzx (t, in, mlen);
|
|
|
|
__ muladd (out, in, off_aux, len, t, a, b, carry);
|
|
|
|
// begin<addOne>
|
|
// off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
|
|
__ addi (mlen, mlen, 4);
|
|
__ sldi (a, out_len, 2);
|
|
__ subi (a, a, 4);
|
|
__ sub (a, a, mlen);
|
|
__ subi (off_aux, offset, 4);
|
|
__ sub (off_aux, a, off_aux);
|
|
|
|
__ lwzx (b, off_aux, out);
|
|
__ add (b, b, carry);
|
|
__ stwx (b, off_aux, out);
|
|
|
|
// if (((uint64_t)s >> 32) != 0) {
|
|
__ srdi_ (a, b, 32);
|
|
__ beq (CR0, SKIP_ADDONE);
|
|
|
|
// while (--mlen >= 0) {
|
|
__ bind(LOOP_ADDONE);
|
|
__ subi (mlen, mlen, 4);
|
|
__ cmpwi (CR0, mlen, 0);
|
|
__ beq (CR0, SKIP_ADDONE);
|
|
|
|
// if (--offset_aux < 0) { // Carry out of number
|
|
__ subi (off_aux, off_aux, 4);
|
|
__ cmpwi (CR0, off_aux, 0);
|
|
__ blt (CR0, SKIP_ADDONE);
|
|
|
|
// } else {
|
|
__ lwzx (b, off_aux, out);
|
|
__ addi (b, b, 1);
|
|
__ stwx (b, off_aux, out);
|
|
__ cmpwi (CR0, b, 0);
|
|
__ bne (CR0, SKIP_ADDONE);
|
|
__ b (LOOP_ADDONE);
|
|
|
|
__ bind(SKIP_ADDONE);
|
|
// } } } end<addOne>
|
|
|
|
__ addi (offset, offset, 8);
|
|
__ subi (i_minus1, i_minus1, 1);
|
|
__ cmpwi (CR0, i_minus1, 0);
|
|
__ bge (CR0, LOOP_DIAGONAL_SUM);
|
|
|
|
__ bind(SKIP_DIAGONAL_SUM);
|
|
|
|
// Shift back up and set low bit
|
|
// Shifts 1 bit left up to len positions. Assumes no leading zeros
|
|
// begin<primitiveLeftShift>
|
|
__ cmpwi (CR0, out_len, 0);
|
|
__ ble (CR0, SKIP_LSHIFT);
|
|
__ li (i, 0);
|
|
__ lwz (c, 0, out);
|
|
__ subi (b, out_len, 1);
|
|
__ mtctr (b);
|
|
|
|
__ bind(LOOP_LSHIFT);
|
|
__ mr (b, c);
|
|
__ addi (cs, i, 4);
|
|
__ lwzx (c, out, cs);
|
|
|
|
__ sldi (b, b, 1);
|
|
__ srwi (cs, c, 31);
|
|
__ orr (b, b, cs);
|
|
__ stwx (b, i, out);
|
|
|
|
__ addi (i, i, 4);
|
|
__ bdnz (LOOP_LSHIFT);
|
|
|
|
__ sldi (c, out_len, 2);
|
|
__ subi (c, c, 4);
|
|
__ lwzx (b, out, c);
|
|
__ sldi (b, b, 1);
|
|
__ stwx (b, out, c);
|
|
|
|
__ bind(SKIP_LSHIFT);
|
|
// end<primitiveLeftShift>
|
|
|
|
// Set low bit
|
|
__ sldi (i, in_len, 2);
|
|
__ subi (i, i, 4);
|
|
__ lwzx (i, in, i);
|
|
__ sldi (c, out_len, 2);
|
|
__ subi (c, c, 4);
|
|
__ lwzx (b, out, c);
|
|
|
|
__ andi (i, i, 1);
|
|
__ orr (i, b, i);
|
|
|
|
__ stwx (i, out, c);
|
|
|
|
// Restore non-volatile regs.
|
|
current_offs = -8;
|
|
__ ld(R28, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R27, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R26, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R25, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R24, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R23, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R22, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R21, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R20, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R19, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R18, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R17, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R16, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R15, current_offs, R1_SP); current_offs -= 8;
|
|
__ ld(R14, current_offs, R1_SP);
|
|
|
|
__ mr(ret, out);
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
/**
|
|
* Arguments:
|
|
*
|
|
* Inputs:
|
|
* R3_ARG1 - int crc
|
|
* R4_ARG2 - byte* buf
|
|
* R5_ARG3 - int length (of buffer)
|
|
*
|
|
* scratch:
|
|
* R2, R6-R12
|
|
*
|
|
* Output:
|
|
* R3_RET - int crc result
|
|
*/
|
|
// Compute CRC32 function.
|
|
address generate_CRC32_updateBytes(StubId stub_id) {
|
|
bool is_crc32c;
|
|
switch (stub_id) {
|
|
case StubId::stubgen_updateBytesCRC32_id:
|
|
is_crc32c = false;
|
|
break;
|
|
case StubId::stubgen_updateBytesCRC32C_id:
|
|
is_crc32c = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry(); // Remember stub start address (is rtn value).
|
|
__ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
|
|
__ blr();
|
|
return start;
|
|
}
|
|
|
|
address generate_floatToFloat16() {
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
|
|
address start = __ function_entry();
|
|
__ f2hf(R3_RET, F1_ARG1, F0);
|
|
__ blr();
|
|
return start;
|
|
}
|
|
|
|
address generate_float16ToFloat() {
|
|
__ align(CodeEntryAlignment);
|
|
StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
|
|
address start = __ function_entry();
|
|
__ hf2f(F1_RET, R3_ARG1);
|
|
__ blr();
|
|
return start;
|
|
}
|
|
|
|
address generate_method_entry_barrier() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_method_entry_barrier_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address stub_address = __ pc();
|
|
|
|
int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
|
|
__ save_volatile_gprs(R1_SP, -nbytes_save, true);
|
|
|
|
// Link register points to instruction in prologue of the guarded nmethod.
|
|
// As the stub requires one layer of indirection (argument is of type address* and not address),
|
|
// passing the link register's value directly doesn't work.
|
|
// Since we have to save the link register on the stack anyway, we calculate the corresponding stack address
|
|
// and pass that one instead.
|
|
__ addi(R3_ARG1, R1_SP, _abi0(lr));
|
|
|
|
__ save_LR(R0);
|
|
__ push_frame_reg_args(nbytes_save, R0);
|
|
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
|
|
__ mr(R0, R3_RET);
|
|
|
|
__ pop_frame();
|
|
__ restore_LR(R3_RET /* used as tmp register */);
|
|
__ restore_volatile_gprs(R1_SP, -nbytes_save, true);
|
|
|
|
__ cmpdi(CR0, R0, 0);
|
|
|
|
// Return to prologue if no deoptimization is required (bnelr)
|
|
__ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintIsTaken);
|
|
|
|
// Deoptimization required.
|
|
// For actually handling the deoptimization, the 'wrong method stub' is invoked.
|
|
__ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
|
|
__ mtctr(R0);
|
|
|
|
// Pop the frame built in the prologue.
|
|
__ pop_frame();
|
|
|
|
// Restore link register. Required as the 'wrong method stub' needs the caller's frame
|
|
// to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods).
|
|
// This method's prologue is aborted.
|
|
__ restore_LR(R0);
|
|
|
|
__ bctr();
|
|
return stub_address;
|
|
}
|
|
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
// The following Base64 decode intrinsic is based on an algorithm outlined
|
|
// in here:
|
|
// http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
|
|
// in the section titled "Vector lookup (pshufb with bitmask)"
|
|
//
|
|
// This implementation differs in the following ways:
|
|
// * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
|
|
// are used instead. It turns out that some of the vector operations
|
|
// needed in the algorithm require fewer AltiVec instructions.
|
|
// * The algorithm in the above mentioned paper doesn't handle the
|
|
// Base64-URL variant in RFC 4648. Adjustments to both the code and to two
|
|
// lookup tables are needed for this.
|
|
// * The "Pack" section of the code is a complete rewrite for Power because we
|
|
// can utilize better instructions for this step.
|
|
//
|
|
|
|
// Offsets per group of Base64 characters
|
|
// Uppercase
|
|
#define UC (signed char)((-'A' + 0) & 0xff)
|
|
// Lowercase
|
|
#define LC (signed char)((-'a' + 26) & 0xff)
|
|
// Digits
|
|
#define DIG (signed char)((-'0' + 52) & 0xff)
|
|
// Plus sign (URL = 0)
|
|
#define PLS (signed char)((-'+' + 62) & 0xff)
|
|
// Hyphen (URL = 1)
|
|
#define HYP (signed char)((-'-' + 62) & 0xff)
|
|
// Slash (URL = 0)
|
|
#define SLS (signed char)((-'/' + 63) & 0xff)
|
|
// Underscore (URL = 1)
|
|
#define US (signed char)((-'_' + 63) & 0xff)
|
|
|
|
// For P10 (or later) only
|
|
#define VALID_B64 0x80
|
|
#define VB64(x) (VALID_B64 | x)
|
|
|
|
#define BLK_OFFSETOF(x) (offsetof(constant_block, x))
|
|
|
|
// In little-endian mode, the lxv instruction loads the element at EA into
|
|
// element 15 of the vector register, EA+1 goes into element 14, and so
|
|
// on.
|
|
//
|
|
// To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the
|
|
// order of the elements in a vector initialization.
|
|
#define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
|
|
|
|
//
|
|
// Base64 decodeBlock intrinsic
|
|
address generate_base64_decodeBlock() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
|
|
typedef struct {
|
|
signed char offsetLUT_val[16];
|
|
signed char offsetLUT_URL_val[16];
|
|
unsigned char maskLUT_val[16];
|
|
unsigned char maskLUT_URL_val[16];
|
|
unsigned char bitposLUT_val[16];
|
|
unsigned char table_32_47_val[16];
|
|
unsigned char table_32_47_URL_val[16];
|
|
unsigned char table_48_63_val[16];
|
|
unsigned char table_64_79_val[16];
|
|
unsigned char table_80_95_val[16];
|
|
unsigned char table_80_95_URL_val[16];
|
|
unsigned char table_96_111_val[16];
|
|
unsigned char table_112_127_val[16];
|
|
unsigned char pack_lshift_val[16];
|
|
unsigned char pack_rshift_val[16];
|
|
unsigned char pack_permute_val[16];
|
|
} constant_block;
|
|
|
|
alignas(16) static const constant_block const_block = {
|
|
|
|
.offsetLUT_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0, 0, PLS, DIG, UC, UC, LC, LC,
|
|
0, 0, 0, 0, 0, 0, 0, 0 ) },
|
|
|
|
.offsetLUT_URL_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0, 0, HYP, DIG, UC, UC, LC, LC,
|
|
0, 0, 0, 0, 0, 0, 0, 0 ) },
|
|
|
|
.maskLUT_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
/* 0 */ (unsigned char)0b10101000,
|
|
/* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
|
(unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
|
(unsigned char)0b11111000,
|
|
/* 10 */ (unsigned char)0b11110000,
|
|
/* 11 */ (unsigned char)0b01010100,
|
|
/* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
|
|
/* 15 */ (unsigned char)0b01010100 ) },
|
|
|
|
.maskLUT_URL_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
/* 0 */ (unsigned char)0b10101000,
|
|
/* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
|
(unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
|
(unsigned char)0b11111000,
|
|
/* 10 */ (unsigned char)0b11110000,
|
|
/* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
|
|
/* 13 */ (unsigned char)0b01010100,
|
|
/* 14 */ (unsigned char)0b01010000,
|
|
/* 15 */ (unsigned char)0b01110000 ) },
|
|
|
|
.bitposLUT_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
|
|
|
|
// In the following table_*_val constants, a 0 value means the
|
|
// character is not in the Base64 character set
|
|
.table_32_47_val = {
|
|
ARRAY_TO_LXV_ORDER (
|
|
/* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
|
|
|
|
.table_32_47_URL_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
/* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
|
|
|
|
.table_48_63_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
/* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
|
|
/* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
|
|
|
|
.table_64_79_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
/* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
|
|
VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
|
|
|
|
.table_80_95_val = {
|
|
ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
|
|
VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
|
|
|
|
.table_80_95_URL_val = {
|
|
ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
|
|
VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
|
|
|
|
.table_96_111_val = {
|
|
ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
|
|
VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
|
|
|
|
.table_112_127_val = {
|
|
ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
|
|
VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
|
|
|
|
.pack_lshift_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
|
|
|
|
.pack_rshift_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
|
|
|
|
// The first 4 index values are "don't care" because
|
|
// we only use the first 12 bytes of the vector,
|
|
// which are decoded from 16 bytes of Base64 characters.
|
|
.pack_permute_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0, 0, 0, 0,
|
|
0, 1, 2,
|
|
4, 5, 6,
|
|
8, 9, 10,
|
|
12, 13, 14 ) }
|
|
};
|
|
|
|
const unsigned block_size = 16; // number of bytes to process in each pass through the loop
|
|
const unsigned block_size_shift = 4;
|
|
|
|
// According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
|
|
Register s = R3_ARG1; // source starting address of Base64 characters
|
|
Register sp = R4_ARG2; // source offset
|
|
Register sl = R5_ARG3; // source length = # of Base64 characters to be processed
|
|
Register d = R6_ARG4; // destination address
|
|
Register dp = R7_ARG5; // destination offset
|
|
Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
|
|
Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
|
|
|
|
// Local variables
|
|
Register const_ptr = R9; // used for loading constants
|
|
Register tmp_reg = R10; // used for speeding up load_constant_optimized()
|
|
|
|
// Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore)
|
|
Register out = R9; // moving out (destination) pointer
|
|
Register in = R10; // moving in (source) pointer
|
|
|
|
// Volatile VSRS are 0..13, 32..51 (VR0..VR13)
|
|
// VR Constants
|
|
VectorRegister vec_0s = VR0;
|
|
VectorRegister vec_4s = VR1;
|
|
VectorRegister vec_8s = VR2;
|
|
VectorRegister vec_special_case_char = VR3;
|
|
VectorRegister pack_rshift = VR4;
|
|
VectorRegister pack_lshift = VR5;
|
|
|
|
// VSR Constants
|
|
VectorSRegister offsetLUT = VSR0;
|
|
VectorSRegister maskLUT = VSR1;
|
|
VectorSRegister bitposLUT = VSR2;
|
|
VectorSRegister vec_0xfs = VSR3;
|
|
VectorSRegister vec_special_case_offset = VSR4;
|
|
VectorSRegister pack_permute = VSR5;
|
|
|
|
// P10 (or later) VSR lookup constants
|
|
VectorSRegister table_32_47 = VSR0;
|
|
VectorSRegister table_48_63 = VSR1;
|
|
VectorSRegister table_64_79 = VSR2;
|
|
VectorSRegister table_80_95 = VSR3;
|
|
VectorSRegister table_96_111 = VSR4;
|
|
VectorSRegister table_112_127 = VSR6;
|
|
|
|
// Data read in and later converted
|
|
VectorRegister input = VR6;
|
|
// Variable for testing Base64 validity
|
|
VectorRegister non_match = VR10;
|
|
|
|
// P9 VR Variables for lookup
|
|
VectorRegister higher_nibble = VR7;
|
|
VectorRegister eq_special_case_char = VR8;
|
|
VectorRegister offsets = VR9;
|
|
|
|
// P9 VSR lookup variables
|
|
VectorSRegister bit = VSR6;
|
|
VectorSRegister lower_nibble = VSR7;
|
|
VectorSRegister M = VSR8;
|
|
|
|
// P10 (or later) VSR lookup variables
|
|
VectorSRegister xlate_a = VSR7;
|
|
VectorSRegister xlate_b = VSR8;
|
|
|
|
// Variables for pack
|
|
// VR
|
|
VectorRegister l = VR7; // reuse higher_nibble's register
|
|
VectorRegister r = VR8; // reuse eq_special_case_char's register
|
|
VectorRegister gathered = VR10; // reuse non_match's register
|
|
|
|
Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
|
|
|
|
// The upper 32 bits of the non-pointer parameter registers are not
|
|
// guaranteed to be zero, so mask off those upper bits.
|
|
__ clrldi(sp, sp, 32);
|
|
__ clrldi(sl, sl, 32);
|
|
|
|
// Don't handle the last 4 characters of the source, because this
|
|
// VSX-based algorithm doesn't handle padding characters. Also the
|
|
// vector code will always write 16 bytes of decoded data on each pass,
|
|
// but only the first 12 of those 16 bytes are valid data (16 base64
|
|
// characters become 12 bytes of binary data), so for this reason we
|
|
// need to subtract an additional 8 bytes from the source length, in
|
|
// order not to write past the end of the destination buffer. The
|
|
// result of this subtraction implies that a Java function in the
|
|
// Base64 class will be used to process the last 12 characters.
|
|
__ sub(sl, sl, sp);
|
|
__ subi(sl, sl, 12);
|
|
|
|
// Load CTR with the number of passes through the loop
|
|
// = sl >> block_size_shift. After the shift, if sl <= 0, there's too
|
|
// little data to be processed by this intrinsic.
|
|
__ srawi_(sl, sl, block_size_shift);
|
|
__ ble(CR0, return_zero);
|
|
__ mtctr(sl);
|
|
|
|
// Clear the other two parameter registers upper 32 bits.
|
|
__ clrldi(isURL, isURL, 32);
|
|
__ clrldi(dp, dp, 32);
|
|
|
|
// Load constant vec registers that need to be loaded from memory
|
|
__ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
|
|
__ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
|
|
__ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
|
|
__ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
|
|
__ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
|
|
|
|
// Splat the constants that can use xxspltib
|
|
__ xxspltib(vec_0s->to_vsr(), 0);
|
|
__ xxspltib(vec_8s->to_vsr(), 8);
|
|
if (PowerArchitecturePPC64 >= 10) {
|
|
// Using VALID_B64 for the offsets effectively strips the upper bit
|
|
// of each byte that was selected from the table. Setting the upper
|
|
// bit gives us a way to distinguish between the 6-bit value of 0
|
|
// from an error code of 0, which will happen if the character is
|
|
// outside the range of the lookup, or is an illegal Base64
|
|
// character, such as %.
|
|
__ xxspltib(offsets->to_vsr(), VALID_B64);
|
|
|
|
__ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
|
|
__ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
|
|
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
|
|
__ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
|
|
__ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
|
|
} else {
|
|
__ xxspltib(vec_4s->to_vsr(), 4);
|
|
__ xxspltib(vec_0xfs, 0xf);
|
|
__ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
|
|
}
|
|
|
|
// The rest of the constants use different values depending on the
|
|
// setting of isURL
|
|
__ cmpwi(CR0, isURL, 0);
|
|
__ beq(CR0, not_URL);
|
|
|
|
// isURL != 0 (true)
|
|
if (PowerArchitecturePPC64 >= 10) {
|
|
__ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
|
|
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
|
|
} else {
|
|
__ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
|
|
__ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
|
|
__ xxspltib(vec_special_case_char->to_vsr(), '_');
|
|
__ xxspltib(vec_special_case_offset, (unsigned char)US);
|
|
}
|
|
__ b(calculate_size);
|
|
|
|
// isURL = 0 (false)
|
|
__ bind(not_URL);
|
|
if (PowerArchitecturePPC64 >= 10) {
|
|
__ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
|
|
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
|
|
} else {
|
|
__ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
|
|
__ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
|
|
__ xxspltib(vec_special_case_char->to_vsr(), '/');
|
|
__ xxspltib(vec_special_case_offset, (unsigned char)SLS);
|
|
}
|
|
|
|
__ bind(calculate_size);
|
|
|
|
// out starts at d + dp
|
|
__ add(out, d, dp);
|
|
|
|
// in starts at s + sp
|
|
__ add(in, s, sp);
|
|
|
|
__ align(32);
|
|
__ bind(loop_start);
|
|
__ lxv(input->to_vsr(), 0, in); // offset=0
|
|
|
|
//
|
|
// Lookup
|
|
//
|
|
if (PowerArchitecturePPC64 >= 10) {
|
|
// Use xxpermx to do a lookup of each Base64 character in the
|
|
// input vector and translate it to a 6-bit value + 0x80.
|
|
// Characters which are not valid Base64 characters will result
|
|
// in a zero in the corresponding byte.
|
|
//
|
|
// Note that due to align(32) call above, the xxpermx instructions do
|
|
// not require align_prefix() calls, since the final xxpermx
|
|
// prefix+opcode is at byte 24.
|
|
__ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1); // offset=4
|
|
__ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2); // offset=12
|
|
__ xxlor(xlate_b, xlate_a, xlate_b); // offset=20
|
|
__ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
|
|
__ xxlor(input->to_vsr(), xlate_a, xlate_b);
|
|
// Check for non-Base64 characters by comparing each byte to zero.
|
|
__ vcmpequb_(non_match, input, vec_0s);
|
|
} else {
|
|
// Isolate the upper 4 bits of each character by shifting it right 4 bits
|
|
__ vsrb(higher_nibble, input, vec_4s);
|
|
// Isolate the lower 4 bits by masking
|
|
__ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
|
|
|
|
// Get the offset (the value to subtract from the byte) by using
|
|
// a lookup table indexed by the upper 4 bits of the character
|
|
__ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
|
|
|
|
// Find out which elements are the special case character (isURL ? '/' : '-')
|
|
__ vcmpequb(eq_special_case_char, input, vec_special_case_char);
|
|
|
|
// For each character in the input which is a special case
|
|
// character, replace its offset with one that is special for that
|
|
// character.
|
|
__ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
|
|
|
|
// Use the lower_nibble to select a mask "M" from the lookup table.
|
|
__ xxperm(M, maskLUT, lower_nibble);
|
|
|
|
// "bit" is used to isolate which of the bits in M is relevant.
|
|
__ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
|
|
|
|
// Each element of non_match correspond to one each of the 16 input
|
|
// characters. Those elements that become 0x00 after the xxland
|
|
// instruction are invalid Base64 characters.
|
|
__ xxland(non_match->to_vsr(), M, bit);
|
|
|
|
// Compare each element to zero
|
|
//
|
|
__ vcmpequb_(non_match, non_match, vec_0s);
|
|
}
|
|
// vmcmpequb_ sets the EQ bit of CR6 if no elements compare equal.
|
|
// Any element comparing equal to zero means there is an error in
|
|
// that element. Note that the comparison result register
|
|
// non_match is not referenced again. Only CR6-EQ matters.
|
|
__ bne_predict_not_taken(CR6, loop_exit);
|
|
|
|
// The Base64 characters had no errors, so add the offsets, which in
|
|
// the case of Power10 is a constant vector of all 0x80's (see earlier
|
|
// comment where the offsets register is loaded).
|
|
__ vaddubm(input, input, offsets);
|
|
|
|
// Pack
|
|
//
|
|
// In the tables below, b0, b1, .. b15 are the bytes of decoded
|
|
// binary data, the first line of each of the cells (except for
|
|
// the constants) uses the bit-field nomenclature from the
|
|
// above-linked paper, whereas the second line is more specific
|
|
// about which exact bits are present, and is constructed using the
|
|
// Power ISA 3.x document style, where:
|
|
//
|
|
// * The specifier after the colon depicts which bits are there.
|
|
// * The bit numbering is big endian style (bit 0 is the most
|
|
// significant).
|
|
// * || is a concatenate operator.
|
|
// * Strings of 0's are a field of zeros with the shown length, and
|
|
// likewise for strings of 1's.
|
|
|
|
// Note that only e12..e15 are shown here because the shifting
|
|
// and OR'ing pattern replicates for e8..e11, e4..7, and
|
|
// e0..e3.
|
|
//
|
|
// +======================+=================+======================+======================+=============+
|
|
// | Vector | e12 | e13 | e14 | e15 |
|
|
// | Element | | | | |
|
|
// +======================+=================+======================+======================+=============+
|
|
// | after vaddubm | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
|
|
// | | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
|
|
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
|
// | pack_lshift | | << 6 | << 4 | << 2 |
|
|
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
|
// | l after vslb | 00dddddd | cc000000 | bbbb0000 | aaaaaa00 |
|
|
// | | 00||b2:2..7 | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 |
|
|
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
|
// | l after vslo | cc000000 | bbbb0000 | aaaaaa00 | 00000000 |
|
|
// | | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 00000000 |
|
|
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
|
// | pack_rshift | | >> 2 | >> 4 | |
|
|
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
|
// | r after vsrb | 00dddddd | 0000cccc | 000000bb | 00aaaaaa |
|
|
// | | 00||b2:2..7 | 0000||b1:4..7 | 000000||b0:6..7 | 00||b0:0..5 |
|
|
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
|
// | gathered after xxlor | ccdddddd | bbbbcccc | aaaaaabb | 00aaaaaa |
|
|
// | | b2:0..7 | b1:0..7 | b0:0..7 | 00||b0:0..5 |
|
|
// +======================+=================+======================+======================+=============+
|
|
//
|
|
// Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
|
|
// [ddddddcc|bbbbcccc|aaaaaabb]
|
|
// but should be:
|
|
// [ccdddddd|bbbbcccc|aaaaaabb]
|
|
//
|
|
__ vslb(l, input, pack_lshift);
|
|
// vslo of vec_8s shifts the vector by one octet toward lower
|
|
// element numbers, discarding element 0. This means it actually
|
|
// shifts to the right (not left) according to the order of the
|
|
// table above.
|
|
__ vslo(l, l, vec_8s);
|
|
__ vsrb(r, input, pack_rshift);
|
|
__ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
|
|
|
|
// Final rearrangement of bytes into their correct positions.
|
|
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
|
// | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
|
|
// | Elements | | | | | | | | | | | | | | | | |
|
|
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
|
// | after xxlor | b11 | b10 | b9 | xx | b8 | b7 | b6 | xx | b5 | b4 | b3 | xx | b2 | b1 | b0 | xx |
|
|
// +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
|
|
// | pack_permute | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 5 | 6 | 8 | 9 | 10 | 12 | 13 | 14 |
|
|
// +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
|
|
// | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
|
|
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
|
// xx bytes are not used to form the final data
|
|
// b0..b15 are the decoded and reassembled 8-bit bytes of data
|
|
// b11 with asterisk is a "don't care", because these bytes will be
|
|
// overwritten on the next iteration.
|
|
__ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
|
|
|
|
// We cannot use a static displacement on the store, since it's a
|
|
// multiple of 12, not 16. Note that this stxv instruction actually
|
|
// writes 16 bytes, even though only the first 12 are valid data.
|
|
__ stxv(gathered->to_vsr(), 0, out);
|
|
__ addi(out, out, 12);
|
|
__ addi(in, in, 16);
|
|
__ bdnz(loop_start);
|
|
|
|
__ bind(loop_exit);
|
|
|
|
// Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
|
|
__ sub(R3_RET, out, d);
|
|
__ sub(R3_RET, R3_RET, dp);
|
|
|
|
__ blr();
|
|
|
|
__ bind(return_zero);
|
|
__ li(R3_RET, 0);
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
#undef UC
|
|
#undef LC
|
|
#undef DIG
|
|
#undef PLS
|
|
#undef HYP
|
|
#undef SLS
|
|
#undef US
|
|
|
|
// This algorithm is based on the methods described in this paper:
|
|
// http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
|
|
//
|
|
// The details of this implementation vary from the paper due to the
|
|
// difference in the ISA between SSE and AltiVec, especially in the
|
|
// splitting bytes section where there is no need on Power to mask after
|
|
// the shift because the shift is byte-wise rather than an entire an entire
|
|
// 128-bit word.
|
|
//
|
|
// For the lookup part of the algorithm, different logic is used than
|
|
// described in the paper because of the availability of vperm, which can
|
|
// do a 64-byte table lookup in four instructions, while preserving the
|
|
// branchless nature.
|
|
//
|
|
// Description of the ENCODE_CORE macro
|
|
//
|
|
// Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
|
|
// bits of each byte are zeros)
|
|
//
|
|
// (Note: e7..e0 are not shown because they follow the same pattern as
|
|
// e8..e15)
|
|
//
|
|
// In the table below, b0, b1, .. b15 are the bytes of unencoded
|
|
// binary data, the first line of each of the cells (except for
|
|
// the constants) uses the bit-field nomenclature from the
|
|
// above-linked paper, whereas the second line is more specific
|
|
// about which exact bits are present, and is constructed using the
|
|
// Power ISA 3.x document style, where:
|
|
//
|
|
// * The specifier after the colon depicts which bits are there.
|
|
// * The bit numbering is big endian style (bit 0 is the most
|
|
// significant).
|
|
// * || is a concatenate operator.
|
|
// * Strings of 0's are a field of zeros with the shown length, and
|
|
// likewise for strings of 1's.
|
|
//
|
|
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
|
|
// | Vector | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
|
|
// | Element | | | | | | | | |
|
|
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
|
|
// | after lxv | jjjjkkkk | iiiiiijj | gghhhhhh | ffffgggg | eeeeeeff | ccdddddd | bbbbcccc | aaaaaabb |
|
|
// | | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | xxperm indexes | 0 | 10 | 11 | 12 | 0 | 13 | 14 | 15 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | (1) after xxperm | | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb |
|
|
// | | (b15) | b5 | b4 | b3 | (b15) | b2 | b1 | b0 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | rshift_amount | 0 | 6 | 4 | 2 | 0 | 6 | 4 | 2 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | after vsrb | | 000000gg | 0000ffff | 00eeeeee | | 000000cc | 0000bbbb | 00aaaaaa |
|
|
// | | (b15) | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | (b15) | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | rshift_mask | 00000000 | 000000||11 | 0000||1111 | 00||111111 | 00000000 | 000000||11 | 0000||1111 | 00||111111 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | rshift after vand | 00000000 | 000000gg | 0000ffff | 00eeeeee | 00000000 | 000000cc | 0000bbbb | 00aaaaaa |
|
|
// | | 00000000 | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | 00000000 | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | 1 octet lshift (1) | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | 00000000 |
|
|
// | | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | 00000000 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | lshift_amount | 0 | 2 | 4 | 0 | 0 | 2 | 4 | 0 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | after vslb | gghhhhhh | ffgggg00 | eeff0000 | | ccdddddd | bbcccc00 | aabb0000 | 00000000 |
|
|
// | | b5 | b4:2..7||00 | b3:4..7||0000 | (b15) | b2:0..7 | b1:2..7||00 | b0:4..7||0000 | 00000000 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | lshift_mask | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | lshift after vand | 00hhhhhh | 00gggg00 | 00ff0000 | 00000000 | 00dddddd | 00cccc00 | 00bb0000 | 00000000 |
|
|
// | | 00||b5:2..7 | 00||b4:4..7||00 | 00||b3:6..7||0000 | 00000000 | 00||b2:2..7 | 00||b1:4..7||00 | 00||b0:6..7||0000 | 00000000 |
|
|
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
|
// | after vor lshift, rshift | 00hhhhhh | 00gggggg | 00ffffff | 00eeeeee | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
|
|
// | | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
|
|
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
|
|
//
|
|
// Expand the first 12 bytes into 16 bytes, leaving every 4th byte
|
|
// blank for now.
|
|
// __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
|
|
//
|
|
// Generate two bit-shifted pieces - rshift and lshift - that will
|
|
// later be OR'd together.
|
|
//
|
|
// First the right-shifted piece
|
|
// __ vsrb(rshift, input, expand_rshift);
|
|
// __ vand(rshift, rshift, expand_rshift_mask);
|
|
//
|
|
// Now the left-shifted piece, which is done by octet shifting
|
|
// the input one byte to the left, then doing a variable shift,
|
|
// followed by a mask operation.
|
|
//
|
|
// __ vslo(lshift, input, vec_8s);
|
|
// __ vslb(lshift, lshift, expand_lshift);
|
|
// __ vand(lshift, lshift, expand_lshift_mask);
|
|
//
|
|
// Combine the two pieces by OR'ing
|
|
// __ vor(expanded, rshift, lshift);
|
|
//
|
|
// At this point, expanded is a vector containing a 6-bit value in each
|
|
// byte. These values are used as indexes into a 64-byte lookup table that
|
|
// is contained in four vector registers. The lookup operation is done
|
|
// using vperm instructions with the same indexes for the lower 32 and
|
|
// upper 32 bytes. To figure out which of the two looked-up bytes to use
|
|
// at each location, all values in expanded are compared to 31. Using
|
|
// vsel, values higher than 31 use the results from the upper 32 bytes of
|
|
// the lookup operation, while values less than or equal to 31 use the
|
|
// lower 32 bytes of the lookup operation.
|
|
//
|
|
// Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
|
|
// Power10 (or later), but experiments doing so on Power10 yielded a slight
|
|
// performance drop, perhaps due to the need for xxpermx instruction
|
|
// prefixes.
|
|
|
|
#define ENCODE_CORE \
|
|
__ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); \
|
|
__ vsrb(rshift, input, expand_rshift); \
|
|
__ vand(rshift, rshift, expand_rshift_mask); \
|
|
__ vslo(lshift, input, vec_8s); \
|
|
__ vslb(lshift, lshift, expand_lshift); \
|
|
__ vand(lshift, lshift, expand_lshift_mask); \
|
|
__ vor(expanded, rshift, lshift); \
|
|
__ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
|
|
__ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
|
|
__ vcmpgtub(gt_31, expanded, vec_31s); \
|
|
__ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
|
|
|
|
// Intrinsic function prototype in Base64.java:
|
|
// private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
|
|
|
|
address generate_base64_encodeBlock() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ function_entry();
|
|
|
|
typedef struct {
|
|
unsigned char expand_permute_val[16];
|
|
unsigned char expand_rshift_val[16];
|
|
unsigned char expand_rshift_mask_val[16];
|
|
unsigned char expand_lshift_val[16];
|
|
unsigned char expand_lshift_mask_val[16];
|
|
unsigned char base64_00_15_val[16];
|
|
unsigned char base64_16_31_val[16];
|
|
unsigned char base64_32_47_val[16];
|
|
unsigned char base64_48_63_val[16];
|
|
unsigned char base64_48_63_URL_val[16];
|
|
} constant_block;
|
|
|
|
alignas(16) static const constant_block const_block = {
|
|
.expand_permute_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0, 4, 5, 6,
|
|
0, 7, 8, 9,
|
|
0, 10, 11, 12,
|
|
0, 13, 14, 15 ) },
|
|
|
|
.expand_rshift_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0, 6, 4, 2,
|
|
0, 6, 4, 2,
|
|
0, 6, 4, 2,
|
|
0, 6, 4, 2 ) },
|
|
|
|
.expand_rshift_mask_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0b00000000, 0b00000011, 0b00001111, 0b00111111,
|
|
0b00000000, 0b00000011, 0b00001111, 0b00111111,
|
|
0b00000000, 0b00000011, 0b00001111, 0b00111111,
|
|
0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
|
|
|
|
.expand_lshift_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0, 2, 4, 0,
|
|
0, 2, 4, 0,
|
|
0, 2, 4, 0,
|
|
0, 2, 4, 0 ) },
|
|
|
|
.expand_lshift_mask_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
0b00111111, 0b00111100, 0b00110000, 0b00000000,
|
|
0b00111111, 0b00111100, 0b00110000, 0b00000000,
|
|
0b00111111, 0b00111100, 0b00110000, 0b00000000,
|
|
0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
|
|
|
|
.base64_00_15_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
|
|
|
|
.base64_16_31_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
|
|
|
|
.base64_32_47_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
|
|
|
|
.base64_48_63_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
|
|
|
|
.base64_48_63_URL_val = {
|
|
ARRAY_TO_LXV_ORDER(
|
|
'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
|
|
};
|
|
|
|
// Number of bytes to process in each pass through the main loop.
|
|
// 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
|
|
const unsigned block_size = 12;
|
|
|
|
// According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
|
|
Register src = R3_ARG1; // source starting address of Base64 characters
|
|
Register sp = R4_ARG2; // source starting position
|
|
Register sl = R5_ARG3; // total source length of the Base64 characters to be processed
|
|
Register dst = R6_ARG4; // destination address
|
|
Register dp = R7_ARG5; // destination starting position
|
|
Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
|
|
|
|
// Local variables
|
|
Register const_ptr = R12; // used for loading constants (reuses isURL's register)
|
|
Register tmp_reg = R9; // used for speeding up load_constant()
|
|
|
|
Register size = R9; // number of bytes to process (reuses tmp_reg's register)
|
|
Register blocked_size = R10; // number of bytes to process a block at a time
|
|
Register block_modulo = R12; // == block_size (reuse const_ptr)
|
|
Register remaining = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
|
|
Register in = R4; // current input (source) pointer (reuse sp's register)
|
|
Register num_blocks = R11; // number of blocks to be processed by the loop
|
|
Register out = R8; // current output (destination) pointer (reuse const_ptr's register)
|
|
Register three = R9; // constant divisor (reuse size's register)
|
|
Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
|
|
Register tmp1 = R7; // temp register for lxvl length (reuse dp's register)
|
|
Register modulo_chars = R7; // number of bytes written during the final write % 4 (reuse tmp1's register)
|
|
Register pad_char = R6; // literal '=' (reuse dst's register)
|
|
|
|
// Volatile VSRS are 0..13, 32..51 (VR0..VR13)
|
|
// VR Constants
|
|
VectorRegister vec_8s = VR0;
|
|
VectorRegister vec_31s = VR1;
|
|
VectorRegister vec_base64_00_15 = VR2;
|
|
VectorRegister vec_base64_16_31 = VR3;
|
|
VectorRegister vec_base64_32_47 = VR4;
|
|
VectorRegister vec_base64_48_63 = VR5;
|
|
VectorRegister expand_rshift = VR6;
|
|
VectorRegister expand_rshift_mask = VR7;
|
|
VectorRegister expand_lshift = VR8;
|
|
VectorRegister expand_lshift_mask = VR9;
|
|
|
|
// VR variables for expand
|
|
VectorRegister input = VR10;
|
|
VectorRegister rshift = VR11;
|
|
VectorRegister lshift = VR12;
|
|
VectorRegister expanded = VR13;
|
|
|
|
// VR variables for lookup
|
|
VectorRegister encoded_00_31 = VR10; // (reuse input)
|
|
VectorRegister encoded_32_63 = VR11; // (reuse rshift)
|
|
VectorRegister gt_31 = VR12; // (reuse lshift)
|
|
|
|
// VSR Constants
|
|
VectorSRegister expand_permute = VSR0;
|
|
|
|
Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
|
|
Label loop_start, le_16_to_write, no_pad, one_pad_char;
|
|
|
|
// The upper 32 bits of the non-pointer parameter registers are not
|
|
// guaranteed to be zero, so mask off those upper bits.
|
|
__ clrldi(sp, sp, 32);
|
|
__ clrldi(sl, sl, 32);
|
|
__ clrldi(dp, dp, 32);
|
|
__ clrldi(isURL, isURL, 32);
|
|
|
|
// load up the constants
|
|
__ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
|
|
__ lxv(expand_permute, BLK_OFFSETOF(expand_permute_val), const_ptr);
|
|
__ lxv(expand_rshift->to_vsr(), BLK_OFFSETOF(expand_rshift_val), const_ptr);
|
|
__ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
|
|
__ lxv(expand_lshift->to_vsr(), BLK_OFFSETOF(expand_lshift_val), const_ptr);
|
|
__ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
|
|
__ lxv(vec_base64_00_15->to_vsr(), BLK_OFFSETOF(base64_00_15_val), const_ptr);
|
|
__ lxv(vec_base64_16_31->to_vsr(), BLK_OFFSETOF(base64_16_31_val), const_ptr);
|
|
__ lxv(vec_base64_32_47->to_vsr(), BLK_OFFSETOF(base64_32_47_val), const_ptr);
|
|
|
|
// Splat the constants that can use xxspltib
|
|
__ xxspltib(vec_8s->to_vsr(), 8);
|
|
__ xxspltib(vec_31s->to_vsr(), 31);
|
|
|
|
|
|
// Use a different translation lookup table depending on the
|
|
// setting of isURL
|
|
__ cmpdi(CR0, isURL, 0);
|
|
__ beq(CR0, not_URL);
|
|
__ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
|
|
__ b(calculate_size);
|
|
|
|
__ bind(not_URL);
|
|
__ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
|
|
|
|
__ bind(calculate_size);
|
|
|
|
// size = sl - sp - 4 (*)
|
|
// (*) Don't process the last four bytes in the main loop because
|
|
// we don't want the lxv instruction to read past the end of the src
|
|
// data, in case those four bytes are on the start of an unmapped or
|
|
// otherwise inaccessible page.
|
|
//
|
|
__ sub(size, sl, sp);
|
|
__ subi(size, size, 4);
|
|
__ cmpdi(CR7, size, block_size);
|
|
__ bgt(CR7, calculate_blocked_size);
|
|
__ mr(remaining, size);
|
|
// Add the 4 back into remaining again
|
|
__ addi(remaining, remaining, 4);
|
|
// make "in" point to the beginning of the source data: in = src + sp
|
|
__ add(in, src, sp);
|
|
// out = dst + dp
|
|
__ add(out, dst, dp);
|
|
__ b(skip_loop);
|
|
|
|
__ bind(calculate_blocked_size);
|
|
__ li(block_modulo, block_size);
|
|
// num_blocks = size / block_modulo
|
|
__ divwu(num_blocks, size, block_modulo);
|
|
// blocked_size = num_blocks * size
|
|
__ mullw(blocked_size, num_blocks, block_modulo);
|
|
// remaining = size - blocked_size
|
|
__ sub(remaining, size, blocked_size);
|
|
__ mtctr(num_blocks);
|
|
|
|
// Add the 4 back in to remaining again
|
|
__ addi(remaining, remaining, 4);
|
|
|
|
// make "in" point to the beginning of the source data: in = src + sp
|
|
__ add(in, src, sp);
|
|
|
|
// out = dst + dp
|
|
__ add(out, dst, dp);
|
|
|
|
__ align(32);
|
|
__ bind(loop_start);
|
|
|
|
__ lxv(input->to_vsr(), 0, in);
|
|
|
|
ENCODE_CORE
|
|
|
|
__ stxv(expanded->to_vsr(), 0, out);
|
|
__ addi(in, in, 12);
|
|
__ addi(out, out, 16);
|
|
__ bdnz(loop_start);
|
|
|
|
__ bind(skip_loop);
|
|
|
|
// When there are less than 16 bytes left, we need to be careful not to
|
|
// read beyond the end of the src buffer, which might be in an unmapped
|
|
// page.
|
|
// Load the remaining bytes using lxvl.
|
|
__ rldicr(tmp1, remaining, 56, 7);
|
|
__ lxvl(input->to_vsr(), in, tmp1);
|
|
|
|
ENCODE_CORE
|
|
|
|
// bytes_to_write = ((remaining * 4) + 2) / 3
|
|
__ li(three, 3);
|
|
__ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
|
|
__ addi(bytes_to_write, bytes_to_write, 2);
|
|
__ divwu(bytes_to_write, bytes_to_write, three);
|
|
|
|
__ cmpwi(CR7, bytes_to_write, 16);
|
|
__ ble_predict_taken(CR7, le_16_to_write);
|
|
__ stxv(expanded->to_vsr(), 0, out);
|
|
|
|
// We've processed 12 of the 13-15 data bytes, so advance the pointers,
|
|
// and do one final pass for the remaining 1-3 bytes.
|
|
__ addi(in, in, 12);
|
|
__ addi(out, out, 16);
|
|
__ subi(remaining, remaining, 12);
|
|
__ subi(bytes_to_write, bytes_to_write, 16);
|
|
__ rldicr(tmp1, bytes_to_write, 56, 7);
|
|
__ lxvl(input->to_vsr(), in, tmp1);
|
|
|
|
ENCODE_CORE
|
|
|
|
__ bind(le_16_to_write);
|
|
// shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
|
|
__ rldicr(tmp1, bytes_to_write, 56, 7);
|
|
__ stxvl(expanded->to_vsr(), out, tmp1);
|
|
__ add(out, out, bytes_to_write);
|
|
|
|
__ li(pad_char, '=');
|
|
__ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CR0
|
|
// Examples:
|
|
// remaining bytes_to_write modulo_chars num pad chars
|
|
// 0 0 0 0
|
|
// 1 2 2 2
|
|
// 2 3 3 1
|
|
// 3 4 0 0
|
|
// 4 6 2 2
|
|
// 5 7 3 1
|
|
// ...
|
|
// 12 16 0 0
|
|
// 13 18 2 2
|
|
// 14 19 3 1
|
|
// 15 20 0 0
|
|
__ beq(CR0, no_pad);
|
|
__ cmpwi(CR7, modulo_chars, 3);
|
|
__ beq(CR7, one_pad_char);
|
|
|
|
// two pad chars
|
|
__ stb(pad_char, out);
|
|
__ addi(out, out, 1);
|
|
|
|
__ bind(one_pad_char);
|
|
__ stb(pad_char, out);
|
|
|
|
__ bind(no_pad);
|
|
|
|
__ blr();
|
|
return start;
|
|
}
|
|
|
|
#endif // VM_LITTLE_ENDIAN
|
|
|
|
void generate_lookup_secondary_supers_table_stub() {
|
|
StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
const Register
|
|
r_super_klass = R4_ARG2,
|
|
r_array_base = R3_ARG1,
|
|
r_array_length = R7_ARG5,
|
|
r_array_index = R6_ARG4,
|
|
r_sub_klass = R5_ARG3,
|
|
r_bitmap = R11_scratch1,
|
|
result = R8_ARG6;
|
|
|
|
for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
|
|
StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
|
|
__ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
|
|
r_array_base, r_array_length, r_array_index,
|
|
r_bitmap, result, slot);
|
|
__ blr();
|
|
}
|
|
}
|
|
|
|
// Slow path implementation for UseSecondarySupersTable.
|
|
address generate_lookup_secondary_supers_table_slow_path_stub() {
|
|
StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
address start = __ pc();
|
|
const Register
|
|
r_super_klass = R4_ARG2,
|
|
r_array_base = R3_ARG1,
|
|
temp1 = R7_ARG5,
|
|
r_array_index = R6_ARG4,
|
|
r_bitmap = R11_scratch1,
|
|
result = R8_ARG6;
|
|
|
|
__ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_thaw(StubId stub_id) {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
|
|
Continuation::thaw_kind kind;
|
|
bool return_barrier;
|
|
bool return_barrier_exception;
|
|
|
|
switch (stub_id) {
|
|
case StubId::stubgen_cont_thaw_id:
|
|
kind = Continuation::thaw_top;
|
|
return_barrier = false;
|
|
return_barrier_exception = false;
|
|
break;
|
|
case StubId::stubgen_cont_returnBarrier_id:
|
|
kind = Continuation::thaw_return_barrier;
|
|
return_barrier = true;
|
|
return_barrier_exception = false;
|
|
break;
|
|
case StubId::stubgen_cont_returnBarrierExc_id:
|
|
kind = Continuation::thaw_return_barrier_exception;
|
|
return_barrier = true;
|
|
return_barrier_exception = true;
|
|
break;
|
|
default:
|
|
ShouldNotReachHere();
|
|
}
|
|
StubCodeMark mark(this, stub_id);
|
|
|
|
Register tmp1 = R10_ARG8;
|
|
Register tmp2 = R9_ARG7;
|
|
Register tmp3 = R8_ARG6;
|
|
Register nvtmp = R15_esp; // nonvolatile tmp register
|
|
FloatRegister nvftmp = F20; // nonvolatile fp tmp register
|
|
|
|
address start = __ pc();
|
|
|
|
if (kind == Continuation::thaw_top) {
|
|
__ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
|
|
}
|
|
|
|
if (return_barrier) {
|
|
__ mr(nvtmp, R3_RET); __ fmr(nvftmp, F1_RET); // preserve possible return value from a method returning to the return barrier
|
|
DEBUG_ONLY(__ ld_ptr(tmp1, _abi0(callers_sp), R1_SP);)
|
|
__ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
|
|
#ifdef ASSERT
|
|
__ ld_ptr(tmp2, _abi0(callers_sp), R1_SP);
|
|
__ cmpd(CR0, tmp1, tmp2);
|
|
__ asm_assert_eq(FILE_AND_LINE ": callers sp is corrupt");
|
|
#endif
|
|
}
|
|
#ifdef ASSERT
|
|
__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread);
|
|
__ cmpd(CR0, R1_SP, tmp1);
|
|
__ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
|
|
#endif
|
|
|
|
__ li(R4_ARG2, return_barrier ? 1 : 0);
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), R16_thread, R4_ARG2);
|
|
|
|
#ifdef ASSERT
|
|
DEBUG_ONLY(__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread));
|
|
DEBUG_ONLY(__ cmpd(CR0, R1_SP, tmp1));
|
|
__ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
|
|
#endif
|
|
|
|
// R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames
|
|
Label thaw_success;
|
|
__ cmpdi(CR0, R3_RET, 0);
|
|
__ bne(CR0, thaw_success);
|
|
__ load_const_optimized(tmp1, (SharedRuntime::throw_StackOverflowError_entry()), R0);
|
|
__ mtctr(tmp1); __ bctr();
|
|
__ bind(thaw_success);
|
|
|
|
__ addi(R3_RET, R3_RET, frame::native_abi_reg_args_size); // Large abi required for C++ calls.
|
|
__ neg(R3_RET, R3_RET);
|
|
// align down resulting in a smaller negative offset
|
|
__ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes));
|
|
DEBUG_ONLY(__ mr(tmp1, R1_SP);)
|
|
__ resize_frame(R3_RET, tmp2); // make room for the thawed frames
|
|
|
|
__ li(R4_ARG2, kind);
|
|
__ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2);
|
|
__ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame
|
|
|
|
if (return_barrier) {
|
|
// we're now in the caller of the frame that returned to the barrier
|
|
__ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
|
|
} else {
|
|
// we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
|
|
__ li(R3_RET, 0); // return 0 (success) from doYield
|
|
}
|
|
|
|
if (return_barrier_exception) {
|
|
Register ex_pc = R17_tos; // nonvolatile register
|
|
__ ld(ex_pc, _abi0(lr), R1_SP); // LR
|
|
__ mr(nvtmp, R3_RET); // save return value containing the exception oop
|
|
// The thawed top frame has got a frame::java_abi. This is not sufficient for the runtime call.
|
|
__ push_frame_reg_args(0, tmp1);
|
|
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc);
|
|
__ mtlr(R3_RET); // the exception handler
|
|
__ pop_frame();
|
|
// See OptoRuntime::generate_exception_blob for register arguments
|
|
__ mr(R3_ARG1, nvtmp); // exception oop
|
|
__ mr(R4_ARG2, ex_pc); // exception pc
|
|
} else {
|
|
// We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
|
|
__ ld(R0, _abi0(lr), R1_SP); // LR
|
|
__ mtlr(R0);
|
|
}
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
address generate_cont_thaw() {
|
|
return generate_cont_thaw(StubId::stubgen_cont_thaw_id);
|
|
}
|
|
|
|
// TODO: will probably need multiple return barriers depending on return type
|
|
|
|
address generate_cont_returnBarrier() {
|
|
return generate_cont_thaw(StubId::stubgen_cont_returnBarrier_id);
|
|
}
|
|
|
|
address generate_cont_returnBarrier_exception() {
|
|
return generate_cont_thaw(StubId::stubgen_cont_returnBarrierExc_id);
|
|
}
|
|
|
|
address generate_cont_preempt_stub() {
|
|
if (!Continuations::enabled()) return nullptr;
|
|
StubId stub_id = StubId::stubgen_cont_preempt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
|
|
|
|
__ reset_last_Java_frame(false /*check_last_java_sp*/);
|
|
|
|
// Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
|
|
__ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
|
|
|
|
Label preemption_cancelled;
|
|
__ lbz(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
|
|
__ cmpwi(CR0, R11_scratch1, 0);
|
|
__ bne(CR0, preemption_cancelled);
|
|
|
|
// Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
|
|
SharedRuntime::continuation_enter_cleanup(_masm);
|
|
__ pop_frame();
|
|
__ restore_LR(R11_scratch1);
|
|
__ blr();
|
|
|
|
// We acquired the monitor after freezing the frames so call thaw to continue execution.
|
|
__ bind(preemption_cancelled);
|
|
__ li(R11_scratch1, 0); // false
|
|
__ stb(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
|
|
int simm16_offs = __ load_const_optimized(R11_scratch1, ContinuationEntry::thaw_call_pc_address(), R0, true);
|
|
__ ld(R11_scratch1, simm16_offs, R11_scratch1);
|
|
__ mtctr(R11_scratch1);
|
|
__ bctr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// exception handler for upcall stubs
|
|
address generate_upcall_stub_exception_handler() {
|
|
StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
// Native caller has no idea how to handle exceptions,
|
|
// so we just crash here. Up to callee to catch exceptions.
|
|
__ verify_oop(R3_ARG1);
|
|
__ load_const_optimized(R12_scratch2, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception), R0);
|
|
__ call_c(R12_scratch2);
|
|
__ should_not_reach_here();
|
|
|
|
return start;
|
|
}
|
|
|
|
// load Method* target of MethodHandle
|
|
// R3_ARG1 = jobject receiver
|
|
// R19_method = result Method*
|
|
address generate_upcall_stub_load_target() {
|
|
|
|
StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
__ resolve_global_jobject(R3_ARG1, R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS);
|
|
// Load target method from receiver
|
|
__ load_heap_oop(R19_method, java_lang_invoke_MethodHandle::form_offset(), R3_ARG1,
|
|
R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
|
|
__ load_heap_oop(R19_method, java_lang_invoke_LambdaForm::vmentry_offset(), R19_method,
|
|
R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
|
|
__ load_heap_oop(R19_method, java_lang_invoke_MemberName::method_offset(), R19_method,
|
|
R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
|
|
__ ld(R19_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset(), R19_method);
|
|
__ std(R19_method, in_bytes(JavaThread::callee_target_offset()), R16_thread); // just in case callee is deoptimized
|
|
|
|
__ blr();
|
|
|
|
return start;
|
|
}
|
|
|
|
// Initialization
|
|
void generate_preuniverse_stubs() {
|
|
// preuniverse stubs are not needed for ppc
|
|
}
|
|
|
|
void generate_initial_stubs() {
|
|
// Generates all stubs and initializes the entry points
|
|
|
|
// Entry points that exist in all platforms.
|
|
// Note: This is code that could be shared among different platforms - however the
|
|
// benefit seems to be smaller than the disadvantage of having a
|
|
// much more complicated generator structure. See also comment in
|
|
// stubRoutines.hpp.
|
|
|
|
StubRoutines::_forward_exception_entry = generate_forward_exception();
|
|
StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
|
|
StubRoutines::_catch_exception_entry = generate_catch_exception();
|
|
|
|
if (UnsafeMemoryAccess::_table == nullptr) {
|
|
UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
|
|
}
|
|
|
|
// CRC32 Intrinsics.
|
|
if (UseCRC32Intrinsics) {
|
|
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32_id);
|
|
}
|
|
|
|
// CRC32C Intrinsics.
|
|
if (UseCRC32CIntrinsics) {
|
|
StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32C_id);
|
|
}
|
|
|
|
if (VM_Version::supports_float16()) {
|
|
// For results consistency both intrinsics should be enabled.
|
|
StubRoutines::_hf2f = generate_float16ToFloat();
|
|
StubRoutines::_f2hf = generate_floatToFloat16();
|
|
}
|
|
}
|
|
|
|
void generate_continuation_stubs() {
|
|
// Continuation stubs:
|
|
StubRoutines::_cont_thaw = generate_cont_thaw();
|
|
StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
|
|
StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
|
|
StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
|
|
}
|
|
|
|
void generate_final_stubs() {
|
|
// Generates all stubs and initializes the entry points
|
|
|
|
// support for verify_oop (must happen after universe_init)
|
|
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
|
|
|
|
// nmethod entry barriers for concurrent class unloading
|
|
StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
|
|
|
|
// arraycopy stubs used by compilers
|
|
generate_arraycopy_stubs();
|
|
|
|
#ifdef COMPILER2
|
|
if (UseSecondarySupersTable) {
|
|
StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
|
|
if (!InlineSecondarySupersTest) {
|
|
generate_lookup_secondary_supers_table_stub();
|
|
}
|
|
}
|
|
#endif // COMPILER2
|
|
|
|
StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
|
|
StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
|
|
}
|
|
|
|
void generate_compiler_stubs() {
|
|
#if COMPILER2_OR_JVMCI
|
|
|
|
#ifdef COMPILER2
|
|
if (UseMultiplyToLenIntrinsic) {
|
|
StubRoutines::_multiplyToLen = generate_multiplyToLen();
|
|
}
|
|
if (UseSquareToLenIntrinsic) {
|
|
StubRoutines::_squareToLen = generate_squareToLen();
|
|
}
|
|
if (UseMulAddIntrinsic) {
|
|
StubRoutines::_mulAdd = generate_mulAdd();
|
|
}
|
|
if (UseMontgomeryMultiplyIntrinsic) {
|
|
StubRoutines::_montgomeryMultiply
|
|
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
|
|
}
|
|
if (UseMontgomerySquareIntrinsic) {
|
|
StubRoutines::_montgomerySquare
|
|
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
|
|
}
|
|
#endif
|
|
|
|
// data cache line writeback
|
|
if (VM_Version::supports_data_cache_line_flush()) {
|
|
StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
|
|
StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
|
|
}
|
|
|
|
if (UseGHASHIntrinsics) {
|
|
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
|
|
}
|
|
|
|
if (UseAESIntrinsics) {
|
|
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
|
|
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
|
|
}
|
|
|
|
if (UseSHA256Intrinsics) {
|
|
StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
|
|
StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
|
|
}
|
|
if (UseSHA512Intrinsics) {
|
|
StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
|
|
StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
|
|
}
|
|
|
|
#ifdef VM_LITTLE_ENDIAN
|
|
// Currently supported on PPC64LE only
|
|
if (UseBASE64Intrinsics) {
|
|
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
|
|
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
|
|
}
|
|
#endif
|
|
#endif // COMPILER2_OR_JVMCI
|
|
}
|
|
|
|
public:
|
|
StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
|
|
switch(blob_id) {
|
|
case BlobId::stubgen_preuniverse_id:
|
|
generate_preuniverse_stubs();
|
|
break;
|
|
case BlobId::stubgen_initial_id:
|
|
generate_initial_stubs();
|
|
break;
|
|
case BlobId::stubgen_continuation_id:
|
|
generate_continuation_stubs();
|
|
break;
|
|
case BlobId::stubgen_compiler_id:
|
|
generate_compiler_stubs();
|
|
break;
|
|
case BlobId::stubgen_final_id:
|
|
generate_final_stubs();
|
|
break;
|
|
default:
|
|
fatal("unexpected blob id: %s", StubInfo::name(blob_id));
|
|
break;
|
|
};
|
|
}
|
|
};
|
|
|
|
void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
|
|
StubGenerator g(code, blob_id);
|
|
}
|
|
|