jdk/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
2025-07-30 10:08:33 +00:00

5122 lines
187 KiB
C++

/*
* Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2025 SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "asm/macroAssembler.inline.hpp"
#include "compiler/oopMap.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "gc/shared/barrierSetNMethod.hpp"
#include "interpreter/interpreter.hpp"
#include "nativeInst_ppc.hpp"
#include "oops/instanceOop.hpp"
#include "oops/method.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "prims/upcallLinker.hpp"
#include "runtime/continuation.hpp"
#include "runtime/continuationEntry.inline.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/javaThread.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/vm_version.hpp"
#include "utilities/align.hpp"
#include "utilities/powerOfTwo.hpp"
#if INCLUDE_ZGC
#include "gc/z/zBarrierSetAssembler.hpp"
#endif
// Declaration and definition of StubGenerator (no .hpp file).
// For a more detailed description of the stub routine structure
// see the comment in stubRoutines.hpp.
#define __ _masm->
#ifdef PRODUCT
#define BLOCK_COMMENT(str) // nothing
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif
#if defined(ABI_ELFv2)
#define STUB_ENTRY(name) StubRoutines::name
#else
#define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry()
#endif
class StubGenerator: public StubCodeGenerator {
private:
// Call stubs are used to call Java from C
//
// Arguments:
//
// R3 - call wrapper address : address
// R4 - result : intptr_t*
// R5 - result type : BasicType
// R6 - method : Method
// R7 - frame mgr entry point : address
// R8 - parameter block : intptr_t*
// R9 - parameter count in words : int
// R10 - thread : Thread*
//
address generate_call_stub(address& return_address) {
// Setup a new c frame, copy java arguments, call template interpreter or
// native_entry, and process result.
StubId stub_id = StubId::stubgen_call_stub_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
int save_nonvolatile_registers_size = __ save_nonvolatile_registers_size(true, SuperwordUseVSX);
// some sanity checks
STATIC_ASSERT(StackAlignmentInBytes == 16);
assert((sizeof(frame::native_abi_minframe) % 16) == 0, "unaligned");
assert((sizeof(frame::native_abi_reg_args) % 16) == 0, "unaligned");
assert((save_nonvolatile_registers_size % 16) == 0, "unaligned");
assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
Register r_arg_call_wrapper_addr = R3;
Register r_arg_result_addr = R4;
Register r_arg_result_type = R5;
Register r_arg_method = R6;
Register r_arg_entry = R7;
Register r_arg_argument_addr = R8;
Register r_arg_argument_count = R9;
Register r_arg_thread = R10;
Register r_entryframe_fp = R2; // volatile
Register r_argument_size = R11_scratch1; // volatile
Register r_top_of_arguments_addr = R21_tmp1;
{
// Stack on entry to call_stub:
//
// F1 [C_FRAME]
// ...
Register r_frame_size = R12_scratch2; // volatile
Label arguments_copied;
// Save LR/CR to caller's C_FRAME.
__ save_LR_CR(R0);
// Keep copy of our frame pointer (caller's SP).
__ mr(r_entryframe_fp, R1_SP);
// calculate frame size
STATIC_ASSERT(Interpreter::logStackElementSize == 3);
// space for arguments aligned up: ((arg_count + 1) * 8) &~ 15
__ addi(r_frame_size, r_arg_argument_count, 1);
__ rldicr(r_frame_size, r_frame_size, 3, 63 - 4);
// this is the pure space for arguments (excluding alignment padding)
__ sldi(r_argument_size, r_arg_argument_count, 3);
__ addi(r_frame_size, r_frame_size,
save_nonvolatile_registers_size + frame::entry_frame_locals_size + frame::top_ijava_frame_abi_size);
// push ENTRY_FRAME
__ push_frame(r_frame_size, R0);
// Save non-volatiles registers to ENTRY_FRAME.
__ save_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
true, SuperwordUseVSX);
BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
// Push ENTRY_FRAME including arguments:
//
// F0 [TOP_IJAVA_FRAME_ABI]
// alignment (optional)
// [outgoing Java arguments]
// [non-volatiles]
// [ENTRY_FRAME_LOCALS]
// F1 [C_FRAME]
// ...
// initialize call_stub locals (step 1)
__ std(r_arg_call_wrapper_addr, _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
__ std(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
__ std(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
// we will save arguments_tos_address later
BLOCK_COMMENT("Copy Java arguments");
// copy Java arguments
// Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
__ addi(r_top_of_arguments_addr, r_entryframe_fp,
-(save_nonvolatile_registers_size + frame::entry_frame_locals_size));
__ sub(r_top_of_arguments_addr, r_top_of_arguments_addr, r_argument_size);
// any arguments to copy?
__ cmpdi(CR0, r_arg_argument_count, 0);
__ beq(CR0, arguments_copied);
// prepare loop and copy arguments in reverse order
{
Register r_argument_addr = R22_tmp2;
Register r_argumentcopy_addr = R23_tmp3;
// init CTR with arg_argument_count
__ mtctr(r_arg_argument_count);
// let r_argumentcopy_addr point to last outgoing Java arguments P
__ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
// let r_argument_addr point to last incoming java argument
__ add(r_argument_addr, r_arg_argument_addr, r_argument_size);
__ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
// now loop while CTR > 0 and copy arguments
{
Label next_argument;
__ bind(next_argument);
__ ld(R0, 0, r_argument_addr);
// argument_addr--;
__ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
__ std(R0, 0, r_argumentcopy_addr);
// argumentcopy_addr++;
__ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
__ bdnz(next_argument);
}
}
// Arguments copied, continue.
__ bind(arguments_copied);
}
{
BLOCK_COMMENT("Call template interpreter or native entry.");
assert_different_registers(r_arg_entry, r_top_of_arguments_addr, r_arg_method, r_arg_thread);
// Register state on entry to template interpreter / native entry:
//
// tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
// R19_method - Method
// R16_thread - JavaThread*
// Tos must point to last argument - element_size.
const Register tos = R15_esp;
__ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
// initialize call_stub locals (step 2)
// now save tos as arguments_tos_address
__ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
// load argument registers for call
__ mr(R19_method, r_arg_method);
__ mr(R16_thread, r_arg_thread);
assert(tos != r_arg_method, "trashed r_arg_method");
assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
// Set R15_prev_state to 0 for simplifying checks in callee.
__ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R0);
// Stack on entry to template interpreter / native entry:
//
// F0 [TOP_IJAVA_FRAME_ABI]
// alignment (optional)
// [outgoing Java arguments]
// [non-volatiles]
// [ENTRY_FRAME_LOCALS]
// F1 [C_FRAME]
// ...
//
// global toc register
__ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R0);
// Remember the senderSP so we interpreter can pop c2i arguments off of the stack
// when called via a c2i.
// Pass initial_caller_sp to framemanager.
__ mr(R21_sender_SP, R1_SP);
// Do a light-weight C-call here, r_arg_entry holds the address
// of the interpreter entry point (template interpreter or native entry)
// and save runtime-value of LR in return_address.
assert(r_arg_entry != tos && r_arg_entry != R19_method && r_arg_entry != R16_thread,
"trashed r_arg_entry");
return_address = __ call_stub(r_arg_entry);
}
{
BLOCK_COMMENT("Returned from template interpreter or native entry.");
// Now pop frame, process result, and return to caller.
// Stack on exit from template interpreter / native entry:
//
// F0 [ABI]
// ...
// [non-volatiles]
// [ENTRY_FRAME_LOCALS]
// F1 [C_FRAME]
// ...
//
// Just pop the topmost frame ...
//
Label ret_is_object;
Label ret_is_long;
Label ret_is_float;
Label ret_is_double;
Register r_lr = R11_scratch1;
Register r_cr = R12_scratch2;
// Reload some volatile registers which we've spilled before the call
// to template interpreter / native entry.
// Access all locals via frame pointer, because we know nothing about
// the topmost frame's size.
__ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP); // restore after call
assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
__ ld(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
__ ld(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
__ ld(r_cr, _abi0(cr), r_entryframe_fp);
__ ld(r_lr, _abi0(lr), r_entryframe_fp);
__ mtcr(r_cr); // restore CR
__ mtlr(r_lr); // restore LR
// Store result depending on type. Everything that is not
// T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
// Using volatile CRs.
__ cmpwi(CR1, r_arg_result_type, T_OBJECT);
__ cmpwi(CR5, r_arg_result_type, T_LONG);
__ cmpwi(CR6, r_arg_result_type, T_FLOAT);
__ cmpwi(CR7, r_arg_result_type, T_DOUBLE);
__ pop_cont_fastpath(); // kills CR0, uses R16_thread
// restore non-volatile registers
__ restore_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
true, SuperwordUseVSX);
// pop frame
__ mr(R1_SP, r_entryframe_fp);
// Stack on exit from call_stub:
//
// 0 [C_FRAME]
// ...
//
// no call_stub frames left.
__ beq(CR1, ret_is_object);
__ beq(CR5, ret_is_long);
__ beq(CR6, ret_is_float);
__ beq(CR7, ret_is_double);
// default:
__ stw(R3_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_OBJECT:
// case T_LONG:
__ bind(ret_is_object);
__ bind(ret_is_long);
__ std(R3_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_FLOAT:
__ bind(ret_is_float);
__ stfs(F1_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_DOUBLE:
__ bind(ret_is_double);
__ stfd(F1_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
}
return start;
}
// Return point for a Java call if there's an exception thrown in
// Java code. The exception is caught and transformed into a
// pending exception stored in JavaThread that can be tested from
// within the VM.
//
address generate_catch_exception() {
StubId stub_id = StubId::stubgen_catch_exception_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
// Registers alive
//
// R16_thread
// R3_ARG1 - address of pending exception
// R4_ARG2 - return address in call stub
const Register exception_file = R21_tmp1;
const Register exception_line = R22_tmp2;
__ load_const(exception_file, (void*)__FILE__);
__ load_const(exception_line, (void*)__LINE__);
__ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
// store into `char *'
__ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
// store into `int'
__ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
// complete return to VM
assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
__ mtlr(R4_ARG2);
// continue in call stub
__ blr();
return start;
}
// Continuation point for runtime calls returning with a pending
// exception. The pending exception check happened in the runtime
// or native call stub. The pending exception in Thread is
// converted into a Java-level exception.
//
// Read:
//
// LR: The pc the runtime library callee wants to return to.
// Since the exception occurred in the callee, the return pc
// from the point of view of Java is the exception pc.
// thread: Needed for method handles.
//
// Invalidate:
//
// volatile registers (except below).
//
// Update:
//
// R4_ARG2: exception
//
// (LR is unchanged and is live out).
//
address generate_forward_exception() {
StubId stub_id = StubId::stubgen_forward_exception_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
if (VerifyOops) {
// Get pending exception oop.
__ ld(R3_ARG1,
in_bytes(Thread::pending_exception_offset()),
R16_thread);
// Make sure that this code is only executed if there is a pending exception.
{
Label L;
__ cmpdi(CR0, R3_ARG1, 0);
__ bne(CR0, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
}
__ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
}
// Save LR/CR and copy exception pc (LR) into R4_ARG2.
__ save_LR(R4_ARG2);
__ push_frame_reg_args(0, R0);
// Find exception handler.
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
SharedRuntime::exception_handler_for_return_address),
R16_thread,
R4_ARG2);
// Copy handler's address.
__ mtctr(R3_RET);
__ pop_frame();
__ restore_LR(R0);
// Set up the arguments for the exception handler:
// - R3_ARG1: exception oop
// - R4_ARG2: exception pc.
// Load pending exception oop.
__ ld(R3_ARG1,
in_bytes(Thread::pending_exception_offset()),
R16_thread);
// The exception pc is the return address in the caller.
// Must load it into R4_ARG2.
__ mflr(R4_ARG2);
#ifdef ASSERT
// Make sure exception is set.
{
Label L;
__ cmpdi(CR0, R3_ARG1, 0);
__ bne(CR0, L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
}
#endif
// Clear the pending exception.
__ li(R0, 0);
__ std(R0,
in_bytes(Thread::pending_exception_offset()),
R16_thread);
// Jump to exception handler.
__ bctr();
return start;
}
#undef __
#define __ _masm->
#if !defined(PRODUCT)
// Wrapper which calls oopDesc::is_oop_or_null()
// Only called by MacroAssembler::verify_oop
static void verify_oop_helper(const char* message, oopDesc* o) {
if (!oopDesc::is_oop_or_null(o)) {
fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
}
++ StubRoutines::_verify_oop_count;
}
#endif
// Return address of code to be called from code generated by
// MacroAssembler::verify_oop.
//
// Don't generate, rather use C++ code.
address generate_verify_oop() {
// this is actually a `FunctionDescriptor*'.
address start = nullptr;
#if !defined(PRODUCT)
start = CAST_FROM_FN_PTR(address, verify_oop_helper);
#endif
return start;
}
// Computes the Galois/Counter Mode (GCM) product and reduction.
//
// This function performs polynomial multiplication of the subkey H with
// the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
// The subkey H is divided into lower, middle, and higher halves.
// The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
// The final computed value is stored back into `vState`.
static void computeGCMProduct(MacroAssembler* _masm,
VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
VectorRegister vCombinedResult, VectorRegister vSwappedH) {
__ vxor(vH, vH, vState);
__ vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H
__ vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H
__ vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H
__ vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction
__ vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M
__ vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M
__ vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half
__ vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half
__ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap
__ vxor(vLowProduct, vLowProduct, vReducedLow);
__ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap
__ vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant
__ vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products
__ vxor(vState, vLowProduct, vCombinedResult);
}
// Generate stub for ghash process blocks.
//
// Arguments for generated stub:
// state: R3_ARG1 (long[] state)
// subkeyH: R4_ARG2 (long[] subH)
// data: R5_ARG3 (byte[] data)
// blocks: R6_ARG4 (number of 16-byte blocks to process)
//
// The polynomials are processed in bit-reflected order for efficiency reasons.
// This optimization leverages the structure of the Galois field arithmetic
// to minimize the number of bit manipulations required during multiplication.
// For an explanation of how this works, refer :
// Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
// Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel®
// Architecture Processor"
// http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
//
//
address generate_ghash_processBlocks() {
StubCodeMark mark(this, "StubRoutines", "ghash");
address start = __ function_entry();
// Registers for parameters
Register state = R3_ARG1; // long[] state
Register subkeyH = R4_ARG2; // long[] subH
Register data = R5_ARG3; // byte[] data
Register blocks = R6_ARG4;
Register temp1 = R8;
// Vector Registers
VectorRegister vZero = VR0;
VectorRegister vH = VR1;
VectorRegister vLowerH = VR2;
VectorRegister vHigherH = VR3;
VectorRegister vLowProduct = VR4;
VectorRegister vMidProduct = VR5;
VectorRegister vHighProduct = VR6;
VectorRegister vReducedLow = VR7;
VectorRegister vTmp8 = VR8;
VectorRegister vTmp9 = VR9;
VectorRegister vTmp10 = VR10;
VectorRegister vSwappedH = VR11;
VectorRegister vTmp12 = VR12;
VectorRegister loadOrder = VR13;
VectorRegister vHigh = VR14;
VectorRegister vLow = VR15;
VectorRegister vState = VR16;
VectorRegister vPerm = VR17;
VectorRegister vCombinedResult = VR18;
VectorRegister vConstC2 = VR19;
__ li(temp1, 0xc2);
__ sldi(temp1, temp1, 56);
__ vspltisb(vZero, 0);
__ mtvrd(vConstC2, temp1);
__ lxvd2x(vH->to_vsr(), subkeyH);
__ lxvd2x(vState->to_vsr(), state);
// Operations to obtain lower and higher bytes of subkey H.
__ vspltisb(vReducedLow, 1);
__ vspltisb(vTmp10, 7);
__ vsldoi(vTmp8, vZero, vReducedLow, 1); // 0x1
__ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1
__ vsplt(vTmp9, 0, vH); // MSB of H
__ vsl(vH, vH, vReducedLow); // Carry = H<<7
__ vsrab(vTmp9, vTmp9, vTmp10);
__ vand(vTmp9, vTmp9, vTmp8); // Carry
__ vxor(vTmp10, vH, vTmp9);
__ vsldoi(vConstC2, vZero, vConstC2, 8);
__ vsldoi(vSwappedH, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H
__ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L
__ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H
#ifdef ASSERT
__ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero
__ asm_assert_ne("blocks should NOT be zero");
#endif
__ clrldi(blocks, blocks, 32);
__ mtctr(blocks);
__ lvsl(loadOrder, temp1);
#ifdef VM_LITTLE_ENDIAN
__ vspltisb(vTmp12, 0xf);
__ vxor(loadOrder, loadOrder, vTmp12);
#define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder)
#else
#define LE_swap_bytes(x)
#endif
// This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
//
// The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
// performing three 128-bit multiplications and combining the results efficiently.
//
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
//
// Inputs:
// - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half).
// - vLowerH: Lower half of the subkey H (A0).
// - vHigherH: Higher half of the subkey H (A1).
// - vConstC2: Constant used for reduction (for final processing).
//
// References:
// Shay Gueron, Michael E. Kounavis.
// "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
// https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
//
Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
__ andi(temp1, data, 15);
__ cmpwi(CR0, temp1, 0);
__ bne(CR0, L_initialize_unaligned_loop);
__ bind(L_aligned_loop);
__ lvx(vH, temp1, data);
LE_swap_bytes(vH);
computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
__ addi(data, data, 16);
__ bdnz(L_aligned_loop);
__ b(L_store);
__ bind(L_initialize_unaligned_loop);
__ li(temp1, 0);
__ lvsl(vPerm, temp1, data);
__ lvx(vHigh, temp1, data);
#ifdef VM_LITTLE_ENDIAN
__ vspltisb(vTmp12, -1);
__ vxor(vPerm, vPerm, vTmp12);
#endif
__ bind(L_unaligned_loop);
__ addi(data, data, 16);
__ lvx(vLow, temp1, data);
__ vec_perm(vH, vHigh, vLow, vPerm);
computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
__ vmr(vHigh, vLow);
__ bdnz(L_unaligned_loop);
__ bind(L_store);
__ stxvd2x(vState->to_vsr(), state);
__ blr();
return start;
}
// -XX:+OptimizeFill : convert fill/copy loops into intrinsic
//
// The code is implemented(ported from sparc) as we believe it benefits JVM98, however
// tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
//
// Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
// for turning on loop predication optimization, and hence the behavior of "array range check"
// and "loop invariant check" could be influenced, which potentially boosted JVM98.
//
// Generate stub for disjoint short fill. If "aligned" is true, the
// "to" address is assumed to be heapword aligned.
//
// Arguments for generated stub:
// to: R3_ARG1
// value: R4_ARG2
// count: R5_ARG3 treated as signed
//
address generate_fill(StubId stub_id) {
BasicType t;
bool aligned;
switch (stub_id) {
case StubId::stubgen_jbyte_fill_id:
t = T_BYTE;
aligned = false;
break;
case StubId::stubgen_jshort_fill_id:
t = T_SHORT;
aligned = false;
break;
case StubId::stubgen_jint_fill_id:
t = T_INT;
aligned = false;
break;
case StubId::stubgen_arrayof_jbyte_fill_id:
t = T_BYTE;
aligned = true;
break;
case StubId::stubgen_arrayof_jshort_fill_id:
t = T_SHORT;
aligned = true;
break;
case StubId::stubgen_arrayof_jint_fill_id:
t = T_INT;
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
const Register to = R3_ARG1; // source array address
const Register value = R4_ARG2; // fill value
const Register count = R5_ARG3; // elements count
const Register temp = R6_ARG4; // temp register
//assert_clean_int(count, O3); // Make sure 'count' is clean int.
Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
int shift = -1;
switch (t) {
case T_BYTE:
shift = 2;
// Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
__ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
__ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CR0, L_fill_elements);
__ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
break;
case T_SHORT:
shift = 1;
// Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
__ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
__ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CR0, L_fill_elements);
break;
case T_INT:
shift = 0;
__ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CR0, L_fill_4_bytes);
break;
default: ShouldNotReachHere();
}
if (!aligned && (t == T_BYTE || t == T_SHORT)) {
// Align source address at 4 bytes address boundary.
if (t == T_BYTE) {
// One byte misalignment happens only for byte arrays.
__ andi_(temp, to, 1);
__ beq(CR0, L_skip_align1);
__ stb(value, 0, to);
__ addi(to, to, 1);
__ addi(count, count, -1);
__ bind(L_skip_align1);
}
// Two bytes misalignment happens only for byte and short (char) arrays.
__ andi_(temp, to, 2);
__ beq(CR0, L_skip_align2);
__ sth(value, 0, to);
__ addi(to, to, 2);
__ addi(count, count, -(1 << (shift - 1)));
__ bind(L_skip_align2);
}
if (!aligned) {
// Align to 8 bytes, we know we are 4 byte aligned to start.
__ andi_(temp, to, 7);
__ beq(CR0, L_fill_32_bytes);
__ stw(value, 0, to);
__ addi(to, to, 4);
__ addi(count, count, -(1 << shift));
__ bind(L_fill_32_bytes);
}
__ li(temp, 8<<shift); // Prepare for 32 byte loop.
// Clone bytes int->long as above.
__ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
Label L_check_fill_8_bytes;
// Fill 32-byte chunks.
__ subf_(count, temp, count);
__ blt(CR0, L_check_fill_8_bytes);
Label L_fill_32_bytes_loop;
__ align(32);
__ bind(L_fill_32_bytes_loop);
__ std(value, 0, to);
__ std(value, 8, to);
__ subf_(count, temp, count); // Update count.
__ std(value, 16, to);
__ std(value, 24, to);
__ addi(to, to, 32);
__ bge(CR0, L_fill_32_bytes_loop);
__ bind(L_check_fill_8_bytes);
__ add_(count, temp, count);
__ beq(CR0, L_exit);
__ addic_(count, count, -(2 << shift));
__ blt(CR0, L_fill_4_bytes);
//
// Length is too short, just fill 8 bytes at a time.
//
Label L_fill_8_bytes_loop;
__ bind(L_fill_8_bytes_loop);
__ std(value, 0, to);
__ addic_(count, count, -(2 << shift));
__ addi(to, to, 8);
__ bge(CR0, L_fill_8_bytes_loop);
// Fill trailing 4 bytes.
__ bind(L_fill_4_bytes);
__ andi_(temp, count, 1<<shift);
__ beq(CR0, L_fill_2_bytes);
__ stw(value, 0, to);
if (t == T_BYTE || t == T_SHORT) {
__ addi(to, to, 4);
// Fill trailing 2 bytes.
__ bind(L_fill_2_bytes);
__ andi_(temp, count, 1<<(shift-1));
__ beq(CR0, L_fill_byte);
__ sth(value, 0, to);
if (t == T_BYTE) {
__ addi(to, to, 2);
// Fill trailing byte.
__ bind(L_fill_byte);
__ andi_(count, count, 1);
__ beq(CR0, L_exit);
__ stb(value, 0, to);
} else {
__ bind(L_fill_byte);
}
} else {
__ bind(L_fill_2_bytes);
}
__ bind(L_exit);
__ blr();
// Handle copies less than 8 bytes. Int is handled elsewhere.
if (t == T_BYTE) {
__ bind(L_fill_elements);
Label L_fill_2, L_fill_4;
__ andi_(temp, count, 1);
__ beq(CR0, L_fill_2);
__ stb(value, 0, to);
__ addi(to, to, 1);
__ bind(L_fill_2);
__ andi_(temp, count, 2);
__ beq(CR0, L_fill_4);
__ stb(value, 0, to);
__ stb(value, 0, to);
__ addi(to, to, 2);
__ bind(L_fill_4);
__ andi_(temp, count, 4);
__ beq(CR0, L_exit);
__ stb(value, 0, to);
__ stb(value, 1, to);
__ stb(value, 2, to);
__ stb(value, 3, to);
__ blr();
}
if (t == T_SHORT) {
Label L_fill_2;
__ bind(L_fill_elements);
__ andi_(temp, count, 1);
__ beq(CR0, L_fill_2);
__ sth(value, 0, to);
__ addi(to, to, 2);
__ bind(L_fill_2);
__ andi_(temp, count, 2);
__ beq(CR0, L_exit);
__ sth(value, 0, to);
__ sth(value, 2, to);
__ blr();
}
return start;
}
inline void assert_positive_int(Register count) {
#ifdef ASSERT
__ srdi_(R0, count, 31);
__ asm_assert_eq("missing zero extend");
#endif
}
// Generate overlap test for array copy stubs.
//
// Input:
// R3_ARG1 - from
// R4_ARG2 - to
// R5_ARG3 - element count
//
void array_overlap_test(address no_overlap_target, int log2_elem_size) {
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
assert_positive_int(R5_ARG3);
__ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
__ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
__ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
__ cmpld(CR1, tmp1, tmp2);
__ crnand(CR0, Assembler::less, CR1, Assembler::less);
// Overlaps if Src before dst and distance smaller than size.
// Branch to forward copy routine otherwise (within range of 32kB).
__ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::less), no_overlap_target);
// need to copy backwards
}
// This is common errorexit stub for UnsafeMemoryAccess.
address generate_unsafecopy_common_error_exit() {
address start_pc = __ pc();
Register tmp1 = R6_ARG4;
// probably copy stub would have changed value reset it.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp1, VM_Version::_dscr_val);
__ mtdscr(tmp1);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start_pc;
}
// The guideline in the implementations of generate_disjoint_xxx_copy
// (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
// single instructions, but to avoid alignment interrupts (see subsequent
// comment). Furthermore, we try to minimize misaligned access, even
// though they cause no alignment interrupt.
//
// In Big-Endian mode, the PowerPC architecture requires implementations to
// handle automatically misaligned integer halfword and word accesses,
// word-aligned integer doubleword accesses, and word-aligned floating-point
// accesses. Other accesses may or may not generate an Alignment interrupt
// depending on the implementation.
// Alignment interrupt handling may require on the order of hundreds of cycles,
// so every effort should be made to avoid misaligned memory values.
//
//
// Generate stub for disjoint byte copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
address generate_disjoint_byte_copy(StubId stub_id) {
bool aligned;
switch (stub_id) {
case StubId::stubgen_jbyte_disjoint_arraycopy_id:
aligned = false;
break;
case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
Register tmp3 = R8_ARG6;
Register tmp4 = R9_ARG7;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
{
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
UnsafeMemoryAccessMark umam(this, !aligned, false);
// Don't try anything fancy if arrays don't have many elements.
__ li(tmp3, 0);
__ cmpwi(CR0, R5_ARG3, 17);
__ ble(CR0, l_6); // copy 4 at a time
if (!aligned) {
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp1, 3);
__ bne(CR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
// Copy elements if necessary to align to 4 bytes.
__ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
__ andi_(tmp1, tmp1, 3);
__ beq(CR0, l_2);
__ subf(R5_ARG3, tmp1, R5_ARG3);
__ bind(l_9);
__ lbz(tmp2, 0, R3_ARG1);
__ addic_(tmp1, tmp1, -1);
__ stb(tmp2, 0, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 1);
__ addi(R4_ARG2, R4_ARG2, 1);
__ bne(CR0, l_9);
__ bind(l_2);
}
// copy 8 elements at a time
__ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
__ andi_(tmp1, tmp2, 7);
__ bne(CR0, l_7); // not same alignment -> to or from is aligned -> copy 8
// copy a 2-element word if necessary to align to 8 bytes
__ andi_(R0, R3_ARG1, 7);
__ beq(CR0, l_7);
__ lwzx(tmp2, R3_ARG1, tmp3);
__ addi(R5_ARG3, R5_ARG3, -4);
__ stwx(tmp2, R4_ARG2, tmp3);
{ // FasterArrayCopy
__ addi(R3_ARG1, R3_ARG1, 4);
__ addi(R4_ARG2, R4_ARG2, 4);
}
__ bind(l_7);
{ // FasterArrayCopy
__ cmpwi(CR0, R5_ARG3, 31);
__ ble(CR0, l_6); // copy 2 at a time if less than 32 elements remain
__ srdi(tmp1, R5_ARG3, 5);
__ andi_(R5_ARG3, R5_ARG3, 31);
__ mtctr(tmp1);
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as
// loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_10);
// Use loop with VSX load/store instructions to
// copy 32 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_10); // Dec CTR and loop if not zero.
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
} // FasterArrayCopy
__ bind(l_6);
// copy 4 elements at a time
__ cmpwi(CR0, R5_ARG3, 4);
__ blt(CR0, l_1);
__ srdi(tmp1, R5_ARG3, 2);
__ mtctr(tmp1); // is > 0
__ andi_(R5_ARG3, R5_ARG3, 3);
{ // FasterArrayCopy
__ addi(R3_ARG1, R3_ARG1, -4);
__ addi(R4_ARG2, R4_ARG2, -4);
__ bind(l_3);
__ lwzu(tmp2, 4, R3_ARG1);
__ stwu(tmp2, 4, R4_ARG2);
__ bdnz(l_3);
__ addi(R3_ARG1, R3_ARG1, 4);
__ addi(R4_ARG2, R4_ARG2, 4);
}
// do single element copy
__ bind(l_1);
__ cmpwi(CR0, R5_ARG3, 0);
__ beq(CR0, l_4);
{ // FasterArrayCopy
__ mtctr(R5_ARG3);
__ addi(R3_ARG1, R3_ARG1, -1);
__ addi(R4_ARG2, R4_ARG2, -1);
__ bind(l_5);
__ lbzu(tmp2, 1, R3_ARG1);
__ stbu(tmp2, 1, R4_ARG2);
__ bdnz(l_5);
}
}
__ bind(l_4);
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate stub for conjoint byte copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
address generate_conjoint_byte_copy(StubId stub_id) {
bool aligned;
switch (stub_id) {
case StubId::stubgen_jbyte_arraycopy_id:
aligned = false;
break;
case StubId::stubgen_arrayof_jbyte_arraycopy_id:
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
Register tmp3 = R8_ARG6;
address nooverlap_target = aligned ?
STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy()) :
STUB_ENTRY(jbyte_disjoint_arraycopy());
array_overlap_test(nooverlap_target, 0);
// Do reverse copy. We assume the case of actual overlap is rare enough
// that we don't have to optimize it.
Label l_1, l_2;
{
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
UnsafeMemoryAccessMark umam(this, !aligned, false);
__ b(l_2);
__ bind(l_1);
__ stbx(tmp1, R4_ARG2, R5_ARG3);
__ bind(l_2);
__ addic_(R5_ARG3, R5_ARG3, -1);
__ lbzx(tmp1, R3_ARG1, R5_ARG3);
__ bge(CR0, l_1);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate stub for disjoint short copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// elm.count: R5_ARG3 treated as signed
//
// Strategy for aligned==true:
//
// If length <= 9:
// 1. copy 2 elements at a time (l_6)
// 2. copy last element if original element count was odd (l_1)
//
// If length > 9:
// 1. copy 4 elements at a time until less than 4 elements are left (l_7)
// 2. copy 2 elements at a time until less than 2 elements are left (l_6)
// 3. copy last element if one was left in step 2. (l_1)
//
//
// Strategy for aligned==false:
//
// If length <= 9: same as aligned==true case, but NOTE: load/stores
// can be unaligned (see comment below)
//
// If length > 9:
// 1. continue with step 6. if the alignment of from and to mod 4
// is different.
// 2. align from and to to 4 bytes by copying 1 element if necessary
// 3. at l_2 from and to are 4 byte aligned; continue with
// 5. if they cannot be aligned to 8 bytes because they have
// got different alignment mod 8.
// 4. at this point we know that both, from and to, have the same
// alignment mod 8, now copy one element if necessary to get
// 8 byte alignment of from and to.
// 5. copy 4 elements at a time until less than 4 elements are
// left; depending on step 3. all load/stores are aligned or
// either all loads or all stores are unaligned.
// 6. copy 2 elements at a time until less than 2 elements are
// left (l_6); arriving here from step 1., there is a chance
// that all accesses are unaligned.
// 7. copy last element if one was left in step 6. (l_1)
//
// There are unaligned data accesses using integer load/store
// instructions in this stub. POWER allows such accesses.
//
// According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
// Chapter 2: Effect of Operand Placement on Performance) unaligned
// integer load/stores have good performance. Only unaligned
// floating point load/stores can have poor performance.
//
// TODO:
//
// 1. check if aligning the backbranch target of loops is beneficial
//
address generate_disjoint_short_copy(StubId stub_id) {
bool aligned;
switch (stub_id) {
case StubId::stubgen_jshort_disjoint_arraycopy_id:
aligned = false;
break;
case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
Register tmp3 = R8_ARG6;
Register tmp4 = R9_ARG7;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
address start = __ function_entry();
assert_positive_int(R5_ARG3);
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
{
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
UnsafeMemoryAccessMark umam(this, !aligned, false);
// don't try anything fancy if arrays don't have many elements
__ li(tmp3, 0);
__ cmpwi(CR0, R5_ARG3, 9);
__ ble(CR0, l_6); // copy 2 at a time
if (!aligned) {
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp1, 3);
__ bne(CR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
// At this point it is guaranteed that both, from and to have the same alignment mod 4.
// Copy 1 element if necessary to align to 4 bytes.
__ andi_(tmp1, R3_ARG1, 3);
__ beq(CR0, l_2);
__ lhz(tmp2, 0, R3_ARG1);
__ addi(R3_ARG1, R3_ARG1, 2);
__ sth(tmp2, 0, R4_ARG2);
__ addi(R4_ARG2, R4_ARG2, 2);
__ addi(R5_ARG3, R5_ARG3, -1);
__ bind(l_2);
// At this point the positions of both, from and to, are at least 4 byte aligned.
// Copy 4 elements at a time.
// Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
__ xorr(tmp2, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp2, 7);
__ bne(CR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
// Copy a 2-element word if necessary to align to 8 bytes.
__ andi_(R0, R3_ARG1, 7);
__ beq(CR0, l_7);
__ lwzx(tmp2, R3_ARG1, tmp3);
__ addi(R5_ARG3, R5_ARG3, -2);
__ stwx(tmp2, R4_ARG2, tmp3);
{ // FasterArrayCopy
__ addi(R3_ARG1, R3_ARG1, 4);
__ addi(R4_ARG2, R4_ARG2, 4);
}
}
__ bind(l_7);
// Copy 4 elements at a time; either the loads or the stores can
// be unaligned if aligned == false.
{ // FasterArrayCopy
__ cmpwi(CR0, R5_ARG3, 15);
__ ble(CR0, l_6); // copy 2 at a time if less than 16 elements remain
__ srdi(tmp1, R5_ARG3, 4);
__ andi_(R5_ARG3, R5_ARG3, 15);
__ mtctr(tmp1);
// Processor supports VSX, so use it to mass copy.
// Prefetch src data into L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. It's not aligned 16-byte
// as loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_9);
// Use loop with VSX load/store instructions to
// copy 16 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
__ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
__ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
__ bdnz(l_9); // Dec CTR and loop if not zero.
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
} // FasterArrayCopy
__ bind(l_6);
// copy 2 elements at a time
{ // FasterArrayCopy
__ cmpwi(CR0, R5_ARG3, 2);
__ blt(CR0, l_1);
__ srdi(tmp1, R5_ARG3, 1);
__ andi_(R5_ARG3, R5_ARG3, 1);
__ addi(R3_ARG1, R3_ARG1, -4);
__ addi(R4_ARG2, R4_ARG2, -4);
__ mtctr(tmp1);
__ bind(l_3);
__ lwzu(tmp2, 4, R3_ARG1);
__ stwu(tmp2, 4, R4_ARG2);
__ bdnz(l_3);
__ addi(R3_ARG1, R3_ARG1, 4);
__ addi(R4_ARG2, R4_ARG2, 4);
}
// do single element copy
__ bind(l_1);
__ cmpwi(CR0, R5_ARG3, 0);
__ beq(CR0, l_4);
{ // FasterArrayCopy
__ mtctr(R5_ARG3);
__ addi(R3_ARG1, R3_ARG1, -2);
__ addi(R4_ARG2, R4_ARG2, -2);
__ bind(l_5);
__ lhzu(tmp2, 2, R3_ARG1);
__ sthu(tmp2, 2, R4_ARG2);
__ bdnz(l_5);
}
}
__ bind(l_4);
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate stub for conjoint short copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
address generate_conjoint_short_copy(StubId stub_id) {
bool aligned;
switch (stub_id) {
case StubId::stubgen_jshort_arraycopy_id:
aligned = false;
break;
case StubId::stubgen_arrayof_jshort_arraycopy_id:
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
Register tmp3 = R8_ARG6;
address nooverlap_target = aligned ?
STUB_ENTRY(arrayof_jshort_disjoint_arraycopy()) :
STUB_ENTRY(jshort_disjoint_arraycopy());
array_overlap_test(nooverlap_target, 1);
Label l_1, l_2;
{
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
UnsafeMemoryAccessMark umam(this, !aligned, false);
__ sldi(tmp1, R5_ARG3, 1);
__ b(l_2);
__ bind(l_1);
__ sthx(tmp2, R4_ARG2, tmp1);
__ bind(l_2);
__ addic_(tmp1, tmp1, -2);
__ lhzx(tmp2, R3_ARG1, tmp1);
__ bge(CR0, l_1);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
// is true, the "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
void generate_disjoint_int_copy_core(bool aligned) {
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
Register tmp3 = R8_ARG6;
Register tmp4 = R0;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
// for short arrays, just do single element copy
__ li(tmp3, 0);
__ cmpwi(CR0, R5_ARG3, 5);
__ ble(CR0, l_2);
if (!aligned) {
// check if arrays have same alignment mod 8.
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(R0, tmp1, 7);
// Not the same alignment, but ld and std just need to be 4 byte aligned.
__ bne(CR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
// copy 1 element to align to and from on an 8 byte boundary
__ andi_(R0, R3_ARG1, 7);
__ beq(CR0, l_4);
__ lwzx(tmp2, R3_ARG1, tmp3);
__ addi(R5_ARG3, R5_ARG3, -1);
__ stwx(tmp2, R4_ARG2, tmp3);
{ // FasterArrayCopy
__ addi(R3_ARG1, R3_ARG1, 4);
__ addi(R4_ARG2, R4_ARG2, 4);
}
__ bind(l_4);
}
{ // FasterArrayCopy
__ cmpwi(CR0, R5_ARG3, 7);
__ ble(CR0, l_2); // copy 1 at a time if less than 8 elements remain
__ srdi(tmp1, R5_ARG3, 3);
__ andi_(R5_ARG3, R5_ARG3, 7);
__ mtctr(tmp1);
// Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// Set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as
// loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_7);
// Use loop with VSX load/store instructions to
// copy 8 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_7); // Dec CTR and loop if not zero.
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
} // FasterArrayCopy
// copy 1 element at a time
__ bind(l_2);
__ cmpwi(CR0, R5_ARG3, 0);
__ beq(CR0, l_1);
{ // FasterArrayCopy
__ mtctr(R5_ARG3);
__ addi(R3_ARG1, R3_ARG1, -4);
__ addi(R4_ARG2, R4_ARG2, -4);
__ bind(l_3);
__ lwzu(tmp2, 4, R3_ARG1);
__ stwu(tmp2, 4, R4_ARG2);
__ bdnz(l_3);
}
__ bind(l_1);
return;
}
// Generate stub for disjoint int copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
address generate_disjoint_int_copy(StubId stub_id) {
bool aligned;
switch (stub_id) {
case StubId::stubgen_jint_disjoint_arraycopy_id:
aligned = false;
break;
case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
{
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
UnsafeMemoryAccessMark umam(this, !aligned, false);
generate_disjoint_int_copy_core(aligned);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate core code for conjoint int copy (and oop copy on
// 32-bit). If "aligned" is true, the "from" and "to" addresses
// are assumed to be heapword aligned.
//
// Arguments:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
void generate_conjoint_int_copy_core(bool aligned) {
// Do reverse copy. We assume the case of actual overlap is rare enough
// that we don't have to optimize it.
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
Register tmp3 = R8_ARG6;
Register tmp4 = R0;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
{ // FasterArrayCopy
__ cmpwi(CR0, R5_ARG3, 0);
__ beq(CR0, l_6);
__ sldi(R5_ARG3, R5_ARG3, 2);
__ add(R3_ARG1, R3_ARG1, R5_ARG3);
__ add(R4_ARG2, R4_ARG2, R5_ARG3);
__ srdi(R5_ARG3, R5_ARG3, 2);
if (!aligned) {
// check if arrays have same alignment mod 8.
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(R0, tmp1, 7);
// Not the same alignment, but ld and std just need to be 4 byte aligned.
__ bne(CR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
// copy 1 element to align to and from on an 8 byte boundary
__ andi_(R0, R3_ARG1, 7);
__ beq(CR0, l_7);
__ addi(R3_ARG1, R3_ARG1, -4);
__ addi(R4_ARG2, R4_ARG2, -4);
__ addi(R5_ARG3, R5_ARG3, -1);
__ lwzx(tmp2, R3_ARG1);
__ stwx(tmp2, R4_ARG2);
__ bind(l_7);
}
__ cmpwi(CR0, R5_ARG3, 7);
__ ble(CR0, l_5); // copy 1 at a time if less than 8 elements remain
__ srdi(tmp1, R5_ARG3, 3);
__ andi(R5_ARG3, R5_ARG3, 7);
__ mtctr(tmp1);
// Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// Set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as
// loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_4);
// Use loop with VSX load/store instructions to
// copy 8 elements a time.
__ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
__ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ bdnz(l_4);
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
__ cmpwi(CR0, R5_ARG3, 0);
__ beq(CR0, l_6);
__ bind(l_5);
__ mtctr(R5_ARG3);
__ bind(l_3);
__ lwz(R0, -4, R3_ARG1);
__ stw(R0, -4, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, -4);
__ addi(R4_ARG2, R4_ARG2, -4);
__ bdnz(l_3);
__ bind(l_6);
}
}
// Generate stub for conjoint int copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
address generate_conjoint_int_copy(StubId stub_id) {
bool aligned;
switch (stub_id) {
case StubId::stubgen_jint_arraycopy_id:
aligned = false;
break;
case StubId::stubgen_arrayof_jint_arraycopy_id:
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
address nooverlap_target = aligned ?
STUB_ENTRY(arrayof_jint_disjoint_arraycopy()) :
STUB_ENTRY(jint_disjoint_arraycopy());
array_overlap_test(nooverlap_target, 2);
{
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
UnsafeMemoryAccessMark umam(this, !aligned, false);
generate_conjoint_int_copy_core(aligned);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate core code for disjoint long copy (and oop copy on
// 64-bit). If "aligned" is true, the "from" and "to" addresses
// are assumed to be heapword aligned.
//
// Arguments:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
void generate_disjoint_long_copy_core(bool aligned) {
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
Register tmp3 = R8_ARG6;
Register tmp4 = R0;
Label l_1, l_2, l_3, l_4, l_5;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
{ // FasterArrayCopy
__ cmpwi(CR0, R5_ARG3, 3);
__ ble(CR0, l_3); // copy 1 at a time if less than 4 elements remain
__ srdi(tmp1, R5_ARG3, 2);
__ andi_(R5_ARG3, R5_ARG3, 3);
__ mtctr(tmp1);
// Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// Set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as
// loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_5);
// Use loop with VSX load/store instructions to
// copy 4 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_5); // Dec CTR and loop if not zero.
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
} // FasterArrayCopy
// copy 1 element at a time
__ bind(l_3);
__ cmpwi(CR0, R5_ARG3, 0);
__ beq(CR0, l_1);
{ // FasterArrayCopy
__ mtctr(R5_ARG3);
__ addi(R3_ARG1, R3_ARG1, -8);
__ addi(R4_ARG2, R4_ARG2, -8);
__ bind(l_2);
__ ldu(R0, 8, R3_ARG1);
__ stdu(R0, 8, R4_ARG2);
__ bdnz(l_2);
}
__ bind(l_1);
}
// Generate stub for disjoint long copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
address generate_disjoint_long_copy(StubId stub_id) {
bool aligned;
switch (stub_id) {
case StubId::stubgen_jlong_disjoint_arraycopy_id:
aligned = false;
break;
case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
{
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
UnsafeMemoryAccessMark umam(this, !aligned, false);
generate_disjoint_long_copy_core(aligned);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate core code for conjoint long copy (and oop copy on
// 64-bit). If "aligned" is true, the "from" and "to" addresses
// are assumed to be heapword aligned.
//
// Arguments:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
void generate_conjoint_long_copy_core(bool aligned) {
Register tmp1 = R6_ARG4;
Register tmp2 = R7_ARG5;
Register tmp3 = R8_ARG6;
Register tmp4 = R0;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
Label l_1, l_2, l_3, l_4, l_5;
__ cmpwi(CR0, R5_ARG3, 0);
__ beq(CR0, l_1);
{ // FasterArrayCopy
__ sldi(R5_ARG3, R5_ARG3, 3);
__ add(R3_ARG1, R3_ARG1, R5_ARG3);
__ add(R4_ARG2, R4_ARG2, R5_ARG3);
__ srdi(R5_ARG3, R5_ARG3, 3);
__ cmpwi(CR0, R5_ARG3, 3);
__ ble(CR0, l_5); // copy 1 at a time if less than 4 elements remain
__ srdi(tmp1, R5_ARG3, 2);
__ andi(R5_ARG3, R5_ARG3, 3);
__ mtctr(tmp1);
// Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// Set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as
// loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_4);
// Use loop with VSX load/store instructions to
// copy 4 elements a time.
__ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
__ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ bdnz(l_4);
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
__ cmpwi(CR0, R5_ARG3, 0);
__ beq(CR0, l_1);
__ bind(l_5);
__ mtctr(R5_ARG3);
__ bind(l_3);
__ ld(R0, -8, R3_ARG1);
__ std(R0, -8, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, -8);
__ addi(R4_ARG2, R4_ARG2, -8);
__ bdnz(l_3);
}
__ bind(l_1);
}
// Generate stub for conjoint long copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
//
address generate_conjoint_long_copy(StubId stub_id) {
bool aligned;
switch (stub_id) {
case StubId::stubgen_jlong_arraycopy_id:
aligned = false;
break;
case StubId::stubgen_arrayof_jlong_arraycopy_id:
aligned = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
address nooverlap_target = aligned ?
STUB_ENTRY(arrayof_jlong_disjoint_arraycopy()) :
STUB_ENTRY(jlong_disjoint_arraycopy());
array_overlap_test(nooverlap_target, 3);
{
// UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
UnsafeMemoryAccessMark umam(this, !aligned, false);
generate_conjoint_long_copy_core(aligned);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate stub for conjoint oop copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
// dest_uninitialized: G1 support
//
address generate_conjoint_oop_copy(StubId stub_id) {
bool aligned;
bool dest_uninitialized;
switch (stub_id) {
case StubId::stubgen_oop_arraycopy_id:
aligned = false;
dest_uninitialized = false;
break;
case StubId::stubgen_arrayof_oop_arraycopy_id:
aligned = true;
dest_uninitialized = false;
break;
case StubId::stubgen_oop_arraycopy_uninit_id:
aligned = false;
dest_uninitialized = true;
break;
case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
aligned = true;
dest_uninitialized = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
address nooverlap_target = aligned ?
STUB_ENTRY(arrayof_oop_disjoint_arraycopy(dest_uninitialized)) :
STUB_ENTRY(oop_disjoint_arraycopy(dest_uninitialized));
array_overlap_test(nooverlap_target, UseCompressedOops ? 2 : 3);
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
if (UseCompressedOops) {
generate_conjoint_int_copy_core(aligned);
} else {
#if INCLUDE_ZGC
if (UseZGC) {
ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
zbs->generate_conjoint_oop_copy(_masm, dest_uninitialized);
} else
#endif
generate_conjoint_long_copy_core(aligned);
}
bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate stub for disjoint oop copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: R3_ARG1
// to: R4_ARG2
// count: R5_ARG3 treated as signed
// dest_uninitialized: G1 support
//
address generate_disjoint_oop_copy(StubId stub_id) {
bool aligned;
bool dest_uninitialized;
switch (stub_id) {
case StubId::stubgen_oop_disjoint_arraycopy_id:
aligned = false;
dest_uninitialized = false;
break;
case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
aligned = true;
dest_uninitialized = false;
break;
case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
aligned = false;
dest_uninitialized = true;
break;
case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
aligned = true;
dest_uninitialized = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
if (UseCompressedOops) {
generate_disjoint_int_copy_core(aligned);
} else {
#if INCLUDE_ZGC
if (UseZGC) {
ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
zbs->generate_disjoint_oop_copy(_masm, dest_uninitialized);
} else
#endif
generate_disjoint_long_copy_core(aligned);
}
bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Helper for generating a dynamic type check.
// Smashes only the given temp registers.
void generate_type_check(Register sub_klass,
Register super_check_offset,
Register super_klass,
Register temp1,
Register temp2,
Label& L_success) {
assert_different_registers(sub_klass, super_check_offset, super_klass);
BLOCK_COMMENT("type_check:");
Label L_miss;
__ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr,
super_check_offset);
__ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success);
// Fall through on failure!
__ bind(L_miss);
}
// Generate stub for checked oop copy.
//
// Arguments for generated stub:
// from: R3
// to: R4
// count: R5 treated as signed
// ckoff: R6 (super_check_offset)
// ckval: R7 (super_klass)
// ret: R3 zero for success; (-1^K) where K is partial transfer count
//
address generate_checkcast_copy(StubId stub_id) {
const Register R3_from = R3_ARG1; // source array address
const Register R4_to = R4_ARG2; // destination array address
const Register R5_count = R5_ARG3; // elements count
const Register R6_ckoff = R6_ARG4; // super_check_offset
const Register R7_ckval = R7_ARG5; // super_klass
const Register R8_offset = R8_ARG6; // loop var, with stride wordSize
const Register R9_remain = R9_ARG7; // loop var, with stride -1
const Register R10_oop = R10_ARG8; // actual oop copied
const Register R11_klass = R11_scratch1; // oop._klass
const Register R12_tmp = R12_scratch2;
const Register R2_tmp = R2;
bool dest_uninitialized;
switch (stub_id) {
case StubId::stubgen_checkcast_arraycopy_id:
dest_uninitialized = false;
break;
case StubId::stubgen_checkcast_arraycopy_uninit_id:
dest_uninitialized = true;
break;
default:
ShouldNotReachHere();
}
//__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
// Assert that int is 64 bit sign extended and arrays are not conjoint.
#ifdef ASSERT
{
assert_positive_int(R5_ARG3);
const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
Label no_overlap;
__ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
__ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
__ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
__ cmpld(CR1, tmp1, tmp2);
__ crnand(CR0, Assembler::less, CR1, Assembler::less);
// Overlaps if Src before dst and distance smaller than size.
// Branch to forward copy routine otherwise.
__ blt(CR0, no_overlap);
__ stop("overlap in checkcast_copy");
__ bind(no_overlap);
}
#endif
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
//inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
Label load_element, store_element, store_null, success, do_epilogue;
__ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
__ li(R8_offset, 0); // Offset from start of arrays.
__ bne(CR0, load_element);
// Empty array: Nothing to do.
__ li(R3_RET, 0); // Return 0 on (trivial) success.
__ blr();
// ======== begin loop ========
// (Entry is load_element.)
__ align(OptoLoopAlignment);
__ bind(store_element);
if (UseCompressedOops) {
__ encode_heap_oop_not_null(R10_oop);
__ bind(store_null);
__ stw(R10_oop, R8_offset, R4_to);
} else {
__ bind(store_null);
#if INCLUDE_ZGC
if (UseZGC) {
__ store_heap_oop(R10_oop, R8_offset, R4_to, R11_scratch1, R12_tmp, noreg,
MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
dest_uninitialized ? IS_DEST_UNINITIALIZED : 0);
} else
#endif
__ std(R10_oop, R8_offset, R4_to);
}
__ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset.
__ addic_(R9_remain, R9_remain, -1); // Decrement the count.
__ beq(CR0, success);
// ======== loop entry is here ========
__ bind(load_element);
#if INCLUDE_ZGC
if (UseZGC) {
__ load_heap_oop(R10_oop, R8_offset, R3_from,
R11_scratch1, R12_tmp,
MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
0, &store_null);
} else
#endif
__ load_heap_oop(R10_oop, R8_offset, R3_from,
R11_scratch1, R12_tmp,
MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
AS_RAW, &store_null);
__ load_klass(R11_klass, R10_oop); // Query the object klass.
generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp,
// Branch to this on success:
store_element);
// ======== end loop ========
// It was a real error; we must depend on the caller to finish the job.
// Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
// Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
// and report their number to the caller.
__ subf_(R5_count, R9_remain, R5_count);
__ nand(R3_RET, R5_count, R5_count); // report (-1^K) to caller
__ bne(CR0, do_epilogue);
__ blr();
__ bind(success);
__ li(R3_RET, 0);
__ bind(do_epilogue);
bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
__ blr();
return start;
}
// Generate 'unsafe' array copy stub.
// Though just as safe as the other stubs, it takes an unscaled
// size_t argument instead of an element count.
//
// Arguments for generated stub:
// from: R3
// to: R4
// count: R5 byte count, treated as ssize_t, can be zero
//
// Examines the alignment of the operands and dispatches
// to a long, int, short, or byte copy loop.
//
address generate_unsafe_copy(address byte_copy_entry,
address short_copy_entry,
address int_copy_entry,
address long_copy_entry) {
const Register R3_from = R3_ARG1; // source array address
const Register R4_to = R4_ARG2; // destination array address
const Register R5_count = R5_ARG3; // elements count (as long on PPC64)
const Register R6_bits = R6_ARG4; // test copy of low bits
const Register R7_tmp = R7_ARG5;
//__ align(CodeEntryAlignment);
StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
// Bump this on entry, not on exit:
//inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
Label short_copy, int_copy, long_copy;
__ orr(R6_bits, R3_from, R4_to);
__ orr(R6_bits, R6_bits, R5_count);
__ andi_(R0, R6_bits, (BytesPerLong-1));
__ beq(CR0, long_copy);
__ andi_(R0, R6_bits, (BytesPerInt-1));
__ beq(CR0, int_copy);
__ andi_(R0, R6_bits, (BytesPerShort-1));
__ beq(CR0, short_copy);
// byte_copy:
__ b(byte_copy_entry);
__ bind(short_copy);
__ srwi(R5_count, R5_count, LogBytesPerShort);
__ b(short_copy_entry);
__ bind(int_copy);
__ srwi(R5_count, R5_count, LogBytesPerInt);
__ b(int_copy_entry);
__ bind(long_copy);
__ srwi(R5_count, R5_count, LogBytesPerLong);
__ b(long_copy_entry);
return start;
}
// Perform range checks on the proposed arraycopy.
// Kills the two temps, but nothing else.
// Also, clean the sign bits of src_pos and dst_pos.
void arraycopy_range_checks(Register src, // source array oop
Register src_pos, // source position
Register dst, // destination array oop
Register dst_pos, // destination position
Register length, // length of copy
Register temp1, Register temp2,
Label& L_failed) {
BLOCK_COMMENT("arraycopy_range_checks:");
const Register array_length = temp1; // scratch
const Register end_pos = temp2; // scratch
// if (src_pos + length > arrayOop(src)->length() ) FAIL;
__ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
__ add(end_pos, src_pos, length); // src_pos + length
__ cmpd(CR0, end_pos, array_length);
__ bgt(CR0, L_failed);
// if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
__ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
__ add(end_pos, dst_pos, length); // src_pos + length
__ cmpd(CR0, end_pos, array_length);
__ bgt(CR0, L_failed);
BLOCK_COMMENT("arraycopy_range_checks done");
}
// Helper for generate_unsafe_setmemory
//
// Atomically fill an array of memory using 1-, 2-, 4-, or 8-byte chunks and return.
static void do_setmemory_atomic_loop(int elem_size, Register dest, Register size, Register byteVal,
MacroAssembler *_masm) {
Label L_Loop, L_Tail; // 2x unrolled loop
// Propagate byte to required width
if (elem_size > 1) __ rldimi(byteVal, byteVal, 8, 64 - 2 * 8);
if (elem_size > 2) __ rldimi(byteVal, byteVal, 16, 64 - 2 * 16);
if (elem_size > 4) __ rldimi(byteVal, byteVal, 32, 64 - 2 * 32);
__ srwi_(R0, size, exact_log2(2 * elem_size)); // size is a 32 bit value
__ beq(CR0, L_Tail);
__ mtctr(R0);
__ align(32); // loop alignment
__ bind(L_Loop);
__ store_sized_value(byteVal, 0, dest, elem_size);
__ store_sized_value(byteVal, elem_size, dest, elem_size);
__ addi(dest, dest, 2 * elem_size);
__ bdnz(L_Loop);
__ bind(L_Tail);
__ andi_(R0, size, elem_size);
__ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintbhBCLRisReturn);
__ store_sized_value(byteVal, 0, dest, elem_size);
__ blr();
}
//
// Generate 'unsafe' set memory stub
// Though just as safe as the other stubs, it takes an unscaled
// size_t (# bytes) argument instead of an element count.
//
// Input:
// R3_ARG1 - destination array address
// R4_ARG2 - byte count (size_t)
// R5_ARG3 - byte value
//
address generate_unsafe_setmemory(address unsafe_byte_fill) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
address start = __ function_entry();
// bump this on entry, not on exit:
// inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
{
Label L_fill8Bytes, L_fill4Bytes, L_fillBytes;
const Register dest = R3_ARG1;
const Register size = R4_ARG2;
const Register byteVal = R5_ARG3;
const Register rScratch1 = R6;
// fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
// Check for pointer & size alignment
__ orr(rScratch1, dest, size);
__ andi_(R0, rScratch1, 7);
__ beq(CR0, L_fill8Bytes);
__ andi_(R0, rScratch1, 3);
__ beq(CR0, L_fill4Bytes);
__ andi_(R0, rScratch1, 1);
__ bne(CR0, L_fillBytes);
// Mark remaining code as such which performs Unsafe accesses.
UnsafeMemoryAccessMark umam(this, true, false);
// At this point, we know the lower bit of size is zero and a
// multiple of 2
do_setmemory_atomic_loop(2, dest, size, byteVal, _masm);
__ align(32);
__ bind(L_fill8Bytes);
// At this point, we know the lower 3 bits of size are zero and a
// multiple of 8
do_setmemory_atomic_loop(8, dest, size, byteVal, _masm);
__ align(32);
__ bind(L_fill4Bytes);
// At this point, we know the lower 2 bits of size are zero and a
// multiple of 4
do_setmemory_atomic_loop(4, dest, size, byteVal, _masm);
__ align(32);
__ bind(L_fillBytes);
do_setmemory_atomic_loop(1, dest, size, byteVal, _masm);
}
return start;
}
//
// Generate generic array copy stubs
//
// Input:
// R3 - src oop
// R4 - src_pos
// R5 - dst oop
// R6 - dst_pos
// R7 - element count
//
// Output:
// R3 == 0 - success
// R3 == -1 - need to call System.arraycopy
//
address generate_generic_copy(address entry_jbyte_arraycopy,
address entry_jshort_arraycopy,
address entry_jint_arraycopy,
address entry_oop_arraycopy,
address entry_disjoint_oop_arraycopy,
address entry_jlong_arraycopy,
address entry_checkcast_arraycopy) {
Label L_failed, L_objArray;
// Input registers
const Register src = R3_ARG1; // source array oop
const Register src_pos = R4_ARG2; // source position
const Register dst = R5_ARG3; // destination array oop
const Register dst_pos = R6_ARG4; // destination position
const Register length = R7_ARG5; // elements count
// registers used as temp
const Register src_klass = R8_ARG6; // source array klass
const Register dst_klass = R9_ARG7; // destination array klass
const Register lh = R10_ARG8; // layout handler
const Register temp = R2;
//__ align(CodeEntryAlignment);
StubId stub_id = StubId::stubgen_generic_arraycopy_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
// Bump this on entry, not on exit:
//inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
// In principle, the int arguments could be dirty.
//-----------------------------------------------------------------------
// Assembler stubs will be used for this call to arraycopy
// if the following conditions are met:
//
// (1) src and dst must not be null.
// (2) src_pos must not be negative.
// (3) dst_pos must not be negative.
// (4) length must not be negative.
// (5) src klass and dst klass should be the same and not null.
// (6) src and dst should be arrays.
// (7) src_pos + length must not exceed length of src.
// (8) dst_pos + length must not exceed length of dst.
BLOCK_COMMENT("arraycopy initial argument checks");
__ cmpdi(CR1, src, 0); // if (src == nullptr) return -1;
__ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
__ cmpdi(CR5, dst, 0); // if (dst == nullptr) return -1;
__ cror(CR1, Assembler::equal, CR0, Assembler::less);
__ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
__ cror(CR5, Assembler::equal, CR0, Assembler::less);
__ extsw_(length, length); // if (length < 0) return -1;
__ cror(CR1, Assembler::equal, CR5, Assembler::equal);
__ cror(CR1, Assembler::equal, CR0, Assembler::less);
__ beq(CR1, L_failed);
BLOCK_COMMENT("arraycopy argument klass checks");
__ load_klass(src_klass, src);
__ load_klass(dst_klass, dst);
// Load layout helper
//
// |array_tag| | header_size | element_type | |log2_element_size|
// 32 30 24 16 8 2 0
//
// array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
//
int lh_offset = in_bytes(Klass::layout_helper_offset());
// Load 32-bits signed value. Use br() instruction with it to check icc.
__ lwz(lh, lh_offset, src_klass);
// Handle objArrays completely differently...
jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
__ load_const_optimized(temp, objArray_lh, R0);
__ cmpw(CR0, lh, temp);
__ beq(CR0, L_objArray);
__ cmpd(CR5, src_klass, dst_klass); // if (src->klass() != dst->klass()) return -1;
__ cmpwi(CR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
__ crnand(CR5, Assembler::equal, CR6, Assembler::less);
__ beq(CR5, L_failed);
// At this point, it is known to be a typeArray (array_tag 0x3).
#ifdef ASSERT
{ Label L;
jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
__ load_const_optimized(temp, lh_prim_tag_in_place, R0);
__ cmpw(CR0, lh, temp);
__ bge(CR0, L);
__ stop("must be a primitive array");
__ bind(L);
}
#endif
arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
temp, dst_klass, L_failed);
// TypeArrayKlass
//
// src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
// dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
//
const Register offset = dst_klass; // array offset
const Register elsize = src_klass; // log2 element size
__ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
__ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
__ add(src, offset, src); // src array offset
__ add(dst, offset, dst); // dst array offset
// Next registers should be set before the jump to corresponding stub.
const Register from = R3_ARG1; // source array address
const Register to = R4_ARG2; // destination array address
const Register count = R5_ARG3; // elements count
// 'from', 'to', 'count' registers should be set in this order
// since they are the same as 'src', 'src_pos', 'dst'.
BLOCK_COMMENT("scale indexes to element size");
__ sld(src_pos, src_pos, elsize);
__ sld(dst_pos, dst_pos, elsize);
__ add(from, src_pos, src); // src_addr
__ add(to, dst_pos, dst); // dst_addr
__ mr(count, length); // length
BLOCK_COMMENT("choose copy loop based on element size");
// Using conditional branches with range 32kB.
const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CR0, Assembler::equal);
__ cmpwi(CR0, elsize, 0);
__ bc(bo, bi, entry_jbyte_arraycopy);
__ cmpwi(CR0, elsize, LogBytesPerShort);
__ bc(bo, bi, entry_jshort_arraycopy);
__ cmpwi(CR0, elsize, LogBytesPerInt);
__ bc(bo, bi, entry_jint_arraycopy);
#ifdef ASSERT
{ Label L;
__ cmpwi(CR0, elsize, LogBytesPerLong);
__ beq(CR0, L);
__ stop("must be long copy, but elsize is wrong");
__ bind(L);
}
#endif
__ b(entry_jlong_arraycopy);
// ObjArrayKlass
__ bind(L_objArray);
// live at this point: src_klass, dst_klass, src[_pos], dst[_pos], length
Label L_disjoint_plain_copy, L_checkcast_copy;
// test array classes for subtyping
__ cmpd(CR0, src_klass, dst_klass); // usual case is exact equality
__ bne(CR0, L_checkcast_copy);
// Identically typed arrays can be copied without element-wise checks.
arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
temp, lh, L_failed);
__ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
__ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
__ sldi(src_pos, src_pos, LogBytesPerHeapOop);
__ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
__ add(from, src_pos, src); // src_addr
__ add(to, dst_pos, dst); // dst_addr
__ mr(count, length); // length
__ b(entry_oop_arraycopy);
__ bind(L_checkcast_copy);
// live at this point: src_klass, dst_klass
{
// Before looking at dst.length, make sure dst is also an objArray.
__ lwz(temp, lh_offset, dst_klass);
__ cmpw(CR0, lh, temp);
__ bne(CR0, L_failed);
// It is safe to examine both src.length and dst.length.
arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
temp, lh, L_failed);
// Marshal the base address arguments now, freeing registers.
__ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
__ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
__ sldi(src_pos, src_pos, LogBytesPerHeapOop);
__ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
__ add(from, src_pos, src); // src_addr
__ add(to, dst_pos, dst); // dst_addr
__ mr(count, length); // length
Register sco_temp = R6_ARG4; // This register is free now.
assert_different_registers(from, to, count, sco_temp,
dst_klass, src_klass);
// Generate the type check.
int sco_offset = in_bytes(Klass::super_check_offset_offset());
__ lwz(sco_temp, sco_offset, dst_klass);
generate_type_check(src_klass, sco_temp, dst_klass,
temp, /* temp */ R10_ARG8, L_disjoint_plain_copy);
// Fetch destination element klass from the ObjArrayKlass header.
int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
// The checkcast_copy loop needs two extra arguments:
__ ld(R7_ARG5, ek_offset, dst_klass); // dest elem klass
__ lwz(R6_ARG4, sco_offset, R7_ARG5); // sco of elem klass
__ b(entry_checkcast_arraycopy);
}
__ bind(L_disjoint_plain_copy);
__ b(entry_disjoint_oop_arraycopy);
__ bind(L_failed);
__ li(R3_RET, -1); // return -1
__ blr();
return start;
}
// Arguments for generated stub:
// R3_ARG1 - source byte array address
// R4_ARG2 - destination byte array address
// R5_ARG3 - round key array
address generate_aescrypt_encryptBlock() {
assert(UseAES, "need AES instructions and misaligned SSE support");
StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
Label L_doLast, L_error;
Register from = R3_ARG1; // source array address
Register to = R4_ARG2; // destination array address
Register key = R5_ARG3; // round key array
Register keylen = R8;
Register temp = R9;
Register keypos = R10;
Register fifteen = R12;
VectorRegister vRet = VR0;
VectorRegister vKey1 = VR1;
VectorRegister vKey2 = VR2;
VectorRegister vKey3 = VR3;
VectorRegister vKey4 = VR4;
VectorRegister fromPerm = VR5;
VectorRegister keyPerm = VR6;
VectorRegister toPerm = VR7;
VectorRegister fSplt = VR8;
VectorRegister vTmp1 = VR9;
VectorRegister vTmp2 = VR10;
VectorRegister vTmp3 = VR11;
VectorRegister vTmp4 = VR12;
__ li (fifteen, 15);
// load unaligned from[0-15] to vRet
__ lvx (vRet, from);
__ lvx (vTmp1, fifteen, from);
__ lvsl (fromPerm, from);
#ifdef VM_LITTLE_ENDIAN
__ vspltisb (fSplt, 0x0f);
__ vxor (fromPerm, fromPerm, fSplt);
#endif
__ vperm (vRet, vRet, vTmp1, fromPerm);
// load keylen (44 or 52 or 60)
__ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
// to load keys
__ load_perm (keyPerm, key);
#ifdef VM_LITTLE_ENDIAN
__ vspltisb (vTmp2, -16);
__ vrld (keyPerm, keyPerm, vTmp2);
__ vrld (keyPerm, keyPerm, vTmp2);
__ vsldoi (keyPerm, keyPerm, keyPerm, 8);
#endif
// load the 1st round key to vTmp1
__ lvx (vTmp1, key);
__ li (keypos, 16);
__ lvx (vKey1, keypos, key);
__ vec_perm (vTmp1, vKey1, keyPerm);
// 1st round
__ vxor (vRet, vRet, vTmp1);
// load the 2nd round key to vKey1
__ li (keypos, 32);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vKey2, keyPerm);
// load the 3rd round key to vKey2
__ li (keypos, 48);
__ lvx (vKey3, keypos, key);
__ vec_perm (vKey2, vKey3, keyPerm);
// load the 4th round key to vKey3
__ li (keypos, 64);
__ lvx (vKey4, keypos, key);
__ vec_perm (vKey3, vKey4, keyPerm);
// load the 5th round key to vKey4
__ li (keypos, 80);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey4, vTmp1, keyPerm);
// 2nd - 5th rounds
__ vcipher (vRet, vRet, vKey1);
__ vcipher (vRet, vRet, vKey2);
__ vcipher (vRet, vRet, vKey3);
__ vcipher (vRet, vRet, vKey4);
// load the 6th round key to vKey1
__ li (keypos, 96);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
// load the 7th round key to vKey2
__ li (keypos, 112);
__ lvx (vKey3, keypos, key);
__ vec_perm (vKey2, vKey3, keyPerm);
// load the 8th round key to vKey3
__ li (keypos, 128);
__ lvx (vKey4, keypos, key);
__ vec_perm (vKey3, vKey4, keyPerm);
// load the 9th round key to vKey4
__ li (keypos, 144);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey4, vTmp1, keyPerm);
// 6th - 9th rounds
__ vcipher (vRet, vRet, vKey1);
__ vcipher (vRet, vRet, vKey2);
__ vcipher (vRet, vRet, vKey3);
__ vcipher (vRet, vRet, vKey4);
// load the 10th round key to vKey1
__ li (keypos, 160);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
// load the 11th round key to vKey2
__ li (keypos, 176);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey2, vTmp1, keyPerm);
// if all round keys are loaded, skip next 4 rounds
__ cmpwi (CR0, keylen, 44);
__ beq (CR0, L_doLast);
// 10th - 11th rounds
__ vcipher (vRet, vRet, vKey1);
__ vcipher (vRet, vRet, vKey2);
// load the 12th round key to vKey1
__ li (keypos, 192);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
// load the 13th round key to vKey2
__ li (keypos, 208);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey2, vTmp1, keyPerm);
// if all round keys are loaded, skip next 2 rounds
__ cmpwi (CR0, keylen, 52);
__ beq (CR0, L_doLast);
#ifdef ASSERT
__ cmpwi (CR0, keylen, 60);
__ bne (CR0, L_error);
#endif
// 12th - 13th rounds
__ vcipher (vRet, vRet, vKey1);
__ vcipher (vRet, vRet, vKey2);
// load the 14th round key to vKey1
__ li (keypos, 224);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
// load the 15th round key to vKey2
__ li (keypos, 240);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey2, vTmp1, keyPerm);
__ bind(L_doLast);
// last two rounds
__ vcipher (vRet, vRet, vKey1);
__ vcipherlast (vRet, vRet, vKey2);
#ifdef VM_LITTLE_ENDIAN
// toPerm = 0x0F0E0D0C0B0A09080706050403020100
__ lvsl (toPerm, keypos); // keypos is a multiple of 16
__ vxor (toPerm, toPerm, fSplt);
// Swap Bytes
__ vperm (vRet, vRet, vRet, toPerm);
#endif
// store result (unaligned)
// Note: We can't use a read-modify-write sequence which touches additional Bytes.
Register lo = temp, hi = fifteen; // Reuse
__ vsldoi (vTmp1, vRet, vRet, 8);
__ mfvrd (hi, vRet);
__ mfvrd (lo, vTmp1);
__ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
__ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
__ blr();
#ifdef ASSERT
__ bind(L_error);
__ stop("aescrypt_encryptBlock: invalid key length");
#endif
return start;
}
// Arguments for generated stub:
// R3_ARG1 - source byte array address
// R4_ARG2 - destination byte array address
// R5_ARG3 - K (key) in little endian int array
address generate_aescrypt_decryptBlock() {
assert(UseAES, "need AES instructions and misaligned SSE support");
StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
Label L_doLast, L_do44, L_do52, L_error;
Register from = R3_ARG1; // source array address
Register to = R4_ARG2; // destination array address
Register key = R5_ARG3; // round key array
Register keylen = R8;
Register temp = R9;
Register keypos = R10;
Register fifteen = R12;
VectorRegister vRet = VR0;
VectorRegister vKey1 = VR1;
VectorRegister vKey2 = VR2;
VectorRegister vKey3 = VR3;
VectorRegister vKey4 = VR4;
VectorRegister vKey5 = VR5;
VectorRegister fromPerm = VR6;
VectorRegister keyPerm = VR7;
VectorRegister toPerm = VR8;
VectorRegister fSplt = VR9;
VectorRegister vTmp1 = VR10;
VectorRegister vTmp2 = VR11;
VectorRegister vTmp3 = VR12;
VectorRegister vTmp4 = VR13;
__ li (fifteen, 15);
// load unaligned from[0-15] to vRet
__ lvx (vRet, from);
__ lvx (vTmp1, fifteen, from);
__ lvsl (fromPerm, from);
#ifdef VM_LITTLE_ENDIAN
__ vspltisb (fSplt, 0x0f);
__ vxor (fromPerm, fromPerm, fSplt);
#endif
__ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
// load keylen (44 or 52 or 60)
__ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
// to load keys
__ load_perm (keyPerm, key);
#ifdef VM_LITTLE_ENDIAN
__ vxor (vTmp2, vTmp2, vTmp2);
__ vspltisb (vTmp2, -16);
__ vrld (keyPerm, keyPerm, vTmp2);
__ vrld (keyPerm, keyPerm, vTmp2);
__ vsldoi (keyPerm, keyPerm, keyPerm, 8);
#endif
__ cmpwi (CR0, keylen, 44);
__ beq (CR0, L_do44);
__ cmpwi (CR0, keylen, 52);
__ beq (CR0, L_do52);
#ifdef ASSERT
__ cmpwi (CR0, keylen, 60);
__ bne (CR0, L_error);
#endif
// load the 15th round key to vKey1
__ li (keypos, 240);
__ lvx (vKey1, keypos, key);
__ li (keypos, 224);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vKey2, vKey1, keyPerm);
// load the 14th round key to vKey2
__ li (keypos, 208);
__ lvx (vKey3, keypos, key);
__ vec_perm (vKey2, vKey3, vKey2, keyPerm);
// load the 13th round key to vKey3
__ li (keypos, 192);
__ lvx (vKey4, keypos, key);
__ vec_perm (vKey3, vKey4, vKey3, keyPerm);
// load the 12th round key to vKey4
__ li (keypos, 176);
__ lvx (vKey5, keypos, key);
__ vec_perm (vKey4, vKey5, vKey4, keyPerm);
// load the 11th round key to vKey5
__ li (keypos, 160);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
// 1st - 5th rounds
__ vxor (vRet, vRet, vKey1);
__ vncipher (vRet, vRet, vKey2);
__ vncipher (vRet, vRet, vKey3);
__ vncipher (vRet, vRet, vKey4);
__ vncipher (vRet, vRet, vKey5);
__ b (L_doLast);
__ align(32);
__ bind (L_do52);
// load the 13th round key to vKey1
__ li (keypos, 208);
__ lvx (vKey1, keypos, key);
__ li (keypos, 192);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vKey2, vKey1, keyPerm);
// load the 12th round key to vKey2
__ li (keypos, 176);
__ lvx (vKey3, keypos, key);
__ vec_perm (vKey2, vKey3, vKey2, keyPerm);
// load the 11th round key to vKey3
__ li (keypos, 160);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey3, vTmp1, vKey3, keyPerm);
// 1st - 3rd rounds
__ vxor (vRet, vRet, vKey1);
__ vncipher (vRet, vRet, vKey2);
__ vncipher (vRet, vRet, vKey3);
__ b (L_doLast);
__ align(32);
__ bind (L_do44);
// load the 11th round key to vKey1
__ li (keypos, 176);
__ lvx (vKey1, keypos, key);
__ li (keypos, 160);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey1, vTmp1, vKey1, keyPerm);
// 1st round
__ vxor (vRet, vRet, vKey1);
__ bind (L_doLast);
// load the 10th round key to vKey1
__ li (keypos, 144);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
// load the 9th round key to vKey2
__ li (keypos, 128);
__ lvx (vKey3, keypos, key);
__ vec_perm (vKey2, vKey3, vKey2, keyPerm);
// load the 8th round key to vKey3
__ li (keypos, 112);
__ lvx (vKey4, keypos, key);
__ vec_perm (vKey3, vKey4, vKey3, keyPerm);
// load the 7th round key to vKey4
__ li (keypos, 96);
__ lvx (vKey5, keypos, key);
__ vec_perm (vKey4, vKey5, vKey4, keyPerm);
// load the 6th round key to vKey5
__ li (keypos, 80);
__ lvx (vTmp1, keypos, key);
__ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
// last 10th - 6th rounds
__ vncipher (vRet, vRet, vKey1);
__ vncipher (vRet, vRet, vKey2);
__ vncipher (vRet, vRet, vKey3);
__ vncipher (vRet, vRet, vKey4);
__ vncipher (vRet, vRet, vKey5);
// load the 5th round key to vKey1
__ li (keypos, 64);
__ lvx (vKey2, keypos, key);
__ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
// load the 4th round key to vKey2
__ li (keypos, 48);
__ lvx (vKey3, keypos, key);
__ vec_perm (vKey2, vKey3, vKey2, keyPerm);
// load the 3rd round key to vKey3
__ li (keypos, 32);
__ lvx (vKey4, keypos, key);
__ vec_perm (vKey3, vKey4, vKey3, keyPerm);
// load the 2nd round key to vKey4
__ li (keypos, 16);
__ lvx (vKey5, keypos, key);
__ vec_perm (vKey4, vKey5, vKey4, keyPerm);
// load the 1st round key to vKey5
__ lvx (vTmp1, key);
__ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
// last 5th - 1th rounds
__ vncipher (vRet, vRet, vKey1);
__ vncipher (vRet, vRet, vKey2);
__ vncipher (vRet, vRet, vKey3);
__ vncipher (vRet, vRet, vKey4);
__ vncipherlast (vRet, vRet, vKey5);
#ifdef VM_LITTLE_ENDIAN
// toPerm = 0x0F0E0D0C0B0A09080706050403020100
__ lvsl (toPerm, keypos); // keypos is a multiple of 16
__ vxor (toPerm, toPerm, fSplt);
// Swap Bytes
__ vperm (vRet, vRet, vRet, toPerm);
#endif
// store result (unaligned)
// Note: We can't use a read-modify-write sequence which touches additional Bytes.
Register lo = temp, hi = fifteen; // Reuse
__ vsldoi (vTmp1, vRet, vRet, 8);
__ mfvrd (hi, vRet);
__ mfvrd (lo, vTmp1);
__ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
__ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
__ blr();
#ifdef ASSERT
__ bind(L_error);
__ stop("aescrypt_decryptBlock: invalid key length");
#endif
return start;
}
address generate_sha256_implCompress(StubId stub_id) {
assert(UseSHA, "need SHA instructions");
bool multi_block;
switch (stub_id) {
case StubId::stubgen_sha256_implCompress_id:
multi_block = false;
break;
case StubId::stubgen_sha256_implCompressMB_id:
multi_block = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
__ sha256 (multi_block);
__ blr();
return start;
}
address generate_sha512_implCompress(StubId stub_id) {
assert(UseSHA, "need SHA instructions");
bool multi_block;
switch (stub_id) {
case StubId::stubgen_sha512_implCompress_id:
multi_block = false;
break;
case StubId::stubgen_sha512_implCompressMB_id:
multi_block = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
__ sha512 (multi_block);
__ blr();
return start;
}
address generate_data_cache_writeback() {
const Register cacheline = R3_ARG1;
StubId stub_id = StubId::stubgen_data_cache_writeback_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ cache_wb(Address(cacheline));
__ blr();
return start;
}
address generate_data_cache_writeback_sync() {
const Register is_presync = R3_ARG1;
Register temp = R4;
Label SKIP;
StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ andi_(temp, is_presync, 1);
__ bne(CR0, SKIP);
__ cache_wbsync(false); // post sync => emit 'sync'
__ bind(SKIP); // pre sync => emit nothing
__ blr();
return start;
}
void generate_arraycopy_stubs() {
// generate the common exit first so later stubs can rely on it if
// they want an UnsafeMemoryAccess exit non-local to the stub
StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
// register the stub as the default exit with class UnsafeMemoryAccess
UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
// Note: the disjoint stubs must be generated first, some of
// the conjoint stubs use them.
// non-aligned disjoint versions
StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubId::stubgen_jshort_disjoint_arraycopy_id);
StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubId::stubgen_jint_disjoint_arraycopy_id);
StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubId::stubgen_jlong_disjoint_arraycopy_id);
StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id);
StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
// aligned disjoint versions
StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id);
StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id);
StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id);
StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id);
StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id);
StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
// non-aligned conjoint versions
StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubId::stubgen_jbyte_arraycopy_id);
StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubId::stubgen_jshort_arraycopy_id);
StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(StubId::stubgen_jint_arraycopy_id);
StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(StubId::stubgen_jlong_arraycopy_id);
StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_id);
StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id);
// aligned conjoint versions
StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id);
StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(StubId::stubgen_arrayof_jshort_arraycopy_id);
StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(StubId::stubgen_arrayof_jint_arraycopy_id);
StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(StubId::stubgen_arrayof_jlong_arraycopy_id);
StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
// special/generic versions
StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id);
StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id);
StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(STUB_ENTRY(jbyte_arraycopy()),
STUB_ENTRY(jshort_arraycopy()),
STUB_ENTRY(jint_arraycopy()),
STUB_ENTRY(jlong_arraycopy()));
StubRoutines::_generic_arraycopy = generate_generic_copy(STUB_ENTRY(jbyte_arraycopy()),
STUB_ENTRY(jshort_arraycopy()),
STUB_ENTRY(jint_arraycopy()),
STUB_ENTRY(oop_arraycopy()),
STUB_ENTRY(oop_disjoint_arraycopy()),
STUB_ENTRY(jlong_arraycopy()),
STUB_ENTRY(checkcast_arraycopy()));
// fill routines
#ifdef COMPILER2
if (OptimizeFill) {
StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
}
StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
#endif
}
// Stub for BigInteger::multiplyToLen()
//
// Arguments:
//
// Input:
// R3 - x address
// R4 - x length
// R5 - y address
// R6 - y length
// R7 - z address
//
address generate_multiplyToLen() {
StubId stub_id = StubId::stubgen_multiplyToLen_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
const Register x = R3;
const Register xlen = R4;
const Register y = R5;
const Register ylen = R6;
const Register z = R7;
const Register tmp1 = R2; // TOC not used.
const Register tmp2 = R9;
const Register tmp3 = R10;
const Register tmp4 = R11;
const Register tmp5 = R12;
// non-volatile regs
const Register tmp6 = R31;
const Register tmp7 = R30;
const Register tmp8 = R29;
const Register tmp9 = R28;
const Register tmp10 = R27;
const Register tmp11 = R26;
const Register tmp12 = R25;
const Register tmp13 = R24;
BLOCK_COMMENT("Entry:");
// C2 does not respect int to long conversion for stub calls.
__ clrldi(xlen, xlen, 32);
__ clrldi(ylen, ylen, 32);
// Save non-volatile regs (frameless).
int current_offs = 8;
__ std(R24, -current_offs, R1_SP); current_offs += 8;
__ std(R25, -current_offs, R1_SP); current_offs += 8;
__ std(R26, -current_offs, R1_SP); current_offs += 8;
__ std(R27, -current_offs, R1_SP); current_offs += 8;
__ std(R28, -current_offs, R1_SP); current_offs += 8;
__ std(R29, -current_offs, R1_SP); current_offs += 8;
__ std(R30, -current_offs, R1_SP); current_offs += 8;
__ std(R31, -current_offs, R1_SP);
__ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5,
tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
// Restore non-volatile regs.
current_offs = 8;
__ ld(R24, -current_offs, R1_SP); current_offs += 8;
__ ld(R25, -current_offs, R1_SP); current_offs += 8;
__ ld(R26, -current_offs, R1_SP); current_offs += 8;
__ ld(R27, -current_offs, R1_SP); current_offs += 8;
__ ld(R28, -current_offs, R1_SP); current_offs += 8;
__ ld(R29, -current_offs, R1_SP); current_offs += 8;
__ ld(R30, -current_offs, R1_SP); current_offs += 8;
__ ld(R31, -current_offs, R1_SP);
__ blr(); // Return to caller.
return start;
}
/**
* Arguments:
*
* Input:
* R3_ARG1 - out address
* R4_ARG2 - in address
* R5_ARG3 - offset
* R6_ARG4 - len
* R7_ARG5 - k
* Output:
* R3_RET - carry
*/
address generate_mulAdd() {
__ align(CodeEntryAlignment);
StubId stub_id = StubId::stubgen_mulAdd_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
// C2 does not sign extend signed parameters to full 64 bits registers:
__ rldic (R5_ARG3, R5_ARG3, 2, 32); // always positive
__ clrldi(R6_ARG4, R6_ARG4, 32); // force zero bits on higher word
__ clrldi(R7_ARG5, R7_ARG5, 32); // force zero bits on higher word
__ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
// Moves output carry to return register
__ mr (R3_RET, R10);
__ blr();
return start;
}
/**
* Arguments:
*
* Input:
* R3_ARG1 - in address
* R4_ARG2 - in length
* R5_ARG3 - out address
* R6_ARG4 - out length
*/
address generate_squareToLen() {
__ align(CodeEntryAlignment);
StubId stub_id = StubId::stubgen_squareToLen_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
// args - higher word is cleaned (unsignedly) due to int to long casting
const Register in = R3_ARG1;
const Register in_len = R4_ARG2;
__ clrldi(in_len, in_len, 32);
const Register out = R5_ARG3;
const Register out_len = R6_ARG4;
__ clrldi(out_len, out_len, 32);
// output
const Register ret = R3_RET;
// temporaries
const Register lplw_s = R7;
const Register in_aux = R8;
const Register out_aux = R9;
const Register piece = R10;
const Register product = R14;
const Register lplw = R15;
const Register i_minus1 = R16;
const Register carry = R17;
const Register offset = R18;
const Register off_aux = R19;
const Register t = R20;
const Register mlen = R21;
const Register len = R22;
const Register a = R23;
const Register b = R24;
const Register i = R25;
const Register c = R26;
const Register cs = R27;
// Labels
Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
// Save non-volatile regs (frameless).
int current_offs = -8;
__ std(R28, current_offs, R1_SP); current_offs -= 8;
__ std(R27, current_offs, R1_SP); current_offs -= 8;
__ std(R26, current_offs, R1_SP); current_offs -= 8;
__ std(R25, current_offs, R1_SP); current_offs -= 8;
__ std(R24, current_offs, R1_SP); current_offs -= 8;
__ std(R23, current_offs, R1_SP); current_offs -= 8;
__ std(R22, current_offs, R1_SP); current_offs -= 8;
__ std(R21, current_offs, R1_SP); current_offs -= 8;
__ std(R20, current_offs, R1_SP); current_offs -= 8;
__ std(R19, current_offs, R1_SP); current_offs -= 8;
__ std(R18, current_offs, R1_SP); current_offs -= 8;
__ std(R17, current_offs, R1_SP); current_offs -= 8;
__ std(R16, current_offs, R1_SP); current_offs -= 8;
__ std(R15, current_offs, R1_SP); current_offs -= 8;
__ std(R14, current_offs, R1_SP);
// Store the squares, right shifted one bit (i.e., divided by 2)
__ subi (out_aux, out, 8);
__ subi (in_aux, in, 4);
__ cmpwi (CR0, in_len, 0);
// Initialize lplw outside of the loop
__ xorr (lplw, lplw, lplw);
__ ble (CR0, SKIP_LOOP_SQUARE); // in_len <= 0
__ mtctr (in_len);
__ bind(LOOP_SQUARE);
__ lwzu (piece, 4, in_aux);
__ mulld (product, piece, piece);
// shift left 63 bits and only keep the MSB
__ rldic (lplw_s, lplw, 63, 0);
__ mr (lplw, product);
// shift right 1 bit without sign extension
__ srdi (product, product, 1);
// join them to the same register and store it
__ orr (product, lplw_s, product);
#ifdef VM_LITTLE_ENDIAN
// Swap low and high words for little endian
__ rldicl (product, product, 32, 0);
#endif
__ stdu (product, 8, out_aux);
__ bdnz (LOOP_SQUARE);
__ bind(SKIP_LOOP_SQUARE);
// Add in off-diagonal sums
__ cmpwi (CR0, in_len, 0);
__ ble (CR0, SKIP_DIAGONAL_SUM);
// Avoid CTR usage here in order to use it at mulAdd
__ subi (i_minus1, in_len, 1);
__ li (offset, 4);
__ bind(LOOP_DIAGONAL_SUM);
__ sldi (off_aux, out_len, 2);
__ sub (off_aux, off_aux, offset);
__ mr (len, i_minus1);
__ sldi (mlen, i_minus1, 2);
__ lwzx (t, in, mlen);
__ muladd (out, in, off_aux, len, t, a, b, carry);
// begin<addOne>
// off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
__ addi (mlen, mlen, 4);
__ sldi (a, out_len, 2);
__ subi (a, a, 4);
__ sub (a, a, mlen);
__ subi (off_aux, offset, 4);
__ sub (off_aux, a, off_aux);
__ lwzx (b, off_aux, out);
__ add (b, b, carry);
__ stwx (b, off_aux, out);
// if (((uint64_t)s >> 32) != 0) {
__ srdi_ (a, b, 32);
__ beq (CR0, SKIP_ADDONE);
// while (--mlen >= 0) {
__ bind(LOOP_ADDONE);
__ subi (mlen, mlen, 4);
__ cmpwi (CR0, mlen, 0);
__ beq (CR0, SKIP_ADDONE);
// if (--offset_aux < 0) { // Carry out of number
__ subi (off_aux, off_aux, 4);
__ cmpwi (CR0, off_aux, 0);
__ blt (CR0, SKIP_ADDONE);
// } else {
__ lwzx (b, off_aux, out);
__ addi (b, b, 1);
__ stwx (b, off_aux, out);
__ cmpwi (CR0, b, 0);
__ bne (CR0, SKIP_ADDONE);
__ b (LOOP_ADDONE);
__ bind(SKIP_ADDONE);
// } } } end<addOne>
__ addi (offset, offset, 8);
__ subi (i_minus1, i_minus1, 1);
__ cmpwi (CR0, i_minus1, 0);
__ bge (CR0, LOOP_DIAGONAL_SUM);
__ bind(SKIP_DIAGONAL_SUM);
// Shift back up and set low bit
// Shifts 1 bit left up to len positions. Assumes no leading zeros
// begin<primitiveLeftShift>
__ cmpwi (CR0, out_len, 0);
__ ble (CR0, SKIP_LSHIFT);
__ li (i, 0);
__ lwz (c, 0, out);
__ subi (b, out_len, 1);
__ mtctr (b);
__ bind(LOOP_LSHIFT);
__ mr (b, c);
__ addi (cs, i, 4);
__ lwzx (c, out, cs);
__ sldi (b, b, 1);
__ srwi (cs, c, 31);
__ orr (b, b, cs);
__ stwx (b, i, out);
__ addi (i, i, 4);
__ bdnz (LOOP_LSHIFT);
__ sldi (c, out_len, 2);
__ subi (c, c, 4);
__ lwzx (b, out, c);
__ sldi (b, b, 1);
__ stwx (b, out, c);
__ bind(SKIP_LSHIFT);
// end<primitiveLeftShift>
// Set low bit
__ sldi (i, in_len, 2);
__ subi (i, i, 4);
__ lwzx (i, in, i);
__ sldi (c, out_len, 2);
__ subi (c, c, 4);
__ lwzx (b, out, c);
__ andi (i, i, 1);
__ orr (i, b, i);
__ stwx (i, out, c);
// Restore non-volatile regs.
current_offs = -8;
__ ld(R28, current_offs, R1_SP); current_offs -= 8;
__ ld(R27, current_offs, R1_SP); current_offs -= 8;
__ ld(R26, current_offs, R1_SP); current_offs -= 8;
__ ld(R25, current_offs, R1_SP); current_offs -= 8;
__ ld(R24, current_offs, R1_SP); current_offs -= 8;
__ ld(R23, current_offs, R1_SP); current_offs -= 8;
__ ld(R22, current_offs, R1_SP); current_offs -= 8;
__ ld(R21, current_offs, R1_SP); current_offs -= 8;
__ ld(R20, current_offs, R1_SP); current_offs -= 8;
__ ld(R19, current_offs, R1_SP); current_offs -= 8;
__ ld(R18, current_offs, R1_SP); current_offs -= 8;
__ ld(R17, current_offs, R1_SP); current_offs -= 8;
__ ld(R16, current_offs, R1_SP); current_offs -= 8;
__ ld(R15, current_offs, R1_SP); current_offs -= 8;
__ ld(R14, current_offs, R1_SP);
__ mr(ret, out);
__ blr();
return start;
}
/**
* Arguments:
*
* Inputs:
* R3_ARG1 - int crc
* R4_ARG2 - byte* buf
* R5_ARG3 - int length (of buffer)
*
* scratch:
* R2, R6-R12
*
* Output:
* R3_RET - int crc result
*/
// Compute CRC32 function.
address generate_CRC32_updateBytes(StubId stub_id) {
bool is_crc32c;
switch (stub_id) {
case StubId::stubgen_updateBytesCRC32_id:
is_crc32c = false;
break;
case StubId::stubgen_updateBytesCRC32C_id:
is_crc32c = true;
break;
default:
ShouldNotReachHere();
}
__ align(CodeEntryAlignment);
StubCodeMark mark(this, stub_id);
address start = __ function_entry(); // Remember stub start address (is rtn value).
__ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
__ blr();
return start;
}
address generate_floatToFloat16() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
address start = __ function_entry();
__ f2hf(R3_RET, F1_ARG1, F0);
__ blr();
return start;
}
address generate_float16ToFloat() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
address start = __ function_entry();
__ hf2f(F1_RET, R3_ARG1);
__ blr();
return start;
}
address generate_method_entry_barrier() {
__ align(CodeEntryAlignment);
StubId stub_id = StubId::stubgen_method_entry_barrier_id;
StubCodeMark mark(this, stub_id);
address stub_address = __ pc();
int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
__ save_volatile_gprs(R1_SP, -nbytes_save, true);
// Link register points to instruction in prologue of the guarded nmethod.
// As the stub requires one layer of indirection (argument is of type address* and not address),
// passing the link register's value directly doesn't work.
// Since we have to save the link register on the stack anyway, we calculate the corresponding stack address
// and pass that one instead.
__ addi(R3_ARG1, R1_SP, _abi0(lr));
__ save_LR(R0);
__ push_frame_reg_args(nbytes_save, R0);
__ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
__ mr(R0, R3_RET);
__ pop_frame();
__ restore_LR(R3_RET /* used as tmp register */);
__ restore_volatile_gprs(R1_SP, -nbytes_save, true);
__ cmpdi(CR0, R0, 0);
// Return to prologue if no deoptimization is required (bnelr)
__ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintIsTaken);
// Deoptimization required.
// For actually handling the deoptimization, the 'wrong method stub' is invoked.
__ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
__ mtctr(R0);
// Pop the frame built in the prologue.
__ pop_frame();
// Restore link register. Required as the 'wrong method stub' needs the caller's frame
// to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods).
// This method's prologue is aborted.
__ restore_LR(R0);
__ bctr();
return stub_address;
}
#ifdef VM_LITTLE_ENDIAN
// The following Base64 decode intrinsic is based on an algorithm outlined
// in here:
// http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
// in the section titled "Vector lookup (pshufb with bitmask)"
//
// This implementation differs in the following ways:
// * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
// are used instead. It turns out that some of the vector operations
// needed in the algorithm require fewer AltiVec instructions.
// * The algorithm in the above mentioned paper doesn't handle the
// Base64-URL variant in RFC 4648. Adjustments to both the code and to two
// lookup tables are needed for this.
// * The "Pack" section of the code is a complete rewrite for Power because we
// can utilize better instructions for this step.
//
// Offsets per group of Base64 characters
// Uppercase
#define UC (signed char)((-'A' + 0) & 0xff)
// Lowercase
#define LC (signed char)((-'a' + 26) & 0xff)
// Digits
#define DIG (signed char)((-'0' + 52) & 0xff)
// Plus sign (URL = 0)
#define PLS (signed char)((-'+' + 62) & 0xff)
// Hyphen (URL = 1)
#define HYP (signed char)((-'-' + 62) & 0xff)
// Slash (URL = 0)
#define SLS (signed char)((-'/' + 63) & 0xff)
// Underscore (URL = 1)
#define US (signed char)((-'_' + 63) & 0xff)
// For P10 (or later) only
#define VALID_B64 0x80
#define VB64(x) (VALID_B64 | x)
#define BLK_OFFSETOF(x) (offsetof(constant_block, x))
// In little-endian mode, the lxv instruction loads the element at EA into
// element 15 of the vector register, EA+1 goes into element 14, and so
// on.
//
// To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the
// order of the elements in a vector initialization.
#define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
//
// Base64 decodeBlock intrinsic
address generate_base64_decodeBlock() {
__ align(CodeEntryAlignment);
StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
typedef struct {
signed char offsetLUT_val[16];
signed char offsetLUT_URL_val[16];
unsigned char maskLUT_val[16];
unsigned char maskLUT_URL_val[16];
unsigned char bitposLUT_val[16];
unsigned char table_32_47_val[16];
unsigned char table_32_47_URL_val[16];
unsigned char table_48_63_val[16];
unsigned char table_64_79_val[16];
unsigned char table_80_95_val[16];
unsigned char table_80_95_URL_val[16];
unsigned char table_96_111_val[16];
unsigned char table_112_127_val[16];
unsigned char pack_lshift_val[16];
unsigned char pack_rshift_val[16];
unsigned char pack_permute_val[16];
} constant_block;
alignas(16) static const constant_block const_block = {
.offsetLUT_val = {
ARRAY_TO_LXV_ORDER(
0, 0, PLS, DIG, UC, UC, LC, LC,
0, 0, 0, 0, 0, 0, 0, 0 ) },
.offsetLUT_URL_val = {
ARRAY_TO_LXV_ORDER(
0, 0, HYP, DIG, UC, UC, LC, LC,
0, 0, 0, 0, 0, 0, 0, 0 ) },
.maskLUT_val = {
ARRAY_TO_LXV_ORDER(
/* 0 */ (unsigned char)0b10101000,
/* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
(unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
(unsigned char)0b11111000,
/* 10 */ (unsigned char)0b11110000,
/* 11 */ (unsigned char)0b01010100,
/* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
/* 15 */ (unsigned char)0b01010100 ) },
.maskLUT_URL_val = {
ARRAY_TO_LXV_ORDER(
/* 0 */ (unsigned char)0b10101000,
/* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
(unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
(unsigned char)0b11111000,
/* 10 */ (unsigned char)0b11110000,
/* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
/* 13 */ (unsigned char)0b01010100,
/* 14 */ (unsigned char)0b01010000,
/* 15 */ (unsigned char)0b01110000 ) },
.bitposLUT_val = {
ARRAY_TO_LXV_ORDER(
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
// In the following table_*_val constants, a 0 value means the
// character is not in the Base64 character set
.table_32_47_val = {
ARRAY_TO_LXV_ORDER (
/* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
.table_32_47_URL_val = {
ARRAY_TO_LXV_ORDER(
/* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
.table_48_63_val = {
ARRAY_TO_LXV_ORDER(
/* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
/* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
.table_64_79_val = {
ARRAY_TO_LXV_ORDER(
/* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
.table_80_95_val = {
ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
.table_80_95_URL_val = {
ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
.table_96_111_val = {
ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
.table_112_127_val = {
ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
.pack_lshift_val = {
ARRAY_TO_LXV_ORDER(
0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
.pack_rshift_val = {
ARRAY_TO_LXV_ORDER(
0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
// The first 4 index values are "don't care" because
// we only use the first 12 bytes of the vector,
// which are decoded from 16 bytes of Base64 characters.
.pack_permute_val = {
ARRAY_TO_LXV_ORDER(
0, 0, 0, 0,
0, 1, 2,
4, 5, 6,
8, 9, 10,
12, 13, 14 ) }
};
const unsigned block_size = 16; // number of bytes to process in each pass through the loop
const unsigned block_size_shift = 4;
// According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
Register s = R3_ARG1; // source starting address of Base64 characters
Register sp = R4_ARG2; // source offset
Register sl = R5_ARG3; // source length = # of Base64 characters to be processed
Register d = R6_ARG4; // destination address
Register dp = R7_ARG5; // destination offset
Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
// Local variables
Register const_ptr = R9; // used for loading constants
Register tmp_reg = R10; // used for speeding up load_constant_optimized()
// Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore)
Register out = R9; // moving out (destination) pointer
Register in = R10; // moving in (source) pointer
// Volatile VSRS are 0..13, 32..51 (VR0..VR13)
// VR Constants
VectorRegister vec_0s = VR0;
VectorRegister vec_4s = VR1;
VectorRegister vec_8s = VR2;
VectorRegister vec_special_case_char = VR3;
VectorRegister pack_rshift = VR4;
VectorRegister pack_lshift = VR5;
// VSR Constants
VectorSRegister offsetLUT = VSR0;
VectorSRegister maskLUT = VSR1;
VectorSRegister bitposLUT = VSR2;
VectorSRegister vec_0xfs = VSR3;
VectorSRegister vec_special_case_offset = VSR4;
VectorSRegister pack_permute = VSR5;
// P10 (or later) VSR lookup constants
VectorSRegister table_32_47 = VSR0;
VectorSRegister table_48_63 = VSR1;
VectorSRegister table_64_79 = VSR2;
VectorSRegister table_80_95 = VSR3;
VectorSRegister table_96_111 = VSR4;
VectorSRegister table_112_127 = VSR6;
// Data read in and later converted
VectorRegister input = VR6;
// Variable for testing Base64 validity
VectorRegister non_match = VR10;
// P9 VR Variables for lookup
VectorRegister higher_nibble = VR7;
VectorRegister eq_special_case_char = VR8;
VectorRegister offsets = VR9;
// P9 VSR lookup variables
VectorSRegister bit = VSR6;
VectorSRegister lower_nibble = VSR7;
VectorSRegister M = VSR8;
// P10 (or later) VSR lookup variables
VectorSRegister xlate_a = VSR7;
VectorSRegister xlate_b = VSR8;
// Variables for pack
// VR
VectorRegister l = VR7; // reuse higher_nibble's register
VectorRegister r = VR8; // reuse eq_special_case_char's register
VectorRegister gathered = VR10; // reuse non_match's register
Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
// The upper 32 bits of the non-pointer parameter registers are not
// guaranteed to be zero, so mask off those upper bits.
__ clrldi(sp, sp, 32);
__ clrldi(sl, sl, 32);
// Don't handle the last 4 characters of the source, because this
// VSX-based algorithm doesn't handle padding characters. Also the
// vector code will always write 16 bytes of decoded data on each pass,
// but only the first 12 of those 16 bytes are valid data (16 base64
// characters become 12 bytes of binary data), so for this reason we
// need to subtract an additional 8 bytes from the source length, in
// order not to write past the end of the destination buffer. The
// result of this subtraction implies that a Java function in the
// Base64 class will be used to process the last 12 characters.
__ sub(sl, sl, sp);
__ subi(sl, sl, 12);
// Load CTR with the number of passes through the loop
// = sl >> block_size_shift. After the shift, if sl <= 0, there's too
// little data to be processed by this intrinsic.
__ srawi_(sl, sl, block_size_shift);
__ ble(CR0, return_zero);
__ mtctr(sl);
// Clear the other two parameter registers upper 32 bits.
__ clrldi(isURL, isURL, 32);
__ clrldi(dp, dp, 32);
// Load constant vec registers that need to be loaded from memory
__ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
__ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
__ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
__ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
__ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
// Splat the constants that can use xxspltib
__ xxspltib(vec_0s->to_vsr(), 0);
__ xxspltib(vec_8s->to_vsr(), 8);
if (PowerArchitecturePPC64 >= 10) {
// Using VALID_B64 for the offsets effectively strips the upper bit
// of each byte that was selected from the table. Setting the upper
// bit gives us a way to distinguish between the 6-bit value of 0
// from an error code of 0, which will happen if the character is
// outside the range of the lookup, or is an illegal Base64
// character, such as %.
__ xxspltib(offsets->to_vsr(), VALID_B64);
__ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
__ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
__ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
__ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
} else {
__ xxspltib(vec_4s->to_vsr(), 4);
__ xxspltib(vec_0xfs, 0xf);
__ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
}
// The rest of the constants use different values depending on the
// setting of isURL
__ cmpwi(CR0, isURL, 0);
__ beq(CR0, not_URL);
// isURL != 0 (true)
if (PowerArchitecturePPC64 >= 10) {
__ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
} else {
__ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
__ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
__ xxspltib(vec_special_case_char->to_vsr(), '_');
__ xxspltib(vec_special_case_offset, (unsigned char)US);
}
__ b(calculate_size);
// isURL = 0 (false)
__ bind(not_URL);
if (PowerArchitecturePPC64 >= 10) {
__ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
} else {
__ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
__ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
__ xxspltib(vec_special_case_char->to_vsr(), '/');
__ xxspltib(vec_special_case_offset, (unsigned char)SLS);
}
__ bind(calculate_size);
// out starts at d + dp
__ add(out, d, dp);
// in starts at s + sp
__ add(in, s, sp);
__ align(32);
__ bind(loop_start);
__ lxv(input->to_vsr(), 0, in); // offset=0
//
// Lookup
//
if (PowerArchitecturePPC64 >= 10) {
// Use xxpermx to do a lookup of each Base64 character in the
// input vector and translate it to a 6-bit value + 0x80.
// Characters which are not valid Base64 characters will result
// in a zero in the corresponding byte.
//
// Note that due to align(32) call above, the xxpermx instructions do
// not require align_prefix() calls, since the final xxpermx
// prefix+opcode is at byte 24.
__ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1); // offset=4
__ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2); // offset=12
__ xxlor(xlate_b, xlate_a, xlate_b); // offset=20
__ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
__ xxlor(input->to_vsr(), xlate_a, xlate_b);
// Check for non-Base64 characters by comparing each byte to zero.
__ vcmpequb_(non_match, input, vec_0s);
} else {
// Isolate the upper 4 bits of each character by shifting it right 4 bits
__ vsrb(higher_nibble, input, vec_4s);
// Isolate the lower 4 bits by masking
__ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
// Get the offset (the value to subtract from the byte) by using
// a lookup table indexed by the upper 4 bits of the character
__ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
// Find out which elements are the special case character (isURL ? '/' : '-')
__ vcmpequb(eq_special_case_char, input, vec_special_case_char);
// For each character in the input which is a special case
// character, replace its offset with one that is special for that
// character.
__ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
// Use the lower_nibble to select a mask "M" from the lookup table.
__ xxperm(M, maskLUT, lower_nibble);
// "bit" is used to isolate which of the bits in M is relevant.
__ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
// Each element of non_match correspond to one each of the 16 input
// characters. Those elements that become 0x00 after the xxland
// instruction are invalid Base64 characters.
__ xxland(non_match->to_vsr(), M, bit);
// Compare each element to zero
//
__ vcmpequb_(non_match, non_match, vec_0s);
}
// vmcmpequb_ sets the EQ bit of CR6 if no elements compare equal.
// Any element comparing equal to zero means there is an error in
// that element. Note that the comparison result register
// non_match is not referenced again. Only CR6-EQ matters.
__ bne_predict_not_taken(CR6, loop_exit);
// The Base64 characters had no errors, so add the offsets, which in
// the case of Power10 is a constant vector of all 0x80's (see earlier
// comment where the offsets register is loaded).
__ vaddubm(input, input, offsets);
// Pack
//
// In the tables below, b0, b1, .. b15 are the bytes of decoded
// binary data, the first line of each of the cells (except for
// the constants) uses the bit-field nomenclature from the
// above-linked paper, whereas the second line is more specific
// about which exact bits are present, and is constructed using the
// Power ISA 3.x document style, where:
//
// * The specifier after the colon depicts which bits are there.
// * The bit numbering is big endian style (bit 0 is the most
// significant).
// * || is a concatenate operator.
// * Strings of 0's are a field of zeros with the shown length, and
// likewise for strings of 1's.
// Note that only e12..e15 are shown here because the shifting
// and OR'ing pattern replicates for e8..e11, e4..7, and
// e0..e3.
//
// +======================+=================+======================+======================+=============+
// | Vector | e12 | e13 | e14 | e15 |
// | Element | | | | |
// +======================+=================+======================+======================+=============+
// | after vaddubm | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
// | | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
// +----------------------+-----------------+----------------------+----------------------+-------------+
// | pack_lshift | | << 6 | << 4 | << 2 |
// +----------------------+-----------------+----------------------+----------------------+-------------+
// | l after vslb | 00dddddd | cc000000 | bbbb0000 | aaaaaa00 |
// | | 00||b2:2..7 | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 |
// +----------------------+-----------------+----------------------+----------------------+-------------+
// | l after vslo | cc000000 | bbbb0000 | aaaaaa00 | 00000000 |
// | | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 00000000 |
// +----------------------+-----------------+----------------------+----------------------+-------------+
// | pack_rshift | | >> 2 | >> 4 | |
// +----------------------+-----------------+----------------------+----------------------+-------------+
// | r after vsrb | 00dddddd | 0000cccc | 000000bb | 00aaaaaa |
// | | 00||b2:2..7 | 0000||b1:4..7 | 000000||b0:6..7 | 00||b0:0..5 |
// +----------------------+-----------------+----------------------+----------------------+-------------+
// | gathered after xxlor | ccdddddd | bbbbcccc | aaaaaabb | 00aaaaaa |
// | | b2:0..7 | b1:0..7 | b0:0..7 | 00||b0:0..5 |
// +======================+=================+======================+======================+=============+
//
// Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
// [ddddddcc|bbbbcccc|aaaaaabb]
// but should be:
// [ccdddddd|bbbbcccc|aaaaaabb]
//
__ vslb(l, input, pack_lshift);
// vslo of vec_8s shifts the vector by one octet toward lower
// element numbers, discarding element 0. This means it actually
// shifts to the right (not left) according to the order of the
// table above.
__ vslo(l, l, vec_8s);
__ vsrb(r, input, pack_rshift);
__ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
// Final rearrangement of bytes into their correct positions.
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
// | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
// | Elements | | | | | | | | | | | | | | | | |
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
// | after xxlor | b11 | b10 | b9 | xx | b8 | b7 | b6 | xx | b5 | b4 | b3 | xx | b2 | b1 | b0 | xx |
// +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
// | pack_permute | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 5 | 6 | 8 | 9 | 10 | 12 | 13 | 14 |
// +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
// | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
// xx bytes are not used to form the final data
// b0..b15 are the decoded and reassembled 8-bit bytes of data
// b11 with asterisk is a "don't care", because these bytes will be
// overwritten on the next iteration.
__ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
// We cannot use a static displacement on the store, since it's a
// multiple of 12, not 16. Note that this stxv instruction actually
// writes 16 bytes, even though only the first 12 are valid data.
__ stxv(gathered->to_vsr(), 0, out);
__ addi(out, out, 12);
__ addi(in, in, 16);
__ bdnz(loop_start);
__ bind(loop_exit);
// Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
__ sub(R3_RET, out, d);
__ sub(R3_RET, R3_RET, dp);
__ blr();
__ bind(return_zero);
__ li(R3_RET, 0);
__ blr();
return start;
}
#undef UC
#undef LC
#undef DIG
#undef PLS
#undef HYP
#undef SLS
#undef US
// This algorithm is based on the methods described in this paper:
// http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
//
// The details of this implementation vary from the paper due to the
// difference in the ISA between SSE and AltiVec, especially in the
// splitting bytes section where there is no need on Power to mask after
// the shift because the shift is byte-wise rather than an entire an entire
// 128-bit word.
//
// For the lookup part of the algorithm, different logic is used than
// described in the paper because of the availability of vperm, which can
// do a 64-byte table lookup in four instructions, while preserving the
// branchless nature.
//
// Description of the ENCODE_CORE macro
//
// Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
// bits of each byte are zeros)
//
// (Note: e7..e0 are not shown because they follow the same pattern as
// e8..e15)
//
// In the table below, b0, b1, .. b15 are the bytes of unencoded
// binary data, the first line of each of the cells (except for
// the constants) uses the bit-field nomenclature from the
// above-linked paper, whereas the second line is more specific
// about which exact bits are present, and is constructed using the
// Power ISA 3.x document style, where:
//
// * The specifier after the colon depicts which bits are there.
// * The bit numbering is big endian style (bit 0 is the most
// significant).
// * || is a concatenate operator.
// * Strings of 0's are a field of zeros with the shown length, and
// likewise for strings of 1's.
//
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
// | Vector | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
// | Element | | | | | | | | |
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
// | after lxv | jjjjkkkk | iiiiiijj | gghhhhhh | ffffgggg | eeeeeeff | ccdddddd | bbbbcccc | aaaaaabb |
// | | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | xxperm indexes | 0 | 10 | 11 | 12 | 0 | 13 | 14 | 15 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | (1) after xxperm | | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb |
// | | (b15) | b5 | b4 | b3 | (b15) | b2 | b1 | b0 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | rshift_amount | 0 | 6 | 4 | 2 | 0 | 6 | 4 | 2 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | after vsrb | | 000000gg | 0000ffff | 00eeeeee | | 000000cc | 0000bbbb | 00aaaaaa |
// | | (b15) | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | (b15) | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | rshift_mask | 00000000 | 000000||11 | 0000||1111 | 00||111111 | 00000000 | 000000||11 | 0000||1111 | 00||111111 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | rshift after vand | 00000000 | 000000gg | 0000ffff | 00eeeeee | 00000000 | 000000cc | 0000bbbb | 00aaaaaa |
// | | 00000000 | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | 00000000 | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | 1 octet lshift (1) | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | 00000000 |
// | | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | 00000000 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | lshift_amount | 0 | 2 | 4 | 0 | 0 | 2 | 4 | 0 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | after vslb | gghhhhhh | ffgggg00 | eeff0000 | | ccdddddd | bbcccc00 | aabb0000 | 00000000 |
// | | b5 | b4:2..7||00 | b3:4..7||0000 | (b15) | b2:0..7 | b1:2..7||00 | b0:4..7||0000 | 00000000 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | lshift_mask | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | lshift after vand | 00hhhhhh | 00gggg00 | 00ff0000 | 00000000 | 00dddddd | 00cccc00 | 00bb0000 | 00000000 |
// | | 00||b5:2..7 | 00||b4:4..7||00 | 00||b3:6..7||0000 | 00000000 | 00||b2:2..7 | 00||b1:4..7||00 | 00||b0:6..7||0000 | 00000000 |
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
// | after vor lshift, rshift | 00hhhhhh | 00gggggg | 00ffffff | 00eeeeee | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
// | | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
//
// Expand the first 12 bytes into 16 bytes, leaving every 4th byte
// blank for now.
// __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
//
// Generate two bit-shifted pieces - rshift and lshift - that will
// later be OR'd together.
//
// First the right-shifted piece
// __ vsrb(rshift, input, expand_rshift);
// __ vand(rshift, rshift, expand_rshift_mask);
//
// Now the left-shifted piece, which is done by octet shifting
// the input one byte to the left, then doing a variable shift,
// followed by a mask operation.
//
// __ vslo(lshift, input, vec_8s);
// __ vslb(lshift, lshift, expand_lshift);
// __ vand(lshift, lshift, expand_lshift_mask);
//
// Combine the two pieces by OR'ing
// __ vor(expanded, rshift, lshift);
//
// At this point, expanded is a vector containing a 6-bit value in each
// byte. These values are used as indexes into a 64-byte lookup table that
// is contained in four vector registers. The lookup operation is done
// using vperm instructions with the same indexes for the lower 32 and
// upper 32 bytes. To figure out which of the two looked-up bytes to use
// at each location, all values in expanded are compared to 31. Using
// vsel, values higher than 31 use the results from the upper 32 bytes of
// the lookup operation, while values less than or equal to 31 use the
// lower 32 bytes of the lookup operation.
//
// Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
// Power10 (or later), but experiments doing so on Power10 yielded a slight
// performance drop, perhaps due to the need for xxpermx instruction
// prefixes.
#define ENCODE_CORE \
__ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); \
__ vsrb(rshift, input, expand_rshift); \
__ vand(rshift, rshift, expand_rshift_mask); \
__ vslo(lshift, input, vec_8s); \
__ vslb(lshift, lshift, expand_lshift); \
__ vand(lshift, lshift, expand_lshift_mask); \
__ vor(expanded, rshift, lshift); \
__ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
__ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
__ vcmpgtub(gt_31, expanded, vec_31s); \
__ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
// Intrinsic function prototype in Base64.java:
// private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
address generate_base64_encodeBlock() {
__ align(CodeEntryAlignment);
StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
StubCodeMark mark(this, stub_id);
address start = __ function_entry();
typedef struct {
unsigned char expand_permute_val[16];
unsigned char expand_rshift_val[16];
unsigned char expand_rshift_mask_val[16];
unsigned char expand_lshift_val[16];
unsigned char expand_lshift_mask_val[16];
unsigned char base64_00_15_val[16];
unsigned char base64_16_31_val[16];
unsigned char base64_32_47_val[16];
unsigned char base64_48_63_val[16];
unsigned char base64_48_63_URL_val[16];
} constant_block;
alignas(16) static const constant_block const_block = {
.expand_permute_val = {
ARRAY_TO_LXV_ORDER(
0, 4, 5, 6,
0, 7, 8, 9,
0, 10, 11, 12,
0, 13, 14, 15 ) },
.expand_rshift_val = {
ARRAY_TO_LXV_ORDER(
0, 6, 4, 2,
0, 6, 4, 2,
0, 6, 4, 2,
0, 6, 4, 2 ) },
.expand_rshift_mask_val = {
ARRAY_TO_LXV_ORDER(
0b00000000, 0b00000011, 0b00001111, 0b00111111,
0b00000000, 0b00000011, 0b00001111, 0b00111111,
0b00000000, 0b00000011, 0b00001111, 0b00111111,
0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
.expand_lshift_val = {
ARRAY_TO_LXV_ORDER(
0, 2, 4, 0,
0, 2, 4, 0,
0, 2, 4, 0,
0, 2, 4, 0 ) },
.expand_lshift_mask_val = {
ARRAY_TO_LXV_ORDER(
0b00111111, 0b00111100, 0b00110000, 0b00000000,
0b00111111, 0b00111100, 0b00110000, 0b00000000,
0b00111111, 0b00111100, 0b00110000, 0b00000000,
0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
.base64_00_15_val = {
ARRAY_TO_LXV_ORDER(
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
.base64_16_31_val = {
ARRAY_TO_LXV_ORDER(
'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
.base64_32_47_val = {
ARRAY_TO_LXV_ORDER(
'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
.base64_48_63_val = {
ARRAY_TO_LXV_ORDER(
'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
.base64_48_63_URL_val = {
ARRAY_TO_LXV_ORDER(
'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
};
// Number of bytes to process in each pass through the main loop.
// 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
const unsigned block_size = 12;
// According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
Register src = R3_ARG1; // source starting address of Base64 characters
Register sp = R4_ARG2; // source starting position
Register sl = R5_ARG3; // total source length of the Base64 characters to be processed
Register dst = R6_ARG4; // destination address
Register dp = R7_ARG5; // destination starting position
Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
// Local variables
Register const_ptr = R12; // used for loading constants (reuses isURL's register)
Register tmp_reg = R9; // used for speeding up load_constant()
Register size = R9; // number of bytes to process (reuses tmp_reg's register)
Register blocked_size = R10; // number of bytes to process a block at a time
Register block_modulo = R12; // == block_size (reuse const_ptr)
Register remaining = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
Register in = R4; // current input (source) pointer (reuse sp's register)
Register num_blocks = R11; // number of blocks to be processed by the loop
Register out = R8; // current output (destination) pointer (reuse const_ptr's register)
Register three = R9; // constant divisor (reuse size's register)
Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
Register tmp1 = R7; // temp register for lxvl length (reuse dp's register)
Register modulo_chars = R7; // number of bytes written during the final write % 4 (reuse tmp1's register)
Register pad_char = R6; // literal '=' (reuse dst's register)
// Volatile VSRS are 0..13, 32..51 (VR0..VR13)
// VR Constants
VectorRegister vec_8s = VR0;
VectorRegister vec_31s = VR1;
VectorRegister vec_base64_00_15 = VR2;
VectorRegister vec_base64_16_31 = VR3;
VectorRegister vec_base64_32_47 = VR4;
VectorRegister vec_base64_48_63 = VR5;
VectorRegister expand_rshift = VR6;
VectorRegister expand_rshift_mask = VR7;
VectorRegister expand_lshift = VR8;
VectorRegister expand_lshift_mask = VR9;
// VR variables for expand
VectorRegister input = VR10;
VectorRegister rshift = VR11;
VectorRegister lshift = VR12;
VectorRegister expanded = VR13;
// VR variables for lookup
VectorRegister encoded_00_31 = VR10; // (reuse input)
VectorRegister encoded_32_63 = VR11; // (reuse rshift)
VectorRegister gt_31 = VR12; // (reuse lshift)
// VSR Constants
VectorSRegister expand_permute = VSR0;
Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
Label loop_start, le_16_to_write, no_pad, one_pad_char;
// The upper 32 bits of the non-pointer parameter registers are not
// guaranteed to be zero, so mask off those upper bits.
__ clrldi(sp, sp, 32);
__ clrldi(sl, sl, 32);
__ clrldi(dp, dp, 32);
__ clrldi(isURL, isURL, 32);
// load up the constants
__ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
__ lxv(expand_permute, BLK_OFFSETOF(expand_permute_val), const_ptr);
__ lxv(expand_rshift->to_vsr(), BLK_OFFSETOF(expand_rshift_val), const_ptr);
__ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
__ lxv(expand_lshift->to_vsr(), BLK_OFFSETOF(expand_lshift_val), const_ptr);
__ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
__ lxv(vec_base64_00_15->to_vsr(), BLK_OFFSETOF(base64_00_15_val), const_ptr);
__ lxv(vec_base64_16_31->to_vsr(), BLK_OFFSETOF(base64_16_31_val), const_ptr);
__ lxv(vec_base64_32_47->to_vsr(), BLK_OFFSETOF(base64_32_47_val), const_ptr);
// Splat the constants that can use xxspltib
__ xxspltib(vec_8s->to_vsr(), 8);
__ xxspltib(vec_31s->to_vsr(), 31);
// Use a different translation lookup table depending on the
// setting of isURL
__ cmpdi(CR0, isURL, 0);
__ beq(CR0, not_URL);
__ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
__ b(calculate_size);
__ bind(not_URL);
__ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
__ bind(calculate_size);
// size = sl - sp - 4 (*)
// (*) Don't process the last four bytes in the main loop because
// we don't want the lxv instruction to read past the end of the src
// data, in case those four bytes are on the start of an unmapped or
// otherwise inaccessible page.
//
__ sub(size, sl, sp);
__ subi(size, size, 4);
__ cmpdi(CR7, size, block_size);
__ bgt(CR7, calculate_blocked_size);
__ mr(remaining, size);
// Add the 4 back into remaining again
__ addi(remaining, remaining, 4);
// make "in" point to the beginning of the source data: in = src + sp
__ add(in, src, sp);
// out = dst + dp
__ add(out, dst, dp);
__ b(skip_loop);
__ bind(calculate_blocked_size);
__ li(block_modulo, block_size);
// num_blocks = size / block_modulo
__ divwu(num_blocks, size, block_modulo);
// blocked_size = num_blocks * size
__ mullw(blocked_size, num_blocks, block_modulo);
// remaining = size - blocked_size
__ sub(remaining, size, blocked_size);
__ mtctr(num_blocks);
// Add the 4 back in to remaining again
__ addi(remaining, remaining, 4);
// make "in" point to the beginning of the source data: in = src + sp
__ add(in, src, sp);
// out = dst + dp
__ add(out, dst, dp);
__ align(32);
__ bind(loop_start);
__ lxv(input->to_vsr(), 0, in);
ENCODE_CORE
__ stxv(expanded->to_vsr(), 0, out);
__ addi(in, in, 12);
__ addi(out, out, 16);
__ bdnz(loop_start);
__ bind(skip_loop);
// When there are less than 16 bytes left, we need to be careful not to
// read beyond the end of the src buffer, which might be in an unmapped
// page.
// Load the remaining bytes using lxvl.
__ rldicr(tmp1, remaining, 56, 7);
__ lxvl(input->to_vsr(), in, tmp1);
ENCODE_CORE
// bytes_to_write = ((remaining * 4) + 2) / 3
__ li(three, 3);
__ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
__ addi(bytes_to_write, bytes_to_write, 2);
__ divwu(bytes_to_write, bytes_to_write, three);
__ cmpwi(CR7, bytes_to_write, 16);
__ ble_predict_taken(CR7, le_16_to_write);
__ stxv(expanded->to_vsr(), 0, out);
// We've processed 12 of the 13-15 data bytes, so advance the pointers,
// and do one final pass for the remaining 1-3 bytes.
__ addi(in, in, 12);
__ addi(out, out, 16);
__ subi(remaining, remaining, 12);
__ subi(bytes_to_write, bytes_to_write, 16);
__ rldicr(tmp1, bytes_to_write, 56, 7);
__ lxvl(input->to_vsr(), in, tmp1);
ENCODE_CORE
__ bind(le_16_to_write);
// shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
__ rldicr(tmp1, bytes_to_write, 56, 7);
__ stxvl(expanded->to_vsr(), out, tmp1);
__ add(out, out, bytes_to_write);
__ li(pad_char, '=');
__ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CR0
// Examples:
// remaining bytes_to_write modulo_chars num pad chars
// 0 0 0 0
// 1 2 2 2
// 2 3 3 1
// 3 4 0 0
// 4 6 2 2
// 5 7 3 1
// ...
// 12 16 0 0
// 13 18 2 2
// 14 19 3 1
// 15 20 0 0
__ beq(CR0, no_pad);
__ cmpwi(CR7, modulo_chars, 3);
__ beq(CR7, one_pad_char);
// two pad chars
__ stb(pad_char, out);
__ addi(out, out, 1);
__ bind(one_pad_char);
__ stb(pad_char, out);
__ bind(no_pad);
__ blr();
return start;
}
#endif // VM_LITTLE_ENDIAN
void generate_lookup_secondary_supers_table_stub() {
StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
StubCodeMark mark(this, stub_id);
const Register
r_super_klass = R4_ARG2,
r_array_base = R3_ARG1,
r_array_length = R7_ARG5,
r_array_index = R6_ARG4,
r_sub_klass = R5_ARG3,
r_bitmap = R11_scratch1,
result = R8_ARG6;
for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
__ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
r_array_base, r_array_length, r_array_index,
r_bitmap, result, slot);
__ blr();
}
}
// Slow path implementation for UseSecondarySupersTable.
address generate_lookup_secondary_supers_table_slow_path_stub() {
StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
const Register
r_super_klass = R4_ARG2,
r_array_base = R3_ARG1,
temp1 = R7_ARG5,
r_array_index = R6_ARG4,
r_bitmap = R11_scratch1,
result = R8_ARG6;
__ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
__ blr();
return start;
}
address generate_cont_thaw(StubId stub_id) {
if (!Continuations::enabled()) return nullptr;
Continuation::thaw_kind kind;
bool return_barrier;
bool return_barrier_exception;
switch (stub_id) {
case StubId::stubgen_cont_thaw_id:
kind = Continuation::thaw_top;
return_barrier = false;
return_barrier_exception = false;
break;
case StubId::stubgen_cont_returnBarrier_id:
kind = Continuation::thaw_return_barrier;
return_barrier = true;
return_barrier_exception = false;
break;
case StubId::stubgen_cont_returnBarrierExc_id:
kind = Continuation::thaw_return_barrier_exception;
return_barrier = true;
return_barrier_exception = true;
break;
default:
ShouldNotReachHere();
}
StubCodeMark mark(this, stub_id);
Register tmp1 = R10_ARG8;
Register tmp2 = R9_ARG7;
Register tmp3 = R8_ARG6;
Register nvtmp = R15_esp; // nonvolatile tmp register
FloatRegister nvftmp = F20; // nonvolatile fp tmp register
address start = __ pc();
if (kind == Continuation::thaw_top) {
__ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
}
if (return_barrier) {
__ mr(nvtmp, R3_RET); __ fmr(nvftmp, F1_RET); // preserve possible return value from a method returning to the return barrier
DEBUG_ONLY(__ ld_ptr(tmp1, _abi0(callers_sp), R1_SP);)
__ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
#ifdef ASSERT
__ ld_ptr(tmp2, _abi0(callers_sp), R1_SP);
__ cmpd(CR0, tmp1, tmp2);
__ asm_assert_eq(FILE_AND_LINE ": callers sp is corrupt");
#endif
}
#ifdef ASSERT
__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread);
__ cmpd(CR0, R1_SP, tmp1);
__ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
#endif
__ li(R4_ARG2, return_barrier ? 1 : 0);
__ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), R16_thread, R4_ARG2);
#ifdef ASSERT
DEBUG_ONLY(__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread));
DEBUG_ONLY(__ cmpd(CR0, R1_SP, tmp1));
__ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
#endif
// R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames
Label thaw_success;
__ cmpdi(CR0, R3_RET, 0);
__ bne(CR0, thaw_success);
__ load_const_optimized(tmp1, (SharedRuntime::throw_StackOverflowError_entry()), R0);
__ mtctr(tmp1); __ bctr();
__ bind(thaw_success);
__ addi(R3_RET, R3_RET, frame::native_abi_reg_args_size); // Large abi required for C++ calls.
__ neg(R3_RET, R3_RET);
// align down resulting in a smaller negative offset
__ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes));
DEBUG_ONLY(__ mr(tmp1, R1_SP);)
__ resize_frame(R3_RET, tmp2); // make room for the thawed frames
__ li(R4_ARG2, kind);
__ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2);
__ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame
if (return_barrier) {
// we're now in the caller of the frame that returned to the barrier
__ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
} else {
// we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
__ li(R3_RET, 0); // return 0 (success) from doYield
}
if (return_barrier_exception) {
Register ex_pc = R17_tos; // nonvolatile register
__ ld(ex_pc, _abi0(lr), R1_SP); // LR
__ mr(nvtmp, R3_RET); // save return value containing the exception oop
// The thawed top frame has got a frame::java_abi. This is not sufficient for the runtime call.
__ push_frame_reg_args(0, tmp1);
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc);
__ mtlr(R3_RET); // the exception handler
__ pop_frame();
// See OptoRuntime::generate_exception_blob for register arguments
__ mr(R3_ARG1, nvtmp); // exception oop
__ mr(R4_ARG2, ex_pc); // exception pc
} else {
// We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
__ ld(R0, _abi0(lr), R1_SP); // LR
__ mtlr(R0);
}
__ blr();
return start;
}
address generate_cont_thaw() {
return generate_cont_thaw(StubId::stubgen_cont_thaw_id);
}
// TODO: will probably need multiple return barriers depending on return type
address generate_cont_returnBarrier() {
return generate_cont_thaw(StubId::stubgen_cont_returnBarrier_id);
}
address generate_cont_returnBarrier_exception() {
return generate_cont_thaw(StubId::stubgen_cont_returnBarrierExc_id);
}
address generate_cont_preempt_stub() {
if (!Continuations::enabled()) return nullptr;
StubId stub_id = StubId::stubgen_cont_preempt_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
__ reset_last_Java_frame(false /*check_last_java_sp*/);
// Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
__ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
Label preemption_cancelled;
__ lbz(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
__ cmpwi(CR0, R11_scratch1, 0);
__ bne(CR0, preemption_cancelled);
// Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
SharedRuntime::continuation_enter_cleanup(_masm);
__ pop_frame();
__ restore_LR(R11_scratch1);
__ blr();
// We acquired the monitor after freezing the frames so call thaw to continue execution.
__ bind(preemption_cancelled);
__ li(R11_scratch1, 0); // false
__ stb(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
int simm16_offs = __ load_const_optimized(R11_scratch1, ContinuationEntry::thaw_call_pc_address(), R0, true);
__ ld(R11_scratch1, simm16_offs, R11_scratch1);
__ mtctr(R11_scratch1);
__ bctr();
return start;
}
// exception handler for upcall stubs
address generate_upcall_stub_exception_handler() {
StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
// Native caller has no idea how to handle exceptions,
// so we just crash here. Up to callee to catch exceptions.
__ verify_oop(R3_ARG1);
__ load_const_optimized(R12_scratch2, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception), R0);
__ call_c(R12_scratch2);
__ should_not_reach_here();
return start;
}
// load Method* target of MethodHandle
// R3_ARG1 = jobject receiver
// R19_method = result Method*
address generate_upcall_stub_load_target() {
StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
StubCodeMark mark(this, stub_id);
address start = __ pc();
__ resolve_global_jobject(R3_ARG1, R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS);
// Load target method from receiver
__ load_heap_oop(R19_method, java_lang_invoke_MethodHandle::form_offset(), R3_ARG1,
R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
__ load_heap_oop(R19_method, java_lang_invoke_LambdaForm::vmentry_offset(), R19_method,
R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
__ load_heap_oop(R19_method, java_lang_invoke_MemberName::method_offset(), R19_method,
R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
__ ld(R19_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset(), R19_method);
__ std(R19_method, in_bytes(JavaThread::callee_target_offset()), R16_thread); // just in case callee is deoptimized
__ blr();
return start;
}
// Initialization
void generate_preuniverse_stubs() {
// preuniverse stubs are not needed for ppc
}
void generate_initial_stubs() {
// Generates all stubs and initializes the entry points
// Entry points that exist in all platforms.
// Note: This is code that could be shared among different platforms - however the
// benefit seems to be smaller than the disadvantage of having a
// much more complicated generator structure. See also comment in
// stubRoutines.hpp.
StubRoutines::_forward_exception_entry = generate_forward_exception();
StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
StubRoutines::_catch_exception_entry = generate_catch_exception();
if (UnsafeMemoryAccess::_table == nullptr) {
UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
}
// CRC32 Intrinsics.
if (UseCRC32Intrinsics) {
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32_id);
}
// CRC32C Intrinsics.
if (UseCRC32CIntrinsics) {
StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32C_id);
}
if (VM_Version::supports_float16()) {
// For results consistency both intrinsics should be enabled.
StubRoutines::_hf2f = generate_float16ToFloat();
StubRoutines::_f2hf = generate_floatToFloat16();
}
}
void generate_continuation_stubs() {
// Continuation stubs:
StubRoutines::_cont_thaw = generate_cont_thaw();
StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
}
void generate_final_stubs() {
// Generates all stubs and initializes the entry points
// support for verify_oop (must happen after universe_init)
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
// nmethod entry barriers for concurrent class unloading
StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
// arraycopy stubs used by compilers
generate_arraycopy_stubs();
#ifdef COMPILER2
if (UseSecondarySupersTable) {
StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
if (!InlineSecondarySupersTest) {
generate_lookup_secondary_supers_table_stub();
}
}
#endif // COMPILER2
StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
}
void generate_compiler_stubs() {
#if COMPILER2_OR_JVMCI
#ifdef COMPILER2
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
if (UseSquareToLenIntrinsic) {
StubRoutines::_squareToLen = generate_squareToLen();
}
if (UseMulAddIntrinsic) {
StubRoutines::_mulAdd = generate_mulAdd();
}
if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
}
if (UseMontgomerySquareIntrinsic) {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
}
#endif
// data cache line writeback
if (VM_Version::supports_data_cache_line_flush()) {
StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
}
if (UseGHASHIntrinsics) {
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
}
if (UseAESIntrinsics) {
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
}
if (UseSHA256Intrinsics) {
StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
}
if (UseSHA512Intrinsics) {
StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
}
#ifdef VM_LITTLE_ENDIAN
// Currently supported on PPC64LE only
if (UseBASE64Intrinsics) {
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
}
#endif
#endif // COMPILER2_OR_JVMCI
}
public:
StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
switch(blob_id) {
case BlobId::stubgen_preuniverse_id:
generate_preuniverse_stubs();
break;
case BlobId::stubgen_initial_id:
generate_initial_stubs();
break;
case BlobId::stubgen_continuation_id:
generate_continuation_stubs();
break;
case BlobId::stubgen_compiler_id:
generate_compiler_stubs();
break;
case BlobId::stubgen_final_id:
generate_final_stubs();
break;
default:
fatal("unexpected blob id: %s", StubInfo::name(blob_id));
break;
};
}
};
void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
StubGenerator g(code, blob_id);
}