mirror of
https://github.com/openjdk/jdk.git
synced 2026-05-31 07:48:49 +00:00
8256431: [PPC64] Implement Base64 encodeBlock() for Power64-LE
Reviewed-by: mdoerr
This commit is contained in:
parent
172af1524d
commit
0849117d5c
@ -264,6 +264,7 @@ class Assembler : public AbstractAssembler {
|
||||
SUBFME_OPCODE = (31u << OPCODE_SHIFT | 232u << 1),
|
||||
SUBFZE_OPCODE = (31u << OPCODE_SHIFT | 200u << 1),
|
||||
DIVW_OPCODE = (31u << OPCODE_SHIFT | 491u << 1),
|
||||
DIVWU_OPCODE = (31u << OPCODE_SHIFT | 459u << 1),
|
||||
MULLW_OPCODE = (31u << OPCODE_SHIFT | 235u << 1),
|
||||
MULHW_OPCODE = (31u << OPCODE_SHIFT | 75u << 1),
|
||||
MULHWU_OPCODE = (31u << OPCODE_SHIFT | 11u << 1),
|
||||
@ -524,10 +525,13 @@ class Assembler : public AbstractAssembler {
|
||||
|
||||
// Vector-Scalar (VSX) instruction support.
|
||||
LXV_OPCODE = (61u << OPCODE_SHIFT | 1u ),
|
||||
LXVL_OPCODE = (31u << OPCODE_SHIFT | 269u << 1),
|
||||
STXV_OPCODE = (61u << OPCODE_SHIFT | 5u ),
|
||||
STXVL_OPCODE = (31u << OPCODE_SHIFT | 397u << 1),
|
||||
LXVD2X_OPCODE = (31u << OPCODE_SHIFT | 844u << 1),
|
||||
STXVD2X_OPCODE = (31u << OPCODE_SHIFT | 972u << 1),
|
||||
MTVSRD_OPCODE = (31u << OPCODE_SHIFT | 179u << 1),
|
||||
MTVSRDD_OPCODE = (31u << OPCODE_SHIFT | 435u << 1),
|
||||
MTVSRWZ_OPCODE = (31u << OPCODE_SHIFT | 243u << 1),
|
||||
MFVSRD_OPCODE = (31u << OPCODE_SHIFT | 51u << 1),
|
||||
MTVSRWA_OPCODE = (31u << OPCODE_SHIFT | 211u << 1),
|
||||
@ -1343,6 +1347,8 @@ class Assembler : public AbstractAssembler {
|
||||
inline void divd_( Register d, Register a, Register b);
|
||||
inline void divw( Register d, Register a, Register b);
|
||||
inline void divw_( Register d, Register a, Register b);
|
||||
inline void divwu( Register d, Register a, Register b);
|
||||
inline void divwu_( Register d, Register a, Register b);
|
||||
|
||||
// Fixed-Point Arithmetic Instructions with Overflow detection
|
||||
inline void addo( Register d, Register a, Register b);
|
||||
@ -2263,6 +2269,8 @@ class Assembler : public AbstractAssembler {
|
||||
// Vector-Scalar (VSX) instructions.
|
||||
inline void lxv( VectorSRegister d, int si16, Register a);
|
||||
inline void stxv( VectorSRegister d, int si16, Register a);
|
||||
inline void lxvl( VectorSRegister d, Register a, Register b);
|
||||
inline void stxvl( VectorSRegister d, Register a, Register b);
|
||||
inline void lxvd2x( VectorSRegister d, Register a);
|
||||
inline void lxvd2x( VectorSRegister d, Register a, Register b);
|
||||
inline void stxvd2x( VectorSRegister d, Register a);
|
||||
@ -2277,6 +2285,7 @@ class Assembler : public AbstractAssembler {
|
||||
inline void xxmrglw( VectorSRegister d, VectorSRegister a, VectorSRegister b);
|
||||
inline void mtvsrd( VectorSRegister d, Register a);
|
||||
inline void mfvsrd( Register d, VectorSRegister a);
|
||||
inline void mtvsrdd( VectorSRegister d, Register a, Register b);
|
||||
inline void mtvsrwz( VectorSRegister d, Register a);
|
||||
inline void mfvsrwz( Register d, VectorSRegister a);
|
||||
inline void xxspltw( VectorSRegister d, VectorSRegister b, int ui2);
|
||||
|
||||
@ -127,6 +127,8 @@ inline void Assembler::divd( Register d, Register a, Register b) { emit_int32(
|
||||
inline void Assembler::divd_( Register d, Register a, Register b) { emit_int32(DIVD_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
|
||||
inline void Assembler::divw( Register d, Register a, Register b) { emit_int32(DIVW_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(0)); }
|
||||
inline void Assembler::divw_( Register d, Register a, Register b) { emit_int32(DIVW_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
|
||||
inline void Assembler::divwu( Register d, Register a, Register b) { emit_int32(DIVWU_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(0)); }
|
||||
inline void Assembler::divwu_( Register d, Register a, Register b) { emit_int32(DIVWU_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
|
||||
|
||||
// Fixed-Point Arithmetic Instructions with Overflow detection
|
||||
inline void Assembler::addo( Register d, Register a, Register b) { emit_int32(ADD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); }
|
||||
@ -792,11 +794,14 @@ inline void Assembler::lvsr( VectorRegister d, Register s1, Register s2) { emit
|
||||
// Vector-Scalar (VSX) instructions.
|
||||
inline void Assembler::lxv( VectorSRegister d, int ui16, Register a) { assert(is_aligned(ui16, 16), "displacement must be a multiple of 16"); emit_int32( LXV_OPCODE | vsrt_dq(d) | ra0mem(a) | uimm(ui16, 16)); }
|
||||
inline void Assembler::stxv( VectorSRegister d, int ui16, Register a) { assert(is_aligned(ui16, 16), "displacement must be a multiple of 16"); emit_int32( STXV_OPCODE | vsrs_dq(d) | ra0mem(a) | uimm(ui16, 16)); }
|
||||
inline void Assembler::lxvl( VectorSRegister d, Register s1, Register b) { emit_int32( LXVL_OPCODE | vsrt(d) | ra0mem(s1) | rb(b)); }
|
||||
inline void Assembler::stxvl( VectorSRegister d, Register s1, Register b) { emit_int32( STXVL_OPCODE | vsrt(d) | ra0mem(s1) | rb(b)); }
|
||||
inline void Assembler::lxvd2x( VectorSRegister d, Register s1) { emit_int32( LXVD2X_OPCODE | vsrt(d) | ra(0) | rb(s1)); }
|
||||
inline void Assembler::lxvd2x( VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE | vsrt(d) | ra0mem(s1) | rb(s2)); }
|
||||
inline void Assembler::stxvd2x( VectorSRegister d, Register s1) { emit_int32( STXVD2X_OPCODE | vsrs(d) | ra(0) | rb(s1)); }
|
||||
inline void Assembler::stxvd2x( VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrs(d) | ra0mem(s1) | rb(s2)); }
|
||||
inline void Assembler::mtvsrd( VectorSRegister d, Register a) { emit_int32( MTVSRD_OPCODE | vsrt(d) | ra(a)); }
|
||||
inline void Assembler::mtvsrdd( VectorSRegister d, Register a, Register b) { emit_int32( MTVSRDD_OPCODE | vsrt(d) | ra(a) | rb(b)); }
|
||||
inline void Assembler::mfvsrd( Register d, VectorSRegister a) { emit_int32( MFVSRD_OPCODE | vsrs(a) | ra(d)); }
|
||||
inline void Assembler::mtvsrwz( VectorSRegister d, Register a) { emit_int32( MTVSRWZ_OPCODE | vsrt(d) | ra(a)); }
|
||||
inline void Assembler::mfvsrwz( Register d, VectorSRegister a) { emit_int32( MFVSRWZ_OPCODE | vsrs(a) | ra(d)); }
|
||||
|
||||
@ -4016,6 +4016,422 @@ class StubGenerator: public StubCodeGenerator {
|
||||
#undef SLS
|
||||
#undef US
|
||||
|
||||
// This algorithm is based on the methods described in this paper:
|
||||
// http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
|
||||
//
|
||||
// The details of this implementation vary from the paper due to the
|
||||
// difference in the ISA between SSE and AltiVec, especially in the
|
||||
// splitting bytes section where there is no need on Power to mask after
|
||||
// the shift because the shift is byte-wise rather than an entire an entire
|
||||
// 128-bit word.
|
||||
//
|
||||
// For the lookup part of the algorithm, different logic is used than
|
||||
// described in the paper because of the availability of vperm, which can
|
||||
// do a 64-byte table lookup in four instructions, while preserving the
|
||||
// branchless nature.
|
||||
//
|
||||
// Description of the ENCODE_CORE macro
|
||||
//
|
||||
// Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
|
||||
// bits of each byte are zeros)
|
||||
//
|
||||
// (Note: e7..e0 are not shown because they follow the same pattern as
|
||||
// e8..e15)
|
||||
//
|
||||
// In the table below, b0, b1, .. b15 are the bytes of unencoded
|
||||
// binary data, the first line of each of the cells (except for
|
||||
// the constants) uses the bit-field nomenclature from the
|
||||
// above-linked paper, whereas the second line is more specific
|
||||
// about which exact bits are present, and is constructed using the
|
||||
// Power ISA 3.x document style, where:
|
||||
//
|
||||
// * The specifier after the colon depicts which bits are there.
|
||||
// * The bit numbering is big endian style (bit 0 is the most
|
||||
// significant).
|
||||
// * || is a concatenate operator.
|
||||
// * Strings of 0's are a field of zeros with the shown length, and
|
||||
// likewise for strings of 1's.
|
||||
//
|
||||
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
|
||||
// | Vector | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
|
||||
// | Element | | | | | | | | |
|
||||
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
|
||||
// | after lxv | jjjjkkkk | iiiiiijj | gghhhhhh | ffffgggg | eeeeeeff | ccdddddd | bbbbcccc | aaaaaabb |
|
||||
// | | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | xxperm indexes | 0 | 10 | 11 | 12 | 0 | 13 | 14 | 15 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | (1) after xxperm | | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb |
|
||||
// | | (b15) | b5 | b4 | b3 | (b15) | b2 | b1 | b0 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | rshift_amount | 0 | 6 | 4 | 2 | 0 | 6 | 4 | 2 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | after vsrb | | 000000gg | 0000ffff | 00eeeeee | | 000000cc | 0000bbbb | 00aaaaaa |
|
||||
// | | (b15) | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | (b15) | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | rshift_mask | 00000000 | 000000||11 | 0000||1111 | 00||111111 | 00000000 | 000000||11 | 0000||1111 | 00||111111 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | rshift after vand | 00000000 | 000000gg | 0000ffff | 00eeeeee | 00000000 | 000000cc | 0000bbbb | 00aaaaaa |
|
||||
// | | 00000000 | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | 00000000 | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | 1 octet lshift (1) | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | 00000000 |
|
||||
// | | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | 00000000 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | lshift_amount | 0 | 2 | 4 | 0 | 0 | 2 | 4 | 0 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | after vslb | gghhhhhh | ffgggg00 | eeff0000 | | ccdddddd | bbcccc00 | aabb0000 | 00000000 |
|
||||
// | | b5 | b4:2..7||00 | b3:4..7||0000 | (b15) | b2:0..7 | b1:2..7||00 | b0:4..7||0000 | 00000000 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | lshift_mask | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | lshift after vand | 00hhhhhh | 00gggg00 | 00ff0000 | 00000000 | 00dddddd | 00cccc00 | 00bb0000 | 00000000 |
|
||||
// | | 00||b5:2..7 | 00||b4:4..7||00 | 00||b3:6..7||0000 | 00000000 | 00||b2:2..7 | 00||b1:4..7||00 | 00||b0:6..7||0000 | 00000000 |
|
||||
// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | after vor lshift, rshift | 00hhhhhh | 00gggggg | 00ffffff | 00eeeeee | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
|
||||
// | | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
|
||||
// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
|
||||
//
|
||||
// Expand the first 12 bytes into 16 bytes, leaving every 4th byte
|
||||
// blank for now.
|
||||
// __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
|
||||
//
|
||||
// Generate two bit-shifted pieces - rshift and lshift - that will
|
||||
// later be OR'd together.
|
||||
//
|
||||
// First the right-shifted piece
|
||||
// __ vsrb(rshift, input, expand_rshift);
|
||||
// __ vand(rshift, rshift, expand_rshift_mask);
|
||||
//
|
||||
// Now the left-shifted piece, which is done by octet shifting
|
||||
// the input one byte to the left, then doing a variable shift,
|
||||
// followed by a mask operation.
|
||||
//
|
||||
// __ vslo(lshift, input, vec_8s);
|
||||
// __ vslb(lshift, lshift, expand_lshift);
|
||||
// __ vand(lshift, lshift, expand_lshift_mask);
|
||||
//
|
||||
// Combine the two pieces by OR'ing
|
||||
// __ vor(expanded, rshift, lshift);
|
||||
//
|
||||
// At this point, expanded is a vector containing a 6-bit value in each
|
||||
// byte. These values are used as indexes into a 64-byte lookup table that
|
||||
// is contained in four vector registers. The lookup operation is done
|
||||
// using vperm instructions with the same indexes for the lower 32 and
|
||||
// upper 32 bytes. To figure out which of the two looked-up bytes to use
|
||||
// at each location, all values in expanded are compared to 31. Using
|
||||
// vsel, values higher than 31 use the results from the upper 32 bytes of
|
||||
// the lookup operation, while values less than or equal to 31 use the
|
||||
// lower 32 bytes of the lookup operation. Power10 and beyond can save the
|
||||
// compare instruction, because the comparison is done within xxpermx
|
||||
// itself. TODO: use xxpermx,xxpermx,vor on P10 when instruction prefixes are
|
||||
// available in assembler_ppc.*
|
||||
|
||||
#define ENCODE_CORE \
|
||||
__ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); \
|
||||
__ vsrb(rshift, input, expand_rshift); \
|
||||
__ vand(rshift, rshift, expand_rshift_mask); \
|
||||
__ vslo(lshift, input, vec_8s); \
|
||||
__ vslb(lshift, lshift, expand_lshift); \
|
||||
__ vand(lshift, lshift, expand_lshift_mask); \
|
||||
__ vor(expanded, rshift, lshift); \
|
||||
__ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
|
||||
__ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
|
||||
__ vcmpgtub(gt_31, expanded, vec_31s); \
|
||||
__ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
|
||||
|
||||
// Intrinsic function prototype in Base64.java:
|
||||
// private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
|
||||
|
||||
address generate_base64_encodeBlock() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "base64_encodeBlock");
|
||||
address start = __ function_entry();
|
||||
|
||||
typedef struct {
|
||||
unsigned char expand_permute_val[16];
|
||||
unsigned char expand_rshift_val[16];
|
||||
unsigned char expand_rshift_mask_val[16];
|
||||
unsigned char expand_lshift_val[16];
|
||||
unsigned char expand_lshift_mask_val[16];
|
||||
unsigned char base64_00_15_val[16];
|
||||
unsigned char base64_16_31_val[16];
|
||||
unsigned char base64_32_47_val[16];
|
||||
unsigned char base64_48_63_val[16];
|
||||
unsigned char base64_48_63_URL_val[16];
|
||||
} constant_block;
|
||||
|
||||
static const constant_block VEC_ALIGN const_block = {
|
||||
.expand_permute_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 4, 5, 6,
|
||||
0, 7, 8, 9,
|
||||
0, 10, 11, 12,
|
||||
0, 13, 14, 15 ) },
|
||||
|
||||
.expand_rshift_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 6, 4, 2,
|
||||
0, 6, 4, 2,
|
||||
0, 6, 4, 2,
|
||||
0, 6, 4, 2 ) },
|
||||
|
||||
.expand_rshift_mask_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0b00000000, 0b00000011, 0b00001111, 0b00111111,
|
||||
0b00000000, 0b00000011, 0b00001111, 0b00111111,
|
||||
0b00000000, 0b00000011, 0b00001111, 0b00111111,
|
||||
0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
|
||||
|
||||
.expand_lshift_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 2, 4, 0,
|
||||
0, 2, 4, 0,
|
||||
0, 2, 4, 0,
|
||||
0, 2, 4, 0 ) },
|
||||
|
||||
.expand_lshift_mask_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0b00111111, 0b00111100, 0b00110000, 0b00000000,
|
||||
0b00111111, 0b00111100, 0b00110000, 0b00000000,
|
||||
0b00111111, 0b00111100, 0b00110000, 0b00000000,
|
||||
0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
|
||||
|
||||
.base64_00_15_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
|
||||
|
||||
.base64_16_31_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
|
||||
|
||||
.base64_32_47_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
|
||||
|
||||
.base64_48_63_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
|
||||
|
||||
.base64_48_63_URL_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
|
||||
};
|
||||
#define BLK_OFFSETOF(x) (offsetof(constant_block, x))
|
||||
|
||||
// Number of bytes to process in each pass through the main loop.
|
||||
// 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
|
||||
const unsigned block_size = 12;
|
||||
|
||||
// According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
|
||||
Register src = R3_ARG1; // source starting address of Base64 characters
|
||||
Register sp = R4_ARG2; // source starting position
|
||||
Register sl = R5_ARG3; // total source length of the Base64 characters to be processed
|
||||
Register dst = R6_ARG4; // destination address
|
||||
Register dp = R7_ARG5; // destination starting position
|
||||
Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
|
||||
|
||||
// Local variables
|
||||
Register const_ptr = R12; // used for loading constants (reuses isURL's register)
|
||||
Register tmp_reg = R9; // used for speeding up load_constant()
|
||||
|
||||
Register size = R9; // number of bytes to process (reuses tmp_reg's register)
|
||||
Register blocked_size = R10; // number of bytes to process a block at a time
|
||||
Register block_modulo = R12; // == block_size (reuse const_ptr)
|
||||
Register remaining = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
|
||||
Register in = R4; // current input (source) pointer (reuse sp's register)
|
||||
Register num_blocks = R11; // number of blocks to be processed by the unrolled loop
|
||||
Register out = R8; // current output (destination) pointer (reuse const_ptr's register)
|
||||
Register three = R9; // constant divisor (reuse size's register)
|
||||
Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
|
||||
Register tmp1 = R7; // temp register for lxvl length (reuse dp's register)
|
||||
Register modulo_chars = R7; // number of bytes written during the final write % 4 (reuse tmp1's register)
|
||||
Register pad_char = R6; // literal '=' (reuse dst's register)
|
||||
|
||||
// Volatile VSRS are 0..13, 32..51 (VR0..VR13)
|
||||
// VR Constants
|
||||
VectorRegister vec_8s = VR0;
|
||||
VectorRegister vec_31s = VR1;
|
||||
VectorRegister vec_base64_00_15 = VR2;
|
||||
VectorRegister vec_base64_16_31 = VR3;
|
||||
VectorRegister vec_base64_32_47 = VR4;
|
||||
VectorRegister vec_base64_48_63 = VR5;
|
||||
VectorRegister expand_rshift = VR6;
|
||||
VectorRegister expand_rshift_mask = VR7;
|
||||
VectorRegister expand_lshift = VR8;
|
||||
VectorRegister expand_lshift_mask = VR9;
|
||||
|
||||
// VR variables for expand
|
||||
VectorRegister input = VR10;
|
||||
VectorRegister rshift = VR11;
|
||||
VectorRegister lshift = VR12;
|
||||
VectorRegister expanded = VR13;
|
||||
|
||||
// VR variables for lookup
|
||||
VectorRegister encoded_00_31 = VR10; // (reuse input)
|
||||
VectorRegister encoded_32_63 = VR11; // (reuse rshift)
|
||||
VectorRegister gt_31 = VR12; // (reuse lshift)
|
||||
|
||||
// VSR Constants
|
||||
VectorSRegister expand_permute = VSR0;
|
||||
|
||||
Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
|
||||
Label loop_start, le_16_to_write, no_pad, one_pad_char;
|
||||
|
||||
// The upper 32 bits of the non-pointer parameter registers are not
|
||||
// guaranteed to be zero, so mask off those upper bits.
|
||||
__ clrldi(sp, sp, 32);
|
||||
__ clrldi(sl, sl, 32);
|
||||
__ clrldi(dp, dp, 32);
|
||||
__ clrldi(isURL, isURL, 32);
|
||||
|
||||
// load up the constants
|
||||
__ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
|
||||
__ lxv(expand_permute, BLK_OFFSETOF(expand_permute_val), const_ptr);
|
||||
__ lxv(expand_rshift->to_vsr(), BLK_OFFSETOF(expand_rshift_val), const_ptr);
|
||||
__ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
|
||||
__ lxv(expand_lshift->to_vsr(), BLK_OFFSETOF(expand_lshift_val), const_ptr);
|
||||
__ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
|
||||
__ lxv(vec_base64_00_15->to_vsr(), BLK_OFFSETOF(base64_00_15_val), const_ptr);
|
||||
__ lxv(vec_base64_16_31->to_vsr(), BLK_OFFSETOF(base64_16_31_val), const_ptr);
|
||||
__ lxv(vec_base64_32_47->to_vsr(), BLK_OFFSETOF(base64_32_47_val), const_ptr);
|
||||
|
||||
// Splat the constants that can use xxspltib
|
||||
__ xxspltib(vec_8s->to_vsr(), 8);
|
||||
__ xxspltib(vec_31s->to_vsr(), 31);
|
||||
|
||||
|
||||
// Use a different translation lookup table depending on the
|
||||
// setting of isURL
|
||||
__ cmpdi(CCR0, isURL, 0);
|
||||
__ beq(CCR0, not_URL);
|
||||
__ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
|
||||
__ b(calculate_size);
|
||||
|
||||
__ bind(not_URL);
|
||||
__ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
|
||||
|
||||
__ bind(calculate_size);
|
||||
|
||||
// size = sl - sp - 4 (*)
|
||||
// (*) Don't process the last four bytes in the main loop because
|
||||
// we don't want the lxv instruction to read past the end of the src
|
||||
// data, in case those four bytes are on the start of an unmapped or
|
||||
// otherwise inaccessible page.
|
||||
//
|
||||
__ sub(size, sl, sp);
|
||||
__ subi(size, size, 4);
|
||||
__ cmpdi(CCR7, size, block_size);
|
||||
__ bgt(CCR7, calculate_blocked_size);
|
||||
__ mr(remaining, size);
|
||||
// Add the 4 back into remaining again
|
||||
__ addi(remaining, remaining, 4);
|
||||
// make "in" point to the beginning of the source data: in = src + sp
|
||||
__ add(in, src, sp);
|
||||
// out = dst + dp
|
||||
__ add(out, dst, dp);
|
||||
__ b(skip_loop);
|
||||
|
||||
__ bind(calculate_blocked_size);
|
||||
__ li(block_modulo, block_size);
|
||||
// num_blocks = size / block_modulo
|
||||
__ divwu(num_blocks, size, block_modulo);
|
||||
// blocked_size = num_blocks * size
|
||||
__ mullw(blocked_size, num_blocks, block_modulo);
|
||||
// remaining = size - blocked_size
|
||||
__ sub(remaining, size, blocked_size);
|
||||
__ mtctr(num_blocks);
|
||||
|
||||
// Add the 4 back in to remaining again
|
||||
__ addi(remaining, remaining, 4);
|
||||
|
||||
// make "in" point to the beginning of the source data: in = src + sp
|
||||
__ add(in, src, sp);
|
||||
|
||||
// out = dst + dp
|
||||
__ add(out, dst, dp);
|
||||
|
||||
__ align(32);
|
||||
__ bind(loop_start);
|
||||
|
||||
__ lxv(input->to_vsr(), 0, in);
|
||||
|
||||
ENCODE_CORE
|
||||
|
||||
__ stxv(expanded->to_vsr(), 0, out);
|
||||
__ addi(in, in, 12);
|
||||
__ addi(out, out, 16);
|
||||
__ bdnz(loop_start);
|
||||
|
||||
__ bind(skip_loop);
|
||||
|
||||
// When there are less than 16 bytes left, we need to be careful not to
|
||||
// read beyond the end of the src buffer, which might be in an unmapped
|
||||
// page.
|
||||
// Load the remaining bytes using lxvl.
|
||||
__ rldicr(tmp1, remaining, 56, 7);
|
||||
__ lxvl(input->to_vsr(), in, tmp1);
|
||||
|
||||
ENCODE_CORE
|
||||
|
||||
// bytes_to_write = ((remaining * 4) + 2) / 3
|
||||
__ li(three, 3);
|
||||
__ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
|
||||
__ addi(bytes_to_write, bytes_to_write, 2);
|
||||
__ divwu(bytes_to_write, bytes_to_write, three);
|
||||
|
||||
__ cmpwi(CCR7, bytes_to_write, 16);
|
||||
__ ble_predict_taken(CCR7, le_16_to_write);
|
||||
__ stxv(expanded->to_vsr(), 0, out);
|
||||
|
||||
// We've processed 12 of the 13-15 data bytes, so advance the pointers,
|
||||
// and do one final pass for the remaining 1-3 bytes.
|
||||
__ addi(in, in, 12);
|
||||
__ addi(out, out, 16);
|
||||
__ subi(remaining, remaining, 12);
|
||||
__ subi(bytes_to_write, bytes_to_write, 16);
|
||||
__ rldicr(tmp1, bytes_to_write, 56, 7);
|
||||
__ lxvl(input->to_vsr(), in, tmp1);
|
||||
|
||||
ENCODE_CORE
|
||||
|
||||
__ bind(le_16_to_write);
|
||||
// shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
|
||||
__ rldicr(tmp1, bytes_to_write, 56, 7);
|
||||
__ stxvl(expanded->to_vsr(), out, tmp1);
|
||||
__ add(out, out, bytes_to_write);
|
||||
|
||||
__ li(pad_char, '=');
|
||||
__ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CCR0
|
||||
// Examples:
|
||||
// remaining bytes_to_write modulo_chars num pad chars
|
||||
// 0 0 0 0
|
||||
// 1 2 2 2
|
||||
// 2 3 3 1
|
||||
// 3 4 0 0
|
||||
// 4 6 2 2
|
||||
// 5 7 3 1
|
||||
// ...
|
||||
// 12 16 0 0
|
||||
// 13 18 2 2
|
||||
// 14 19 3 1
|
||||
// 15 20 0 0
|
||||
__ beq(CCR0, no_pad);
|
||||
__ cmpwi(CCR7, modulo_chars, 3);
|
||||
__ beq(CCR7, one_pad_char);
|
||||
|
||||
// two pad chars
|
||||
__ stb(pad_char, out);
|
||||
__ addi(out, out, 1);
|
||||
|
||||
__ bind(one_pad_char);
|
||||
__ stb(pad_char, out);
|
||||
|
||||
__ bind(no_pad);
|
||||
|
||||
__ blr();
|
||||
return start;
|
||||
}
|
||||
|
||||
#endif // VM_LITTLE_ENDIAN
|
||||
|
||||
// Initialization
|
||||
@ -4121,6 +4537,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// Currently supported on PPC64LE only
|
||||
if (UseBASE64Intrinsics) {
|
||||
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
|
||||
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user