8256431: [PPC64] Implement Base64 encodeBlock() for Power64-LE

Reviewed-by: mdoerr
2026-07-18 15:09:37 +00:00 · 2020-12-22 14:19:32 +00:00 · 2020-12-22 14:19:32 +00:00 · 0849117d5c
commit 0849117d5c
parent 172af1524d
3 changed files with 431 additions and 0 deletions
--- a/src/hotspot/cpu/ppc/assembler_ppc.hpp
+++ b/src/hotspot/cpu/ppc/assembler_ppc.hpp
@ -264,6 +264,7 @@ class Assembler : public AbstractAssembler {
    SUBFME_OPCODE = (31u << OPCODE_SHIFT | 232u << 1),
    SUBFZE_OPCODE = (31u << OPCODE_SHIFT | 200u << 1),
    DIVW_OPCODE   = (31u << OPCODE_SHIFT | 491u << 1),
+    DIVWU_OPCODE  = (31u << OPCODE_SHIFT | 459u << 1),
    MULLW_OPCODE  = (31u << OPCODE_SHIFT | 235u << 1),
    MULHW_OPCODE  = (31u << OPCODE_SHIFT |  75u << 1),
    MULHWU_OPCODE = (31u << OPCODE_SHIFT |  11u << 1),
@ -524,10 +525,13 @@ class Assembler : public AbstractAssembler {

    // Vector-Scalar (VSX) instruction support.
    LXV_OPCODE     = (61u << OPCODE_SHIFT |    1u     ),
+    LXVL_OPCODE    = (31u << OPCODE_SHIFT |  269u << 1),
    STXV_OPCODE    = (61u << OPCODE_SHIFT |    5u     ),
+    STXVL_OPCODE   = (31u << OPCODE_SHIFT |  397u << 1),
    LXVD2X_OPCODE  = (31u << OPCODE_SHIFT |  844u << 1),
    STXVD2X_OPCODE = (31u << OPCODE_SHIFT |  972u << 1),
    MTVSRD_OPCODE  = (31u << OPCODE_SHIFT |  179u << 1),
+    MTVSRDD_OPCODE = (31u << OPCODE_SHIFT |  435u << 1),
    MTVSRWZ_OPCODE = (31u << OPCODE_SHIFT |  243u << 1),
    MFVSRD_OPCODE  = (31u << OPCODE_SHIFT |   51u << 1),
    MTVSRWA_OPCODE = (31u << OPCODE_SHIFT |  211u << 1),
@ -1343,6 +1347,8 @@ class Assembler : public AbstractAssembler {
  inline void divd_(  Register d, Register a, Register b);
  inline void divw(   Register d, Register a, Register b);
  inline void divw_(  Register d, Register a, Register b);
+  inline void divwu(  Register d, Register a, Register b);
+  inline void divwu_( Register d, Register a, Register b);

  // Fixed-Point Arithmetic Instructions with Overflow detection
  inline void addo(    Register d, Register a, Register b);
@ -2263,6 +2269,8 @@ class Assembler : public AbstractAssembler {
  // Vector-Scalar (VSX) instructions.
  inline void lxv(      VectorSRegister d, int si16, Register a);
  inline void stxv(     VectorSRegister d, int si16, Register a);
+  inline void lxvl(     VectorSRegister d, Register a, Register b);
+  inline void stxvl(    VectorSRegister d, Register a, Register b);
  inline void lxvd2x(   VectorSRegister d, Register a);
  inline void lxvd2x(   VectorSRegister d, Register a, Register b);
  inline void stxvd2x(  VectorSRegister d, Register a);
@ -2277,6 +2285,7 @@ class Assembler : public AbstractAssembler {
  inline void xxmrglw(  VectorSRegister d, VectorSRegister a, VectorSRegister b);
  inline void mtvsrd(   VectorSRegister d, Register a);
  inline void mfvsrd(   Register        d, VectorSRegister a);
+  inline void mtvsrdd(  VectorSRegister d, Register a, Register b);
  inline void mtvsrwz(  VectorSRegister d, Register a);
  inline void mfvsrwz(  Register        d, VectorSRegister a);
  inline void xxspltw(  VectorSRegister d, VectorSRegister b, int ui2);
--- a/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp
+++ b/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp
@ -127,6 +127,8 @@ inline void Assembler::divd(   Register d, Register a, Register b) { emit_int32(
 inline void Assembler::divd_(  Register d, Register a, Register b) { emit_int32(DIVD_OPCODE   | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
 inline void Assembler::divw(   Register d, Register a, Register b) { emit_int32(DIVW_OPCODE   | rt(d) | ra(a) | rb(b) | oe(0) | rc(0)); }
 inline void Assembler::divw_(  Register d, Register a, Register b) { emit_int32(DIVW_OPCODE   | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
+inline void Assembler::divwu(  Register d, Register a, Register b) { emit_int32(DIVWU_OPCODE  | rt(d) | ra(a) | rb(b) | oe(0) | rc(0)); }
+inline void Assembler::divwu_( Register d, Register a, Register b) { emit_int32(DIVWU_OPCODE  | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }

 // Fixed-Point Arithmetic Instructions with Overflow detection
 inline void Assembler::addo(    Register d, Register a, Register b) { emit_int32(ADD_OPCODE    | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); }
@ -792,11 +794,14 @@ inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit
 // Vector-Scalar (VSX) instructions.
 inline void Assembler::lxv(     VectorSRegister d, int ui16, Register a)     { assert(is_aligned(ui16, 16), "displacement must be a multiple of 16"); emit_int32( LXV_OPCODE  | vsrt_dq(d) | ra0mem(a) | uimm(ui16, 16)); }
 inline void Assembler::stxv(    VectorSRegister d, int ui16, Register a)     { assert(is_aligned(ui16, 16), "displacement must be a multiple of 16"); emit_int32( STXV_OPCODE  | vsrs_dq(d) | ra0mem(a) | uimm(ui16, 16)); }
+inline void Assembler::lxvl(    VectorSRegister d, Register s1, Register b)  { emit_int32( LXVL_OPCODE    | vsrt(d) | ra0mem(s1) | rb(b)); }
+inline void Assembler::stxvl(   VectorSRegister d, Register s1, Register b)  { emit_int32( STXVL_OPCODE   | vsrt(d) | ra0mem(s1) | rb(b)); }
 inline void Assembler::lxvd2x(  VectorSRegister d, Register s1)              { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(0) | rb(s1)); }
 inline void Assembler::lxvd2x(  VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra0mem(s1) | rb(s2)); }
 inline void Assembler::stxvd2x( VectorSRegister d, Register s1)              { emit_int32( STXVD2X_OPCODE | vsrs(d) | ra(0) | rb(s1)); }
 inline void Assembler::stxvd2x( VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrs(d) | ra0mem(s1) | rb(s2)); }
 inline void Assembler::mtvsrd(  VectorSRegister d, Register a)               { emit_int32( MTVSRD_OPCODE  | vsrt(d)  | ra(a)); }
+inline void Assembler::mtvsrdd( VectorSRegister d, Register a, Register b)   { emit_int32( MTVSRDD_OPCODE | vsrt(d)  | ra(a) | rb(b)); }
 inline void Assembler::mfvsrd(  Register d, VectorSRegister a)               { emit_int32( MFVSRD_OPCODE  | vsrs(a)  | ra(d)); }
 inline void Assembler::mtvsrwz( VectorSRegister d, Register a)               { emit_int32( MTVSRWZ_OPCODE | vsrt(d) | ra(a)); }
 inline void Assembler::mfvsrwz( Register d, VectorSRegister a)               { emit_int32( MFVSRWZ_OPCODE | vsrs(a) | ra(d)); }
--- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
+++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
@ -4016,6 +4016,422 @@ class StubGenerator: public StubCodeGenerator {
 #undef SLS
 #undef US

+// This algorithm is based on the methods described in this paper:
+// http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
+//
+// The details of this implementation vary from the paper due to the
+// difference in the ISA between SSE and AltiVec, especially in the
+// splitting bytes section where there is no need on Power to mask after
+// the shift because the shift is byte-wise rather than an entire an entire
+// 128-bit word.
+//
+// For the lookup part of the algorithm, different logic is used than
+// described in the paper because of the availability of vperm, which can
+// do a 64-byte table lookup in four instructions, while preserving the
+// branchless nature.
+//
+// Description of the ENCODE_CORE macro
+//
+// Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
+// bits of each byte are zeros)
+//
+// (Note: e7..e0 are not shown because they follow the same pattern as
+// e8..e15)
+//
+// In the table below, b0, b1, .. b15 are the bytes of unencoded
+// binary data, the first line of each of the cells (except for
+// the constants) uses the bit-field nomenclature from the
+// above-linked paper, whereas the second line is more specific
+// about which exact bits are present, and is constructed using the
+// Power ISA 3.x document style, where:
+//
+// * The specifier after the colon depicts which bits are there.
+// * The bit numbering is big endian style (bit 0 is the most
+//   significant).
+// * || is a concatenate operator.
+// * Strings of 0's are a field of zeros with the shown length, and
+//   likewise for strings of 1's.
+//
+// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
+// |          Vector          |     e8      |          e9          |         e10          |     e11     |     e12     |         e13          |         e14          |     e15     |
+// |         Element          |             |                      |                      |             |             |                      |                      |             |
+// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
+// |        after lxv         |  jjjjkkkk   |       iiiiiijj       |       gghhhhhh       |  ffffgggg   |  eeeeeeff   |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
+// |                          |     b7      |          b6          |          b5          |     b4      |     b3      |          b2          |          b1          |     b0      |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |      xxperm indexes      |      0      |          10          |          11          |     12      |      0      |          13          |          14          |     15      |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |     (1) after xxperm     |             |       gghhhhhh       |       ffffgggg       |  eeeeeeff   |             |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
+// |                          |    (b15)    |          b5          |          b4          |     b3      |    (b15)    |          b2          |          b1          |     b0      |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |      rshift_amount       |      0      |          6           |          4           |      2      |      0      |          6           |          4           |      2      |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |        after vsrb        |             |       000000gg       |       0000ffff       |  00eeeeee   |             |       000000cc       |       0000bbbb       |  00aaaaaa   |
+// |                          |    (b15)    |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |    (b15)    |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |       rshift_mask        |  00000000   |      000000||11      |      0000||1111      | 00||111111  |  00000000   |      000000||11      |      0000||1111      | 00||111111  |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |    rshift after vand     |  00000000   |       000000gg       |       0000ffff       |  00eeeeee   |  00000000   |       000000cc       |       0000bbbb       |  00aaaaaa   |
+// |                          |  00000000   |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |  00000000   |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |    1 octet lshift (1)    |  gghhhhhh   |       ffffgggg       |       eeeeeeff       |             |  ccdddddd   |       bbbbcccc       |       aaaaaabb       |  00000000   |
+// |                          |     b5      |          b4          |          b3          |    (b15)    |     b2      |          b1          |          b0          |  00000000   |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |      lshift_amount       |      0      |          2           |          4           |      0      |      0      |          2           |          4           |      0      |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |        after vslb        |  gghhhhhh   |       ffgggg00       |       eeff0000       |             |  ccdddddd   |       bbcccc00       |       aabb0000       |  00000000   |
+// |                          |     b5      |     b4:2..7||00      |    b3:4..7||0000     |    (b15)    |   b2:0..7   |     b1:2..7||00      |    b0:4..7||0000     |  00000000   |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |       lshift_mask        | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// |    lshift after vand     |  00hhhhhh   |       00gggg00       |       00ff0000       |  00000000   |  00dddddd   |       00cccc00       |       00bb0000       |  00000000   |
+// |                          | 00||b5:2..7 |   00||b4:4..7||00    |  00||b3:6..7||0000   |  00000000   | 00||b2:2..7 |   00||b1:4..7||00    |  00||b0:6..7||0000   |  00000000   |
+// +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
+// | after vor lshift, rshift |  00hhhhhh   |       00gggggg       |       00ffffff       |  00eeeeee   |  00dddddd   |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
+// |                          | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
+// +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
+//
+// Expand the first 12 bytes into 16 bytes, leaving every 4th byte
+// blank for now.
+// __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
+//
+// Generate two bit-shifted pieces - rshift and lshift - that will
+// later be OR'd together.
+//
+// First the right-shifted piece
+// __ vsrb(rshift, input, expand_rshift);
+// __ vand(rshift, rshift, expand_rshift_mask);
+//
+// Now the left-shifted piece, which is done by octet shifting
+// the input one byte to the left, then doing a variable shift,
+// followed by a mask operation.
+//
+// __ vslo(lshift, input, vec_8s);
+// __ vslb(lshift, lshift, expand_lshift);
+// __ vand(lshift, lshift, expand_lshift_mask);
+//
+// Combine the two pieces by OR'ing
+// __ vor(expanded, rshift, lshift);
+//
+// At this point, expanded is a vector containing a 6-bit value in each
+// byte.  These values are used as indexes into a 64-byte lookup table that
+// is contained in four vector registers.  The lookup operation is done
+// using vperm instructions with the same indexes for the lower 32 and
+// upper 32 bytes.  To figure out which of the two looked-up bytes to use
+// at each location, all values in expanded are compared to 31.  Using
+// vsel, values higher than 31 use the results from the upper 32 bytes of
+// the lookup operation, while values less than or equal to 31 use the
+// lower 32 bytes of the lookup operation.  Power10 and beyond can save the
+// compare instruction, because the comparison is done within xxpermx
+// itself. TODO: use xxpermx,xxpermx,vor on P10 when instruction prefixes are
+// available in assembler_ppc.*
+
+#define ENCODE_CORE                                                        \
+    __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);           \
+    __ vsrb(rshift, input, expand_rshift);                                 \
+    __ vand(rshift, rshift, expand_rshift_mask);                           \
+    __ vslo(lshift, input, vec_8s);                                        \
+    __ vslb(lshift, lshift, expand_lshift);                                \
+    __ vand(lshift, lshift, expand_lshift_mask);                           \
+    __ vor(expanded, rshift, lshift);                                      \
+    __ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
+    __ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
+    __ vcmpgtub(gt_31, expanded, vec_31s);                                 \
+    __ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
+
+// Intrinsic function prototype in Base64.java:
+// private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
+
+  address generate_base64_encodeBlock() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "base64_encodeBlock");
+    address start   = __ function_entry();
+
+    typedef struct {
+      unsigned char expand_permute_val[16];
+      unsigned char expand_rshift_val[16];
+      unsigned char expand_rshift_mask_val[16];
+      unsigned char expand_lshift_val[16];
+      unsigned char expand_lshift_mask_val[16];
+      unsigned char base64_00_15_val[16];
+      unsigned char base64_16_31_val[16];
+      unsigned char base64_32_47_val[16];
+      unsigned char base64_48_63_val[16];
+      unsigned char base64_48_63_URL_val[16];
+    } constant_block;
+
+    static const constant_block VEC_ALIGN const_block = {
+      .expand_permute_val = {
+        ARRAY_TO_LXV_ORDER(
+        0,  4,  5,  6,
+        0,  7,  8,  9,
+        0, 10, 11, 12,
+        0, 13, 14, 15 ) },
+
+      .expand_rshift_val = {
+        ARRAY_TO_LXV_ORDER(
+        0, 6, 4, 2,
+        0, 6, 4, 2,
+        0, 6, 4, 2,
+        0, 6, 4, 2 ) },
+
+      .expand_rshift_mask_val = {
+        ARRAY_TO_LXV_ORDER(
+        0b00000000, 0b00000011, 0b00001111, 0b00111111,
+        0b00000000, 0b00000011, 0b00001111, 0b00111111,
+        0b00000000, 0b00000011, 0b00001111, 0b00111111,
+        0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
+
+      .expand_lshift_val = {
+        ARRAY_TO_LXV_ORDER(
+        0, 2, 4, 0,
+        0, 2, 4, 0,
+        0, 2, 4, 0,
+        0, 2, 4, 0 ) },
+
+      .expand_lshift_mask_val = {
+        ARRAY_TO_LXV_ORDER(
+        0b00111111, 0b00111100, 0b00110000, 0b00000000,
+        0b00111111, 0b00111100, 0b00110000, 0b00000000,
+        0b00111111, 0b00111100, 0b00110000, 0b00000000,
+        0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
+
+      .base64_00_15_val = {
+        ARRAY_TO_LXV_ORDER(
+        'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
+
+      .base64_16_31_val = {
+        ARRAY_TO_LXV_ORDER(
+        'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
+
+      .base64_32_47_val = {
+        ARRAY_TO_LXV_ORDER(
+        'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
+
+      .base64_48_63_val = {
+        ARRAY_TO_LXV_ORDER(
+        'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
+
+      .base64_48_63_URL_val = {
+        ARRAY_TO_LXV_ORDER(
+        'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
+    };
+    #define BLK_OFFSETOF(x) (offsetof(constant_block, x))
+
+    // Number of bytes to process in each pass through the main loop.
+    // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
+    const unsigned block_size = 12;
+
+    // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
+    Register src       = R3_ARG1; // source starting address of Base64 characters
+    Register sp        = R4_ARG2; // source starting position
+    Register sl        = R5_ARG3; // total source length of the Base64 characters to be processed
+    Register dst       = R6_ARG4; // destination address
+    Register dp        = R7_ARG5; // destination starting position
+    Register isURL     = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
+
+    // Local variables
+    Register const_ptr     = R12; // used for loading constants (reuses isURL's register)
+    Register tmp_reg       = R9;  // used for speeding up load_constant()
+
+    Register size           = R9;  // number of bytes to process (reuses tmp_reg's register)
+    Register blocked_size   = R10; // number of bytes to process a block at a time
+    Register block_modulo   = R12; // == block_size (reuse const_ptr)
+    Register remaining      = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
+    Register in             = R4;  // current input (source) pointer (reuse sp's register)
+    Register num_blocks     = R11; // number of blocks to be processed by the unrolled loop
+    Register out            = R8;  // current output (destination) pointer (reuse const_ptr's register)
+    Register three          = R9;  // constant divisor (reuse size's register)
+    Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
+    Register tmp1           = R7;  // temp register for lxvl length (reuse dp's register)
+    Register modulo_chars   = R7;  // number of bytes written during the final write % 4 (reuse tmp1's register)
+    Register pad_char       = R6;  // literal '=' (reuse dst's register)
+
+    // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
+    // VR Constants
+    VectorRegister  vec_8s             = VR0;
+    VectorRegister  vec_31s            = VR1;
+    VectorRegister  vec_base64_00_15   = VR2;
+    VectorRegister  vec_base64_16_31   = VR3;
+    VectorRegister  vec_base64_32_47   = VR4;
+    VectorRegister  vec_base64_48_63   = VR5;
+    VectorRegister  expand_rshift      = VR6;
+    VectorRegister  expand_rshift_mask = VR7;
+    VectorRegister  expand_lshift      = VR8;
+    VectorRegister  expand_lshift_mask = VR9;
+
+    // VR variables for expand
+    VectorRegister  input              = VR10;
+    VectorRegister  rshift             = VR11;
+    VectorRegister  lshift             = VR12;
+    VectorRegister  expanded           = VR13;
+
+    // VR variables for lookup
+    VectorRegister  encoded_00_31      = VR10; // (reuse input)
+    VectorRegister  encoded_32_63      = VR11; // (reuse rshift)
+    VectorRegister  gt_31              = VR12; // (reuse lshift)
+
+    // VSR Constants
+    VectorSRegister expand_permute     = VSR0;
+
+    Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
+    Label loop_start, le_16_to_write, no_pad, one_pad_char;
+
+    // The upper 32 bits of the non-pointer parameter registers are not
+    // guaranteed to be zero, so mask off those upper bits.
+    __ clrldi(sp, sp, 32);
+    __ clrldi(sl, sl, 32);
+    __ clrldi(dp, dp, 32);
+    __ clrldi(isURL, isURL, 32);
+
+    // load up the constants
+    __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
+    __ lxv(expand_permute,               BLK_OFFSETOF(expand_permute_val),     const_ptr);
+    __ lxv(expand_rshift->to_vsr(),      BLK_OFFSETOF(expand_rshift_val),      const_ptr);
+    __ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
+    __ lxv(expand_lshift->to_vsr(),      BLK_OFFSETOF(expand_lshift_val),      const_ptr);
+    __ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
+    __ lxv(vec_base64_00_15->to_vsr(),   BLK_OFFSETOF(base64_00_15_val),       const_ptr);
+    __ lxv(vec_base64_16_31->to_vsr(),   BLK_OFFSETOF(base64_16_31_val),       const_ptr);
+    __ lxv(vec_base64_32_47->to_vsr(),   BLK_OFFSETOF(base64_32_47_val),       const_ptr);
+
+    // Splat the constants that can use xxspltib
+    __ xxspltib(vec_8s->to_vsr(), 8);
+    __ xxspltib(vec_31s->to_vsr(), 31);
+
+
+    // Use a different translation lookup table depending on the
+    // setting of isURL
+    __ cmpdi(CCR0, isURL, 0);
+    __ beq(CCR0, not_URL);
+    __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
+    __ b(calculate_size);
+
+    __ bind(not_URL);
+    __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
+
+    __ bind(calculate_size);
+
+    // size = sl - sp - 4 (*)
+    // (*) Don't process the last four bytes in the main loop because
+    // we don't want the lxv instruction to read past the end of the src
+    // data, in case those four bytes are on the start of an unmapped or
+    // otherwise inaccessible page.
+    //
+    __ sub(size, sl, sp);
+    __ subi(size, size, 4);
+    __ cmpdi(CCR7, size, block_size);
+    __ bgt(CCR7, calculate_blocked_size);
+    __ mr(remaining, size);
+    // Add the 4 back into remaining again
+    __ addi(remaining, remaining, 4);
+    // make "in" point to the beginning of the source data: in = src + sp
+    __ add(in, src, sp);
+    // out = dst + dp
+    __ add(out, dst, dp);
+    __ b(skip_loop);
+
+    __ bind(calculate_blocked_size);
+    __ li(block_modulo, block_size);
+    // num_blocks = size / block_modulo
+    __ divwu(num_blocks, size, block_modulo);
+    // blocked_size = num_blocks * size
+    __ mullw(blocked_size, num_blocks, block_modulo);
+    // remaining = size - blocked_size
+    __ sub(remaining, size, blocked_size);
+    __ mtctr(num_blocks);
+
+    // Add the 4 back in to remaining again
+    __ addi(remaining, remaining, 4);
+
+    // make "in" point to the beginning of the source data: in = src + sp
+    __ add(in, src, sp);
+
+    // out = dst + dp
+    __ add(out, dst, dp);
+
+    __ align(32);
+    __ bind(loop_start);
+
+    __ lxv(input->to_vsr(), 0, in);
+
+    ENCODE_CORE
+
+    __ stxv(expanded->to_vsr(), 0, out);
+    __ addi(in, in, 12);
+    __ addi(out, out, 16);
+    __ bdnz(loop_start);
+
+    __ bind(skip_loop);
+
+    // When there are less than 16 bytes left, we need to be careful not to
+    // read beyond the end of the src buffer, which might be in an unmapped
+    // page.
+    // Load the remaining bytes using lxvl.
+    __ rldicr(tmp1, remaining, 56, 7);
+    __ lxvl(input->to_vsr(), in, tmp1);
+
+    ENCODE_CORE
+
+    // bytes_to_write = ((remaining * 4) + 2) / 3
+    __ li(three, 3);
+    __ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
+    __ addi(bytes_to_write, bytes_to_write, 2);
+    __ divwu(bytes_to_write, bytes_to_write, three);
+
+    __ cmpwi(CCR7, bytes_to_write, 16);
+    __ ble_predict_taken(CCR7, le_16_to_write);
+    __ stxv(expanded->to_vsr(), 0, out);
+
+    // We've processed 12 of the 13-15 data bytes, so advance the pointers,
+    // and do one final pass for the remaining 1-3 bytes.
+    __ addi(in, in, 12);
+    __ addi(out, out, 16);
+    __ subi(remaining, remaining, 12);
+    __ subi(bytes_to_write, bytes_to_write, 16);
+    __ rldicr(tmp1, bytes_to_write, 56, 7);
+    __ lxvl(input->to_vsr(), in, tmp1);
+
+    ENCODE_CORE
+
+    __ bind(le_16_to_write);
+    // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
+    __ rldicr(tmp1, bytes_to_write, 56, 7);
+    __ stxvl(expanded->to_vsr(), out, tmp1);
+    __ add(out, out, bytes_to_write);
+
+    __ li(pad_char, '=');
+    __ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CCR0
+    // Examples:
+    //    remaining  bytes_to_write  modulo_chars  num pad chars
+    //        0            0               0            0
+    //        1            2               2            2
+    //        2            3               3            1
+    //        3            4               0            0
+    //        4            6               2            2
+    //        5            7               3            1
+    //        ...
+    //       12           16               0            0
+    //       13           18               2            2
+    //       14           19               3            1
+    //       15           20               0            0
+    __ beq(CCR0, no_pad);
+    __ cmpwi(CCR7, modulo_chars, 3);
+    __ beq(CCR7, one_pad_char);
+
+    // two pad chars
+    __ stb(pad_char, out);
+    __ addi(out, out, 1);
+
+    __ bind(one_pad_char);
+    __ stb(pad_char, out);
+
+    __ bind(no_pad);
+
+    __ blr();
+    return start;
+  }
+
 #endif // VM_LITTLE_ENDIAN

  // Initialization
@ -4121,6 +4537,7 @@ class StubGenerator: public StubCodeGenerator {
    // Currently supported on PPC64LE only
    if (UseBASE64Intrinsics) {
      StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
+      StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
    }
 #endif
  }