8154156: PPC64: improve array copy stubs by using vector instructions

Reviewed-by: goetz, mdoerr
2026-04-14 08:58:46 +00:00 · 2016-05-23 10:35:51 -03:00 · 2016-05-23 10:35:51 -03:00 · 066208e368
commit 066208e368
parent 05540f90da
7 changed files with 214 additions and 28 deletions
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
@ -503,6 +503,10 @@ class Assembler : public AbstractAssembler {
    LVSL_OPCODE    = (31u << OPCODE_SHIFT |    6u << 1),
    LVSR_OPCODE    = (31u << OPCODE_SHIFT |   38u << 1),

+    // Vector-Scalar (VSX) instruction support.
+    LXVD2X_OPCODE  = (31u << OPCODE_SHIFT |  844u << 1),
+    STXVD2X_OPCODE = (31u << OPCODE_SHIFT |  972u << 1),
+
    // Vector Permute and Formatting
    VPKPX_OPCODE   = (4u  << OPCODE_SHIFT |  782u     ),
    VPKSHSS_OPCODE = (4u  << OPCODE_SHIFT |  398u     ),
@ -1085,6 +1089,19 @@ class Assembler : public AbstractAssembler {
  static int vrs(   VectorRegister r)  { return  vrs(r->encoding());}
  static int vrt(   VectorRegister r)  { return  vrt(r->encoding());}

+  // Support Vector-Scalar (VSX) instructions.
+  static int vsra(      int         x)  { return  opp_u_field(x,            15, 11); }
+  static int vsrb(      int         x)  { return  opp_u_field(x,            20, 16); }
+  static int vsrc(      int         x)  { return  opp_u_field(x,            25, 21); }
+  static int vsrs(      int         x)  { return  opp_u_field(x,            10,  6); }
+  static int vsrt(      int         x)  { return  opp_u_field(x,            10,  6); }
+
+  static int vsra(   VectorSRegister r)  { return  vsra(r->encoding());}
+  static int vsrb(   VectorSRegister r)  { return  vsrb(r->encoding());}
+  static int vsrc(   VectorSRegister r)  { return  vsrc(r->encoding());}
+  static int vsrs(   VectorSRegister r)  { return  vsrs(r->encoding());}
+  static int vsrt(   VectorSRegister r)  { return  vsrt(r->encoding());}
+
  static int vsplt_uim( int        x)  { return  opp_u_field(x,             15, 12); } // for vsplt* instructions
  static int vsplti_sim(int        x)  { return  opp_u_field(x,             15, 11); } // for vsplti* instructions
  static int vsldoi_shb(int        x)  { return  opp_u_field(x,             25, 22); } // for vsldoi instruction
@ -2065,6 +2082,10 @@ class Assembler : public AbstractAssembler {
  inline void mtvscr(   VectorRegister b);
  inline void mfvscr(   VectorRegister d);

+  // Vector-Scalar (VSX) instructions.
+  inline void lxvd2x(   VectorSRegister d, Register a, Register b);
+  inline void stxvd2x(  VectorSRegister d, Register a, Register b);
+
  // AES (introduced with Power 8)
  inline void vcipher(     VectorRegister d, VectorRegister a, VectorRegister b);
  inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
@ -721,6 +721,10 @@ inline void Assembler::stvxl( VectorRegister d, Register s1, Register s2) { emit
 inline void Assembler::lvsl(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
 inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }

+// Vector-Scalar (VSX) instructions.
+inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(s1) | rb(s2)); }
+inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); }
+
 inline void Assembler::vpkpx(   VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE   | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
--- a/hotspot/src/cpu/ppc/vm/register_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/register_ppc.cpp
@ -75,3 +75,14 @@ const char* VectorRegisterImpl::name() const {
  };
  return is_valid() ? names[encoding()] : "vnoreg";
 }
+
+const char* VectorSRegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "VSR0",  "VSR1",  "VSR2",  "VSR3",  "VSR4",  "VSR5",  "VSR6",  "VSR7",
+    "VSR8",  "VSR9",  "VSR10", "VSR11", "VSR12", "VSR13", "VSR14", "VSR15",
+    "VSR16", "VSR17", "VSR18", "VSR19", "VSR20", "VSR21", "VSR22", "VSR23",
+    "VSR24", "VSR25", "VSR26", "VSR27", "VSR28", "VSR29", "VSR30", "VSR31"
+  };
+  return is_valid() ? names[encoding()] : "vsnoreg";
+}
+
--- a/hotspot/src/cpu/ppc/vm/register_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/register_ppc.hpp
@ -491,6 +491,106 @@ CONSTANT_REGISTER_DECLARATION(VectorRegister, VR31, (31));
 #endif // DONT_USE_REGISTER_DEFINES


+// Use VectorSRegister as a shortcut.
+class VectorSRegisterImpl;
+typedef VectorSRegisterImpl* VectorSRegister;
+
+inline VectorSRegister as_VectorSRegister(int encoding) {
+  return (VectorSRegister)(intptr_t)encoding;
+}
+
+// The implementation of Vector-Scalar (VSX) registers on POWER architecture.
+class VectorSRegisterImpl: public AbstractRegisterImpl {
+ public:
+  enum {
+    number_of_registers = 32
+  };
+
+  // construction
+  inline friend VectorSRegister as_VectorSRegister(int encoding);
+
+  // accessors
+  int encoding() const { assert(is_valid(), "invalid register"); return value(); }
+
+  // testers
+  bool is_valid() const { return 0 <=  value() &&  value() < number_of_registers; }
+
+  const char* name() const;
+};
+
+// The Vector-Scalar (VSX) registers of the POWER architecture.
+
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, vsnoreg, (-1));
+
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR0,  ( 0));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR1,  ( 1));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR2,  ( 2));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR3,  ( 3));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR4,  ( 4));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR5,  ( 5));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR6,  ( 6));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR7,  ( 7));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR8,  ( 8));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR9,  ( 9));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR10, (10));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR11, (11));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR12, (12));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR13, (13));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR14, (14));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR15, (15));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR16, (16));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR17, (17));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR18, (18));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR19, (19));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR20, (20));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR21, (21));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR22, (22));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR23, (23));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR24, (24));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR25, (25));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR26, (26));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR27, (27));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR28, (28));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR29, (29));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR30, (30));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR31, (31));
+
+#ifndef DONT_USE_REGISTER_DEFINES
+#define vsnoregi ((VectorSRegister)(vsnoreg_VectorSRegisterEnumValue))
+#define VSR0    ((VectorSRegister)(   VSR0_VectorSRegisterEnumValue))
+#define VSR1    ((VectorSRegister)(   VSR1_VectorSRegisterEnumValue))
+#define VSR2    ((VectorSRegister)(   VSR2_VectorSRegisterEnumValue))
+#define VSR3    ((VectorSRegister)(   VSR3_VectorSRegisterEnumValue))
+#define VSR4    ((VectorSRegister)(   VSR4_VectorSRegisterEnumValue))
+#define VSR5    ((VectorSRegister)(   VSR5_VectorSRegisterEnumValue))
+#define VSR6    ((VectorSRegister)(   VSR6_VectorSRegisterEnumValue))
+#define VSR7    ((VectorSRegister)(   VSR7_VectorSRegisterEnumValue))
+#define VSR8    ((VectorSRegister)(   VSR8_VectorSRegisterEnumValue))
+#define VSR9    ((VectorSRegister)(   VSR9_VectorSRegisterEnumValue))
+#define VSR10   ((VectorSRegister)(  VSR10_VectorSRegisterEnumValue))
+#define VSR11   ((VectorSRegister)(  VSR11_VectorSRegisterEnumValue))
+#define VSR12   ((VectorSRegister)(  VSR12_VectorSRegisterEnumValue))
+#define VSR13   ((VectorSRegister)(  VSR13_VectorSRegisterEnumValue))
+#define VSR14   ((VectorSRegister)(  VSR14_VectorSRegisterEnumValue))
+#define VSR15   ((VectorSRegister)(  VSR15_VectorSRegisterEnumValue))
+#define VSR16   ((VectorSRegister)(  VSR16_VectorSRegisterEnumValue))
+#define VSR17   ((VectorSRegister)(  VSR17_VectorSRegisterEnumValue))
+#define VSR18   ((VectorSRegister)(  VSR18_VectorSRegisterEnumValue))
+#define VSR19   ((VectorSRegister)(  VSR19_VectorSRegisterEnumValue))
+#define VSR20   ((VectorSRegister)(  VSR20_VectorSRegisterEnumValue))
+#define VSR21   ((VectorSRegister)(  VSR21_VectorSRegisterEnumValue))
+#define VSR22   ((VectorSRegister)(  VSR22_VectorSRegisterEnumValue))
+#define VSR23   ((VectorSRegister)(  VSR23_VectorSRegisterEnumValue))
+#define VSR24   ((VectorSRegister)(  VSR24_VectorSRegisterEnumValue))
+#define VSR25   ((VectorSRegister)(  VSR25_VectorSRegisterEnumValue))
+#define VSR26   ((VectorSRegister)(  VSR26_VectorSRegisterEnumValue))
+#define VSR27   ((VectorSRegister)(  VSR27_VectorSRegisterEnumValue))
+#define VSR28   ((VectorSRegister)(  VSR28_VectorSRegisterEnumValue))
+#define VSR29   ((VectorSRegister)(  VSR29_VectorSRegisterEnumValue))
+#define VSR30   ((VectorSRegister)(  VSR30_VectorSRegisterEnumValue))
+#define VSR31   ((VectorSRegister)(  VSR31_VectorSRegisterEnumValue))
+#endif // DONT_USE_REGISTER_DEFINES
+
 // Maximum number of incoming arguments that can be passed in i registers.
 const int PPC_ARGS_IN_REGS_NUM = 8;

--- a/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
@ -1341,10 +1341,13 @@ class StubGenerator: public StubCodeGenerator {
    Register tmp3 = R8_ARG6;
    Register tmp4 = R9_ARG7;

+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
    address start = __ function_entry();
    assert_positive_int(R5_ARG3);

-      Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;

    // don't try anything fancy if arrays don't have many elements
    __ li(tmp3, 0);
@ -1403,22 +1406,60 @@ class StubGenerator: public StubCodeGenerator {
      __ andi_(R5_ARG3, R5_ARG3, 15);
      __ mtctr(tmp1);

-      __ bind(l_8);
-      // Use unrolled version for mass copying (copy 16 elements a time).
-      // Load feeding store gets zero latency on Power6, however not on Power5.
-      // Therefore, the following sequence is made for the good of both.
-      __ ld(tmp1, 0, R3_ARG1);
-      __ ld(tmp2, 8, R3_ARG1);
-      __ ld(tmp3, 16, R3_ARG1);
-      __ ld(tmp4, 24, R3_ARG1);
-      __ std(tmp1, 0, R4_ARG2);
-      __ std(tmp2, 8, R4_ARG2);
-      __ std(tmp3, 16, R4_ARG2);
-      __ std(tmp4, 24, R4_ARG2);
-      __ addi(R3_ARG1, R3_ARG1, 32);
-      __ addi(R4_ARG2, R4_ARG2, 32);
-      __ bdnz(l_8);
-    }
+      if (!VM_Version::has_vsx()) {
+
+        __ bind(l_8);
+        // Use unrolled version for mass copying (copy 16 elements a time).
+        // Load feeding store gets zero latency on Power6, however not on Power5.
+        // Therefore, the following sequence is made for the good of both.
+        __ ld(tmp1, 0, R3_ARG1);
+        __ ld(tmp2, 8, R3_ARG1);
+        __ ld(tmp3, 16, R3_ARG1);
+        __ ld(tmp4, 24, R3_ARG1);
+        __ std(tmp1, 0, R4_ARG2);
+        __ std(tmp2, 8, R4_ARG2);
+        __ std(tmp3, 16, R4_ARG2);
+        __ std(tmp4, 24, R4_ARG2);
+        __ addi(R3_ARG1, R3_ARG1, 32);
+        __ addi(R4_ARG2, R4_ARG2, 32);
+        __ bdnz(l_8);
+
+      } else { // Processor supports VSX, so use it to mass copy.
+
+        // Prefetch src data into L2 cache.
+        __ dcbt(R3_ARG1, 0);
+
+        // If supported set DSCR pre-fetch to deepest.
+        if (VM_Version::has_mfdscr()) {
+          __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+          __ mtdscr(tmp2);
+        }
+        __ li(tmp1, 16);
+
+        // Backbranch target aligned to 32-byte. It's not aligned 16-byte
+        // as loop contains < 8 instructions that fit inside a single
+        // i-cache sector.
+        __ align(32);
+
+        __ bind(l_9);
+        // Use loop with VSX load/store instructions to
+        // copy 16 elements a time.
+        __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
+        __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
+        __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
+        __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
+        __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
+        __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
+        __ bdnz(l_9);                        // Dec CTR and loop if not zero.
+
+        // Restore DSCR pre-fetch value.
+        if (VM_Version::has_mfdscr()) {
+          __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+          __ mtdscr(tmp2);
+        }
+
+      }
+    } // FasterArrayCopy
    __ bind(l_6);

    // copy 2 elements at a time
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
@ -38,7 +38,7 @@
 # include <sys/sysinfo.h>

 bool VM_Version::_is_determine_features_test_running = false;
-
+uint64_t VM_Version::_dscr_val = 0;

 #define MSG(flag)   \
  if (flag && !FLAG_IS_DEFAULT(flag))                                  \
@ -111,7 +111,7 @@ void VM_Version::initialize() {
  // Create and print feature-string.
  char buf[(num_features+1) * 16]; // Max 16 chars per feature.
  jio_snprintf(buf, sizeof(buf),
-               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s",
+               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
               (has_fsqrt()   ? " fsqrt"   : ""),
               (has_isel()    ? " isel"    : ""),
               (has_lxarxeh() ? " lxarxeh" : ""),
@ -125,7 +125,8 @@ void VM_Version::initialize() {
               (has_vcipher() ? " aes"     : ""),
               (has_vpmsumb() ? " vpmsumb" : ""),
               (has_tcheck()  ? " tcheck"  : ""),
-               (has_mfdscr()  ? " mfdscr"  : "")
+               (has_mfdscr()  ? " mfdscr"  : ""),
+               (has_vsx()     ? " vsx"     : "")
               // Make sure number of %s matches num_features!
              );
  _features_string = os::strdup(buf);
@ -643,6 +644,7 @@ void VM_Version::determine_features() {
  a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
  a->tcheck(0);                                // code[12] -> tcheck
  a->mfdscr(R0);                               // code[13] -> mfdscr
+  a->lxvd2x(VSR0, 0, R3_ARG1);                 // code[14] -> vsx
  a->blr();

  // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@ -691,6 +693,7 @@ void VM_Version::determine_features() {
  if (code[feature_cntr++]) features |= vpmsumb_m;
  if (code[feature_cntr++]) features |= tcheck_m;
  if (code[feature_cntr++]) features |= mfdscr_m;
+  if (code[feature_cntr++]) features |= vsx_m;

  // Print the detection code.
  if (PrintAssembly) {
@ -733,31 +736,31 @@ void VM_Version::config_dscr() {
  }

  // Apply the configuration if needed.
-  uint64_t dscr_val = (*get_dscr)();
+  _dscr_val = (*get_dscr)();
  if (Verbose) {
-    tty->print_cr("dscr value was 0x%lx" , dscr_val);
+    tty->print_cr("dscr value was 0x%lx" , _dscr_val);
  }
  bool change_requested = false;
  if (DSCR_PPC64 != (uintx)-1) {
-    dscr_val = DSCR_PPC64;
+    _dscr_val = DSCR_PPC64;
    change_requested = true;
  }
  if (DSCR_DPFD_PPC64 <= 7) {
    uint64_t mask = 0x7;
-    if ((dscr_val & mask) != DSCR_DPFD_PPC64) {
-      dscr_val = (dscr_val & ~mask) | (DSCR_DPFD_PPC64);
+    if ((_dscr_val & mask) != DSCR_DPFD_PPC64) {
+      _dscr_val = (_dscr_val & ~mask) | (DSCR_DPFD_PPC64);
      change_requested = true;
    }
  }
  if (DSCR_URG_PPC64 <= 7) {
    uint64_t mask = 0x7 << 6;
-    if ((dscr_val & mask) != DSCR_DPFD_PPC64 << 6) {
-      dscr_val = (dscr_val & ~mask) | (DSCR_URG_PPC64 << 6);
+    if ((_dscr_val & mask) != DSCR_DPFD_PPC64 << 6) {
+      _dscr_val = (_dscr_val & ~mask) | (DSCR_URG_PPC64 << 6);
      change_requested = true;
    }
  }
  if (change_requested) {
-    (*set_dscr)(dscr_val);
+    (*set_dscr)(_dscr_val);
    if (Verbose) {
      tty->print_cr("dscr was set to 0x%lx" , (*get_dscr)());
    }
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp
@ -46,6 +46,7 @@ protected:
    vpmsumb,
    tcheck,
    mfdscr,
+    vsx,
    num_features // last entry to count features
  };
  enum Feature_Flag_Set {
@ -64,6 +65,7 @@ protected:
    vpmsumb_m             = (1 << vpmsumb),
    tcheck_m              = (1 << tcheck ),
    mfdscr_m              = (1 << mfdscr ),
+    vsx_m                 = (1 << vsx    ),
    all_features_m        = (unsigned long)-1
  };

@ -97,10 +99,14 @@ public:
  static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
  static bool has_tcheck()  { return (_features & tcheck_m) != 0; }
  static bool has_mfdscr()  { return (_features & mfdscr_m) != 0; }
+  static bool has_vsx()     { return (_features & vsx_m) != 0; }

  // Assembler testing
  static void allow_all();
  static void revert();
+
+  // POWER 8: DSCR current value.
+  static uint64_t _dscr_val;
 };

 #endif // CPU_PPC_VM_VM_VERSION_PPC_HPP