8154156: PPC64: improve array copy stubs by using vector instructions

Reviewed-by: goetz, mdoerr
This commit is contained in:
Gustavo Romero 2016-05-23 10:35:51 -03:00 committed by Martin Doerr
parent 05540f90da
commit 066208e368
7 changed files with 214 additions and 28 deletions

View File

@ -503,6 +503,10 @@ class Assembler : public AbstractAssembler {
LVSL_OPCODE = (31u << OPCODE_SHIFT | 6u << 1),
LVSR_OPCODE = (31u << OPCODE_SHIFT | 38u << 1),
// Vector-Scalar (VSX) instruction support.
LXVD2X_OPCODE = (31u << OPCODE_SHIFT | 844u << 1),
STXVD2X_OPCODE = (31u << OPCODE_SHIFT | 972u << 1),
// Vector Permute and Formatting
VPKPX_OPCODE = (4u << OPCODE_SHIFT | 782u ),
VPKSHSS_OPCODE = (4u << OPCODE_SHIFT | 398u ),
@ -1085,6 +1089,19 @@ class Assembler : public AbstractAssembler {
static int vrs( VectorRegister r) { return vrs(r->encoding());}
static int vrt( VectorRegister r) { return vrt(r->encoding());}
// Support Vector-Scalar (VSX) instructions.
static int vsra( int x) { return opp_u_field(x, 15, 11); }
static int vsrb( int x) { return opp_u_field(x, 20, 16); }
static int vsrc( int x) { return opp_u_field(x, 25, 21); }
static int vsrs( int x) { return opp_u_field(x, 10, 6); }
static int vsrt( int x) { return opp_u_field(x, 10, 6); }
static int vsra( VectorSRegister r) { return vsra(r->encoding());}
static int vsrb( VectorSRegister r) { return vsrb(r->encoding());}
static int vsrc( VectorSRegister r) { return vsrc(r->encoding());}
static int vsrs( VectorSRegister r) { return vsrs(r->encoding());}
static int vsrt( VectorSRegister r) { return vsrt(r->encoding());}
static int vsplt_uim( int x) { return opp_u_field(x, 15, 12); } // for vsplt* instructions
static int vsplti_sim(int x) { return opp_u_field(x, 15, 11); } // for vsplti* instructions
static int vsldoi_shb(int x) { return opp_u_field(x, 25, 22); } // for vsldoi instruction
@ -2065,6 +2082,10 @@ class Assembler : public AbstractAssembler {
inline void mtvscr( VectorRegister b);
inline void mfvscr( VectorRegister d);
// Vector-Scalar (VSX) instructions.
inline void lxvd2x( VectorSRegister d, Register a, Register b);
inline void stxvd2x( VectorSRegister d, Register a, Register b);
// AES (introduced with Power 8)
inline void vcipher( VectorRegister d, VectorRegister a, VectorRegister b);
inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);

View File

@ -721,6 +721,10 @@ inline void Assembler::stvxl( VectorRegister d, Register s1, Register s2) { emit
inline void Assembler::lvsl( VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); }
inline void Assembler::lvsr( VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); }
// Vector-Scalar (VSX) instructions.
inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); }
inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); }
inline void Assembler::vpkpx( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE | vrt(d) | vra(a) | vrb(b)); }
inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }

View File

@ -75,3 +75,14 @@ const char* VectorRegisterImpl::name() const {
};
return is_valid() ? names[encoding()] : "vnoreg";
}
const char* VectorSRegisterImpl::name() const {
const char* names[number_of_registers] = {
"VSR0", "VSR1", "VSR2", "VSR3", "VSR4", "VSR5", "VSR6", "VSR7",
"VSR8", "VSR9", "VSR10", "VSR11", "VSR12", "VSR13", "VSR14", "VSR15",
"VSR16", "VSR17", "VSR18", "VSR19", "VSR20", "VSR21", "VSR22", "VSR23",
"VSR24", "VSR25", "VSR26", "VSR27", "VSR28", "VSR29", "VSR30", "VSR31"
};
return is_valid() ? names[encoding()] : "vsnoreg";
}

View File

@ -491,6 +491,106 @@ CONSTANT_REGISTER_DECLARATION(VectorRegister, VR31, (31));
#endif // DONT_USE_REGISTER_DEFINES
// Use VectorSRegister as a shortcut.
class VectorSRegisterImpl;
typedef VectorSRegisterImpl* VectorSRegister;
inline VectorSRegister as_VectorSRegister(int encoding) {
return (VectorSRegister)(intptr_t)encoding;
}
// The implementation of Vector-Scalar (VSX) registers on POWER architecture.
class VectorSRegisterImpl: public AbstractRegisterImpl {
public:
enum {
number_of_registers = 32
};
// construction
inline friend VectorSRegister as_VectorSRegister(int encoding);
// accessors
int encoding() const { assert(is_valid(), "invalid register"); return value(); }
// testers
bool is_valid() const { return 0 <= value() && value() < number_of_registers; }
const char* name() const;
};
// The Vector-Scalar (VSX) registers of the POWER architecture.
CONSTANT_REGISTER_DECLARATION(VectorSRegister, vsnoreg, (-1));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR0, ( 0));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR1, ( 1));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR2, ( 2));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR3, ( 3));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR4, ( 4));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR5, ( 5));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR6, ( 6));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR7, ( 7));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR8, ( 8));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR9, ( 9));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR10, (10));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR11, (11));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR12, (12));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR13, (13));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR14, (14));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR15, (15));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR16, (16));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR17, (17));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR18, (18));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR19, (19));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR20, (20));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR21, (21));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR22, (22));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR23, (23));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR24, (24));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR25, (25));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR26, (26));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR27, (27));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR28, (28));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR29, (29));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR30, (30));
CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR31, (31));
#ifndef DONT_USE_REGISTER_DEFINES
#define vsnoregi ((VectorSRegister)(vsnoreg_VectorSRegisterEnumValue))
#define VSR0 ((VectorSRegister)( VSR0_VectorSRegisterEnumValue))
#define VSR1 ((VectorSRegister)( VSR1_VectorSRegisterEnumValue))
#define VSR2 ((VectorSRegister)( VSR2_VectorSRegisterEnumValue))
#define VSR3 ((VectorSRegister)( VSR3_VectorSRegisterEnumValue))
#define VSR4 ((VectorSRegister)( VSR4_VectorSRegisterEnumValue))
#define VSR5 ((VectorSRegister)( VSR5_VectorSRegisterEnumValue))
#define VSR6 ((VectorSRegister)( VSR6_VectorSRegisterEnumValue))
#define VSR7 ((VectorSRegister)( VSR7_VectorSRegisterEnumValue))
#define VSR8 ((VectorSRegister)( VSR8_VectorSRegisterEnumValue))
#define VSR9 ((VectorSRegister)( VSR9_VectorSRegisterEnumValue))
#define VSR10 ((VectorSRegister)( VSR10_VectorSRegisterEnumValue))
#define VSR11 ((VectorSRegister)( VSR11_VectorSRegisterEnumValue))
#define VSR12 ((VectorSRegister)( VSR12_VectorSRegisterEnumValue))
#define VSR13 ((VectorSRegister)( VSR13_VectorSRegisterEnumValue))
#define VSR14 ((VectorSRegister)( VSR14_VectorSRegisterEnumValue))
#define VSR15 ((VectorSRegister)( VSR15_VectorSRegisterEnumValue))
#define VSR16 ((VectorSRegister)( VSR16_VectorSRegisterEnumValue))
#define VSR17 ((VectorSRegister)( VSR17_VectorSRegisterEnumValue))
#define VSR18 ((VectorSRegister)( VSR18_VectorSRegisterEnumValue))
#define VSR19 ((VectorSRegister)( VSR19_VectorSRegisterEnumValue))
#define VSR20 ((VectorSRegister)( VSR20_VectorSRegisterEnumValue))
#define VSR21 ((VectorSRegister)( VSR21_VectorSRegisterEnumValue))
#define VSR22 ((VectorSRegister)( VSR22_VectorSRegisterEnumValue))
#define VSR23 ((VectorSRegister)( VSR23_VectorSRegisterEnumValue))
#define VSR24 ((VectorSRegister)( VSR24_VectorSRegisterEnumValue))
#define VSR25 ((VectorSRegister)( VSR25_VectorSRegisterEnumValue))
#define VSR26 ((VectorSRegister)( VSR26_VectorSRegisterEnumValue))
#define VSR27 ((VectorSRegister)( VSR27_VectorSRegisterEnumValue))
#define VSR28 ((VectorSRegister)( VSR28_VectorSRegisterEnumValue))
#define VSR29 ((VectorSRegister)( VSR29_VectorSRegisterEnumValue))
#define VSR30 ((VectorSRegister)( VSR30_VectorSRegisterEnumValue))
#define VSR31 ((VectorSRegister)( VSR31_VectorSRegisterEnumValue))
#endif // DONT_USE_REGISTER_DEFINES
// Maximum number of incoming arguments that can be passed in i registers.
const int PPC_ARGS_IN_REGS_NUM = 8;

View File

@ -1341,10 +1341,13 @@ class StubGenerator: public StubCodeGenerator {
Register tmp3 = R8_ARG6;
Register tmp4 = R9_ARG7;
VectorSRegister tmp_vsr1 = VSR1;
VectorSRegister tmp_vsr2 = VSR2;
address start = __ function_entry();
assert_positive_int(R5_ARG3);
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
// don't try anything fancy if arrays don't have many elements
__ li(tmp3, 0);
@ -1403,22 +1406,60 @@ class StubGenerator: public StubCodeGenerator {
__ andi_(R5_ARG3, R5_ARG3, 15);
__ mtctr(tmp1);
__ bind(l_8);
// Use unrolled version for mass copying (copy 16 elements a time).
// Load feeding store gets zero latency on Power6, however not on Power5.
// Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_8);
}
if (!VM_Version::has_vsx()) {
__ bind(l_8);
// Use unrolled version for mass copying (copy 16 elements a time).
// Load feeding store gets zero latency on Power6, however not on Power5.
// Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_8);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch src data into L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. It's not aligned 16-byte
// as loop contains < 8 instructions that fit inside a single
// i-cache sector.
__ align(32);
__ bind(l_9);
// Use loop with VSX load/store instructions to
// copy 16 elements a time.
__ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src.
__ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst.
__ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
__ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
__ bdnz(l_9); // Dec CTR and loop if not zero.
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val);
__ mtdscr(tmp2);
}
}
} // FasterArrayCopy
__ bind(l_6);
// copy 2 elements at a time

View File

@ -38,7 +38,7 @@
# include <sys/sysinfo.h>
bool VM_Version::_is_determine_features_test_running = false;
uint64_t VM_Version::_dscr_val = 0;
#define MSG(flag) \
if (flag && !FLAG_IS_DEFAULT(flag)) \
@ -111,7 +111,7 @@ void VM_Version::initialize() {
// Create and print feature-string.
char buf[(num_features+1) * 16]; // Max 16 chars per feature.
jio_snprintf(buf, sizeof(buf),
"ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s",
"ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
(has_fsqrt() ? " fsqrt" : ""),
(has_isel() ? " isel" : ""),
(has_lxarxeh() ? " lxarxeh" : ""),
@ -125,7 +125,8 @@ void VM_Version::initialize() {
(has_vcipher() ? " aes" : ""),
(has_vpmsumb() ? " vpmsumb" : ""),
(has_tcheck() ? " tcheck" : ""),
(has_mfdscr() ? " mfdscr" : "")
(has_mfdscr() ? " mfdscr" : ""),
(has_vsx() ? " vsx" : "")
// Make sure number of %s matches num_features!
);
_features_string = os::strdup(buf);
@ -643,6 +644,7 @@ void VM_Version::determine_features() {
a->vpmsumb(VR0, VR1, VR2); // code[11] -> vpmsumb
a->tcheck(0); // code[12] -> tcheck
a->mfdscr(R0); // code[13] -> mfdscr
a->lxvd2x(VSR0, 0, R3_ARG1); // code[14] -> vsx
a->blr();
// Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@ -691,6 +693,7 @@ void VM_Version::determine_features() {
if (code[feature_cntr++]) features |= vpmsumb_m;
if (code[feature_cntr++]) features |= tcheck_m;
if (code[feature_cntr++]) features |= mfdscr_m;
if (code[feature_cntr++]) features |= vsx_m;
// Print the detection code.
if (PrintAssembly) {
@ -733,31 +736,31 @@ void VM_Version::config_dscr() {
}
// Apply the configuration if needed.
uint64_t dscr_val = (*get_dscr)();
_dscr_val = (*get_dscr)();
if (Verbose) {
tty->print_cr("dscr value was 0x%lx" , dscr_val);
tty->print_cr("dscr value was 0x%lx" , _dscr_val);
}
bool change_requested = false;
if (DSCR_PPC64 != (uintx)-1) {
dscr_val = DSCR_PPC64;
_dscr_val = DSCR_PPC64;
change_requested = true;
}
if (DSCR_DPFD_PPC64 <= 7) {
uint64_t mask = 0x7;
if ((dscr_val & mask) != DSCR_DPFD_PPC64) {
dscr_val = (dscr_val & ~mask) | (DSCR_DPFD_PPC64);
if ((_dscr_val & mask) != DSCR_DPFD_PPC64) {
_dscr_val = (_dscr_val & ~mask) | (DSCR_DPFD_PPC64);
change_requested = true;
}
}
if (DSCR_URG_PPC64 <= 7) {
uint64_t mask = 0x7 << 6;
if ((dscr_val & mask) != DSCR_DPFD_PPC64 << 6) {
dscr_val = (dscr_val & ~mask) | (DSCR_URG_PPC64 << 6);
if ((_dscr_val & mask) != DSCR_DPFD_PPC64 << 6) {
_dscr_val = (_dscr_val & ~mask) | (DSCR_URG_PPC64 << 6);
change_requested = true;
}
}
if (change_requested) {
(*set_dscr)(dscr_val);
(*set_dscr)(_dscr_val);
if (Verbose) {
tty->print_cr("dscr was set to 0x%lx" , (*get_dscr)());
}

View File

@ -46,6 +46,7 @@ protected:
vpmsumb,
tcheck,
mfdscr,
vsx,
num_features // last entry to count features
};
enum Feature_Flag_Set {
@ -64,6 +65,7 @@ protected:
vpmsumb_m = (1 << vpmsumb),
tcheck_m = (1 << tcheck ),
mfdscr_m = (1 << mfdscr ),
vsx_m = (1 << vsx ),
all_features_m = (unsigned long)-1
};
@ -97,10 +99,14 @@ public:
static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
static bool has_tcheck() { return (_features & tcheck_m) != 0; }
static bool has_mfdscr() { return (_features & mfdscr_m) != 0; }
static bool has_vsx() { return (_features & vsx_m) != 0; }
// Assembler testing
static void allow_all();
static void revert();
// POWER 8: DSCR current value.
static uint64_t _dscr_val;
};
#endif // CPU_PPC_VM_VM_VERSION_PPC_HPP