mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-14 18:03:44 +00:00
8318650: Optimized subword gather for x86 targets.
Reviewed-by: sviswanathan, epeter, psandoz
This commit is contained in:
parent
6d5699617f
commit
185e711bfe
@ -169,9 +169,7 @@ source %{
|
||||
case Op_VectorMaskGen:
|
||||
case Op_LoadVectorMasked:
|
||||
case Op_StoreVectorMasked:
|
||||
case Op_LoadVectorGather:
|
||||
case Op_StoreVectorScatter:
|
||||
case Op_LoadVectorGatherMasked:
|
||||
case Op_StoreVectorScatterMasked:
|
||||
case Op_PopulateIndex:
|
||||
case Op_CompressM:
|
||||
@ -180,6 +178,12 @@ source %{
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_LoadVectorGather:
|
||||
case Op_LoadVectorGatherMasked:
|
||||
if (UseSVE == 0 || is_subword_type(bt)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_MulAddVS2VI:
|
||||
if (length_in_bytes != 16) {
|
||||
return false;
|
||||
|
||||
@ -133,6 +133,11 @@
|
||||
return true;
|
||||
}
|
||||
|
||||
// Does target support predicated operation emulation.
|
||||
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does the CPU supports vector variable rotate instructions?
|
||||
static constexpr bool supports_vector_variable_rotates(void) {
|
||||
return false;
|
||||
|
||||
@ -126,6 +126,11 @@
|
||||
return VM_Version::has_simd();
|
||||
}
|
||||
|
||||
// Does target support predicated operation emulation.
|
||||
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does the CPU supports vector variable rotate instructions?
|
||||
static constexpr bool supports_vector_variable_rotates(void) {
|
||||
return false; // not supported
|
||||
|
||||
@ -133,6 +133,11 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does target support predicated operation emulation.
|
||||
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does the CPU supports vector variable rotate instructions?
|
||||
static constexpr bool supports_vector_variable_rotates(void) {
|
||||
return false;
|
||||
|
||||
@ -132,6 +132,11 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does target support predicated operation emulation.
|
||||
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does the CPU supports vector variable rotate instructions?
|
||||
static constexpr bool supports_vector_variable_rotates(void) {
|
||||
return false;
|
||||
|
||||
@ -73,6 +73,11 @@ source %{
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_LoadVectorGatherMasked:
|
||||
if (is_subword_type(bt)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_VectorCastHF2F:
|
||||
case Op_VectorCastF2HF:
|
||||
return UseZvfh;
|
||||
|
||||
@ -124,6 +124,11 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does target support predicated operation emulation.
|
||||
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does the CPU supports vector variable rotate instructions?
|
||||
static constexpr bool supports_vector_variable_rotates(void) {
|
||||
return false;
|
||||
|
||||
@ -13652,9 +13652,13 @@ void Assembler::notq(Register dst) {
|
||||
emit_int16((unsigned char)0xF7, (0xD0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::btq(Register dst, Register src) {
|
||||
int encode = prefixq_and_encode(src->encoding(), dst->encoding());
|
||||
emit_int24(0x0F, (unsigned char)0xA3, (encode | 0xC0));
|
||||
}
|
||||
|
||||
void Assembler::btq(Register src, int imm8) {
|
||||
assert(isByte(imm8), "not a byte");
|
||||
InstructionMark im(this);
|
||||
int encode = prefixq_and_encode(src->encoding());
|
||||
emit_int16(0x0f, 0xba);
|
||||
emit_int8(0xe0|encode);
|
||||
|
||||
@ -1736,6 +1736,7 @@ private:
|
||||
void btrq(Address dst, int imm8);
|
||||
void btq(Register src, int imm8);
|
||||
#endif
|
||||
void btq(Register dst, Register src);
|
||||
|
||||
void orw(Register dst, Register src);
|
||||
|
||||
|
||||
@ -1796,6 +1796,130 @@ void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
|
||||
XMMRegister dst, Register base,
|
||||
Register idx_base,
|
||||
Register offset, Register mask,
|
||||
Register mask_idx, Register rtmp,
|
||||
int vlen_enc) {
|
||||
vpxor(dst, dst, dst, vlen_enc);
|
||||
if (elem_bt == T_SHORT) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
|
||||
Label skip_load;
|
||||
btq(mask, mask_idx);
|
||||
jccb(Assembler::carryClear, skip_load);
|
||||
movl(rtmp, Address(idx_base, i * 4));
|
||||
if (offset != noreg) {
|
||||
addl(rtmp, offset);
|
||||
}
|
||||
pinsrw(dst, Address(base, rtmp, Address::times_2), i);
|
||||
bind(skip_load);
|
||||
incq(mask_idx);
|
||||
}
|
||||
} else {
|
||||
assert(elem_bt == T_BYTE, "");
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
|
||||
Label skip_load;
|
||||
btq(mask, mask_idx);
|
||||
jccb(Assembler::carryClear, skip_load);
|
||||
movl(rtmp, Address(idx_base, i * 4));
|
||||
if (offset != noreg) {
|
||||
addl(rtmp, offset);
|
||||
}
|
||||
pinsrb(dst, Address(base, rtmp), i);
|
||||
bind(skip_load);
|
||||
incq(mask_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // _LP64
|
||||
|
||||
void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
|
||||
Register base, Register idx_base,
|
||||
Register offset, Register rtmp,
|
||||
int vlen_enc) {
|
||||
vpxor(dst, dst, dst, vlen_enc);
|
||||
if (elem_bt == T_SHORT) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// dst[i] = src[offset + idx_base[i]]
|
||||
movl(rtmp, Address(idx_base, i * 4));
|
||||
if (offset != noreg) {
|
||||
addl(rtmp, offset);
|
||||
}
|
||||
pinsrw(dst, Address(base, rtmp, Address::times_2), i);
|
||||
}
|
||||
} else {
|
||||
assert(elem_bt == T_BYTE, "");
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// dst[i] = src[offset + idx_base[i]]
|
||||
movl(rtmp, Address(idx_base, i * 4));
|
||||
if (offset != noreg) {
|
||||
addl(rtmp, offset);
|
||||
}
|
||||
pinsrb(dst, Address(base, rtmp), i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Gather using hybrid algorithm, first partially unroll scalar loop
|
||||
* to accumulate values from gather indices into a quad-word(64bit) slice.
|
||||
* A slice may hold 8 bytes or 4 short values. This is followed by a vector
|
||||
* permutation to place the slice into appropriate vector lane
|
||||
* locations in destination vector. Following pseudo code describes the
|
||||
* algorithm in detail:
|
||||
*
|
||||
* DST_VEC = ZERO_VEC
|
||||
* PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
|
||||
* TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
|
||||
* FOREACH_ITER:
|
||||
* TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
|
||||
* TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
|
||||
* DST_VEC = DST_VEC OR TEMP_PERM_VEC
|
||||
* PERM_INDEX = PERM_INDEX - TWO_VEC
|
||||
*
|
||||
* With each iteration, doubleword permute indices (0,1) corresponding
|
||||
* to gathered quadword gets right shifted by two lane positions.
|
||||
*
|
||||
*/
|
||||
void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
|
||||
Register base, Register idx_base,
|
||||
Register offset, Register mask,
|
||||
XMMRegister xtmp1, XMMRegister xtmp2,
|
||||
XMMRegister temp_dst, Register rtmp,
|
||||
Register mask_idx, Register length,
|
||||
int vector_len, int vlen_enc) {
|
||||
Label GATHER8_LOOP;
|
||||
assert(is_subword_type(elem_ty), "");
|
||||
movl(length, vector_len);
|
||||
vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
|
||||
vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
|
||||
vallones(xtmp2, vlen_enc);
|
||||
vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
|
||||
vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
|
||||
load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
|
||||
|
||||
bind(GATHER8_LOOP);
|
||||
// TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
|
||||
if (mask == noreg) {
|
||||
vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
|
||||
} else {
|
||||
LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
|
||||
}
|
||||
// TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
|
||||
vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
|
||||
// PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
|
||||
vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
|
||||
// DST_VEC = DST_VEC OR TEMP_PERM_VEC
|
||||
vpor(dst, dst, temp_dst, vlen_enc);
|
||||
addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
|
||||
subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
|
||||
jcc(Assembler::notEqual, GATHER8_LOOP);
|
||||
}
|
||||
|
||||
void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
|
||||
switch(typ) {
|
||||
case T_INT:
|
||||
|
||||
@ -500,4 +500,16 @@ public:
|
||||
void vector_rearrange_int_float(BasicType bt, XMMRegister dst, XMMRegister shuffle,
|
||||
XMMRegister src, int vlen_enc);
|
||||
|
||||
|
||||
void vgather_subword(BasicType elem_ty, XMMRegister dst, Register base, Register idx_base, Register offset,
|
||||
Register mask, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
|
||||
Register midx, Register length, int vector_len, int vlen_enc);
|
||||
|
||||
#ifdef _LP64
|
||||
void vgather8b_masked_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
|
||||
Register offset, Register mask, Register midx, Register rtmp, int vlen_enc);
|
||||
#endif
|
||||
void vgather8b_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
|
||||
Register offset, Register rtmp, int vlen_enc);
|
||||
|
||||
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP
|
||||
|
||||
@ -154,6 +154,16 @@
|
||||
return (UseAVX >= 2);
|
||||
}
|
||||
|
||||
// Does target support predicated operation emulation.
|
||||
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
|
||||
switch(vopc) {
|
||||
case Op_LoadVectorGatherMasked:
|
||||
return is_subword_type(bt) && VM_Version::supports_avx2();
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Does the CPU supports vector variable rotate instructions?
|
||||
static constexpr bool supports_vector_variable_rotates(void) {
|
||||
return true;
|
||||
@ -214,6 +224,9 @@
|
||||
return 7;
|
||||
case Op_MulVL:
|
||||
return VM_Version::supports_avx512vldq() ? 0 : 6;
|
||||
case Op_LoadVectorGather:
|
||||
case Op_LoadVectorGatherMasked:
|
||||
return is_subword_type(ety) ? 50 : 0;
|
||||
case Op_VectorCastF2X: // fall through
|
||||
case Op_VectorCastD2X:
|
||||
return is_floating_point_type(ety) ? 0 : (is_subword_type(ety) ? 35 : 30);
|
||||
|
||||
@ -1569,6 +1569,7 @@ bool Matcher::match_rule_supported(int opcode) {
|
||||
}
|
||||
break;
|
||||
case Op_LoadVectorGather:
|
||||
case Op_LoadVectorGatherMasked:
|
||||
if (UseAVX < 2) {
|
||||
return false;
|
||||
}
|
||||
@ -1906,6 +1907,17 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
|
||||
}
|
||||
break;
|
||||
case Op_LoadVectorGatherMasked:
|
||||
if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
|
||||
return false;
|
||||
}
|
||||
if (is_subword_type(bt) &&
|
||||
(!is_LP64 ||
|
||||
(size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
|
||||
(size_in_bits < 64) ||
|
||||
(bt == T_SHORT && !VM_Version::supports_bmi2()))) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_StoreVectorScatterMasked:
|
||||
case Op_StoreVectorScatter:
|
||||
if (is_subword_type(bt)) {
|
||||
@ -1915,7 +1927,10 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
|
||||
}
|
||||
// fallthrough
|
||||
case Op_LoadVectorGather:
|
||||
if (size_in_bits == 64 ) {
|
||||
if (!is_subword_type(bt) && size_in_bits == 64) {
|
||||
return false;
|
||||
}
|
||||
if (is_subword_type(bt) && size_in_bits < 64) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
@ -4024,10 +4039,11 @@ instruct storeV(memory mem, vec src) %{
|
||||
|
||||
// ---------------------------------------- Gather ------------------------------------
|
||||
|
||||
// Gather INT, LONG, FLOAT, DOUBLE
|
||||
// Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
|
||||
|
||||
instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
|
||||
predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
|
||||
predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
|
||||
Matcher::vector_length_in_bytes(n) <= 32);
|
||||
match(Set dst (LoadVectorGather mem idx));
|
||||
effect(TEMP dst, TEMP tmp, TEMP mask);
|
||||
format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
|
||||
@ -4044,7 +4060,8 @@ instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
|
||||
|
||||
|
||||
instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
|
||||
predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
|
||||
predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
|
||||
!is_subword_type(Matcher::vector_element_basic_type(n)));
|
||||
match(Set dst (LoadVectorGather mem idx));
|
||||
effect(TEMP dst, TEMP tmp, TEMP ktmp);
|
||||
format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
|
||||
@ -4059,7 +4076,8 @@ instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
|
||||
%}
|
||||
|
||||
instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
|
||||
predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
|
||||
predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
|
||||
!is_subword_type(Matcher::vector_element_basic_type(n)));
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
|
||||
effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
|
||||
format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
|
||||
@ -4077,6 +4095,238 @@ instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRe
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
|
||||
predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
|
||||
match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
|
||||
effect(TEMP tmp, TEMP rtmp);
|
||||
format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
|
||||
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
|
||||
predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
|
||||
match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
|
||||
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
|
||||
format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vector_len = Matcher::vector_length(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
|
||||
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
|
||||
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
|
||||
predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
|
||||
match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
|
||||
effect(TEMP tmp, TEMP rtmp, KILL cr);
|
||||
format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
|
||||
instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
|
||||
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
|
||||
predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
|
||||
match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
|
||||
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
|
||||
format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vector_len = Matcher::vector_length(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
|
||||
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
|
||||
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
|
||||
#ifdef _LP64
|
||||
instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
|
||||
predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
|
||||
effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
|
||||
format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ xorq($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ kmovql($rtmp2$$Register, $mask$$KRegister);
|
||||
__ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
|
||||
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
|
||||
predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
|
||||
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
|
||||
format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vector_len = Matcher::vector_length(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ xorq($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
|
||||
__ kmovql($rtmp2$$Register, $mask$$KRegister);
|
||||
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
|
||||
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
|
||||
predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
|
||||
effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
|
||||
format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ xorq($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ kmovql($rtmp2$$Register, $mask$$KRegister);
|
||||
__ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
|
||||
$rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
|
||||
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
|
||||
predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
|
||||
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
|
||||
format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vector_len = Matcher::vector_length(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ xorq($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
|
||||
__ kmovql($rtmp2$$Register, $mask$$KRegister);
|
||||
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
|
||||
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
|
||||
predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
|
||||
effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
|
||||
format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
|
||||
if (elem_bt == T_SHORT) {
|
||||
__ movl($mask_idx$$Register, 0x55555555);
|
||||
__ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
|
||||
}
|
||||
__ xorl($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
|
||||
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
|
||||
predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
|
||||
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
|
||||
format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vector_len = Matcher::vector_length(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
|
||||
__ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
|
||||
if (elem_bt == T_SHORT) {
|
||||
__ movl($mask_idx$$Register, 0x55555555);
|
||||
__ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
|
||||
}
|
||||
__ xorl($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
|
||||
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
|
||||
predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
|
||||
effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
|
||||
format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
|
||||
if (elem_bt == T_SHORT) {
|
||||
__ movl($mask_idx$$Register, 0x55555555);
|
||||
__ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
|
||||
}
|
||||
__ xorl($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
|
||||
$rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
|
||||
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
|
||||
predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
|
||||
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
|
||||
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
|
||||
format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
|
||||
ins_encode %{
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
int vector_len = Matcher::vector_length(this);
|
||||
BasicType elem_bt = Matcher::vector_element_basic_type(this);
|
||||
__ xorl($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ lea($tmp$$Register, $mem$$Address);
|
||||
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
|
||||
__ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
|
||||
if (elem_bt == T_SHORT) {
|
||||
__ movl($mask_idx$$Register, 0x55555555);
|
||||
__ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
|
||||
}
|
||||
__ xorl($mask_idx$$Register, $mask_idx$$Register);
|
||||
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
|
||||
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
#endif
|
||||
|
||||
// ====================Scatter=======================================
|
||||
|
||||
// Scatter INT, LONG, FLOAT, DOUBLE
|
||||
|
||||
@ -1050,6 +1050,8 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
} break;
|
||||
case Op_CountTrailingZerosV:
|
||||
case Op_CountLeadingZerosV:
|
||||
case Op_LoadVectorGather:
|
||||
case Op_LoadVectorGatherMasked:
|
||||
case Op_ReverseV:
|
||||
case Op_RoundVF:
|
||||
case Op_RoundVD:
|
||||
|
||||
@ -2474,7 +2474,22 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
|
||||
n->del_req(3);
|
||||
break;
|
||||
}
|
||||
case Op_LoadVectorGather:
|
||||
if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) {
|
||||
Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1));
|
||||
n->set_req(MemNode::ValueIn, pair);
|
||||
n->del_req(MemNode::ValueIn+1);
|
||||
}
|
||||
break;
|
||||
case Op_LoadVectorGatherMasked:
|
||||
if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) {
|
||||
Node* pair2 = new BinaryNode(n->in(MemNode::ValueIn + 1), n->in(MemNode::ValueIn + 2));
|
||||
Node* pair1 = new BinaryNode(n->in(MemNode::ValueIn), pair2);
|
||||
n->set_req(MemNode::ValueIn, pair1);
|
||||
n->del_req(MemNode::ValueIn+2);
|
||||
n->del_req(MemNode::ValueIn+1);
|
||||
break;
|
||||
} // fall-through
|
||||
case Op_StoreVectorScatter: {
|
||||
Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1));
|
||||
n->set_req(MemNode::ValueIn, pair);
|
||||
|
||||
@ -302,6 +302,7 @@ bool LibraryCallKit::arch_supports_vector(int sopc, int num_elem, BasicType type
|
||||
is_supported = Matcher::match_rule_supported_vector_masked(sopc, num_elem, type);
|
||||
}
|
||||
}
|
||||
is_supported |= Matcher::supports_vector_predicate_op_emulation(sopc, num_elem, type);
|
||||
|
||||
if (!is_supported) {
|
||||
#ifndef PRODUCT
|
||||
@ -1500,8 +1501,8 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
|
||||
}
|
||||
|
||||
// Check whether the predicated gather/scatter node is supported by architecture.
|
||||
if (!arch_supports_vector(is_scatter ? Op_StoreVectorScatterMasked : Op_LoadVectorGatherMasked, num_elem, elem_bt,
|
||||
(VectorMaskUseType) (VecMaskUseLoad | VecMaskUsePred))) {
|
||||
VectorMaskUseType mask = (VectorMaskUseType) (VecMaskUseLoad | VecMaskUsePred);
|
||||
if (!arch_supports_vector(is_scatter ? Op_StoreVectorScatterMasked : Op_LoadVectorGatherMasked, num_elem, elem_bt, mask)) {
|
||||
if (C->print_intrinsics()) {
|
||||
tty->print_cr(" ** not supported: arity=%d op=%s vlen=%d etype=%s is_masked_op=1",
|
||||
is_scatter, is_scatter ? "scatterMasked" : "gatherMasked",
|
||||
@ -1522,7 +1523,8 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
|
||||
}
|
||||
|
||||
// Check that the vector holding indices is supported by architecture
|
||||
if (!arch_supports_vector(Op_LoadVector, num_elem, T_INT, VecMaskNotUsed)) {
|
||||
// For sub-word gathers expander receive index array.
|
||||
if (!is_subword_type(elem_bt) && !arch_supports_vector(Op_LoadVector, num_elem, T_INT, VecMaskNotUsed)) {
|
||||
if (C->print_intrinsics()) {
|
||||
tty->print_cr(" ** not supported: arity=%d op=%s/loadindex vlen=%d etype=int is_masked_op=%d",
|
||||
is_scatter, is_scatter ? "scatter" : "gather",
|
||||
@ -1564,12 +1566,15 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Node* index_vect = nullptr;
|
||||
const TypeInstPtr* vbox_idx_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_idx_klass);
|
||||
Node* index_vect = unbox_vector(argument(8), vbox_idx_type, T_INT, num_elem);
|
||||
if (index_vect == nullptr) {
|
||||
set_map(old_map);
|
||||
set_sp(old_sp);
|
||||
return false;
|
||||
if (!is_subword_type(elem_bt)) {
|
||||
index_vect = unbox_vector(argument(8), vbox_idx_type, T_INT, num_elem);
|
||||
if (index_vect == nullptr) {
|
||||
set_map(old_map);
|
||||
set_sp(old_sp);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Node* mask = nullptr;
|
||||
@ -1608,10 +1613,23 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
|
||||
set_memory(vstore, addr_type);
|
||||
} else {
|
||||
Node* vload = nullptr;
|
||||
Node* index = argument(11);
|
||||
Node* indexMap = argument(12);
|
||||
Node* indexM = argument(13);
|
||||
if (mask != nullptr) {
|
||||
vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_vect, mask));
|
||||
if (is_subword_type(elem_bt)) {
|
||||
Node* index_arr_base = array_element_address(indexMap, indexM, T_INT);
|
||||
vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, mask, index));
|
||||
} else {
|
||||
vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_vect, mask));
|
||||
}
|
||||
} else {
|
||||
vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_vect));
|
||||
if (is_subword_type(elem_bt)) {
|
||||
Node* index_arr_base = array_element_address(indexMap, indexM, T_INT);
|
||||
vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, index));
|
||||
} else {
|
||||
vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_vect));
|
||||
}
|
||||
}
|
||||
Node* box = box_vector(vload, vbox_type, elem_bt, num_elem);
|
||||
set_result(box);
|
||||
|
||||
@ -890,16 +890,26 @@ class LoadVectorNode : public LoadNode {
|
||||
// Load Vector from memory via index map
|
||||
class LoadVectorGatherNode : public LoadVectorNode {
|
||||
public:
|
||||
LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices)
|
||||
LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* offset = nullptr)
|
||||
: LoadVectorNode(c, mem, adr, at, vt) {
|
||||
init_class_id(Class_LoadVectorGather);
|
||||
assert(indices->bottom_type()->is_vect(), "indices must be in vector");
|
||||
add_req(indices);
|
||||
assert(req() == MemNode::ValueIn + 1, "match_edge expects that last input is in MemNode::ValueIn");
|
||||
DEBUG_ONLY(bool is_subword = is_subword_type(vt->element_basic_type()));
|
||||
assert(is_subword || indices->bottom_type()->is_vect(), "indices must be in vector");
|
||||
assert(is_subword || !offset, "");
|
||||
assert(req() == MemNode::ValueIn + 1, "match_edge expects that index input is in MemNode::ValueIn");
|
||||
if (offset) {
|
||||
add_req(offset);
|
||||
}
|
||||
}
|
||||
|
||||
virtual int Opcode() const;
|
||||
virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn; }
|
||||
virtual uint match_edge(uint idx) const {
|
||||
return idx == MemNode::Address ||
|
||||
idx == MemNode::ValueIn ||
|
||||
((is_subword_type(vect_type()->element_basic_type())) &&
|
||||
idx == MemNode::ValueIn + 1);
|
||||
}
|
||||
};
|
||||
|
||||
//------------------------------StoreVectorNode--------------------------------
|
||||
@ -1003,20 +1013,23 @@ class LoadVectorMaskedNode : public LoadVectorNode {
|
||||
// Load Vector from memory via index map under the influence of a predicate register(mask).
|
||||
class LoadVectorGatherMaskedNode : public LoadVectorNode {
|
||||
public:
|
||||
LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask)
|
||||
LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask, Node* offset = nullptr)
|
||||
: LoadVectorNode(c, mem, adr, at, vt) {
|
||||
init_class_id(Class_LoadVectorGatherMasked);
|
||||
assert(indices->bottom_type()->is_vect(), "indices must be in vector");
|
||||
assert(mask->bottom_type()->isa_vectmask(), "sanity");
|
||||
add_req(indices);
|
||||
add_req(mask);
|
||||
assert(req() == MemNode::ValueIn + 2, "match_edge expects that last input is in MemNode::ValueIn+1");
|
||||
if (is_subword_type(vt->element_basic_type())) {
|
||||
add_req(offset);
|
||||
}
|
||||
}
|
||||
|
||||
virtual int Opcode() const;
|
||||
virtual uint match_edge(uint idx) const { return idx == MemNode::Address ||
|
||||
idx == MemNode::ValueIn ||
|
||||
idx == MemNode::ValueIn + 1; }
|
||||
idx == MemNode::ValueIn + 1 ||
|
||||
(is_subword_type(vect_type()->is_vect()->element_basic_type()) &&
|
||||
idx == MemNode::ValueIn + 2); }
|
||||
};
|
||||
|
||||
//------------------------------StoreVectorScatterMaskedNode--------------------------------
|
||||
|
||||
@ -893,6 +893,12 @@ final class Byte128Vector extends ByteVector {
|
||||
return super.fromArray0Template(Byte128Mask.class, a, offset, (Byte128Mask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
|
||||
return super.fromArray0Template(Byte128Mask.class, a, offset, indexMap, mapOffset, (Byte128Mask) m);
|
||||
}
|
||||
|
||||
|
||||
@ForceInline
|
||||
|
||||
@ -925,6 +925,12 @@ final class Byte256Vector extends ByteVector {
|
||||
return super.fromArray0Template(Byte256Mask.class, a, offset, (Byte256Mask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
|
||||
return super.fromArray0Template(Byte256Mask.class, a, offset, indexMap, mapOffset, (Byte256Mask) m);
|
||||
}
|
||||
|
||||
|
||||
@ForceInline
|
||||
|
||||
@ -989,6 +989,12 @@ final class Byte512Vector extends ByteVector {
|
||||
return super.fromArray0Template(Byte512Mask.class, a, offset, (Byte512Mask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
|
||||
return super.fromArray0Template(Byte512Mask.class, a, offset, indexMap, mapOffset, (Byte512Mask) m);
|
||||
}
|
||||
|
||||
|
||||
@ForceInline
|
||||
|
||||
@ -877,6 +877,12 @@ final class Byte64Vector extends ByteVector {
|
||||
return super.fromArray0Template(Byte64Mask.class, a, offset, (Byte64Mask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
|
||||
return super.fromArray0Template(Byte64Mask.class, a, offset, indexMap, mapOffset, (Byte64Mask) m);
|
||||
}
|
||||
|
||||
|
||||
@ForceInline
|
||||
|
||||
@ -863,6 +863,12 @@ final class ByteMaxVector extends ByteVector {
|
||||
return super.fromArray0Template(ByteMaxMask.class, a, offset, (ByteMaxMask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
|
||||
return super.fromArray0Template(ByteMaxMask.class, a, offset, indexMap, mapOffset, (ByteMaxMask) m);
|
||||
}
|
||||
|
||||
|
||||
@ForceInline
|
||||
|
||||
@ -3049,7 +3049,35 @@ public abstract class ByteVector extends AbstractVector<Byte> {
|
||||
byte[] a, int offset,
|
||||
int[] indexMap, int mapOffset) {
|
||||
ByteSpecies vsp = (ByteSpecies) species;
|
||||
return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]);
|
||||
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
|
||||
Objects.requireNonNull(a);
|
||||
Objects.requireNonNull(indexMap);
|
||||
Class<? extends ByteVector> vectorType = vsp.vectorType();
|
||||
|
||||
|
||||
// Constant folding should sweep out following conditonal logic.
|
||||
VectorSpecies<Integer> lsp;
|
||||
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
|
||||
lsp = IntVector.SPECIES_PREFERRED;
|
||||
} else {
|
||||
lsp = isp;
|
||||
}
|
||||
|
||||
// Check indices are within array bounds.
|
||||
for (int i = 0; i < vsp.length(); i += lsp.length()) {
|
||||
IntVector vix = IntVector
|
||||
.fromArray(lsp, indexMap, mapOffset + i)
|
||||
.add(offset);
|
||||
VectorIntrinsics.checkIndex(vix, a.length);
|
||||
}
|
||||
|
||||
return VectorSupport.loadWithMap(
|
||||
vectorType, null, byte.class, vsp.laneCount(),
|
||||
lsp.vectorType(),
|
||||
a, ARRAY_BASE, null, null,
|
||||
a, offset, indexMap, mapOffset, vsp,
|
||||
(c, idx, iMap, idy, s, vm) ->
|
||||
s.vOp(n -> c[idx + iMap[idy+n]]));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3094,8 +3122,13 @@ public abstract class ByteVector extends AbstractVector<Byte> {
|
||||
byte[] a, int offset,
|
||||
int[] indexMap, int mapOffset,
|
||||
VectorMask<Byte> m) {
|
||||
ByteSpecies vsp = (ByteSpecies) species;
|
||||
return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]);
|
||||
if (m.allTrue()) {
|
||||
return fromArray(species, a, offset, indexMap, mapOffset);
|
||||
}
|
||||
else {
|
||||
ByteSpecies vsp = (ByteSpecies) species;
|
||||
return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -3760,6 +3793,49 @@ public abstract class ByteVector extends AbstractVector<Byte> {
|
||||
(arr_, off_, i) -> arr_[off_ + i]));
|
||||
}
|
||||
|
||||
/*package-private*/
|
||||
abstract
|
||||
ByteVector fromArray0(byte[] a, int offset,
|
||||
int[] indexMap, int mapOffset,
|
||||
VectorMask<Byte> m);
|
||||
@ForceInline
|
||||
final
|
||||
<M extends VectorMask<Byte>>
|
||||
ByteVector fromArray0Template(Class<M> maskClass, byte[] a, int offset,
|
||||
int[] indexMap, int mapOffset, M m) {
|
||||
ByteSpecies vsp = vspecies();
|
||||
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
|
||||
Objects.requireNonNull(a);
|
||||
Objects.requireNonNull(indexMap);
|
||||
m.check(vsp);
|
||||
Class<? extends ByteVector> vectorType = vsp.vectorType();
|
||||
|
||||
|
||||
// Constant folding should sweep out following conditonal logic.
|
||||
VectorSpecies<Integer> lsp;
|
||||
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
|
||||
lsp = IntVector.SPECIES_PREFERRED;
|
||||
} else {
|
||||
lsp = isp;
|
||||
}
|
||||
|
||||
// Check indices are within array bounds.
|
||||
// FIXME: Check index under mask controlling.
|
||||
for (int i = 0; i < vsp.length(); i += lsp.length()) {
|
||||
IntVector vix = IntVector
|
||||
.fromArray(lsp, indexMap, mapOffset + i)
|
||||
.add(offset);
|
||||
VectorIntrinsics.checkIndex(vix, a.length);
|
||||
}
|
||||
|
||||
return VectorSupport.loadWithMap(
|
||||
vectorType, maskClass, byte.class, vsp.laneCount(),
|
||||
lsp.vectorType(),
|
||||
a, ARRAY_BASE, null, m,
|
||||
a, offset, indexMap, mapOffset, vsp,
|
||||
(c, idx, iMap, idy, s, vm) ->
|
||||
s.vOp(vm, n -> c[idx + iMap[idy+n]]));
|
||||
}
|
||||
|
||||
|
||||
/*package-private*/
|
||||
|
||||
@ -877,6 +877,12 @@ final class Short128Vector extends ShortVector {
|
||||
return super.fromArray0Template(Short128Mask.class, a, offset, (Short128Mask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
|
||||
return super.fromArray0Template(Short128Mask.class, a, offset, indexMap, mapOffset, (Short128Mask) m);
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
|
||||
@ -893,6 +893,12 @@ final class Short256Vector extends ShortVector {
|
||||
return super.fromArray0Template(Short256Mask.class, a, offset, (Short256Mask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
|
||||
return super.fromArray0Template(Short256Mask.class, a, offset, indexMap, mapOffset, (Short256Mask) m);
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
|
||||
@ -925,6 +925,12 @@ final class Short512Vector extends ShortVector {
|
||||
return super.fromArray0Template(Short512Mask.class, a, offset, (Short512Mask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
|
||||
return super.fromArray0Template(Short512Mask.class, a, offset, indexMap, mapOffset, (Short512Mask) m);
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
|
||||
@ -869,6 +869,12 @@ final class Short64Vector extends ShortVector {
|
||||
return super.fromArray0Template(Short64Mask.class, a, offset, (Short64Mask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
|
||||
return super.fromArray0Template(Short64Mask.class, a, offset, indexMap, mapOffset, (Short64Mask) m);
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
|
||||
@ -863,6 +863,12 @@ final class ShortMaxVector extends ShortVector {
|
||||
return super.fromArray0Template(ShortMaxMask.class, a, offset, (ShortMaxMask) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
|
||||
return super.fromArray0Template(ShortMaxMask.class, a, offset, indexMap, mapOffset, (ShortMaxMask) m);
|
||||
}
|
||||
|
||||
@ForceInline
|
||||
@Override
|
||||
|
||||
@ -3050,7 +3050,35 @@ public abstract class ShortVector extends AbstractVector<Short> {
|
||||
short[] a, int offset,
|
||||
int[] indexMap, int mapOffset) {
|
||||
ShortSpecies vsp = (ShortSpecies) species;
|
||||
return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]);
|
||||
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
|
||||
Objects.requireNonNull(a);
|
||||
Objects.requireNonNull(indexMap);
|
||||
Class<? extends ShortVector> vectorType = vsp.vectorType();
|
||||
|
||||
|
||||
// Constant folding should sweep out following conditonal logic.
|
||||
VectorSpecies<Integer> lsp;
|
||||
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
|
||||
lsp = IntVector.SPECIES_PREFERRED;
|
||||
} else {
|
||||
lsp = isp;
|
||||
}
|
||||
|
||||
// Check indices are within array bounds.
|
||||
for (int i = 0; i < vsp.length(); i += lsp.length()) {
|
||||
IntVector vix = IntVector
|
||||
.fromArray(lsp, indexMap, mapOffset + i)
|
||||
.add(offset);
|
||||
VectorIntrinsics.checkIndex(vix, a.length);
|
||||
}
|
||||
|
||||
return VectorSupport.loadWithMap(
|
||||
vectorType, null, short.class, vsp.laneCount(),
|
||||
lsp.vectorType(),
|
||||
a, ARRAY_BASE, null, null,
|
||||
a, offset, indexMap, mapOffset, vsp,
|
||||
(c, idx, iMap, idy, s, vm) ->
|
||||
s.vOp(n -> c[idx + iMap[idy+n]]));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3095,8 +3123,13 @@ public abstract class ShortVector extends AbstractVector<Short> {
|
||||
short[] a, int offset,
|
||||
int[] indexMap, int mapOffset,
|
||||
VectorMask<Short> m) {
|
||||
ShortSpecies vsp = (ShortSpecies) species;
|
||||
return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]);
|
||||
if (m.allTrue()) {
|
||||
return fromArray(species, a, offset, indexMap, mapOffset);
|
||||
}
|
||||
else {
|
||||
ShortSpecies vsp = (ShortSpecies) species;
|
||||
return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3746,6 +3779,49 @@ public abstract class ShortVector extends AbstractVector<Short> {
|
||||
(arr_, off_, i) -> arr_[off_ + i]));
|
||||
}
|
||||
|
||||
/*package-private*/
|
||||
abstract
|
||||
ShortVector fromArray0(short[] a, int offset,
|
||||
int[] indexMap, int mapOffset,
|
||||
VectorMask<Short> m);
|
||||
@ForceInline
|
||||
final
|
||||
<M extends VectorMask<Short>>
|
||||
ShortVector fromArray0Template(Class<M> maskClass, short[] a, int offset,
|
||||
int[] indexMap, int mapOffset, M m) {
|
||||
ShortSpecies vsp = vspecies();
|
||||
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
|
||||
Objects.requireNonNull(a);
|
||||
Objects.requireNonNull(indexMap);
|
||||
m.check(vsp);
|
||||
Class<? extends ShortVector> vectorType = vsp.vectorType();
|
||||
|
||||
|
||||
// Constant folding should sweep out following conditonal logic.
|
||||
VectorSpecies<Integer> lsp;
|
||||
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
|
||||
lsp = IntVector.SPECIES_PREFERRED;
|
||||
} else {
|
||||
lsp = isp;
|
||||
}
|
||||
|
||||
// Check indices are within array bounds.
|
||||
// FIXME: Check index under mask controlling.
|
||||
for (int i = 0; i < vsp.length(); i += lsp.length()) {
|
||||
IntVector vix = IntVector
|
||||
.fromArray(lsp, indexMap, mapOffset + i)
|
||||
.add(offset);
|
||||
VectorIntrinsics.checkIndex(vix, a.length);
|
||||
}
|
||||
|
||||
return VectorSupport.loadWithMap(
|
||||
vectorType, maskClass, short.class, vsp.laneCount(),
|
||||
lsp.vectorType(),
|
||||
a, ARRAY_BASE, null, m,
|
||||
a, offset, indexMap, mapOffset, vsp,
|
||||
(c, idx, iMap, idy, s, vm) ->
|
||||
s.vOp(vm, n -> c[idx + iMap[idy+n]]));
|
||||
}
|
||||
|
||||
/*package-private*/
|
||||
abstract
|
||||
|
||||
@ -3622,7 +3622,35 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
|
||||
$type$[] a, int offset,
|
||||
int[] indexMap, int mapOffset) {
|
||||
$Type$Species vsp = ($Type$Species) species;
|
||||
return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]);
|
||||
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
|
||||
Objects.requireNonNull(a);
|
||||
Objects.requireNonNull(indexMap);
|
||||
Class<? extends $abstractvectortype$> vectorType = vsp.vectorType();
|
||||
|
||||
|
||||
// Constant folding should sweep out following conditonal logic.
|
||||
VectorSpecies<Integer> lsp;
|
||||
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
|
||||
lsp = IntVector.SPECIES_PREFERRED;
|
||||
} else {
|
||||
lsp = isp;
|
||||
}
|
||||
|
||||
// Check indices are within array bounds.
|
||||
for (int i = 0; i < vsp.length(); i += lsp.length()) {
|
||||
IntVector vix = IntVector
|
||||
.fromArray(lsp, indexMap, mapOffset + i)
|
||||
.add(offset);
|
||||
VectorIntrinsics.checkIndex(vix, a.length);
|
||||
}
|
||||
|
||||
return VectorSupport.loadWithMap(
|
||||
vectorType, null, $type$.class, vsp.laneCount(),
|
||||
lsp.vectorType(),
|
||||
a, ARRAY_BASE, null, null,
|
||||
a, offset, indexMap, mapOffset, vsp,
|
||||
(c, idx, iMap, idy, s, vm) ->
|
||||
s.vOp(n -> c[idx + iMap[idy+n]]));
|
||||
}
|
||||
#else[byteOrShort]
|
||||
@ForceInline
|
||||
@ -3714,17 +3742,6 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
|
||||
* where the mask is set
|
||||
* @see $abstractvectortype$#toIntArray()
|
||||
*/
|
||||
#if[byteOrShort]
|
||||
@ForceInline
|
||||
public static
|
||||
$abstractvectortype$ fromArray(VectorSpecies<$Boxtype$> species,
|
||||
$type$[] a, int offset,
|
||||
int[] indexMap, int mapOffset,
|
||||
VectorMask<$Boxtype$> m) {
|
||||
$Type$Species vsp = ($Type$Species) species;
|
||||
return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]);
|
||||
}
|
||||
#else[byteOrShort]
|
||||
@ForceInline
|
||||
public static
|
||||
$abstractvectortype$ fromArray(VectorSpecies<$Boxtype$> species,
|
||||
@ -3739,7 +3756,6 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
|
||||
return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m);
|
||||
}
|
||||
}
|
||||
#end[byteOrShort]
|
||||
|
||||
#if[short]
|
||||
/**
|
||||
@ -4793,12 +4809,51 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
|
||||
(arr_, off_, i) -> arr_[off_ + i]));
|
||||
}
|
||||
|
||||
#if[!byteOrShort]
|
||||
/*package-private*/
|
||||
abstract
|
||||
$abstractvectortype$ fromArray0($type$[] a, int offset,
|
||||
int[] indexMap, int mapOffset,
|
||||
VectorMask<$Boxtype$> m);
|
||||
#if[byteOrShort]
|
||||
@ForceInline
|
||||
final
|
||||
<M extends VectorMask<$Boxtype$>>
|
||||
$abstractvectortype$ fromArray0Template(Class<M> maskClass, $type$[] a, int offset,
|
||||
int[] indexMap, int mapOffset, M m) {
|
||||
$Type$Species vsp = vspecies();
|
||||
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
|
||||
Objects.requireNonNull(a);
|
||||
Objects.requireNonNull(indexMap);
|
||||
m.check(vsp);
|
||||
Class<? extends $abstractvectortype$> vectorType = vsp.vectorType();
|
||||
|
||||
|
||||
// Constant folding should sweep out following conditonal logic.
|
||||
VectorSpecies<Integer> lsp;
|
||||
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
|
||||
lsp = IntVector.SPECIES_PREFERRED;
|
||||
} else {
|
||||
lsp = isp;
|
||||
}
|
||||
|
||||
// Check indices are within array bounds.
|
||||
// FIXME: Check index under mask controlling.
|
||||
for (int i = 0; i < vsp.length(); i += lsp.length()) {
|
||||
IntVector vix = IntVector
|
||||
.fromArray(lsp, indexMap, mapOffset + i)
|
||||
.add(offset);
|
||||
VectorIntrinsics.checkIndex(vix, a.length);
|
||||
}
|
||||
|
||||
return VectorSupport.loadWithMap(
|
||||
vectorType, maskClass, $type$.class, vsp.laneCount(),
|
||||
lsp.vectorType(),
|
||||
a, ARRAY_BASE, null, m,
|
||||
a, offset, indexMap, mapOffset, vsp,
|
||||
(c, idx, iMap, idy, s, vm) ->
|
||||
s.vOp(vm, n -> c[idx + iMap[idy+n]]));
|
||||
}
|
||||
#else[byteOrShort]
|
||||
@ForceInline
|
||||
final
|
||||
<M extends VectorMask<$Boxtype$>>
|
||||
@ -4852,7 +4907,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
|
||||
(c, idx, iMap, idy, s, vm) ->
|
||||
s.vOp(vm, n -> c[idx + iMap[idy+n]]));
|
||||
}
|
||||
#end[!byteOrShort]
|
||||
#end[byteOrShort]
|
||||
|
||||
#if[short]
|
||||
/*package-private*/
|
||||
|
||||
@ -1151,14 +1151,12 @@ final class $vectortype$ extends $abstractvectortype$ {
|
||||
return super.fromArray0Template($masktype$.class, a, offset, ($masktype$) m, offsetInRange); // specialize
|
||||
}
|
||||
|
||||
#if[!byteOrShort]
|
||||
@ForceInline
|
||||
@Override
|
||||
final
|
||||
$abstractvectortype$ fromArray0($type$[] a, int offset, int[] indexMap, int mapOffset, VectorMask<$Boxtype$> m) {
|
||||
return super.fromArray0Template($masktype$.class, a, offset, indexMap, mapOffset, ($masktype$) m);
|
||||
}
|
||||
#end[!byteOrShort]
|
||||
|
||||
#if[short]
|
||||
@ForceInline
|
||||
|
||||
@ -0,0 +1,357 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
package org.openjdk.bench.jdk.incubator.vector;
|
||||
|
||||
import jdk.incubator.vector.*;
|
||||
import java.util.Random;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Thread)
|
||||
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
|
||||
public class GatherOperationsBenchmark {
|
||||
@Param({"64", "256", "1024", "4096"})
|
||||
int SIZE;
|
||||
byte [] barr;
|
||||
byte [] bres;
|
||||
short [] sarr;
|
||||
short [] sres;
|
||||
int [] index;
|
||||
|
||||
static final VectorSpecies<Short> S64 = ShortVector.SPECIES_64;
|
||||
static final VectorSpecies<Short> S128 = ShortVector.SPECIES_128;
|
||||
static final VectorSpecies<Short> S256 = ShortVector.SPECIES_256;
|
||||
static final VectorSpecies<Short> S512 = ShortVector.SPECIES_512;
|
||||
static final VectorSpecies<Byte> B64 = ByteVector.SPECIES_64;
|
||||
static final VectorSpecies<Byte> B128 = ByteVector.SPECIES_128;
|
||||
static final VectorSpecies<Byte> B256 = ByteVector.SPECIES_256;
|
||||
static final VectorSpecies<Byte> B512 = ByteVector.SPECIES_512;
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void BmSetup() {
|
||||
Random r = new Random(1245);
|
||||
index = new int[SIZE];
|
||||
barr = new byte[SIZE];
|
||||
bres = new byte[SIZE];
|
||||
sarr = new short[SIZE];
|
||||
sres = new short[SIZE];
|
||||
for (int i = 0; i < SIZE; i++) {
|
||||
barr[i] = (byte)i;
|
||||
sarr[i] = (short)i;
|
||||
index[i] = r.nextInt(SIZE-1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather64() {
|
||||
for (int i = 0; i < SIZE; i += B64.length()) {
|
||||
ByteVector.fromArray(B64, barr, 0, index, i)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather64_NZ_OFF() {
|
||||
for (int i = 0; i < SIZE; i += B64.length()) {
|
||||
ByteVector.fromArray(B64, barr, 1, index, i)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather64_MASK() {
|
||||
VectorMask<Byte> VMASK = VectorMask.fromLong(B64, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += B64.length()) {
|
||||
ByteVector.fromArray(B64, barr, 0, index, i, VMASK)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather64_MASK_NZ_OFF() {
|
||||
VectorMask<Byte> VMASK = VectorMask.fromLong(B64, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += B64.length()) {
|
||||
ByteVector.fromArray(B64, barr, 1, index, i, VMASK)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather128() {
|
||||
for (int i = 0; i < SIZE; i += B128.length()) {
|
||||
ByteVector.fromArray(B128, barr, 0, index, i)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather128_NZ_OFF() {
|
||||
for (int i = 0; i < SIZE; i += B128.length()) {
|
||||
ByteVector.fromArray(B128, barr, 1, index, i)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather128_MASK() {
|
||||
VectorMask<Byte> VMASK = VectorMask.fromLong(B128, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += B128.length()) {
|
||||
ByteVector.fromArray(B128, barr, 0, index, i, VMASK)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather128_MASK_NZ_OFF() {
|
||||
VectorMask<Byte> VMASK = VectorMask.fromLong(B128, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += B128.length()) {
|
||||
ByteVector.fromArray(B128, barr, 1, index, i, VMASK)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather256() {
|
||||
for (int i = 0; i < SIZE; i += B256.length()) {
|
||||
ByteVector.fromArray(B256, barr, 0, index, i)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather256_NZ_OFF() {
|
||||
for (int i = 0; i < SIZE; i += B256.length()) {
|
||||
ByteVector.fromArray(B256, barr, 1, index, i)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather256_MASK() {
|
||||
VectorMask<Byte> VMASK = VectorMask.fromLong(B256, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += B256.length()) {
|
||||
ByteVector.fromArray(B256, barr, 0, index, i, VMASK)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather256_MASK_NZ_OFF() {
|
||||
VectorMask<Byte> VMASK = VectorMask.fromLong(B256, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += B256.length()) {
|
||||
ByteVector.fromArray(B256, barr, 1, index, i, VMASK)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather512() {
|
||||
for (int i = 0; i < SIZE; i += B512.length()) {
|
||||
ByteVector.fromArray(B512, barr, 0, index, i)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather512_NZ_OFF() {
|
||||
for (int i = 0; i < SIZE; i += B512.length()) {
|
||||
ByteVector.fromArray(B512, barr, 1, index, i)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather512_MASK() {
|
||||
VectorMask<Byte> VMASK = VectorMask.fromLong(B512, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += B512.length()) {
|
||||
ByteVector.fromArray(B512, barr, 0, index, i, VMASK)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microByteGather512_MASK_NZ_OFF() {
|
||||
VectorMask<Byte> VMASK = VectorMask.fromLong(B512, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += B512.length()) {
|
||||
ByteVector.fromArray(B512, barr, 1, index, i, VMASK)
|
||||
.intoArray(bres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather64() {
|
||||
for (int i = 0; i < SIZE; i += S64.length()) {
|
||||
ShortVector.fromArray(S64, sarr, 0, index, i)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather64_NZ_OFF() {
|
||||
for (int i = 0; i < SIZE; i += S64.length()) {
|
||||
ShortVector.fromArray(S64, sarr, 1, index, i)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather64_MASK() {
|
||||
VectorMask<Short> VMASK = VectorMask.fromLong(S64, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += S64.length()) {
|
||||
ShortVector.fromArray(S64, sarr, 0, index, i, VMASK)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather64_MASK_NZ_OFF() {
|
||||
VectorMask<Short> VMASK = VectorMask.fromLong(S64, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += S64.length()) {
|
||||
ShortVector.fromArray(S64, sarr, 1, index, i, VMASK)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather128() {
|
||||
for (int i = 0; i < SIZE; i += S128.length()) {
|
||||
ShortVector.fromArray(S128, sarr, 0, index, i)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather128_NZ_OFF() {
|
||||
for (int i = 0; i < SIZE; i += S128.length()) {
|
||||
ShortVector.fromArray(S128, sarr, 1, index, i)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather128_MASK() {
|
||||
VectorMask<Short> VMASK = VectorMask.fromLong(S128, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += S128.length()) {
|
||||
ShortVector.fromArray(S128, sarr, 0, index, i, VMASK)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather128_MASK_NZ_OFF() {
|
||||
VectorMask<Short> VMASK = VectorMask.fromLong(S128, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += S128.length()) {
|
||||
ShortVector.fromArray(S128, sarr, 1, index, i, VMASK)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather256() {
|
||||
for (int i = 0; i < SIZE; i += S256.length()) {
|
||||
ShortVector.fromArray(S256, sarr, 0, index, i)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather256_NZ_OFF() {
|
||||
for (int i = 0; i < SIZE; i += S256.length()) {
|
||||
ShortVector.fromArray(S256, sarr, 1, index, i)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather256_MASK() {
|
||||
VectorMask<Short> VMASK = VectorMask.fromLong(S256, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += S256.length()) {
|
||||
ShortVector.fromArray(S256, sarr, 0, index, i, VMASK)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather256_MASK_NZ_OFF() {
|
||||
VectorMask<Short> VMASK = VectorMask.fromLong(S256, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += S256.length()) {
|
||||
ShortVector.fromArray(S256, sarr, 1, index, i, VMASK)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather512() {
|
||||
for (int i = 0; i < SIZE; i += S512.length()) {
|
||||
ShortVector.fromArray(S512, sarr, 0, index, i)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather512_NZ_OFF() {
|
||||
for (int i = 0; i < SIZE; i += S512.length()) {
|
||||
ShortVector.fromArray(S512, sarr, 1, index, i)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather512_MASK() {
|
||||
VectorMask<Short> VMASK = VectorMask.fromLong(S512, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += S512.length()) {
|
||||
ShortVector.fromArray(S512, sarr, 0, index, i, VMASK)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void microShortGather512_MASK_NZ_OFF() {
|
||||
VectorMask<Short> VMASK = VectorMask.fromLong(S512, 0x5555555555555555L);
|
||||
for (int i = 0; i < SIZE; i += S512.length()) {
|
||||
ShortVector.fromArray(S512, sarr, 1, index, i, VMASK)
|
||||
.intoArray(sres, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user