8318650: Optimized subword gather for x86 targets.

Reviewed-by: sviswanathan, epeter, psandoz
This commit is contained in:
Jatin Bhateja 2024-04-21 23:21:17 +00:00
parent 6d5699617f
commit 185e711bfe
32 changed files with 1157 additions and 49 deletions

View File

@ -169,9 +169,7 @@ source %{
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
case Op_LoadVectorGather:
case Op_StoreVectorScatter:
case Op_LoadVectorGatherMasked:
case Op_StoreVectorScatterMasked:
case Op_PopulateIndex:
case Op_CompressM:
@ -180,6 +178,12 @@ source %{
return false;
}
break;
case Op_LoadVectorGather:
case Op_LoadVectorGatherMasked:
if (UseSVE == 0 || is_subword_type(bt)) {
return false;
}
break;
case Op_MulAddVS2VI:
if (length_in_bytes != 16) {
return false;

View File

@ -133,6 +133,11 @@
return true;
}
// Does target support predicated operation emulation.
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
return false;
}
// Does the CPU supports vector variable rotate instructions?
static constexpr bool supports_vector_variable_rotates(void) {
return false;

View File

@ -126,6 +126,11 @@
return VM_Version::has_simd();
}
// Does target support predicated operation emulation.
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
return false;
}
// Does the CPU supports vector variable rotate instructions?
static constexpr bool supports_vector_variable_rotates(void) {
return false; // not supported

View File

@ -133,6 +133,11 @@
return false;
}
// Does target support predicated operation emulation.
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
return false;
}
// Does the CPU supports vector variable rotate instructions?
static constexpr bool supports_vector_variable_rotates(void) {
return false;

View File

@ -132,6 +132,11 @@
return false;
}
// Does target support predicated operation emulation.
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
return false;
}
// Does the CPU supports vector variable rotate instructions?
static constexpr bool supports_vector_variable_rotates(void) {
return false;

View File

@ -73,6 +73,11 @@ source %{
return false;
}
break;
case Op_LoadVectorGatherMasked:
if (is_subword_type(bt)) {
return false;
}
break;
case Op_VectorCastHF2F:
case Op_VectorCastF2HF:
return UseZvfh;

View File

@ -124,6 +124,11 @@
return false;
}
// Does target support predicated operation emulation.
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
return false;
}
// Does the CPU supports vector variable rotate instructions?
static constexpr bool supports_vector_variable_rotates(void) {
return false;

View File

@ -13652,9 +13652,13 @@ void Assembler::notq(Register dst) {
emit_int16((unsigned char)0xF7, (0xD0 | encode));
}
void Assembler::btq(Register dst, Register src) {
int encode = prefixq_and_encode(src->encoding(), dst->encoding());
emit_int24(0x0F, (unsigned char)0xA3, (encode | 0xC0));
}
void Assembler::btq(Register src, int imm8) {
assert(isByte(imm8), "not a byte");
InstructionMark im(this);
int encode = prefixq_and_encode(src->encoding());
emit_int16(0x0f, 0xba);
emit_int8(0xe0|encode);

View File

@ -1736,6 +1736,7 @@ private:
void btrq(Address dst, int imm8);
void btq(Register src, int imm8);
#endif
void btq(Register dst, Register src);
void orw(Register dst, Register src);

View File

@ -1796,6 +1796,130 @@ void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src,
}
}
#ifdef _LP64
void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
XMMRegister dst, Register base,
Register idx_base,
Register offset, Register mask,
Register mask_idx, Register rtmp,
int vlen_enc) {
vpxor(dst, dst, dst, vlen_enc);
if (elem_bt == T_SHORT) {
for (int i = 0; i < 4; i++) {
// dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
Label skip_load;
btq(mask, mask_idx);
jccb(Assembler::carryClear, skip_load);
movl(rtmp, Address(idx_base, i * 4));
if (offset != noreg) {
addl(rtmp, offset);
}
pinsrw(dst, Address(base, rtmp, Address::times_2), i);
bind(skip_load);
incq(mask_idx);
}
} else {
assert(elem_bt == T_BYTE, "");
for (int i = 0; i < 8; i++) {
// dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
Label skip_load;
btq(mask, mask_idx);
jccb(Assembler::carryClear, skip_load);
movl(rtmp, Address(idx_base, i * 4));
if (offset != noreg) {
addl(rtmp, offset);
}
pinsrb(dst, Address(base, rtmp), i);
bind(skip_load);
incq(mask_idx);
}
}
}
#endif // _LP64
void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
Register base, Register idx_base,
Register offset, Register rtmp,
int vlen_enc) {
vpxor(dst, dst, dst, vlen_enc);
if (elem_bt == T_SHORT) {
for (int i = 0; i < 4; i++) {
// dst[i] = src[offset + idx_base[i]]
movl(rtmp, Address(idx_base, i * 4));
if (offset != noreg) {
addl(rtmp, offset);
}
pinsrw(dst, Address(base, rtmp, Address::times_2), i);
}
} else {
assert(elem_bt == T_BYTE, "");
for (int i = 0; i < 8; i++) {
// dst[i] = src[offset + idx_base[i]]
movl(rtmp, Address(idx_base, i * 4));
if (offset != noreg) {
addl(rtmp, offset);
}
pinsrb(dst, Address(base, rtmp), i);
}
}
}
/*
* Gather using hybrid algorithm, first partially unroll scalar loop
* to accumulate values from gather indices into a quad-word(64bit) slice.
* A slice may hold 8 bytes or 4 short values. This is followed by a vector
* permutation to place the slice into appropriate vector lane
* locations in destination vector. Following pseudo code describes the
* algorithm in detail:
*
* DST_VEC = ZERO_VEC
* PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
* TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
* FOREACH_ITER:
* TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
* TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
* DST_VEC = DST_VEC OR TEMP_PERM_VEC
* PERM_INDEX = PERM_INDEX - TWO_VEC
*
* With each iteration, doubleword permute indices (0,1) corresponding
* to gathered quadword gets right shifted by two lane positions.
*
*/
void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
Register base, Register idx_base,
Register offset, Register mask,
XMMRegister xtmp1, XMMRegister xtmp2,
XMMRegister temp_dst, Register rtmp,
Register mask_idx, Register length,
int vector_len, int vlen_enc) {
Label GATHER8_LOOP;
assert(is_subword_type(elem_ty), "");
movl(length, vector_len);
vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
vallones(xtmp2, vlen_enc);
vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
bind(GATHER8_LOOP);
// TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
if (mask == noreg) {
vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
} else {
LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc));
}
// TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
// PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
// DST_VEC = DST_VEC OR TEMP_PERM_VEC
vpor(dst, dst, temp_dst, vlen_enc);
addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
jcc(Assembler::notEqual, GATHER8_LOOP);
}
void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
switch(typ) {
case T_INT:

View File

@ -500,4 +500,16 @@ public:
void vector_rearrange_int_float(BasicType bt, XMMRegister dst, XMMRegister shuffle,
XMMRegister src, int vlen_enc);
void vgather_subword(BasicType elem_ty, XMMRegister dst, Register base, Register idx_base, Register offset,
Register mask, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
Register midx, Register length, int vector_len, int vlen_enc);
#ifdef _LP64
void vgather8b_masked_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
Register offset, Register mask, Register midx, Register rtmp, int vlen_enc);
#endif
void vgather8b_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
Register offset, Register rtmp, int vlen_enc);
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP

View File

@ -154,6 +154,16 @@
return (UseAVX >= 2);
}
// Does target support predicated operation emulation.
static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) {
switch(vopc) {
case Op_LoadVectorGatherMasked:
return is_subword_type(bt) && VM_Version::supports_avx2();
default:
return false;
}
}
// Does the CPU supports vector variable rotate instructions?
static constexpr bool supports_vector_variable_rotates(void) {
return true;
@ -214,6 +224,9 @@
return 7;
case Op_MulVL:
return VM_Version::supports_avx512vldq() ? 0 : 6;
case Op_LoadVectorGather:
case Op_LoadVectorGatherMasked:
return is_subword_type(ety) ? 50 : 0;
case Op_VectorCastF2X: // fall through
case Op_VectorCastD2X:
return is_floating_point_type(ety) ? 0 : (is_subword_type(ety) ? 35 : 30);

View File

@ -1569,6 +1569,7 @@ bool Matcher::match_rule_supported(int opcode) {
}
break;
case Op_LoadVectorGather:
case Op_LoadVectorGatherMasked:
if (UseAVX < 2) {
return false;
}
@ -1906,6 +1907,17 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
}
break;
case Op_LoadVectorGatherMasked:
if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
return false;
}
if (is_subword_type(bt) &&
(!is_LP64 ||
(size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
(size_in_bits < 64) ||
(bt == T_SHORT && !VM_Version::supports_bmi2()))) {
return false;
}
break;
case Op_StoreVectorScatterMasked:
case Op_StoreVectorScatter:
if (is_subword_type(bt)) {
@ -1915,7 +1927,10 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
}
// fallthrough
case Op_LoadVectorGather:
if (size_in_bits == 64 ) {
if (!is_subword_type(bt) && size_in_bits == 64) {
return false;
}
if (is_subword_type(bt) && size_in_bits < 64) {
return false;
}
break;
@ -4024,10 +4039,11 @@ instruct storeV(memory mem, vec src) %{
// ---------------------------------------- Gather ------------------------------------
// Gather INT, LONG, FLOAT, DOUBLE
// Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
Matcher::vector_length_in_bytes(n) <= 32);
match(Set dst (LoadVectorGather mem idx));
effect(TEMP dst, TEMP tmp, TEMP mask);
format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
@ -4044,7 +4060,8 @@ instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
!is_subword_type(Matcher::vector_element_basic_type(n)));
match(Set dst (LoadVectorGather mem idx));
effect(TEMP dst, TEMP tmp, TEMP ktmp);
format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
@ -4059,7 +4076,8 @@ instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
%}
instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
!is_subword_type(Matcher::vector_element_basic_type(n)));
match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
@ -4077,6 +4095,238 @@ instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRe
%}
ins_pipe( pipe_slow );
%}
instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
effect(TEMP tmp, TEMP rtmp);
format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ lea($tmp$$Register, $mem$$Address);
__ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
int vector_len = Matcher::vector_length(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ lea($tmp$$Register, $mem$$Address);
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
effect(TEMP tmp, TEMP rtmp, KILL cr);
format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ lea($tmp$$Register, $mem$$Address);
__ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
int vector_len = Matcher::vector_length(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ lea($tmp$$Register, $mem$$Address);
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
#ifdef _LP64
instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ xorq($mask_idx$$Register, $mask_idx$$Register);
__ lea($tmp$$Register, $mem$$Address);
__ kmovql($rtmp2$$Register, $mask$$KRegister);
__ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
int vector_len = Matcher::vector_length(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ xorq($mask_idx$$Register, $mask_idx$$Register);
__ lea($tmp$$Register, $mem$$Address);
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
__ kmovql($rtmp2$$Register, $mask$$KRegister);
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ xorq($mask_idx$$Register, $mask_idx$$Register);
__ lea($tmp$$Register, $mem$$Address);
__ kmovql($rtmp2$$Register, $mask$$KRegister);
__ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
$rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
int vector_len = Matcher::vector_length(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ xorq($mask_idx$$Register, $mask_idx$$Register);
__ lea($tmp$$Register, $mem$$Address);
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
__ kmovql($rtmp2$$Register, $mask$$KRegister);
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ lea($tmp$$Register, $mem$$Address);
__ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
if (elem_bt == T_SHORT) {
__ movl($mask_idx$$Register, 0x55555555);
__ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
}
__ xorl($mask_idx$$Register, $mask_idx$$Register);
__ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
int vector_len = Matcher::vector_length(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ lea($tmp$$Register, $mem$$Address);
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
__ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
if (elem_bt == T_SHORT) {
__ movl($mask_idx$$Register, 0x55555555);
__ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
}
__ xorl($mask_idx$$Register, $mask_idx$$Register);
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ lea($tmp$$Register, $mem$$Address);
__ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
if (elem_bt == T_SHORT) {
__ movl($mask_idx$$Register, 0x55555555);
__ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
}
__ xorl($mask_idx$$Register, $mask_idx$$Register);
__ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
$rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
int vector_len = Matcher::vector_length(this);
BasicType elem_bt = Matcher::vector_element_basic_type(this);
__ xorl($mask_idx$$Register, $mask_idx$$Register);
__ lea($tmp$$Register, $mem$$Address);
__ movptr($idx_base_temp$$Register, $idx_base$$Register);
__ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
if (elem_bt == T_SHORT) {
__ movl($mask_idx$$Register, 0x55555555);
__ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
}
__ xorl($mask_idx$$Register, $mask_idx$$Register);
__ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
#endif
// ====================Scatter=======================================
// Scatter INT, LONG, FLOAT, DOUBLE

View File

@ -1050,6 +1050,8 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
} break;
case Op_CountTrailingZerosV:
case Op_CountLeadingZerosV:
case Op_LoadVectorGather:
case Op_LoadVectorGatherMasked:
case Op_ReverseV:
case Op_RoundVF:
case Op_RoundVD:

View File

@ -2474,7 +2474,22 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
n->del_req(3);
break;
}
case Op_LoadVectorGather:
if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) {
Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1));
n->set_req(MemNode::ValueIn, pair);
n->del_req(MemNode::ValueIn+1);
}
break;
case Op_LoadVectorGatherMasked:
if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) {
Node* pair2 = new BinaryNode(n->in(MemNode::ValueIn + 1), n->in(MemNode::ValueIn + 2));
Node* pair1 = new BinaryNode(n->in(MemNode::ValueIn), pair2);
n->set_req(MemNode::ValueIn, pair1);
n->del_req(MemNode::ValueIn+2);
n->del_req(MemNode::ValueIn+1);
break;
} // fall-through
case Op_StoreVectorScatter: {
Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1));
n->set_req(MemNode::ValueIn, pair);

View File

@ -302,6 +302,7 @@ bool LibraryCallKit::arch_supports_vector(int sopc, int num_elem, BasicType type
is_supported = Matcher::match_rule_supported_vector_masked(sopc, num_elem, type);
}
}
is_supported |= Matcher::supports_vector_predicate_op_emulation(sopc, num_elem, type);
if (!is_supported) {
#ifndef PRODUCT
@ -1500,8 +1501,8 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
}
// Check whether the predicated gather/scatter node is supported by architecture.
if (!arch_supports_vector(is_scatter ? Op_StoreVectorScatterMasked : Op_LoadVectorGatherMasked, num_elem, elem_bt,
(VectorMaskUseType) (VecMaskUseLoad | VecMaskUsePred))) {
VectorMaskUseType mask = (VectorMaskUseType) (VecMaskUseLoad | VecMaskUsePred);
if (!arch_supports_vector(is_scatter ? Op_StoreVectorScatterMasked : Op_LoadVectorGatherMasked, num_elem, elem_bt, mask)) {
if (C->print_intrinsics()) {
tty->print_cr(" ** not supported: arity=%d op=%s vlen=%d etype=%s is_masked_op=1",
is_scatter, is_scatter ? "scatterMasked" : "gatherMasked",
@ -1522,7 +1523,8 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
}
// Check that the vector holding indices is supported by architecture
if (!arch_supports_vector(Op_LoadVector, num_elem, T_INT, VecMaskNotUsed)) {
// For sub-word gathers expander receive index array.
if (!is_subword_type(elem_bt) && !arch_supports_vector(Op_LoadVector, num_elem, T_INT, VecMaskNotUsed)) {
if (C->print_intrinsics()) {
tty->print_cr(" ** not supported: arity=%d op=%s/loadindex vlen=%d etype=int is_masked_op=%d",
is_scatter, is_scatter ? "scatter" : "gather",
@ -1564,12 +1566,15 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
return false;
}
Node* index_vect = nullptr;
const TypeInstPtr* vbox_idx_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_idx_klass);
Node* index_vect = unbox_vector(argument(8), vbox_idx_type, T_INT, num_elem);
if (index_vect == nullptr) {
set_map(old_map);
set_sp(old_sp);
return false;
if (!is_subword_type(elem_bt)) {
index_vect = unbox_vector(argument(8), vbox_idx_type, T_INT, num_elem);
if (index_vect == nullptr) {
set_map(old_map);
set_sp(old_sp);
return false;
}
}
Node* mask = nullptr;
@ -1608,10 +1613,23 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
set_memory(vstore, addr_type);
} else {
Node* vload = nullptr;
Node* index = argument(11);
Node* indexMap = argument(12);
Node* indexM = argument(13);
if (mask != nullptr) {
vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_vect, mask));
if (is_subword_type(elem_bt)) {
Node* index_arr_base = array_element_address(indexMap, indexM, T_INT);
vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, mask, index));
} else {
vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_vect, mask));
}
} else {
vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_vect));
if (is_subword_type(elem_bt)) {
Node* index_arr_base = array_element_address(indexMap, indexM, T_INT);
vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, index));
} else {
vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_vect));
}
}
Node* box = box_vector(vload, vbox_type, elem_bt, num_elem);
set_result(box);

View File

@ -890,16 +890,26 @@ class LoadVectorNode : public LoadNode {
// Load Vector from memory via index map
class LoadVectorGatherNode : public LoadVectorNode {
public:
LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices)
LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* offset = nullptr)
: LoadVectorNode(c, mem, adr, at, vt) {
init_class_id(Class_LoadVectorGather);
assert(indices->bottom_type()->is_vect(), "indices must be in vector");
add_req(indices);
assert(req() == MemNode::ValueIn + 1, "match_edge expects that last input is in MemNode::ValueIn");
DEBUG_ONLY(bool is_subword = is_subword_type(vt->element_basic_type()));
assert(is_subword || indices->bottom_type()->is_vect(), "indices must be in vector");
assert(is_subword || !offset, "");
assert(req() == MemNode::ValueIn + 1, "match_edge expects that index input is in MemNode::ValueIn");
if (offset) {
add_req(offset);
}
}
virtual int Opcode() const;
virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn; }
virtual uint match_edge(uint idx) const {
return idx == MemNode::Address ||
idx == MemNode::ValueIn ||
((is_subword_type(vect_type()->element_basic_type())) &&
idx == MemNode::ValueIn + 1);
}
};
//------------------------------StoreVectorNode--------------------------------
@ -1003,20 +1013,23 @@ class LoadVectorMaskedNode : public LoadVectorNode {
// Load Vector from memory via index map under the influence of a predicate register(mask).
class LoadVectorGatherMaskedNode : public LoadVectorNode {
public:
LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask)
LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask, Node* offset = nullptr)
: LoadVectorNode(c, mem, adr, at, vt) {
init_class_id(Class_LoadVectorGatherMasked);
assert(indices->bottom_type()->is_vect(), "indices must be in vector");
assert(mask->bottom_type()->isa_vectmask(), "sanity");
add_req(indices);
add_req(mask);
assert(req() == MemNode::ValueIn + 2, "match_edge expects that last input is in MemNode::ValueIn+1");
if (is_subword_type(vt->element_basic_type())) {
add_req(offset);
}
}
virtual int Opcode() const;
virtual uint match_edge(uint idx) const { return idx == MemNode::Address ||
idx == MemNode::ValueIn ||
idx == MemNode::ValueIn + 1; }
idx == MemNode::ValueIn + 1 ||
(is_subword_type(vect_type()->is_vect()->element_basic_type()) &&
idx == MemNode::ValueIn + 2); }
};
//------------------------------StoreVectorScatterMaskedNode--------------------------------

View File

@ -893,6 +893,12 @@ final class Byte128Vector extends ByteVector {
return super.fromArray0Template(Byte128Mask.class, a, offset, (Byte128Mask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
return super.fromArray0Template(Byte128Mask.class, a, offset, indexMap, mapOffset, (Byte128Mask) m);
}
@ForceInline

View File

@ -925,6 +925,12 @@ final class Byte256Vector extends ByteVector {
return super.fromArray0Template(Byte256Mask.class, a, offset, (Byte256Mask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
return super.fromArray0Template(Byte256Mask.class, a, offset, indexMap, mapOffset, (Byte256Mask) m);
}
@ForceInline

View File

@ -989,6 +989,12 @@ final class Byte512Vector extends ByteVector {
return super.fromArray0Template(Byte512Mask.class, a, offset, (Byte512Mask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
return super.fromArray0Template(Byte512Mask.class, a, offset, indexMap, mapOffset, (Byte512Mask) m);
}
@ForceInline

View File

@ -877,6 +877,12 @@ final class Byte64Vector extends ByteVector {
return super.fromArray0Template(Byte64Mask.class, a, offset, (Byte64Mask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
return super.fromArray0Template(Byte64Mask.class, a, offset, indexMap, mapOffset, (Byte64Mask) m);
}
@ForceInline

View File

@ -863,6 +863,12 @@ final class ByteMaxVector extends ByteVector {
return super.fromArray0Template(ByteMaxMask.class, a, offset, (ByteMaxMask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Byte> m) {
return super.fromArray0Template(ByteMaxMask.class, a, offset, indexMap, mapOffset, (ByteMaxMask) m);
}
@ForceInline

View File

@ -3049,7 +3049,35 @@ public abstract class ByteVector extends AbstractVector<Byte> {
byte[] a, int offset,
int[] indexMap, int mapOffset) {
ByteSpecies vsp = (ByteSpecies) species;
return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]);
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
Objects.requireNonNull(a);
Objects.requireNonNull(indexMap);
Class<? extends ByteVector> vectorType = vsp.vectorType();
// Constant folding should sweep out following conditonal logic.
VectorSpecies<Integer> lsp;
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
lsp = IntVector.SPECIES_PREFERRED;
} else {
lsp = isp;
}
// Check indices are within array bounds.
for (int i = 0; i < vsp.length(); i += lsp.length()) {
IntVector vix = IntVector
.fromArray(lsp, indexMap, mapOffset + i)
.add(offset);
VectorIntrinsics.checkIndex(vix, a.length);
}
return VectorSupport.loadWithMap(
vectorType, null, byte.class, vsp.laneCount(),
lsp.vectorType(),
a, ARRAY_BASE, null, null,
a, offset, indexMap, mapOffset, vsp,
(c, idx, iMap, idy, s, vm) ->
s.vOp(n -> c[idx + iMap[idy+n]]));
}
/**
@ -3094,8 +3122,13 @@ public abstract class ByteVector extends AbstractVector<Byte> {
byte[] a, int offset,
int[] indexMap, int mapOffset,
VectorMask<Byte> m) {
ByteSpecies vsp = (ByteSpecies) species;
return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]);
if (m.allTrue()) {
return fromArray(species, a, offset, indexMap, mapOffset);
}
else {
ByteSpecies vsp = (ByteSpecies) species;
return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m);
}
}
@ -3760,6 +3793,49 @@ public abstract class ByteVector extends AbstractVector<Byte> {
(arr_, off_, i) -> arr_[off_ + i]));
}
/*package-private*/
abstract
ByteVector fromArray0(byte[] a, int offset,
int[] indexMap, int mapOffset,
VectorMask<Byte> m);
@ForceInline
final
<M extends VectorMask<Byte>>
ByteVector fromArray0Template(Class<M> maskClass, byte[] a, int offset,
int[] indexMap, int mapOffset, M m) {
ByteSpecies vsp = vspecies();
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
Objects.requireNonNull(a);
Objects.requireNonNull(indexMap);
m.check(vsp);
Class<? extends ByteVector> vectorType = vsp.vectorType();
// Constant folding should sweep out following conditonal logic.
VectorSpecies<Integer> lsp;
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
lsp = IntVector.SPECIES_PREFERRED;
} else {
lsp = isp;
}
// Check indices are within array bounds.
// FIXME: Check index under mask controlling.
for (int i = 0; i < vsp.length(); i += lsp.length()) {
IntVector vix = IntVector
.fromArray(lsp, indexMap, mapOffset + i)
.add(offset);
VectorIntrinsics.checkIndex(vix, a.length);
}
return VectorSupport.loadWithMap(
vectorType, maskClass, byte.class, vsp.laneCount(),
lsp.vectorType(),
a, ARRAY_BASE, null, m,
a, offset, indexMap, mapOffset, vsp,
(c, idx, iMap, idy, s, vm) ->
s.vOp(vm, n -> c[idx + iMap[idy+n]]));
}
/*package-private*/

View File

@ -877,6 +877,12 @@ final class Short128Vector extends ShortVector {
return super.fromArray0Template(Short128Mask.class, a, offset, (Short128Mask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
return super.fromArray0Template(Short128Mask.class, a, offset, indexMap, mapOffset, (Short128Mask) m);
}
@ForceInline
@Override

View File

@ -893,6 +893,12 @@ final class Short256Vector extends ShortVector {
return super.fromArray0Template(Short256Mask.class, a, offset, (Short256Mask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
return super.fromArray0Template(Short256Mask.class, a, offset, indexMap, mapOffset, (Short256Mask) m);
}
@ForceInline
@Override

View File

@ -925,6 +925,12 @@ final class Short512Vector extends ShortVector {
return super.fromArray0Template(Short512Mask.class, a, offset, (Short512Mask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
return super.fromArray0Template(Short512Mask.class, a, offset, indexMap, mapOffset, (Short512Mask) m);
}
@ForceInline
@Override

View File

@ -869,6 +869,12 @@ final class Short64Vector extends ShortVector {
return super.fromArray0Template(Short64Mask.class, a, offset, (Short64Mask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
return super.fromArray0Template(Short64Mask.class, a, offset, indexMap, mapOffset, (Short64Mask) m);
}
@ForceInline
@Override

View File

@ -863,6 +863,12 @@ final class ShortMaxVector extends ShortVector {
return super.fromArray0Template(ShortMaxMask.class, a, offset, (ShortMaxMask) m, offsetInRange); // specialize
}
@ForceInline
@Override
final
ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask<Short> m) {
return super.fromArray0Template(ShortMaxMask.class, a, offset, indexMap, mapOffset, (ShortMaxMask) m);
}
@ForceInline
@Override

View File

@ -3050,7 +3050,35 @@ public abstract class ShortVector extends AbstractVector<Short> {
short[] a, int offset,
int[] indexMap, int mapOffset) {
ShortSpecies vsp = (ShortSpecies) species;
return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]);
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
Objects.requireNonNull(a);
Objects.requireNonNull(indexMap);
Class<? extends ShortVector> vectorType = vsp.vectorType();
// Constant folding should sweep out following conditonal logic.
VectorSpecies<Integer> lsp;
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
lsp = IntVector.SPECIES_PREFERRED;
} else {
lsp = isp;
}
// Check indices are within array bounds.
for (int i = 0; i < vsp.length(); i += lsp.length()) {
IntVector vix = IntVector
.fromArray(lsp, indexMap, mapOffset + i)
.add(offset);
VectorIntrinsics.checkIndex(vix, a.length);
}
return VectorSupport.loadWithMap(
vectorType, null, short.class, vsp.laneCount(),
lsp.vectorType(),
a, ARRAY_BASE, null, null,
a, offset, indexMap, mapOffset, vsp,
(c, idx, iMap, idy, s, vm) ->
s.vOp(n -> c[idx + iMap[idy+n]]));
}
/**
@ -3095,8 +3123,13 @@ public abstract class ShortVector extends AbstractVector<Short> {
short[] a, int offset,
int[] indexMap, int mapOffset,
VectorMask<Short> m) {
ShortSpecies vsp = (ShortSpecies) species;
return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]);
if (m.allTrue()) {
return fromArray(species, a, offset, indexMap, mapOffset);
}
else {
ShortSpecies vsp = (ShortSpecies) species;
return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m);
}
}
/**
@ -3746,6 +3779,49 @@ public abstract class ShortVector extends AbstractVector<Short> {
(arr_, off_, i) -> arr_[off_ + i]));
}
/*package-private*/
abstract
ShortVector fromArray0(short[] a, int offset,
int[] indexMap, int mapOffset,
VectorMask<Short> m);
@ForceInline
final
<M extends VectorMask<Short>>
ShortVector fromArray0Template(Class<M> maskClass, short[] a, int offset,
int[] indexMap, int mapOffset, M m) {
ShortSpecies vsp = vspecies();
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
Objects.requireNonNull(a);
Objects.requireNonNull(indexMap);
m.check(vsp);
Class<? extends ShortVector> vectorType = vsp.vectorType();
// Constant folding should sweep out following conditonal logic.
VectorSpecies<Integer> lsp;
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
lsp = IntVector.SPECIES_PREFERRED;
} else {
lsp = isp;
}
// Check indices are within array bounds.
// FIXME: Check index under mask controlling.
for (int i = 0; i < vsp.length(); i += lsp.length()) {
IntVector vix = IntVector
.fromArray(lsp, indexMap, mapOffset + i)
.add(offset);
VectorIntrinsics.checkIndex(vix, a.length);
}
return VectorSupport.loadWithMap(
vectorType, maskClass, short.class, vsp.laneCount(),
lsp.vectorType(),
a, ARRAY_BASE, null, m,
a, offset, indexMap, mapOffset, vsp,
(c, idx, iMap, idy, s, vm) ->
s.vOp(vm, n -> c[idx + iMap[idy+n]]));
}
/*package-private*/
abstract

View File

@ -3622,7 +3622,35 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
$type$[] a, int offset,
int[] indexMap, int mapOffset) {
$Type$Species vsp = ($Type$Species) species;
return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]);
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
Objects.requireNonNull(a);
Objects.requireNonNull(indexMap);
Class<? extends $abstractvectortype$> vectorType = vsp.vectorType();
// Constant folding should sweep out following conditonal logic.
VectorSpecies<Integer> lsp;
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
lsp = IntVector.SPECIES_PREFERRED;
} else {
lsp = isp;
}
// Check indices are within array bounds.
for (int i = 0; i < vsp.length(); i += lsp.length()) {
IntVector vix = IntVector
.fromArray(lsp, indexMap, mapOffset + i)
.add(offset);
VectorIntrinsics.checkIndex(vix, a.length);
}
return VectorSupport.loadWithMap(
vectorType, null, $type$.class, vsp.laneCount(),
lsp.vectorType(),
a, ARRAY_BASE, null, null,
a, offset, indexMap, mapOffset, vsp,
(c, idx, iMap, idy, s, vm) ->
s.vOp(n -> c[idx + iMap[idy+n]]));
}
#else[byteOrShort]
@ForceInline
@ -3714,17 +3742,6 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
* where the mask is set
* @see $abstractvectortype$#toIntArray()
*/
#if[byteOrShort]
@ForceInline
public static
$abstractvectortype$ fromArray(VectorSpecies<$Boxtype$> species,
$type$[] a, int offset,
int[] indexMap, int mapOffset,
VectorMask<$Boxtype$> m) {
$Type$Species vsp = ($Type$Species) species;
return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]);
}
#else[byteOrShort]
@ForceInline
public static
$abstractvectortype$ fromArray(VectorSpecies<$Boxtype$> species,
@ -3739,7 +3756,6 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m);
}
}
#end[byteOrShort]
#if[short]
/**
@ -4793,12 +4809,51 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
(arr_, off_, i) -> arr_[off_ + i]));
}
#if[!byteOrShort]
/*package-private*/
abstract
$abstractvectortype$ fromArray0($type$[] a, int offset,
int[] indexMap, int mapOffset,
VectorMask<$Boxtype$> m);
#if[byteOrShort]
@ForceInline
final
<M extends VectorMask<$Boxtype$>>
$abstractvectortype$ fromArray0Template(Class<M> maskClass, $type$[] a, int offset,
int[] indexMap, int mapOffset, M m) {
$Type$Species vsp = vspecies();
IntVector.IntSpecies isp = IntVector.species(vsp.indexShape());
Objects.requireNonNull(a);
Objects.requireNonNull(indexMap);
m.check(vsp);
Class<? extends $abstractvectortype$> vectorType = vsp.vectorType();
// Constant folding should sweep out following conditonal logic.
VectorSpecies<Integer> lsp;
if (isp.length() > IntVector.SPECIES_PREFERRED.length()) {
lsp = IntVector.SPECIES_PREFERRED;
} else {
lsp = isp;
}
// Check indices are within array bounds.
// FIXME: Check index under mask controlling.
for (int i = 0; i < vsp.length(); i += lsp.length()) {
IntVector vix = IntVector
.fromArray(lsp, indexMap, mapOffset + i)
.add(offset);
VectorIntrinsics.checkIndex(vix, a.length);
}
return VectorSupport.loadWithMap(
vectorType, maskClass, $type$.class, vsp.laneCount(),
lsp.vectorType(),
a, ARRAY_BASE, null, m,
a, offset, indexMap, mapOffset, vsp,
(c, idx, iMap, idy, s, vm) ->
s.vOp(vm, n -> c[idx + iMap[idy+n]]));
}
#else[byteOrShort]
@ForceInline
final
<M extends VectorMask<$Boxtype$>>
@ -4852,7 +4907,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
(c, idx, iMap, idy, s, vm) ->
s.vOp(vm, n -> c[idx + iMap[idy+n]]));
}
#end[!byteOrShort]
#end[byteOrShort]
#if[short]
/*package-private*/

View File

@ -1151,14 +1151,12 @@ final class $vectortype$ extends $abstractvectortype$ {
return super.fromArray0Template($masktype$.class, a, offset, ($masktype$) m, offsetInRange); // specialize
}
#if[!byteOrShort]
@ForceInline
@Override
final
$abstractvectortype$ fromArray0($type$[] a, int offset, int[] indexMap, int mapOffset, VectorMask<$Boxtype$> m) {
return super.fromArray0Template($masktype$.class, a, offset, indexMap, mapOffset, ($masktype$) m);
}
#end[!byteOrShort]
#if[short]
@ForceInline

View File

@ -0,0 +1,357 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
package org.openjdk.bench.jdk.incubator.vector;
import jdk.incubator.vector.*;
import java.util.Random;
import java.util.stream.IntStream;
import java.util.concurrent.TimeUnit;
import org.openjdk.jmh.annotations.*;
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Thread)
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
public class GatherOperationsBenchmark {
@Param({"64", "256", "1024", "4096"})
int SIZE;
byte [] barr;
byte [] bres;
short [] sarr;
short [] sres;
int [] index;
static final VectorSpecies<Short> S64 = ShortVector.SPECIES_64;
static final VectorSpecies<Short> S128 = ShortVector.SPECIES_128;
static final VectorSpecies<Short> S256 = ShortVector.SPECIES_256;
static final VectorSpecies<Short> S512 = ShortVector.SPECIES_512;
static final VectorSpecies<Byte> B64 = ByteVector.SPECIES_64;
static final VectorSpecies<Byte> B128 = ByteVector.SPECIES_128;
static final VectorSpecies<Byte> B256 = ByteVector.SPECIES_256;
static final VectorSpecies<Byte> B512 = ByteVector.SPECIES_512;
@Setup(Level.Trial)
public void BmSetup() {
Random r = new Random(1245);
index = new int[SIZE];
barr = new byte[SIZE];
bres = new byte[SIZE];
sarr = new short[SIZE];
sres = new short[SIZE];
for (int i = 0; i < SIZE; i++) {
barr[i] = (byte)i;
sarr[i] = (short)i;
index[i] = r.nextInt(SIZE-1);
}
}
@Benchmark
public void microByteGather64() {
for (int i = 0; i < SIZE; i += B64.length()) {
ByteVector.fromArray(B64, barr, 0, index, i)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather64_NZ_OFF() {
for (int i = 0; i < SIZE; i += B64.length()) {
ByteVector.fromArray(B64, barr, 1, index, i)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather64_MASK() {
VectorMask<Byte> VMASK = VectorMask.fromLong(B64, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += B64.length()) {
ByteVector.fromArray(B64, barr, 0, index, i, VMASK)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather64_MASK_NZ_OFF() {
VectorMask<Byte> VMASK = VectorMask.fromLong(B64, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += B64.length()) {
ByteVector.fromArray(B64, barr, 1, index, i, VMASK)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather128() {
for (int i = 0; i < SIZE; i += B128.length()) {
ByteVector.fromArray(B128, barr, 0, index, i)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather128_NZ_OFF() {
for (int i = 0; i < SIZE; i += B128.length()) {
ByteVector.fromArray(B128, barr, 1, index, i)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather128_MASK() {
VectorMask<Byte> VMASK = VectorMask.fromLong(B128, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += B128.length()) {
ByteVector.fromArray(B128, barr, 0, index, i, VMASK)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather128_MASK_NZ_OFF() {
VectorMask<Byte> VMASK = VectorMask.fromLong(B128, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += B128.length()) {
ByteVector.fromArray(B128, barr, 1, index, i, VMASK)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather256() {
for (int i = 0; i < SIZE; i += B256.length()) {
ByteVector.fromArray(B256, barr, 0, index, i)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather256_NZ_OFF() {
for (int i = 0; i < SIZE; i += B256.length()) {
ByteVector.fromArray(B256, barr, 1, index, i)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather256_MASK() {
VectorMask<Byte> VMASK = VectorMask.fromLong(B256, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += B256.length()) {
ByteVector.fromArray(B256, barr, 0, index, i, VMASK)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather256_MASK_NZ_OFF() {
VectorMask<Byte> VMASK = VectorMask.fromLong(B256, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += B256.length()) {
ByteVector.fromArray(B256, barr, 1, index, i, VMASK)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather512() {
for (int i = 0; i < SIZE; i += B512.length()) {
ByteVector.fromArray(B512, barr, 0, index, i)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather512_NZ_OFF() {
for (int i = 0; i < SIZE; i += B512.length()) {
ByteVector.fromArray(B512, barr, 1, index, i)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather512_MASK() {
VectorMask<Byte> VMASK = VectorMask.fromLong(B512, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += B512.length()) {
ByteVector.fromArray(B512, barr, 0, index, i, VMASK)
.intoArray(bres, i);
}
}
@Benchmark
public void microByteGather512_MASK_NZ_OFF() {
VectorMask<Byte> VMASK = VectorMask.fromLong(B512, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += B512.length()) {
ByteVector.fromArray(B512, barr, 1, index, i, VMASK)
.intoArray(bres, i);
}
}
@Benchmark
public void microShortGather64() {
for (int i = 0; i < SIZE; i += S64.length()) {
ShortVector.fromArray(S64, sarr, 0, index, i)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather64_NZ_OFF() {
for (int i = 0; i < SIZE; i += S64.length()) {
ShortVector.fromArray(S64, sarr, 1, index, i)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather64_MASK() {
VectorMask<Short> VMASK = VectorMask.fromLong(S64, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += S64.length()) {
ShortVector.fromArray(S64, sarr, 0, index, i, VMASK)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather64_MASK_NZ_OFF() {
VectorMask<Short> VMASK = VectorMask.fromLong(S64, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += S64.length()) {
ShortVector.fromArray(S64, sarr, 1, index, i, VMASK)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather128() {
for (int i = 0; i < SIZE; i += S128.length()) {
ShortVector.fromArray(S128, sarr, 0, index, i)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather128_NZ_OFF() {
for (int i = 0; i < SIZE; i += S128.length()) {
ShortVector.fromArray(S128, sarr, 1, index, i)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather128_MASK() {
VectorMask<Short> VMASK = VectorMask.fromLong(S128, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += S128.length()) {
ShortVector.fromArray(S128, sarr, 0, index, i, VMASK)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather128_MASK_NZ_OFF() {
VectorMask<Short> VMASK = VectorMask.fromLong(S128, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += S128.length()) {
ShortVector.fromArray(S128, sarr, 1, index, i, VMASK)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather256() {
for (int i = 0; i < SIZE; i += S256.length()) {
ShortVector.fromArray(S256, sarr, 0, index, i)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather256_NZ_OFF() {
for (int i = 0; i < SIZE; i += S256.length()) {
ShortVector.fromArray(S256, sarr, 1, index, i)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather256_MASK() {
VectorMask<Short> VMASK = VectorMask.fromLong(S256, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += S256.length()) {
ShortVector.fromArray(S256, sarr, 0, index, i, VMASK)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather256_MASK_NZ_OFF() {
VectorMask<Short> VMASK = VectorMask.fromLong(S256, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += S256.length()) {
ShortVector.fromArray(S256, sarr, 1, index, i, VMASK)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather512() {
for (int i = 0; i < SIZE; i += S512.length()) {
ShortVector.fromArray(S512, sarr, 0, index, i)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather512_NZ_OFF() {
for (int i = 0; i < SIZE; i += S512.length()) {
ShortVector.fromArray(S512, sarr, 1, index, i)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather512_MASK() {
VectorMask<Short> VMASK = VectorMask.fromLong(S512, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += S512.length()) {
ShortVector.fromArray(S512, sarr, 0, index, i, VMASK)
.intoArray(sres, i);
}
}
@Benchmark
public void microShortGather512_MASK_NZ_OFF() {
VectorMask<Short> VMASK = VectorMask.fromLong(S512, 0x5555555555555555L);
for (int i = 0; i < SIZE; i += S512.length()) {
ShortVector.fromArray(S512, sarr, 1, index, i, VMASK)
.intoArray(sres, i);
}
}
}