diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 467d6ec2250..4e8eb47ee5f 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -169,9 +169,7 @@ source %{ case Op_VectorMaskGen: case Op_LoadVectorMasked: case Op_StoreVectorMasked: - case Op_LoadVectorGather: case Op_StoreVectorScatter: - case Op_LoadVectorGatherMasked: case Op_StoreVectorScatterMasked: case Op_PopulateIndex: case Op_CompressM: @@ -180,6 +178,12 @@ source %{ return false; } break; + case Op_LoadVectorGather: + case Op_LoadVectorGatherMasked: + if (UseSVE == 0 || is_subword_type(bt)) { + return false; + } + break; case Op_MulAddVS2VI: if (length_in_bytes != 16) { return false; diff --git a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp index 08bff22d7d0..4d28e5ade66 100644 --- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp @@ -133,6 +133,11 @@ return true; } + // Does target support predicated operation emulation. + static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) { + return false; + } + // Does the CPU supports vector variable rotate instructions? static constexpr bool supports_vector_variable_rotates(void) { return false; diff --git a/src/hotspot/cpu/arm/matcher_arm.hpp b/src/hotspot/cpu/arm/matcher_arm.hpp index eb26cbcbd7a..716a997a72b 100644 --- a/src/hotspot/cpu/arm/matcher_arm.hpp +++ b/src/hotspot/cpu/arm/matcher_arm.hpp @@ -126,6 +126,11 @@ return VM_Version::has_simd(); } + // Does target support predicated operation emulation. + static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) { + return false; + } + // Does the CPU supports vector variable rotate instructions? static constexpr bool supports_vector_variable_rotates(void) { return false; // not supported diff --git a/src/hotspot/cpu/ppc/matcher_ppc.hpp b/src/hotspot/cpu/ppc/matcher_ppc.hpp index b195ba4eeb2..0ee4245f274 100644 --- a/src/hotspot/cpu/ppc/matcher_ppc.hpp +++ b/src/hotspot/cpu/ppc/matcher_ppc.hpp @@ -133,6 +133,11 @@ return false; } + // Does target support predicated operation emulation. + static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) { + return false; + } + // Does the CPU supports vector variable rotate instructions? static constexpr bool supports_vector_variable_rotates(void) { return false; diff --git a/src/hotspot/cpu/riscv/matcher_riscv.hpp b/src/hotspot/cpu/riscv/matcher_riscv.hpp index 08914d4d834..32665f5922c 100644 --- a/src/hotspot/cpu/riscv/matcher_riscv.hpp +++ b/src/hotspot/cpu/riscv/matcher_riscv.hpp @@ -132,6 +132,11 @@ return false; } + // Does target support predicated operation emulation. + static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) { + return false; + } + // Does the CPU supports vector variable rotate instructions? static constexpr bool supports_vector_variable_rotates(void) { return false; diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index 86309ec9128..01a367a46ea 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -73,6 +73,11 @@ source %{ return false; } break; + case Op_LoadVectorGatherMasked: + if (is_subword_type(bt)) { + return false; + } + break; case Op_VectorCastHF2F: case Op_VectorCastF2HF: return UseZvfh; diff --git a/src/hotspot/cpu/s390/matcher_s390.hpp b/src/hotspot/cpu/s390/matcher_s390.hpp index 450ea35a6cb..6c6cae3c58f 100644 --- a/src/hotspot/cpu/s390/matcher_s390.hpp +++ b/src/hotspot/cpu/s390/matcher_s390.hpp @@ -124,6 +124,11 @@ return false; } + // Does target support predicated operation emulation. + static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) { + return false; + } + // Does the CPU supports vector variable rotate instructions? static constexpr bool supports_vector_variable_rotates(void) { return false; diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index dd3f9c64e20..6ed04ad5211 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -13652,9 +13652,13 @@ void Assembler::notq(Register dst) { emit_int16((unsigned char)0xF7, (0xD0 | encode)); } +void Assembler::btq(Register dst, Register src) { + int encode = prefixq_and_encode(src->encoding(), dst->encoding()); + emit_int24(0x0F, (unsigned char)0xA3, (encode | 0xC0)); +} + void Assembler::btq(Register src, int imm8) { assert(isByte(imm8), "not a byte"); - InstructionMark im(this); int encode = prefixq_and_encode(src->encoding()); emit_int16(0x0f, 0xba); emit_int8(0xe0|encode); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 656b2a97c70..64e8cf99bfc 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1736,6 +1736,7 @@ private: void btrq(Address dst, int imm8); void btq(Register src, int imm8); #endif + void btq(Register dst, Register src); void orw(Register dst, Register src); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index c79753618c0..d0eb103d81b 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -1796,6 +1796,130 @@ void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, } } +#ifdef _LP64 +void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, + XMMRegister dst, Register base, + Register idx_base, + Register offset, Register mask, + Register mask_idx, Register rtmp, + int vlen_enc) { + vpxor(dst, dst, dst, vlen_enc); + if (elem_bt == T_SHORT) { + for (int i = 0; i < 4; i++) { + // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 + Label skip_load; + btq(mask, mask_idx); + jccb(Assembler::carryClear, skip_load); + movl(rtmp, Address(idx_base, i * 4)); + if (offset != noreg) { + addl(rtmp, offset); + } + pinsrw(dst, Address(base, rtmp, Address::times_2), i); + bind(skip_load); + incq(mask_idx); + } + } else { + assert(elem_bt == T_BYTE, ""); + for (int i = 0; i < 8; i++) { + // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 + Label skip_load; + btq(mask, mask_idx); + jccb(Assembler::carryClear, skip_load); + movl(rtmp, Address(idx_base, i * 4)); + if (offset != noreg) { + addl(rtmp, offset); + } + pinsrb(dst, Address(base, rtmp), i); + bind(skip_load); + incq(mask_idx); + } + } +} +#endif // _LP64 + +void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, + Register base, Register idx_base, + Register offset, Register rtmp, + int vlen_enc) { + vpxor(dst, dst, dst, vlen_enc); + if (elem_bt == T_SHORT) { + for (int i = 0; i < 4; i++) { + // dst[i] = src[offset + idx_base[i]] + movl(rtmp, Address(idx_base, i * 4)); + if (offset != noreg) { + addl(rtmp, offset); + } + pinsrw(dst, Address(base, rtmp, Address::times_2), i); + } + } else { + assert(elem_bt == T_BYTE, ""); + for (int i = 0; i < 8; i++) { + // dst[i] = src[offset + idx_base[i]] + movl(rtmp, Address(idx_base, i * 4)); + if (offset != noreg) { + addl(rtmp, offset); + } + pinsrb(dst, Address(base, rtmp), i); + } + } +} + +/* + * Gather using hybrid algorithm, first partially unroll scalar loop + * to accumulate values from gather indices into a quad-word(64bit) slice. + * A slice may hold 8 bytes or 4 short values. This is followed by a vector + * permutation to place the slice into appropriate vector lane + * locations in destination vector. Following pseudo code describes the + * algorithm in detail: + * + * DST_VEC = ZERO_VEC + * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} + * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} + * FOREACH_ITER: + * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES + * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX + * DST_VEC = DST_VEC OR TEMP_PERM_VEC + * PERM_INDEX = PERM_INDEX - TWO_VEC + * + * With each iteration, doubleword permute indices (0,1) corresponding + * to gathered quadword gets right shifted by two lane positions. + * + */ +void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, + Register base, Register idx_base, + Register offset, Register mask, + XMMRegister xtmp1, XMMRegister xtmp2, + XMMRegister temp_dst, Register rtmp, + Register mask_idx, Register length, + int vector_len, int vlen_enc) { + Label GATHER8_LOOP; + assert(is_subword_type(elem_ty), ""); + movl(length, vector_len); + vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} + vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} + vallones(xtmp2, vlen_enc); + vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); + vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} + load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} + + bind(GATHER8_LOOP); + // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES + if (mask == noreg) { + vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); + } else { + LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); + } + // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) + vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); + // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) + vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); + // DST_VEC = DST_VEC OR TEMP_PERM_VEC + vpor(dst, dst, temp_dst, vlen_enc); + addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); + subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); + jcc(Assembler::notEqual, GATHER8_LOOP); +} + void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { switch(typ) { case T_INT: diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index 26f7fb44aa9..8c22990892b 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -500,4 +500,16 @@ public: void vector_rearrange_int_float(BasicType bt, XMMRegister dst, XMMRegister shuffle, XMMRegister src, int vlen_enc); + + void vgather_subword(BasicType elem_ty, XMMRegister dst, Register base, Register idx_base, Register offset, + Register mask, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, + Register midx, Register length, int vector_len, int vlen_enc); + +#ifdef _LP64 + void vgather8b_masked_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base, + Register offset, Register mask, Register midx, Register rtmp, int vlen_enc); +#endif + void vgather8b_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base, + Register offset, Register rtmp, int vlen_enc); + #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP diff --git a/src/hotspot/cpu/x86/matcher_x86.hpp b/src/hotspot/cpu/x86/matcher_x86.hpp index de844c4be9f..192e959451f 100644 --- a/src/hotspot/cpu/x86/matcher_x86.hpp +++ b/src/hotspot/cpu/x86/matcher_x86.hpp @@ -154,6 +154,16 @@ return (UseAVX >= 2); } + // Does target support predicated operation emulation. + static bool supports_vector_predicate_op_emulation(int vopc, int vlen, BasicType bt) { + switch(vopc) { + case Op_LoadVectorGatherMasked: + return is_subword_type(bt) && VM_Version::supports_avx2(); + default: + return false; + } + } + // Does the CPU supports vector variable rotate instructions? static constexpr bool supports_vector_variable_rotates(void) { return true; @@ -214,6 +224,9 @@ return 7; case Op_MulVL: return VM_Version::supports_avx512vldq() ? 0 : 6; + case Op_LoadVectorGather: + case Op_LoadVectorGatherMasked: + return is_subword_type(ety) ? 50 : 0; case Op_VectorCastF2X: // fall through case Op_VectorCastD2X: return is_floating_point_type(ety) ? 0 : (is_subword_type(ety) ? 35 : 30); diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 0b262bb9c37..7bbae30c832 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1569,6 +1569,7 @@ bool Matcher::match_rule_supported(int opcode) { } break; case Op_LoadVectorGather: + case Op_LoadVectorGatherMasked: if (UseAVX < 2) { return false; } @@ -1906,6 +1907,17 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { } break; case Op_LoadVectorGatherMasked: + if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) { + return false; + } + if (is_subword_type(bt) && + (!is_LP64 || + (size_in_bits > 256 && !VM_Version::supports_avx512bw()) || + (size_in_bits < 64) || + (bt == T_SHORT && !VM_Version::supports_bmi2()))) { + return false; + } + break; case Op_StoreVectorScatterMasked: case Op_StoreVectorScatter: if (is_subword_type(bt)) { @@ -1915,7 +1927,10 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { } // fallthrough case Op_LoadVectorGather: - if (size_in_bits == 64 ) { + if (!is_subword_type(bt) && size_in_bits == 64) { + return false; + } + if (is_subword_type(bt) && size_in_bits < 64) { return false; } break; @@ -4024,10 +4039,11 @@ instruct storeV(memory mem, vec src) %{ // ---------------------------------------- Gather ------------------------------------ -// Gather INT, LONG, FLOAT, DOUBLE +// Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{ - predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32); + predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) && + Matcher::vector_length_in_bytes(n) <= 32); match(Set dst (LoadVectorGather mem idx)); effect(TEMP dst, TEMP tmp, TEMP mask); format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %} @@ -4044,7 +4060,8 @@ instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{ instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{ - predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64); + predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) && + !is_subword_type(Matcher::vector_element_basic_type(n))); match(Set dst (LoadVectorGather mem idx)); effect(TEMP dst, TEMP tmp, TEMP ktmp); format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %} @@ -4059,7 +4076,8 @@ instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{ %} instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{ - predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64); + predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) && + !is_subword_type(Matcher::vector_element_basic_type(n))); match(Set dst (LoadVectorGatherMasked mem (Binary idx mask))); effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp); format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %} @@ -4077,6 +4095,238 @@ instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRe %} ins_pipe( pipe_slow ); %} + +instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{ + predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); + match(Set dst (LoadVectorGather mem (Binary idx_base offset))); + effect(TEMP tmp, TEMP rtmp); + format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ lea($tmp$$Register, $mem$$Address); + __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp, + vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{ + predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); + match(Set dst (LoadVectorGather mem (Binary idx_base offset))); + effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr); + format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + int vector_len = Matcher::vector_length(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ lea($tmp$$Register, $mem$$Address); + __ movptr($idx_base_temp$$Register, $idx_base$$Register); + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{ + predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); + match(Set dst (LoadVectorGather mem (Binary idx_base offset))); + effect(TEMP tmp, TEMP rtmp, KILL cr); + format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ lea($tmp$$Register, $mem$$Address); + __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + + +instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp, + vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{ + predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); + match(Set dst (LoadVectorGather mem (Binary idx_base offset))); + effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr); + format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + int vector_len = Matcher::vector_length(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ lea($tmp$$Register, $mem$$Address); + __ movptr($idx_base_temp$$Register, $idx_base$$Register); + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + + +#ifdef _LP64 +instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{ + predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr); + format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ xorq($mask_idx$$Register, $mask_idx$$Register); + __ lea($tmp$$Register, $mem$$Address); + __ kmovql($rtmp2$$Register, $mask$$KRegister); + __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp, + vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{ + predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr); + format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + int vector_len = Matcher::vector_length(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ xorq($mask_idx$$Register, $mask_idx$$Register); + __ lea($tmp$$Register, $mem$$Address); + __ movptr($idx_base_temp$$Register, $idx_base$$Register); + __ kmovql($rtmp2$$Register, $mask$$KRegister); + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{ + predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr); + format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ xorq($mask_idx$$Register, $mask_idx$$Register); + __ lea($tmp$$Register, $mem$$Address); + __ kmovql($rtmp2$$Register, $mask$$KRegister); + __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, + $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp, + vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{ + predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr); + format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + int vector_len = Matcher::vector_length(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ xorq($mask_idx$$Register, $mask_idx$$Register); + __ lea($tmp$$Register, $mem$$Address); + __ movptr($idx_base_temp$$Register, $idx_base$$Register); + __ kmovql($rtmp2$$Register, $mask$$KRegister); + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{ + predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr); + format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ lea($tmp$$Register, $mem$$Address); + __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc); + if (elem_bt == T_SHORT) { + __ movl($mask_idx$$Register, 0x55555555); + __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register); + } + __ xorl($mask_idx$$Register, $mask_idx$$Register); + __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp, + vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{ + predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr); + format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + int vector_len = Matcher::vector_length(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ lea($tmp$$Register, $mem$$Address); + __ movptr($idx_base_temp$$Register, $idx_base$$Register); + __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc); + if (elem_bt == T_SHORT) { + __ movl($mask_idx$$Register, 0x55555555); + __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register); + } + __ xorl($mask_idx$$Register, $mask_idx$$Register); + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{ + predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr); + format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ lea($tmp$$Register, $mem$$Address); + __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc); + if (elem_bt == T_SHORT) { + __ movl($mask_idx$$Register, 0x55555555); + __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register); + } + __ xorl($mask_idx$$Register, $mask_idx$$Register); + __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, + $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp, + vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{ + predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr); + format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + int vector_len = Matcher::vector_length(this); + BasicType elem_bt = Matcher::vector_element_basic_type(this); + __ xorl($mask_idx$$Register, $mask_idx$$Register); + __ lea($tmp$$Register, $mem$$Address); + __ movptr($idx_base_temp$$Register, $idx_base$$Register); + __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc); + if (elem_bt == T_SHORT) { + __ movl($mask_idx$$Register, 0x55555555); + __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register); + } + __ xorl($mask_idx$$Register, $mask_idx$$Register); + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister, + $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} +#endif + // ====================Scatter======================================= // Scatter INT, LONG, FLOAT, DOUBLE diff --git a/src/hotspot/share/opto/loopTransform.cpp b/src/hotspot/share/opto/loopTransform.cpp index d49117beb87..516036d839b 100644 --- a/src/hotspot/share/opto/loopTransform.cpp +++ b/src/hotspot/share/opto/loopTransform.cpp @@ -1050,6 +1050,8 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) { } break; case Op_CountTrailingZerosV: case Op_CountLeadingZerosV: + case Op_LoadVectorGather: + case Op_LoadVectorGatherMasked: case Op_ReverseV: case Op_RoundVF: case Op_RoundVD: diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index d14dce77208..74e2ba603ac 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -2474,7 +2474,22 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) { n->del_req(3); break; } + case Op_LoadVectorGather: + if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) { + Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1)); + n->set_req(MemNode::ValueIn, pair); + n->del_req(MemNode::ValueIn+1); + } + break; case Op_LoadVectorGatherMasked: + if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) { + Node* pair2 = new BinaryNode(n->in(MemNode::ValueIn + 1), n->in(MemNode::ValueIn + 2)); + Node* pair1 = new BinaryNode(n->in(MemNode::ValueIn), pair2); + n->set_req(MemNode::ValueIn, pair1); + n->del_req(MemNode::ValueIn+2); + n->del_req(MemNode::ValueIn+1); + break; + } // fall-through case Op_StoreVectorScatter: { Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1)); n->set_req(MemNode::ValueIn, pair); diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index 5249d8d67af..cee221c8b75 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -302,6 +302,7 @@ bool LibraryCallKit::arch_supports_vector(int sopc, int num_elem, BasicType type is_supported = Matcher::match_rule_supported_vector_masked(sopc, num_elem, type); } } + is_supported |= Matcher::supports_vector_predicate_op_emulation(sopc, num_elem, type); if (!is_supported) { #ifndef PRODUCT @@ -1500,8 +1501,8 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { } // Check whether the predicated gather/scatter node is supported by architecture. - if (!arch_supports_vector(is_scatter ? Op_StoreVectorScatterMasked : Op_LoadVectorGatherMasked, num_elem, elem_bt, - (VectorMaskUseType) (VecMaskUseLoad | VecMaskUsePred))) { + VectorMaskUseType mask = (VectorMaskUseType) (VecMaskUseLoad | VecMaskUsePred); + if (!arch_supports_vector(is_scatter ? Op_StoreVectorScatterMasked : Op_LoadVectorGatherMasked, num_elem, elem_bt, mask)) { if (C->print_intrinsics()) { tty->print_cr(" ** not supported: arity=%d op=%s vlen=%d etype=%s is_masked_op=1", is_scatter, is_scatter ? "scatterMasked" : "gatherMasked", @@ -1522,7 +1523,8 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { } // Check that the vector holding indices is supported by architecture - if (!arch_supports_vector(Op_LoadVector, num_elem, T_INT, VecMaskNotUsed)) { + // For sub-word gathers expander receive index array. + if (!is_subword_type(elem_bt) && !arch_supports_vector(Op_LoadVector, num_elem, T_INT, VecMaskNotUsed)) { if (C->print_intrinsics()) { tty->print_cr(" ** not supported: arity=%d op=%s/loadindex vlen=%d etype=int is_masked_op=%d", is_scatter, is_scatter ? "scatter" : "gather", @@ -1564,12 +1566,15 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { return false; } + Node* index_vect = nullptr; const TypeInstPtr* vbox_idx_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_idx_klass); - Node* index_vect = unbox_vector(argument(8), vbox_idx_type, T_INT, num_elem); - if (index_vect == nullptr) { - set_map(old_map); - set_sp(old_sp); - return false; + if (!is_subword_type(elem_bt)) { + index_vect = unbox_vector(argument(8), vbox_idx_type, T_INT, num_elem); + if (index_vect == nullptr) { + set_map(old_map); + set_sp(old_sp); + return false; + } } Node* mask = nullptr; @@ -1608,10 +1613,23 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { set_memory(vstore, addr_type); } else { Node* vload = nullptr; + Node* index = argument(11); + Node* indexMap = argument(12); + Node* indexM = argument(13); if (mask != nullptr) { - vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_vect, mask)); + if (is_subword_type(elem_bt)) { + Node* index_arr_base = array_element_address(indexMap, indexM, T_INT); + vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, mask, index)); + } else { + vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_vect, mask)); + } } else { - vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_vect)); + if (is_subword_type(elem_bt)) { + Node* index_arr_base = array_element_address(indexMap, indexM, T_INT); + vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, index)); + } else { + vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_vect)); + } } Node* box = box_vector(vload, vbox_type, elem_bt, num_elem); set_result(box); diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index ae37202cd25..ff31d2a51b8 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -890,16 +890,26 @@ class LoadVectorNode : public LoadNode { // Load Vector from memory via index map class LoadVectorGatherNode : public LoadVectorNode { public: - LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices) + LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* offset = nullptr) : LoadVectorNode(c, mem, adr, at, vt) { init_class_id(Class_LoadVectorGather); - assert(indices->bottom_type()->is_vect(), "indices must be in vector"); add_req(indices); - assert(req() == MemNode::ValueIn + 1, "match_edge expects that last input is in MemNode::ValueIn"); + DEBUG_ONLY(bool is_subword = is_subword_type(vt->element_basic_type())); + assert(is_subword || indices->bottom_type()->is_vect(), "indices must be in vector"); + assert(is_subword || !offset, ""); + assert(req() == MemNode::ValueIn + 1, "match_edge expects that index input is in MemNode::ValueIn"); + if (offset) { + add_req(offset); + } } virtual int Opcode() const; - virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn; } + virtual uint match_edge(uint idx) const { + return idx == MemNode::Address || + idx == MemNode::ValueIn || + ((is_subword_type(vect_type()->element_basic_type())) && + idx == MemNode::ValueIn + 1); + } }; //------------------------------StoreVectorNode-------------------------------- @@ -1003,20 +1013,23 @@ class LoadVectorMaskedNode : public LoadVectorNode { // Load Vector from memory via index map under the influence of a predicate register(mask). class LoadVectorGatherMaskedNode : public LoadVectorNode { public: - LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask) + LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask, Node* offset = nullptr) : LoadVectorNode(c, mem, adr, at, vt) { init_class_id(Class_LoadVectorGatherMasked); - assert(indices->bottom_type()->is_vect(), "indices must be in vector"); - assert(mask->bottom_type()->isa_vectmask(), "sanity"); add_req(indices); add_req(mask); assert(req() == MemNode::ValueIn + 2, "match_edge expects that last input is in MemNode::ValueIn+1"); + if (is_subword_type(vt->element_basic_type())) { + add_req(offset); + } } virtual int Opcode() const; virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn || - idx == MemNode::ValueIn + 1; } + idx == MemNode::ValueIn + 1 || + (is_subword_type(vect_type()->is_vect()->element_basic_type()) && + idx == MemNode::ValueIn + 2); } }; //------------------------------StoreVectorScatterMaskedNode-------------------------------- diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte128Vector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte128Vector.java index af60895899f..a889d10fb43 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte128Vector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte128Vector.java @@ -893,6 +893,12 @@ final class Byte128Vector extends ByteVector { return super.fromArray0Template(Byte128Mask.class, a, offset, (Byte128Mask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(Byte128Mask.class, a, offset, indexMap, mapOffset, (Byte128Mask) m); + } @ForceInline diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte256Vector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte256Vector.java index 1dcbbd26907..7f07c32ab13 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte256Vector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte256Vector.java @@ -925,6 +925,12 @@ final class Byte256Vector extends ByteVector { return super.fromArray0Template(Byte256Mask.class, a, offset, (Byte256Mask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(Byte256Mask.class, a, offset, indexMap, mapOffset, (Byte256Mask) m); + } @ForceInline diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte512Vector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte512Vector.java index 9e99a1916a7..20bf261999a 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte512Vector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte512Vector.java @@ -989,6 +989,12 @@ final class Byte512Vector extends ByteVector { return super.fromArray0Template(Byte512Mask.class, a, offset, (Byte512Mask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(Byte512Mask.class, a, offset, indexMap, mapOffset, (Byte512Mask) m); + } @ForceInline diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte64Vector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte64Vector.java index 85276b2eb19..2756128b469 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte64Vector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte64Vector.java @@ -877,6 +877,12 @@ final class Byte64Vector extends ByteVector { return super.fromArray0Template(Byte64Mask.class, a, offset, (Byte64Mask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(Byte64Mask.class, a, offset, indexMap, mapOffset, (Byte64Mask) m); + } @ForceInline diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteMaxVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteMaxVector.java index ff035f13294..c2f5e6f85a9 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteMaxVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteMaxVector.java @@ -863,6 +863,12 @@ final class ByteMaxVector extends ByteVector { return super.fromArray0Template(ByteMaxMask.class, a, offset, (ByteMaxMask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ByteVector fromArray0(byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(ByteMaxMask.class, a, offset, indexMap, mapOffset, (ByteMaxMask) m); + } @ForceInline diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java index 4fc8626754a..8fae8d71b04 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java @@ -3049,7 +3049,35 @@ public abstract class ByteVector extends AbstractVector { byte[] a, int offset, int[] indexMap, int mapOffset) { ByteSpecies vsp = (ByteSpecies) species; - return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]); + IntVector.IntSpecies isp = IntVector.species(vsp.indexShape()); + Objects.requireNonNull(a); + Objects.requireNonNull(indexMap); + Class vectorType = vsp.vectorType(); + + + // Constant folding should sweep out following conditonal logic. + VectorSpecies lsp; + if (isp.length() > IntVector.SPECIES_PREFERRED.length()) { + lsp = IntVector.SPECIES_PREFERRED; + } else { + lsp = isp; + } + + // Check indices are within array bounds. + for (int i = 0; i < vsp.length(); i += lsp.length()) { + IntVector vix = IntVector + .fromArray(lsp, indexMap, mapOffset + i) + .add(offset); + VectorIntrinsics.checkIndex(vix, a.length); + } + + return VectorSupport.loadWithMap( + vectorType, null, byte.class, vsp.laneCount(), + lsp.vectorType(), + a, ARRAY_BASE, null, null, + a, offset, indexMap, mapOffset, vsp, + (c, idx, iMap, idy, s, vm) -> + s.vOp(n -> c[idx + iMap[idy+n]])); } /** @@ -3094,8 +3122,13 @@ public abstract class ByteVector extends AbstractVector { byte[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { - ByteSpecies vsp = (ByteSpecies) species; - return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]); + if (m.allTrue()) { + return fromArray(species, a, offset, indexMap, mapOffset); + } + else { + ByteSpecies vsp = (ByteSpecies) species; + return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m); + } } @@ -3760,6 +3793,49 @@ public abstract class ByteVector extends AbstractVector { (arr_, off_, i) -> arr_[off_ + i])); } + /*package-private*/ + abstract + ByteVector fromArray0(byte[] a, int offset, + int[] indexMap, int mapOffset, + VectorMask m); + @ForceInline + final + > + ByteVector fromArray0Template(Class maskClass, byte[] a, int offset, + int[] indexMap, int mapOffset, M m) { + ByteSpecies vsp = vspecies(); + IntVector.IntSpecies isp = IntVector.species(vsp.indexShape()); + Objects.requireNonNull(a); + Objects.requireNonNull(indexMap); + m.check(vsp); + Class vectorType = vsp.vectorType(); + + + // Constant folding should sweep out following conditonal logic. + VectorSpecies lsp; + if (isp.length() > IntVector.SPECIES_PREFERRED.length()) { + lsp = IntVector.SPECIES_PREFERRED; + } else { + lsp = isp; + } + + // Check indices are within array bounds. + // FIXME: Check index under mask controlling. + for (int i = 0; i < vsp.length(); i += lsp.length()) { + IntVector vix = IntVector + .fromArray(lsp, indexMap, mapOffset + i) + .add(offset); + VectorIntrinsics.checkIndex(vix, a.length); + } + + return VectorSupport.loadWithMap( + vectorType, maskClass, byte.class, vsp.laneCount(), + lsp.vectorType(), + a, ARRAY_BASE, null, m, + a, offset, indexMap, mapOffset, vsp, + (c, idx, iMap, idy, s, vm) -> + s.vOp(vm, n -> c[idx + iMap[idy+n]])); + } /*package-private*/ diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short128Vector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short128Vector.java index 8ae0638e4f3..3930826aa09 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short128Vector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short128Vector.java @@ -877,6 +877,12 @@ final class Short128Vector extends ShortVector { return super.fromArray0Template(Short128Mask.class, a, offset, (Short128Mask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(Short128Mask.class, a, offset, indexMap, mapOffset, (Short128Mask) m); + } @ForceInline @Override diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short256Vector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short256Vector.java index cd9d8ceb887..e39e89f6137 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short256Vector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short256Vector.java @@ -893,6 +893,12 @@ final class Short256Vector extends ShortVector { return super.fromArray0Template(Short256Mask.class, a, offset, (Short256Mask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(Short256Mask.class, a, offset, indexMap, mapOffset, (Short256Mask) m); + } @ForceInline @Override diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short512Vector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short512Vector.java index 2a959a8181c..1caea78f748 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short512Vector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short512Vector.java @@ -925,6 +925,12 @@ final class Short512Vector extends ShortVector { return super.fromArray0Template(Short512Mask.class, a, offset, (Short512Mask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(Short512Mask.class, a, offset, indexMap, mapOffset, (Short512Mask) m); + } @ForceInline @Override diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short64Vector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short64Vector.java index 6090e9cf0d1..640be746f15 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short64Vector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short64Vector.java @@ -869,6 +869,12 @@ final class Short64Vector extends ShortVector { return super.fromArray0Template(Short64Mask.class, a, offset, (Short64Mask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(Short64Mask.class, a, offset, indexMap, mapOffset, (Short64Mask) m); + } @ForceInline @Override diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortMaxVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortMaxVector.java index d451cd4443f..96683ac53c4 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortMaxVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortMaxVector.java @@ -863,6 +863,12 @@ final class ShortMaxVector extends ShortVector { return super.fromArray0Template(ShortMaxMask.class, a, offset, (ShortMaxMask) m, offsetInRange); // specialize } + @ForceInline + @Override + final + ShortVector fromArray0(short[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { + return super.fromArray0Template(ShortMaxMask.class, a, offset, indexMap, mapOffset, (ShortMaxMask) m); + } @ForceInline @Override diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java index 84f542f07ff..ba21e8a9e95 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java @@ -3050,7 +3050,35 @@ public abstract class ShortVector extends AbstractVector { short[] a, int offset, int[] indexMap, int mapOffset) { ShortSpecies vsp = (ShortSpecies) species; - return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]); + IntVector.IntSpecies isp = IntVector.species(vsp.indexShape()); + Objects.requireNonNull(a); + Objects.requireNonNull(indexMap); + Class vectorType = vsp.vectorType(); + + + // Constant folding should sweep out following conditonal logic. + VectorSpecies lsp; + if (isp.length() > IntVector.SPECIES_PREFERRED.length()) { + lsp = IntVector.SPECIES_PREFERRED; + } else { + lsp = isp; + } + + // Check indices are within array bounds. + for (int i = 0; i < vsp.length(); i += lsp.length()) { + IntVector vix = IntVector + .fromArray(lsp, indexMap, mapOffset + i) + .add(offset); + VectorIntrinsics.checkIndex(vix, a.length); + } + + return VectorSupport.loadWithMap( + vectorType, null, short.class, vsp.laneCount(), + lsp.vectorType(), + a, ARRAY_BASE, null, null, + a, offset, indexMap, mapOffset, vsp, + (c, idx, iMap, idy, s, vm) -> + s.vOp(n -> c[idx + iMap[idy+n]])); } /** @@ -3095,8 +3123,13 @@ public abstract class ShortVector extends AbstractVector { short[] a, int offset, int[] indexMap, int mapOffset, VectorMask m) { - ShortSpecies vsp = (ShortSpecies) species; - return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]); + if (m.allTrue()) { + return fromArray(species, a, offset, indexMap, mapOffset); + } + else { + ShortSpecies vsp = (ShortSpecies) species; + return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m); + } } /** @@ -3746,6 +3779,49 @@ public abstract class ShortVector extends AbstractVector { (arr_, off_, i) -> arr_[off_ + i])); } + /*package-private*/ + abstract + ShortVector fromArray0(short[] a, int offset, + int[] indexMap, int mapOffset, + VectorMask m); + @ForceInline + final + > + ShortVector fromArray0Template(Class maskClass, short[] a, int offset, + int[] indexMap, int mapOffset, M m) { + ShortSpecies vsp = vspecies(); + IntVector.IntSpecies isp = IntVector.species(vsp.indexShape()); + Objects.requireNonNull(a); + Objects.requireNonNull(indexMap); + m.check(vsp); + Class vectorType = vsp.vectorType(); + + + // Constant folding should sweep out following conditonal logic. + VectorSpecies lsp; + if (isp.length() > IntVector.SPECIES_PREFERRED.length()) { + lsp = IntVector.SPECIES_PREFERRED; + } else { + lsp = isp; + } + + // Check indices are within array bounds. + // FIXME: Check index under mask controlling. + for (int i = 0; i < vsp.length(); i += lsp.length()) { + IntVector vix = IntVector + .fromArray(lsp, indexMap, mapOffset + i) + .add(offset); + VectorIntrinsics.checkIndex(vix, a.length); + } + + return VectorSupport.loadWithMap( + vectorType, maskClass, short.class, vsp.laneCount(), + lsp.vectorType(), + a, ARRAY_BASE, null, m, + a, offset, indexMap, mapOffset, vsp, + (c, idx, iMap, idy, s, vm) -> + s.vOp(vm, n -> c[idx + iMap[idy+n]])); + } /*package-private*/ abstract diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template index ad878268404..d7562bae475 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template @@ -3622,7 +3622,35 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { $type$[] a, int offset, int[] indexMap, int mapOffset) { $Type$Species vsp = ($Type$Species) species; - return vsp.vOp(n -> a[offset + indexMap[mapOffset + n]]); + IntVector.IntSpecies isp = IntVector.species(vsp.indexShape()); + Objects.requireNonNull(a); + Objects.requireNonNull(indexMap); + Class vectorType = vsp.vectorType(); + + + // Constant folding should sweep out following conditonal logic. + VectorSpecies lsp; + if (isp.length() > IntVector.SPECIES_PREFERRED.length()) { + lsp = IntVector.SPECIES_PREFERRED; + } else { + lsp = isp; + } + + // Check indices are within array bounds. + for (int i = 0; i < vsp.length(); i += lsp.length()) { + IntVector vix = IntVector + .fromArray(lsp, indexMap, mapOffset + i) + .add(offset); + VectorIntrinsics.checkIndex(vix, a.length); + } + + return VectorSupport.loadWithMap( + vectorType, null, $type$.class, vsp.laneCount(), + lsp.vectorType(), + a, ARRAY_BASE, null, null, + a, offset, indexMap, mapOffset, vsp, + (c, idx, iMap, idy, s, vm) -> + s.vOp(n -> c[idx + iMap[idy+n]])); } #else[byteOrShort] @ForceInline @@ -3714,17 +3742,6 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { * where the mask is set * @see $abstractvectortype$#toIntArray() */ -#if[byteOrShort] - @ForceInline - public static - $abstractvectortype$ fromArray(VectorSpecies<$Boxtype$> species, - $type$[] a, int offset, - int[] indexMap, int mapOffset, - VectorMask<$Boxtype$> m) { - $Type$Species vsp = ($Type$Species) species; - return vsp.vOp(m, n -> a[offset + indexMap[mapOffset + n]]); - } -#else[byteOrShort] @ForceInline public static $abstractvectortype$ fromArray(VectorSpecies<$Boxtype$> species, @@ -3739,7 +3756,6 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { return vsp.dummyVector().fromArray0(a, offset, indexMap, mapOffset, m); } } -#end[byteOrShort] #if[short] /** @@ -4793,12 +4809,51 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { (arr_, off_, i) -> arr_[off_ + i])); } -#if[!byteOrShort] /*package-private*/ abstract $abstractvectortype$ fromArray0($type$[] a, int offset, int[] indexMap, int mapOffset, VectorMask<$Boxtype$> m); +#if[byteOrShort] + @ForceInline + final + > + $abstractvectortype$ fromArray0Template(Class maskClass, $type$[] a, int offset, + int[] indexMap, int mapOffset, M m) { + $Type$Species vsp = vspecies(); + IntVector.IntSpecies isp = IntVector.species(vsp.indexShape()); + Objects.requireNonNull(a); + Objects.requireNonNull(indexMap); + m.check(vsp); + Class vectorType = vsp.vectorType(); + + + // Constant folding should sweep out following conditonal logic. + VectorSpecies lsp; + if (isp.length() > IntVector.SPECIES_PREFERRED.length()) { + lsp = IntVector.SPECIES_PREFERRED; + } else { + lsp = isp; + } + + // Check indices are within array bounds. + // FIXME: Check index under mask controlling. + for (int i = 0; i < vsp.length(); i += lsp.length()) { + IntVector vix = IntVector + .fromArray(lsp, indexMap, mapOffset + i) + .add(offset); + VectorIntrinsics.checkIndex(vix, a.length); + } + + return VectorSupport.loadWithMap( + vectorType, maskClass, $type$.class, vsp.laneCount(), + lsp.vectorType(), + a, ARRAY_BASE, null, m, + a, offset, indexMap, mapOffset, vsp, + (c, idx, iMap, idy, s, vm) -> + s.vOp(vm, n -> c[idx + iMap[idy+n]])); + } +#else[byteOrShort] @ForceInline final > @@ -4852,7 +4907,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); } -#end[!byteOrShort] +#end[byteOrShort] #if[short] /*package-private*/ diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-VectorBits.java.template b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-VectorBits.java.template index f2b36066fa7..cebdc7594d6 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-VectorBits.java.template +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-VectorBits.java.template @@ -1151,14 +1151,12 @@ final class $vectortype$ extends $abstractvectortype$ { return super.fromArray0Template($masktype$.class, a, offset, ($masktype$) m, offsetInRange); // specialize } -#if[!byteOrShort] @ForceInline @Override final $abstractvectortype$ fromArray0($type$[] a, int offset, int[] indexMap, int mapOffset, VectorMask<$Boxtype$> m) { return super.fromArray0Template($masktype$.class, a, offset, indexMap, mapOffset, ($masktype$) m); } -#end[!byteOrShort] #if[short] @ForceInline diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/GatherOperationsBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/GatherOperationsBenchmark.java new file mode 100644 index 00000000000..7a7578fcf84 --- /dev/null +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/GatherOperationsBenchmark.java @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +package org.openjdk.bench.jdk.incubator.vector; + +import jdk.incubator.vector.*; +import java.util.Random; +import java.util.stream.IntStream; +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.*; + +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Thread) +@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) +public class GatherOperationsBenchmark { + @Param({"64", "256", "1024", "4096"}) + int SIZE; + byte [] barr; + byte [] bres; + short [] sarr; + short [] sres; + int [] index; + + static final VectorSpecies S64 = ShortVector.SPECIES_64; + static final VectorSpecies S128 = ShortVector.SPECIES_128; + static final VectorSpecies S256 = ShortVector.SPECIES_256; + static final VectorSpecies S512 = ShortVector.SPECIES_512; + static final VectorSpecies B64 = ByteVector.SPECIES_64; + static final VectorSpecies B128 = ByteVector.SPECIES_128; + static final VectorSpecies B256 = ByteVector.SPECIES_256; + static final VectorSpecies B512 = ByteVector.SPECIES_512; + + @Setup(Level.Trial) + public void BmSetup() { + Random r = new Random(1245); + index = new int[SIZE]; + barr = new byte[SIZE]; + bres = new byte[SIZE]; + sarr = new short[SIZE]; + sres = new short[SIZE]; + for (int i = 0; i < SIZE; i++) { + barr[i] = (byte)i; + sarr[i] = (short)i; + index[i] = r.nextInt(SIZE-1); + } + } + + + + @Benchmark + public void microByteGather64() { + for (int i = 0; i < SIZE; i += B64.length()) { + ByteVector.fromArray(B64, barr, 0, index, i) + .intoArray(bres, i); + } + } + + + @Benchmark + public void microByteGather64_NZ_OFF() { + for (int i = 0; i < SIZE; i += B64.length()) { + ByteVector.fromArray(B64, barr, 1, index, i) + .intoArray(bres, i); + } + } + + @Benchmark + public void microByteGather64_MASK() { + VectorMask VMASK = VectorMask.fromLong(B64, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += B64.length()) { + ByteVector.fromArray(B64, barr, 0, index, i, VMASK) + .intoArray(bres, i); + } + } + + @Benchmark + public void microByteGather64_MASK_NZ_OFF() { + VectorMask VMASK = VectorMask.fromLong(B64, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += B64.length()) { + ByteVector.fromArray(B64, barr, 1, index, i, VMASK) + .intoArray(bres, i); + } + } + + + @Benchmark + public void microByteGather128() { + for (int i = 0; i < SIZE; i += B128.length()) { + ByteVector.fromArray(B128, barr, 0, index, i) + .intoArray(bres, i); + } + } + + + @Benchmark + public void microByteGather128_NZ_OFF() { + for (int i = 0; i < SIZE; i += B128.length()) { + ByteVector.fromArray(B128, barr, 1, index, i) + .intoArray(bres, i); + } + } + + @Benchmark + public void microByteGather128_MASK() { + VectorMask VMASK = VectorMask.fromLong(B128, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += B128.length()) { + ByteVector.fromArray(B128, barr, 0, index, i, VMASK) + .intoArray(bres, i); + } + } + + @Benchmark + public void microByteGather128_MASK_NZ_OFF() { + VectorMask VMASK = VectorMask.fromLong(B128, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += B128.length()) { + ByteVector.fromArray(B128, barr, 1, index, i, VMASK) + .intoArray(bres, i); + } + } + + + @Benchmark + public void microByteGather256() { + for (int i = 0; i < SIZE; i += B256.length()) { + ByteVector.fromArray(B256, barr, 0, index, i) + .intoArray(bres, i); + } + } + + + @Benchmark + public void microByteGather256_NZ_OFF() { + for (int i = 0; i < SIZE; i += B256.length()) { + ByteVector.fromArray(B256, barr, 1, index, i) + .intoArray(bres, i); + } + } + + @Benchmark + public void microByteGather256_MASK() { + VectorMask VMASK = VectorMask.fromLong(B256, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += B256.length()) { + ByteVector.fromArray(B256, barr, 0, index, i, VMASK) + .intoArray(bres, i); + } + } + + @Benchmark + public void microByteGather256_MASK_NZ_OFF() { + VectorMask VMASK = VectorMask.fromLong(B256, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += B256.length()) { + ByteVector.fromArray(B256, barr, 1, index, i, VMASK) + .intoArray(bres, i); + } + } + + + @Benchmark + public void microByteGather512() { + for (int i = 0; i < SIZE; i += B512.length()) { + ByteVector.fromArray(B512, barr, 0, index, i) + .intoArray(bres, i); + } + } + + + @Benchmark + public void microByteGather512_NZ_OFF() { + for (int i = 0; i < SIZE; i += B512.length()) { + ByteVector.fromArray(B512, barr, 1, index, i) + .intoArray(bres, i); + } + } + + @Benchmark + public void microByteGather512_MASK() { + VectorMask VMASK = VectorMask.fromLong(B512, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += B512.length()) { + ByteVector.fromArray(B512, barr, 0, index, i, VMASK) + .intoArray(bres, i); + } + } + + @Benchmark + public void microByteGather512_MASK_NZ_OFF() { + VectorMask VMASK = VectorMask.fromLong(B512, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += B512.length()) { + ByteVector.fromArray(B512, barr, 1, index, i, VMASK) + .intoArray(bres, i); + } + } + + + @Benchmark + public void microShortGather64() { + for (int i = 0; i < SIZE; i += S64.length()) { + ShortVector.fromArray(S64, sarr, 0, index, i) + .intoArray(sres, i); + } + } + + + @Benchmark + public void microShortGather64_NZ_OFF() { + for (int i = 0; i < SIZE; i += S64.length()) { + ShortVector.fromArray(S64, sarr, 1, index, i) + .intoArray(sres, i); + } + } + + @Benchmark + public void microShortGather64_MASK() { + VectorMask VMASK = VectorMask.fromLong(S64, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += S64.length()) { + ShortVector.fromArray(S64, sarr, 0, index, i, VMASK) + .intoArray(sres, i); + } + } + + @Benchmark + public void microShortGather64_MASK_NZ_OFF() { + VectorMask VMASK = VectorMask.fromLong(S64, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += S64.length()) { + ShortVector.fromArray(S64, sarr, 1, index, i, VMASK) + .intoArray(sres, i); + } + } + + + @Benchmark + public void microShortGather128() { + for (int i = 0; i < SIZE; i += S128.length()) { + ShortVector.fromArray(S128, sarr, 0, index, i) + .intoArray(sres, i); + } + } + + + @Benchmark + public void microShortGather128_NZ_OFF() { + for (int i = 0; i < SIZE; i += S128.length()) { + ShortVector.fromArray(S128, sarr, 1, index, i) + .intoArray(sres, i); + } + } + + @Benchmark + public void microShortGather128_MASK() { + VectorMask VMASK = VectorMask.fromLong(S128, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += S128.length()) { + ShortVector.fromArray(S128, sarr, 0, index, i, VMASK) + .intoArray(sres, i); + } + } + + @Benchmark + public void microShortGather128_MASK_NZ_OFF() { + VectorMask VMASK = VectorMask.fromLong(S128, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += S128.length()) { + ShortVector.fromArray(S128, sarr, 1, index, i, VMASK) + .intoArray(sres, i); + } + } + + + @Benchmark + public void microShortGather256() { + for (int i = 0; i < SIZE; i += S256.length()) { + ShortVector.fromArray(S256, sarr, 0, index, i) + .intoArray(sres, i); + } + } + + + @Benchmark + public void microShortGather256_NZ_OFF() { + for (int i = 0; i < SIZE; i += S256.length()) { + ShortVector.fromArray(S256, sarr, 1, index, i) + .intoArray(sres, i); + } + } + + @Benchmark + public void microShortGather256_MASK() { + VectorMask VMASK = VectorMask.fromLong(S256, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += S256.length()) { + ShortVector.fromArray(S256, sarr, 0, index, i, VMASK) + .intoArray(sres, i); + } + } + + @Benchmark + public void microShortGather256_MASK_NZ_OFF() { + VectorMask VMASK = VectorMask.fromLong(S256, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += S256.length()) { + ShortVector.fromArray(S256, sarr, 1, index, i, VMASK) + .intoArray(sres, i); + } + } + + + @Benchmark + public void microShortGather512() { + for (int i = 0; i < SIZE; i += S512.length()) { + ShortVector.fromArray(S512, sarr, 0, index, i) + .intoArray(sres, i); + } + } + + + @Benchmark + public void microShortGather512_NZ_OFF() { + for (int i = 0; i < SIZE; i += S512.length()) { + ShortVector.fromArray(S512, sarr, 1, index, i) + .intoArray(sres, i); + } + } + + @Benchmark + public void microShortGather512_MASK() { + VectorMask VMASK = VectorMask.fromLong(S512, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += S512.length()) { + ShortVector.fromArray(S512, sarr, 0, index, i, VMASK) + .intoArray(sres, i); + } + } + + @Benchmark + public void microShortGather512_MASK_NZ_OFF() { + VectorMask VMASK = VectorMask.fromLong(S512, 0x5555555555555555L); + for (int i = 0; i < SIZE; i += S512.length()) { + ShortVector.fromArray(S512, sarr, 1, index, i, VMASK) + .intoArray(sres, i); + } + } +}