From 2ba8a06f0c0a598a6ca7f74e75bab4208e6fa689 Mon Sep 17 00:00:00 2001 From: Bhavana Kilambi Date: Fri, 1 Aug 2025 13:11:21 +0000 Subject: [PATCH] 8348868: AArch64: Add backend support for SelectFromTwoVector Co-authored-by: Jatin Bhateja Reviewed-by: haosun, aph, sviswanathan, xgong --- src/hotspot/cpu/aarch64/aarch64.ad | 120 +++++ src/hotspot/cpu/aarch64/aarch64_vector.ad | 90 ++++ src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 53 ++ src/hotspot/cpu/aarch64/assembler_aarch64.hpp | 23 +- .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 121 +++++ .../cpu/aarch64/c2_MacroAssembler_aarch64.hpp | 14 + src/hotspot/cpu/x86/x86.ad | 5 +- src/hotspot/share/opto/vectorIntrinsics.cpp | 5 +- test/hotspot/gtest/aarch64/aarch64-asmtest.py | 4 + test/hotspot/gtest/aarch64/asmtest.out.h | 47 +- .../compiler/lib/ir_framework/IRNode.java | 30 ++ .../ir_framework/test/IREncodingPrinter.java | 1 + .../vectorapi/TestSelectFromTwoVectorOp.java | 486 ++++++++++++++++++ 13 files changed, 973 insertions(+), 26 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/vectorapi/TestSelectFromTwoVectorOp.java diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 404ab8d9ba4..517da8066de 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -881,6 +881,46 @@ reg_class vectorx_reg( V31, V31_H, V31_J, V31_K ); +// Class for vector register V10 +reg_class v10_veca_reg( + V10, V10_H, V10_J, V10_K +); + +// Class for vector register V11 +reg_class v11_veca_reg( + V11, V11_H, V11_J, V11_K +); + +// Class for vector register V12 +reg_class v12_veca_reg( + V12, V12_H, V12_J, V12_K +); + +// Class for vector register V13 +reg_class v13_veca_reg( + V13, V13_H, V13_J, V13_K +); + +// Class for vector register V17 +reg_class v17_veca_reg( + V17, V17_H, V17_J, V17_K +); + +// Class for vector register V18 +reg_class v18_veca_reg( + V18, V18_H, V18_J, V18_K +); + +// Class for vector register V23 +reg_class v23_veca_reg( + V23, V23_H, V23_J, V23_K +); + +// Class for vector register V24 +reg_class v24_veca_reg( + V24, V24_H, V24_J, V24_K +); + // Class for 128 bit register v0 reg_class v0_reg( V0, V0_H @@ -4969,6 +5009,86 @@ operand vReg() interface(REG_INTER); %} +operand vReg_V10() +%{ + constraint(ALLOC_IN_RC(v10_veca_reg)); + match(vReg); + + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V11() +%{ + constraint(ALLOC_IN_RC(v11_veca_reg)); + match(vReg); + + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V12() +%{ + constraint(ALLOC_IN_RC(v12_veca_reg)); + match(vReg); + + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V13() +%{ + constraint(ALLOC_IN_RC(v13_veca_reg)); + match(vReg); + + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V17() +%{ + constraint(ALLOC_IN_RC(v17_veca_reg)); + match(vReg); + + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V18() +%{ + constraint(ALLOC_IN_RC(v18_veca_reg)); + match(vReg); + + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V23() +%{ + constraint(ALLOC_IN_RC(v23_veca_reg)); + match(vReg); + + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V24() +%{ + constraint(ALLOC_IN_RC(v24_veca_reg)); + match(vReg); + + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + operand vecA() %{ constraint(ALLOC_IN_RC(vectora_reg)); diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 1b6296ddd8b..58300992c2a 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -257,6 +257,28 @@ source %{ return false; } break; + case Op_SelectFromTwoVector: + // The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return + // false if vector length > 16B but supported SVE version < 2. + // For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else + // generate Neon "tbl" instruction to select from two vectors. + // This operation is disabled for doubles and longs on machines with SVE < 2 and instead + // the default VectorRearrange + VectorBlend is generated because the performance of the default + // implementation was better than or equal to the implementation for SelectFromTwoVector. + if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) { + return false; + } + + // Because the SVE2 "tbl" instruction is unpredicated and partial operations cannot be generated + // using masks, we disable this operation on machines where length_in_bytes < MaxVectorSize + // on that machine with the only exception of 8B vector length. This is because at the time of + // writing this, there is no SVE2 machine available with length_in_bytes > 8 and + // length_in_bytes < MaxVectorSize to test this operation on (for example - there isn't an + // SVE2 machine available with MaxVectorSize = 32 to test a case with length_in_bytes = 16). + if (UseSVE == 2 && length_in_bytes > 8 && length_in_bytes < MaxVectorSize) { + return false; + } + break; default: break; } @@ -7172,3 +7194,71 @@ instruct vexpandBits(vReg dst, vReg src1, vReg src2) %{ %} ins_pipe(pipe_slow); %} + +// ------------------------------------- SelectFromTwoVector ------------------------------------ +// The Neon and SVE2 tbl instruction for two vector lookup requires both the source vectors to be +// consecutive. The match rules for SelectFromTwoVector reserve two consecutive vector registers +// for src1 and src2. +// Four combinations of vector registers for vselect_from_two_vectors are chosen at random +// (two from volatile and two from non-volatile set) which gives more freedom to the register +// allocator to choose the best pair of source registers at that point. + +instruct vselect_from_two_vectors_10_11(vReg dst, vReg_V10 src1, vReg_V11 src2, + vReg index, vReg tmp) %{ + effect(TEMP_DEF dst, TEMP tmp); + match(Set dst (SelectFromTwoVector (Binary index src1) src2)); + format %{ "vselect_from_two_vectors_10_11 $dst, $src1, $src2, $index\t# KILL $tmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister, + $src2$$FloatRegister, $index$$FloatRegister, + $tmp$$FloatRegister, bt, length_in_bytes); + %} + ins_pipe(pipe_slow); +%} + +instruct vselect_from_two_vectors_12_13(vReg dst, vReg_V12 src1, vReg_V13 src2, + vReg index, vReg tmp) %{ + effect(TEMP_DEF dst, TEMP tmp); + match(Set dst (SelectFromTwoVector (Binary index src1) src2)); + format %{ "vselect_from_two_vectors_12_13 $dst, $src1, $src2, $index\t# KILL $tmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister, + $src2$$FloatRegister, $index$$FloatRegister, + $tmp$$FloatRegister, bt, length_in_bytes); + %} + ins_pipe(pipe_slow); +%} + +instruct vselect_from_two_vectors_17_18(vReg dst, vReg_V17 src1, vReg_V18 src2, + vReg index, vReg tmp) %{ + effect(TEMP_DEF dst, TEMP tmp); + match(Set dst (SelectFromTwoVector (Binary index src1) src2)); + format %{ "vselect_from_two_vectors_17_18 $dst, $src1, $src2, $index\t# KILL $tmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister, + $src2$$FloatRegister, $index$$FloatRegister, + $tmp$$FloatRegister, bt, length_in_bytes); + %} + ins_pipe(pipe_slow); +%} + +instruct vselect_from_two_vectors_23_24(vReg dst, vReg_V23 src1, vReg_V24 src2, + vReg index, vReg tmp) %{ + effect(TEMP_DEF dst, TEMP tmp); + match(Set dst (SelectFromTwoVector (Binary index src1) src2)); + format %{ "vselect_from_two_vectors_23_24 $dst, $src1, $src2, $index\t# KILL $tmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister, + $src2$$FloatRegister, $index$$FloatRegister, + $tmp$$FloatRegister, bt, length_in_bytes); + %} + ins_pipe(pipe_slow); +%} diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index efefbf692bd..4d91e04dc21 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -247,6 +247,28 @@ source %{ return false; } break; + case Op_SelectFromTwoVector: + // The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return + // false if vector length > 16B but supported SVE version < 2. + // For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else + // generate Neon "tbl" instruction to select from two vectors. + // This operation is disabled for doubles and longs on machines with SVE < 2 and instead + // the default VectorRearrange + VectorBlend is generated because the performance of the default + // implementation was better than or equal to the implementation for SelectFromTwoVector. + if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) { + return false; + } + + // Because the SVE2 "tbl" instruction is unpredicated and partial operations cannot be generated + // using masks, we disable this operation on machines where length_in_bytes < MaxVectorSize + // on that machine with the only exception of 8B vector length. This is because at the time of + // writing this, there is no SVE2 machine available with length_in_bytes > 8 and + // length_in_bytes < MaxVectorSize to test this operation on (for example - there isn't an + // SVE2 machine available with MaxVectorSize = 32 to test a case with length_in_bytes = 16). + if (UseSVE == 2 && length_in_bytes > 8 && length_in_bytes < MaxVectorSize) { + return false; + } + break; default: break; } @@ -5154,3 +5176,34 @@ BITPERM(vcompressBits, CompressBitsV, sve_bext) // ----------------------------------- ExpandBitsV --------------------------------- BITPERM(vexpandBits, ExpandBitsV, sve_bdep) + +// ------------------------------------- SelectFromTwoVector ------------------------------------ +// The Neon and SVE2 tbl instruction for two vector lookup requires both the source vectors to be +// consecutive. The match rules for SelectFromTwoVector reserve two consecutive vector registers +// for src1 and src2. +// Four combinations of vector registers for vselect_from_two_vectors are chosen at random +// (two from volatile and two from non-volatile set) which gives more freedom to the register +// allocator to choose the best pair of source registers at that point. +dnl +dnl SELECT_FROM_TWO_VECTORS($1, $2 ) +dnl SELECT_FROM_TWO_VECTORS(first_reg, second_reg) +define(`SELECT_FROM_TWO_VECTORS', ` +instruct vselect_from_two_vectors_$1_$2(vReg dst, vReg_V$1 src1, vReg_V$2 src2, + vReg index, vReg tmp) %{ + effect(TEMP_DEF dst, TEMP tmp); + match(Set dst (SelectFromTwoVector (Binary index src1) src2)); + format %{ "vselect_from_two_vectors_$1_$2 $dst, $src1, $src2, $index\t# KILL $tmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister, + $src2$$FloatRegister, $index$$FloatRegister, + $tmp$$FloatRegister, bt, length_in_bytes); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +SELECT_FROM_TWO_VECTORS(10, 11) +SELECT_FROM_TWO_VECTORS(12, 13) +SELECT_FROM_TWO_VECTORS(17, 18) +SELECT_FROM_TWO_VECTORS(23, 24) diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 2e35763aa43..11d302e9026 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -4231,12 +4231,29 @@ public: sf(imm1, 9, 5), rf(Zd, 0); } - // SVE programmable table lookup/permute using vector of element indices - void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { +private: + void _sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, unsigned reg_count, FloatRegister Zm) { starti; assert(T != Q, "invalid size"); + // Only supports one or two vector lookup. One vector lookup was introduced in SVE1 + // and two vector lookup in SVE2 + assert(0 < reg_count && reg_count <= 2, "invalid number of registers"); + + int op11 = (reg_count == 1) ? 0b10 : 0b01; + f(0b00000101, 31, 24), f(T, 23, 22), f(0b1, 21), rf(Zm, 16); - f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0); + f(0b001, 15, 13), f(op11, 12, 11), f(0b0, 10), rf(Zn, 5), rf(Zd, 0); + } + +public: + // SVE/SVE2 Programmable table lookup in one or two vector table (zeroing) + void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { + _sve_tbl(Zd, T, Zn, 1, Zm); + } + + void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn1, FloatRegister Zn2, FloatRegister Zm) { + assert(Zn1->successor() == Zn2, "invalid order of registers"); + _sve_tbl(Zd, T, Zn1, 2, Zm); } // Shuffle active elements of vector to the right and fill with zero diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index a4ecd56af08..e87cb478c8f 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -2858,3 +2858,124 @@ void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { add(rfp, sp, framesize - 2 * wordSize); } } + +// Selects elements from two source vectors (src1, src2) based on index values in the index register +// using Neon instructions and places it in the destination vector element corresponding to the +// index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM), +// where NUM_ELEM is the number of BasicType elements per vector. +// If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register) +// Otherwise, selects src2[idx – NUM_ELEM] +void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1, + FloatRegister src2, FloatRegister index, + FloatRegister tmp, unsigned vector_length_in_bytes) { + assert_different_registers(dst, src1, src2, tmp); + SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B; + + if (vector_length_in_bytes == 16) { + assert(UseSVE <= 1, "sve must be <= 1"); + assert(src1->successor() == src2, "Source registers must be ordered"); + // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table + tbl(dst, size, src1, 2, index); + } else { // vector length == 8 + assert(UseSVE == 0, "must be Neon only"); + // We need to fit both the source vectors (src1, src2) in a 128-bit register because the + // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl" + // instruction with one vector lookup + ins(tmp, D, src1, 0, 0); + ins(tmp, D, src2, 1, 0); + tbl(dst, size, tmp, 1, index); + } +} + +// Selects elements from two source vectors (src1, src2) based on index values in the index register +// using SVE/SVE2 instructions and places it in the destination vector element corresponding to the +// index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM), +// where NUM_ELEM is the number of BasicType elements per vector. +// If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register) +// Otherwise, selects src2[idx – NUM_ELEM] +void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1, + FloatRegister src2, FloatRegister index, + FloatRegister tmp, SIMD_RegVariant T, + unsigned vector_length_in_bytes) { + assert_different_registers(dst, src1, src2, index, tmp); + + if (vector_length_in_bytes == 8) { + // We need to fit both the source vectors (src1, src2) in a single vector register because the + // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to + // incorrect results if each source vector is only partially filled. We then use the SVE "tbl" + // instruction with one vector lookup + assert(UseSVE >= 1, "sve must be >= 1"); + ins(tmp, D, src1, 0, 0); + ins(tmp, D, src2, 1, 0); + sve_tbl(dst, T, tmp, index); + } else { // UseSVE == 2 and vector_length_in_bytes > 8 + // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table. + // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation + // is not executed on machines where vector_length_in_bytes < MaxVectorSize + // with the only exception of 8B vector length. + assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be"); + assert(src1->successor() == src2, "Source registers must be ordered"); + sve_tbl(dst, T, src1, src2, index); + } +} + +void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1, + FloatRegister src2, FloatRegister index, + FloatRegister tmp, BasicType bt, + unsigned vector_length_in_bytes) { + + assert_different_registers(dst, src1, src2, index, tmp); + + // The cases that can reach this method are - + // - UseSVE = 0, vector_length_in_bytes = 8 or 16 + // - UseSVE = 1, vector_length_in_bytes = 8 or 16 + // - UseSVE = 2, vector_length_in_bytes >= 8 + // + // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8 + // and UseSVE = 2 with vector_length_in_bytes >= 8 + // + // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and + // UseSVE = 1 with vector_length_in_bytes = 16 + + if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) { + SIMD_RegVariant T = elemType_to_regVariant(bt); + select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes); + return; + } + + // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT + assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type"); + assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16"); + + bool isQ = vector_length_in_bytes == 16; + + SIMD_Arrangement size1 = isQ ? T16B : T8B; + SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); + + // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of + // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table. + // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM + // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length, + // the indices can range from [0, 8). + // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0] + // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202] + // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000] + // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100] + // Add the multiplied result to the vector in tmp to obtain the byte level + // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100] + // Use these offsets in the "tbl" instruction to select chunks of 2B. + + if (bt == T_BYTE) { + select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes); + } else { + int elem_size = (bt == T_SHORT) ? 2 : 4; + uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u; + + mov(tmp, size1, elem_size); + mulv(dst, size2, index, tmp); + mov(tmp, size2, tbl_offset); + addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements + // to select a set of 2B/4B + select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes); + } +} diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp index 70e4265c7cc..233f600cb14 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp @@ -34,6 +34,15 @@ void neon_reduce_logical_helper(int opc, bool sf, Register Rd, Register Rn, Register Rm, enum shift_kind kind = Assembler::LSL, unsigned shift = 0); + void select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1, + FloatRegister src2, FloatRegister index, + FloatRegister tmp, unsigned vector_length_in_bytes); + + void select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1, + FloatRegister src2, FloatRegister index, + FloatRegister tmp, SIMD_RegVariant T, + unsigned vector_length_in_bytes); + public: // jdk.internal.util.ArraysSupport.vectorizedHashCode address arrays_hashcode(Register ary, Register cnt, Register result, FloatRegister vdata0, @@ -193,4 +202,9 @@ void reconstruct_frame_pointer(Register rtmp); + // Select from a table of two vectors + void select_from_two_vectors(FloatRegister dst, FloatRegister src1, FloatRegister src2, + FloatRegister index, FloatRegister tmp, BasicType bt, + unsigned vector_length_in_bytes); + #endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 933be1667c2..2eb748e350c 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1831,7 +1831,10 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { } break; case Op_SelectFromTwoVector: - if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) { + if (size_in_bits < 128) { + return false; + } + if ((size_in_bits < 512 && !VM_Version::supports_avx512vl())) { return false; } if (bt == T_SHORT && !VM_Version::supports_avx512bw()) { diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index 97c5dbe03ef..42bf1e20e24 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -2706,6 +2706,9 @@ bool LibraryCallKit::inline_vector_select_from_two_vectors() { index_elem_bt = T_LONG; } + // Check if the platform requires a VectorLoadShuffle node to be generated + bool need_load_shuffle = Matcher::vector_rearrange_requires_load_shuffle(index_elem_bt, num_elem); + bool lowerSelectFromOp = false; if (!arch_supports_vector(Op_SelectFromTwoVector, num_elem, elem_bt, VecMaskNotUsed)) { int cast_vopc = VectorCastNode::opcode(-1, elem_bt, true); @@ -2715,7 +2718,7 @@ bool LibraryCallKit::inline_vector_select_from_two_vectors() { !arch_supports_vector(Op_VectorMaskCast, num_elem, elem_bt, VecMaskNotUsed) || !arch_supports_vector(Op_VectorBlend, num_elem, elem_bt, VecMaskUseLoad) || !arch_supports_vector(Op_VectorRearrange, num_elem, elem_bt, VecMaskNotUsed) || - !arch_supports_vector(Op_VectorLoadShuffle, num_elem, index_elem_bt, VecMaskNotUsed) || + (need_load_shuffle && !arch_supports_vector(Op_VectorLoadShuffle, num_elem, index_elem_bt, VecMaskNotUsed)) || !arch_supports_vector(Op_Replicate, num_elem, index_elem_bt, VecMaskNotUsed)) { log_if_needed(" ** not supported: opc=%d vlen=%d etype=%s ismask=useload", Op_SelectFromTwoVector, num_elem, type2name(elem_bt)); diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py index 5b2c18b0a2b..62274e2c10f 100644 --- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py +++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py @@ -2087,6 +2087,10 @@ generate(SpecialCases, [["ccmn", "__ ccmn(zr, zr, 3u, Assembler::LE);", ["index", "__ sve_index(z7, __ D, r5, 5);", "index\tz7.d, x5, #5"], ["cpy", "__ sve_cpy(z7, __ H, p3, r5);", "cpy\tz7.h, p3/m, w5"], ["tbl", "__ sve_tbl(z16, __ S, z17, z18);", "tbl\tz16.s, {z17.s}, z18.s"], + ["tbl", "__ sve_tbl(z16, __ B, z17, z18, z16);", "tbl\tz16.b, {z17.b, z18.b}, z16.b"], + ["tbl", "__ sve_tbl(z16, __ H, z17, z18, z16);", "tbl\tz16.h, {z17.h, z18.h}, z16.h"], + ["tbl", "__ sve_tbl(z16, __ S, z17, z18, z16);", "tbl\tz16.s, {z17.s, z18.s}, z16.s"], + ["tbl", "__ sve_tbl(z16, __ D, z17, z18, z16);", "tbl\tz16.d, {z17.d, z18.d}, z16.d"], ["ld1w", "__ sve_ld1w_gather(z15, p0, r5, z16);", "ld1w\t{z15.s}, p0/z, [x5, z16.s, uxtw #2]"], ["ld1d", "__ sve_ld1d_gather(z15, p0, r5, z16);", "ld1d\t{z15.d}, p0/z, [x5, z16.d, uxtw #3]"], ["st1w", "__ sve_st1w_scatter(z15, p0, r5, z16);", "st1w\t{z15.s}, p0, [x5, z16.s, uxtw #2]"], diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h index d90c2479995..f08c69a27dd 100644 --- a/test/hotspot/gtest/aarch64/asmtest.out.h +++ b/test/hotspot/gtest/aarch64/asmtest.out.h @@ -1100,6 +1100,10 @@ __ sve_index(z7, __ D, r5, 5); // index z7.d, x5, #5 __ sve_cpy(z7, __ H, p3, r5); // cpy z7.h, p3/m, w5 __ sve_tbl(z16, __ S, z17, z18); // tbl z16.s, {z17.s}, z18.s + __ sve_tbl(z16, __ B, z17, z18, z16); // tbl z16.b, {z17.b, z18.b}, z16.b + __ sve_tbl(z16, __ H, z17, z18, z16); // tbl z16.h, {z17.h, z18.h}, z16.h + __ sve_tbl(z16, __ S, z17, z18, z16); // tbl z16.s, {z17.s, z18.s}, z16.s + __ sve_tbl(z16, __ D, z17, z18, z16); // tbl z16.d, {z17.d, z18.d}, z16.d __ sve_ld1w_gather(z15, p0, r5, z16); // ld1w {z15.s}, p0/z, [x5, z16.s, uxtw #2] __ sve_ld1d_gather(z15, p0, r5, z16); // ld1d {z15.d}, p0/z, [x5, z16.d, uxtw #3] __ sve_st1w_scatter(z15, p0, r5, z16); // st1w {z15.s}, p0, [x5, z16.s, uxtw #2] @@ -1438,30 +1442,30 @@ 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0227, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, - 0x14000000, 0x17ffffd7, 0x140004b0, 0x94000000, - 0x97ffffd4, 0x940004ad, 0x3400000a, 0x34fffa2a, - 0x3400954a, 0x35000008, 0x35fff9c8, 0x350094e8, - 0xb400000b, 0xb4fff96b, 0xb400948b, 0xb500001d, - 0xb5fff91d, 0xb500943d, 0x10000013, 0x10fff8b3, - 0x100093d3, 0x90000013, 0x36300016, 0x3637f836, - 0x36309356, 0x3758000c, 0x375ff7cc, 0x375892ec, + 0x14000000, 0x17ffffd7, 0x140004b4, 0x94000000, + 0x97ffffd4, 0x940004b1, 0x3400000a, 0x34fffa2a, + 0x340095ca, 0x35000008, 0x35fff9c8, 0x35009568, + 0xb400000b, 0xb4fff96b, 0xb400950b, 0xb500001d, + 0xb5fff91d, 0xb50094bd, 0x10000013, 0x10fff8b3, + 0x10009453, 0x90000013, 0x36300016, 0x3637f836, + 0x363093d6, 0x3758000c, 0x375ff7cc, 0x3758936c, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, - 0x540090c0, 0x54000001, 0x54fff541, 0x54009061, - 0x54000002, 0x54fff4e2, 0x54009002, 0x54000002, - 0x54fff482, 0x54008fa2, 0x54000003, 0x54fff423, - 0x54008f43, 0x54000003, 0x54fff3c3, 0x54008ee3, - 0x54000004, 0x54fff364, 0x54008e84, 0x54000005, - 0x54fff305, 0x54008e25, 0x54000006, 0x54fff2a6, - 0x54008dc6, 0x54000007, 0x54fff247, 0x54008d67, - 0x54000008, 0x54fff1e8, 0x54008d08, 0x54000009, - 0x54fff189, 0x54008ca9, 0x5400000a, 0x54fff12a, - 0x54008c4a, 0x5400000b, 0x54fff0cb, 0x54008beb, - 0x5400000c, 0x54fff06c, 0x54008b8c, 0x5400000d, - 0x54fff00d, 0x54008b2d, 0x5400000e, 0x54ffefae, - 0x54008ace, 0x5400000f, 0x54ffef4f, 0x54008a6f, + 0x54009140, 0x54000001, 0x54fff541, 0x540090e1, + 0x54000002, 0x54fff4e2, 0x54009082, 0x54000002, + 0x54fff482, 0x54009022, 0x54000003, 0x54fff423, + 0x54008fc3, 0x54000003, 0x54fff3c3, 0x54008f63, + 0x54000004, 0x54fff364, 0x54008f04, 0x54000005, + 0x54fff305, 0x54008ea5, 0x54000006, 0x54fff2a6, + 0x54008e46, 0x54000007, 0x54fff247, 0x54008de7, + 0x54000008, 0x54fff1e8, 0x54008d88, 0x54000009, + 0x54fff189, 0x54008d29, 0x5400000a, 0x54fff12a, + 0x54008cca, 0x5400000b, 0x54fff0cb, 0x54008c6b, + 0x5400000c, 0x54fff06c, 0x54008c0c, 0x5400000d, + 0x54fff00d, 0x54008bad, 0x5400000e, 0x54ffefae, + 0x54008b4e, 0x5400000f, 0x54ffef4f, 0x54008aef, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f, 0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf, @@ -1668,7 +1672,8 @@ 0x65d8a801, 0x65dcac01, 0x655cb241, 0x0520a1e0, 0x0521a601, 0x052281e0, 0x05238601, 0x04a14026, 0x042244a6, 0x046344a6, 0x04a444a6, 0x04e544a7, - 0x0568aca7, 0x05b23230, 0x853040af, 0xc5b040af, + 0x0568aca7, 0x05b23230, 0x05302a30, 0x05702a30, + 0x05b02a30, 0x05f02a30, 0x853040af, 0xc5b040af, 0xe57080af, 0xe5b080af, 0x25034440, 0x254054c4, 0x25034640, 0x25415a05, 0x25834440, 0x25c54489, 0x250b5d3a, 0x2550dc20, 0x2518e3e1, 0x2518e021, diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index a231f04b1e8..dd285777e2c 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -2851,6 +2851,36 @@ public class IRNode { fromBeforeRemoveUselessToFinalCode(BLACKHOLE, "Blackhole"); } + public static final String SELECT_FROM_TWO_VECTOR_VB = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VB" + POSTFIX; + static { + vectorNode(SELECT_FROM_TWO_VECTOR_VB, "SelectFromTwoVector", TYPE_BYTE); + } + + public static final String SELECT_FROM_TWO_VECTOR_VS = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VS" + POSTFIX; + static { + vectorNode(SELECT_FROM_TWO_VECTOR_VS, "SelectFromTwoVector", TYPE_SHORT); + } + + public static final String SELECT_FROM_TWO_VECTOR_VI = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VI" + POSTFIX; + static { + vectorNode(SELECT_FROM_TWO_VECTOR_VI, "SelectFromTwoVector", TYPE_INT); + } + + public static final String SELECT_FROM_TWO_VECTOR_VF = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VF" + POSTFIX; + static { + vectorNode(SELECT_FROM_TWO_VECTOR_VF, "SelectFromTwoVector", TYPE_FLOAT); + } + + public static final String SELECT_FROM_TWO_VECTOR_VD = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VD" + POSTFIX; + static { + vectorNode(SELECT_FROM_TWO_VECTOR_VD, "SelectFromTwoVector", TYPE_DOUBLE); + } + + public static final String SELECT_FROM_TWO_VECTOR_VL = VECTOR_PREFIX + "SELECT_FROM_TWO_VECTOR_VL" + POSTFIX; + static { + vectorNode(SELECT_FROM_TWO_VECTOR_VL, "SelectFromTwoVector", TYPE_LONG); + } + /* * Utility methods to set up IR_NODE_MAPPINGS. */ diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java index 6662acf8e9e..16b4654013a 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java @@ -105,6 +105,7 @@ public class IREncodingPrinter { "avx512f", "avx512_fp16", "avx512_vnni", + "avx512_vbmi", "bmi2", // AArch64 "sha3", diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestSelectFromTwoVectorOp.java b/test/hotspot/jtreg/compiler/vectorapi/TestSelectFromTwoVectorOp.java new file mode 100644 index 00000000000..3746578266c --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/TestSelectFromTwoVectorOp.java @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2025, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.vectorapi; + +import compiler.lib.generators.*; +import compiler.lib.ir_framework.*; + +import jdk.incubator.vector.*; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +import java.util.Random; + +import jdk.test.lib.Asserts; +import jdk.test.lib.Utils; +/** + * @test + * @bug 8348868 + * @library /test/lib / + * @summary Verify that SelectFromTwoVector IR node is correctly being + * generated on aarch64 and x86 + * @modules jdk.incubator.vector + * @run driver compiler.vectorapi.TestSelectFromTwoVectorOp + */ + +public class TestSelectFromTwoVectorOp { + private static final int SIZE = 1024; + private static final Generators random = Generators.G; + + private static byte[] ba; + private static byte[] bb; + private static byte[] bres; + private static byte[][] bindex; + + private static short[] sa; + private static short[] sb; + private static short[] sres; + private static short[][] sindex; + + private static int[] ia; + private static int[] ib; + private static int[] ires; + private static int[][] iindex; + + private static float[] fa; + private static float[] fb; + private static float[] fres; + private static float[][] findex; + + private static long[] la; + private static long[] lb; + private static long[] lres; + private static long[][] lindex; + + private static double[] da; + private static double[] db; + private static double[] dres; + private static double[][] dindex; + + // Stores the possible number of elements that can be + // held in various vector sizes/shapes + private static int [] nums = {2, 4, 8, 16, 32, 64}; + + static { + ba = new byte[SIZE]; + bb = new byte[SIZE]; + bres = new byte[SIZE]; + bindex = new byte[4][SIZE]; + + sa = new short[SIZE]; + sb = new short[SIZE]; + sres = new short[SIZE]; + sindex = new short[4][SIZE]; + + ia = new int[SIZE]; + ib = new int[SIZE]; + ires = new int[SIZE]; + iindex = new int[4][SIZE]; + + fa = new float[SIZE]; + fb = new float[SIZE]; + fres = new float[SIZE]; + findex = new float[4][SIZE]; + + la = new long[SIZE]; + lb = new long[SIZE]; + lres = new long[SIZE]; + lindex = new long[3][SIZE]; + + da = new double[SIZE]; + db = new double[SIZE]; + dres = new double[SIZE]; + dindex = new double[3][SIZE]; + + // Populate the indices + for (int i = 0; i < bindex.length; i++) { + bindex[i] = new byte[SIZE]; + sindex[i] = new short[SIZE]; + iindex[i] = new int[SIZE]; + findex[i] = new float[SIZE]; + + // The index array contains indices in the range of [0, vector_length * 2) + Generator byteGen1 = random.uniformInts(0, (nums[i + 2] * 2) - 1); + Generator shortGen1 = random.uniformInts(0, (nums[i + 1] * 2) - 1); + + for (int j = 0; j < SIZE; j++) { + bindex[i][j] = byteGen1.next().byteValue(); + sindex[i][j] = shortGen1.next().shortValue(); + } + + if (i < dindex.length) { + dindex[i] = new double[SIZE]; + lindex[i] = new long[SIZE]; + + random.fill(random.uniformDoubles(0, (double) ((nums[i] * 2) - 1)), dindex[i]); + random.fill(random.uniformLongs(0, (long) ((nums[i] * 2) - 1)), lindex[i]); + } + + random.fill(random.uniformInts(0, (nums[i] * 2) - 1), iindex[i]); + random.fill(random.uniformFloats(0, (float)((nums[i] * 2) - 1)), findex[i]); + } + + // Populate the sources + Generator byteGen = random.uniformInts(Byte.MIN_VALUE, Byte.MAX_VALUE); + Generator shortGen = random.uniformInts(Short.MIN_VALUE, Short.MAX_VALUE); + + for (int i = 0; i < SIZE; i++) { + ba[i] = byteGen.next().byteValue(); + bb[i] = byteGen.next().byteValue(); + + sa[i] = shortGen.next().shortValue(); + sb[i] = shortGen.next().shortValue(); + } + + random.fill(random.ints(), ia); + random.fill(random.ints(), ib); + random.fill(random.floats(), fa); + random.fill(random.floats(), fb); + random.fill(random.longs(), la); + random.fill(random.longs(), lb); + random.fill(random.doubles(), da); + random.fill(random.doubles(), db); + } + + // Test SelectFromTwoVector operation for Bytes + @ForceInline + public static void ByteSelectFromTwoVectorKernel(VectorSpecies SPECIES, byte[] ba, + byte[] bb, byte[] bindex) { + for (int i = 0; i < SPECIES.loopBound(ba.length); i += SPECIES.length()) { + ByteVector.fromArray(SPECIES, bindex, i) + .selectFrom(ByteVector.fromArray(SPECIES, ba, i), + ByteVector.fromArray(SPECIES, bb, i)) + .intoArray(bres, i); + } + } + + @Test + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_8, ">0"}, + applyIfCPUFeature = {"asimd", "true"}, + applyIf = {"MaxVectorSize", ">=8"}) + public static void selectFromTwoVector_Byte64() { + ByteSelectFromTwoVectorKernel(ByteVector.SPECIES_64, ba, bb, bindex[0]); + } + + @Test + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_16, ">0"}, + applyIfCPUFeature = {"asimd", "true"}, + applyIf = {"MaxVectorSize", ">=16"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_16, ">0"}, + applyIfCPUFeatureAnd = {"avx512_vbmi", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=16"}) + public static void selectFromTwoVector_Byte128() { + ByteSelectFromTwoVectorKernel(ByteVector.SPECIES_128, ba, bb, bindex[1]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_32}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=32"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_32, ">0"}, + applyIfCPUFeature = {"sve2", "true"}, + applyIf = {"MaxVectorSize", ">=32"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_32, ">0"}, + applyIfCPUFeatureAnd = {"avx512_vbmi", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=32"}) + public static void selectFromTwoVector_Byte256() { + ByteSelectFromTwoVectorKernel(ByteVector.SPECIES_256, ba, bb, bindex[2]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_64}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=64"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_64, ">0"}, + applyIfCPUFeature = {"sve2", "true"}, + applyIf = {"MaxVectorSize", ">=64"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VB, IRNode.VECTOR_SIZE_64, ">0"}, + applyIfCPUFeatureAnd = {"avx512_vbmi", "true", "avx512f", "true"}, + applyIf = {"MaxVectorSize", ">=64"}) + public static void selectFromTwoVector_Byte512() { + ByteSelectFromTwoVectorKernel(ByteVector.SPECIES_512, ba, bb, bindex[3]); + } + + // Test SelectFromTwoVector operation for Shorts + @ForceInline + public static void ShortSelectFromTwoVectorKernel(VectorSpecies SPECIES, short[] sa, + short[] sb, short[] sindex) { + for (int i = 0; i < SPECIES.loopBound(sa.length); i += SPECIES.length()) { + ShortVector.fromArray(SPECIES, sindex, i) + .selectFrom(ShortVector.fromArray(SPECIES, sa, i), + ShortVector.fromArray(SPECIES, sb, i)) + .intoArray(sres, i); + } + } + + @Test + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_4, ">0"}, + applyIfCPUFeature = {"asimd", "true"}, + applyIf = {"MaxVectorSize", ">=8"}) + public static void selectFromTwoVector_Short64() { + ShortSelectFromTwoVectorKernel(ShortVector.SPECIES_64, sa, sb, sindex[0]); + } + + @Test + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_8, ">0"}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=16"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_8, ">0"}, + applyIfCPUFeature = {"sve2", "true"}, + applyIf = {"MaxVectorSize", ">=16"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_8, ">0"}, + applyIfCPUFeatureAnd = {"avx512bw", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=16"}) + public static void selectFromTwoVector_Short128() { + ShortSelectFromTwoVectorKernel(ShortVector.SPECIES_128, sa, sb, sindex[1]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_16}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=32"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_16, ">0"}, + applyIfCPUFeature = {"sve2", "true"}, + applyIf = {"MaxVectorSize", ">=32"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_16, ">0"}, + applyIfCPUFeatureAnd = {"avx512bw", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=32"}) + public static void selectFromTwoVector_Short256() { + ShortSelectFromTwoVectorKernel(ShortVector.SPECIES_256, sa, sb, sindex[2]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_32}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=64"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_32, ">0"}, + applyIfCPUFeature = {"sve2", "true"}, + applyIf = {"MaxVectorSize", ">=64"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VS, IRNode.VECTOR_SIZE_32, ">0"}, + applyIfCPUFeatureAnd = {"avx512bw", "true", "avx512f", "true"}, + applyIf = {"MaxVectorSize", ">=64"}) + public static void selectFromTwoVector_Short512() { + ShortSelectFromTwoVectorKernel(ShortVector.SPECIES_512, sa, sb, sindex[3]); + } + + // Test SelectFromTwoVector operation for Ints + @ForceInline + public static void IntSelectFromTwoVectorKernel(VectorSpecies SPECIES, int[] ia, + int[] ib, int[] iindex) { + for (int i = 0; i < SPECIES.loopBound(ia.length); i += SPECIES.length()) { + IntVector.fromArray(SPECIES, iindex, i) + .selectFrom(IntVector.fromArray(SPECIES, ia, i), + IntVector.fromArray(SPECIES, ib, i)) + .intoArray(ires, i); + } + } + + @Test + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_2, ">0"}, + applyIfCPUFeatureOr = {"asimd", "true"}, + applyIf = {"MaxVectorSize", ">=8"}) + public static void selectFromTwoVector_Int64() { + IntSelectFromTwoVectorKernel(IntVector.SPECIES_64, ia, ib, iindex[0]); + } + + @Test + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_4, ">0"}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=16"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_4, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=16"}) + public static void selectFromTwoVector_Int128() { + IntSelectFromTwoVectorKernel(IntVector.SPECIES_128, ia, ib, iindex[1]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_8}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=32"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_8, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=32"}) + public static void selectFromTwoVector_Int256() { + IntSelectFromTwoVectorKernel(IntVector.SPECIES_256, ia, ib, iindex[2]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_16}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=64"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VI, IRNode.VECTOR_SIZE_16, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512f", "true"}, + applyIf = {"MaxVectorSize", ">=64"}) + public static void selectFromTwoVector_Int512() { + IntSelectFromTwoVectorKernel(IntVector.SPECIES_512, ia, ib, iindex[3]); + } + + // Test SelectFromTwoVector operation for Floats + @ForceInline + public static void FloatSelectFromTwoVectorKernel(VectorSpecies SPECIES, float[] fa, + float[] fb, float[] findex) { + for (int i = 0; i < SPECIES.loopBound(ia.length); i += SPECIES.length()) { + FloatVector.fromArray(SPECIES, findex, i) + .selectFrom(FloatVector.fromArray(SPECIES, fa, i), + FloatVector.fromArray(SPECIES, fb, i)) + .intoArray(fres, i); + } + } + + @Test + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_2, ">0"}, + applyIfCPUFeatureOr = {"asimd", "true"}, + applyIf = {"MaxVectorSize", ">=8"}) + public static void selectFromTwoVector_Float64() { + FloatSelectFromTwoVectorKernel(FloatVector.SPECIES_64, fa, fb, findex[0]); + } + + @Test + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_4, ">0"}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=16"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_4, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=16"}) + public static void selectFromTwoVector_Float128() { + FloatSelectFromTwoVectorKernel(FloatVector.SPECIES_128, fa, fb, findex[1]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_8}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=32"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_8, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=32"}) + public static void selectFromTwoVector_Float256() { + FloatSelectFromTwoVectorKernel(FloatVector.SPECIES_256, fa, fb, findex[2]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_16}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=64"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VF, IRNode.VECTOR_SIZE_16, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512f", "true"}, + applyIf = {"MaxVectorSize", ">=64"}) + public static void selectFromTwoVector_Float512() { + FloatSelectFromTwoVectorKernel(FloatVector.SPECIES_512, fa, fb, findex[3]); + } + + // Test SelectFromTwoVector operation for Doubles + @ForceInline + public static void DoubleSelectFromTwoVectorKernel(VectorSpecies SPECIES, double[] da, + double[] db, double[] dindex) { + for (int i = 0; i < SPECIES.loopBound(ia.length); i += SPECIES.length()) { + DoubleVector.fromArray(SPECIES, dindex, i) + .selectFrom(DoubleVector.fromArray(SPECIES, da, i), + DoubleVector.fromArray(SPECIES, db, i)) + .intoArray(dres, i); + } + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_2}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=16"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_2, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=16"}) + public static void selectFromTwoVector_Double128() { + DoubleSelectFromTwoVectorKernel(DoubleVector.SPECIES_128, da, db, dindex[0]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_4}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=32"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_4, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=32"}) + public static void selectFromTwoVector_Double256() { + DoubleSelectFromTwoVectorKernel(DoubleVector.SPECIES_256, da, db, dindex[1]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_8}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=64"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VD, IRNode.VECTOR_SIZE_8, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512f", "true"}, + applyIf = {"MaxVectorSize", ">=64"}) + public static void selectFromTwoVector_Double512() { + DoubleSelectFromTwoVectorKernel(DoubleVector.SPECIES_512, da, db, dindex[2]); + } + + // Test SelectFromTwoVector operation for Longs + @ForceInline + public static void LongSelectFromTwoVectorKernel(VectorSpecies SPECIES, long[] la, + long[] lb, long[] lindex) { + for (int i = 0; i < SPECIES.loopBound(ia.length); i += SPECIES.length()) { + LongVector.fromArray(SPECIES, lindex, i) + .selectFrom(LongVector.fromArray(SPECIES, la, i), + LongVector.fromArray(SPECIES, lb, i)) + .intoArray(lres, i); + } + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_2}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=16"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_2, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=16"}) + public static void selectFromTwoVector_Long128() { + LongSelectFromTwoVectorKernel(LongVector.SPECIES_128, la, lb, lindex[0]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_4}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=32"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_4, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512vl", "true"}, + applyIf = {"MaxVectorSize", ">=32"}) + public static void selectFromTwoVector_Long256() { + LongSelectFromTwoVectorKernel(LongVector.SPECIES_256, la, lb, lindex[1]); + } + + @Test + @IR(failOn = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_8}, + applyIfCPUFeatureAnd = {"asimd", "true", "sve2", "false"}, + applyIf = {"MaxVectorSize", ">=64"}) + @IR(counts = {IRNode.SELECT_FROM_TWO_VECTOR_VL, IRNode.VECTOR_SIZE_8, ">0"}, + applyIfCPUFeatureOr = {"sve2", "true", "avx512f", "true"}, + applyIf = {"MaxVectorSize", ">=64"}) + public static void selectFromTwoVector_Long512() { + LongSelectFromTwoVectorKernel(LongVector.SPECIES_512, la, lb, lindex[2]); + } + + public static void main(String[] args) { + TestFramework.runWithFlags("--add-modules=jdk.incubator.vector"); + } +}