From 676e6fd8d5152f4e0d14ae59ddd7aa0a7127ea58 Mon Sep 17 00:00:00 2001 From: Xiaohong Gong Date: Thu, 13 Nov 2025 01:33:21 +0000 Subject: [PATCH] 8367292: VectorAPI: Optimize VectorMask.fromLong/toLong() for SVE Reviewed-by: epeter, psandoz, haosun, sviswanathan --- src/hotspot/cpu/aarch64/aarch64_vector.ad | 67 ++++-- src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 67 ++++-- .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 218 +++++++++--------- .../cpu/aarch64/c2_MacroAssembler_aarch64.hpp | 20 +- src/hotspot/cpu/arm/arm.ad | 4 + src/hotspot/cpu/ppc/ppc.ad | 4 + src/hotspot/cpu/riscv/riscv_v.ad | 5 + src/hotspot/cpu/s390/s390.ad | 4 + src/hotspot/cpu/x86/x86.ad | 5 + src/hotspot/share/opto/matcher.hpp | 6 + src/hotspot/share/opto/vectorIntrinsics.cpp | 6 +- src/hotspot/share/opto/vectornode.cpp | 16 +- .../compiler/lib/ir_framework/IRNode.java | 10 + .../ir_framework/test/IREncodingPrinter.java | 1 + .../vectorapi/VectorMaskFromLongTest.java | 147 ++++++------ .../vectorapi/VectorMaskToLongTest.java | 146 ++++++++++-- 16 files changed, 470 insertions(+), 256 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 9809d096233..842784d1a29 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -393,6 +393,32 @@ source %{ return false; } + bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) { + // Only SVE supports the predicate feature. + if (UseSVE == 0) { + // On architectures that do not support predicate, masks are stored in + // general vector registers (TypeVect) with sizes ranging from TypeVectA + // to TypeVectX based on the vector size in bytes. + assert(vt->isa_vectmask() == nullptr, "mask type is not matched"); + return false; + } + + assert(vt->isa_vectmask() != nullptr, "expected TypeVectMask on SVE"); + switch (opcode) { + case Op_VectorMaskToLong: + case Op_VectorLongToMask: + // These operations lack native SVE predicate instructions and are + // implemented using general vector instructions instead. Use vector + // registers rather than predicate registers to save the mask for + // better performance. + return false; + default: + // By default, the mask operations are implemented with predicate + // instructions with a predicate input/output. + return true; + } + } + // Assert that the given node is not a variable shift. bool assert_not_var_shift(const Node* n) { assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift"); @@ -6249,31 +6275,44 @@ instruct vmask_tolong_neon(iRegLNoSp dst, vReg src) %{ ins_pipe(pipe_slow); %} -instruct vmask_tolong_sve(iRegLNoSp dst, pReg src, vReg tmp1, vReg tmp2) %{ - predicate(UseSVE > 0); +instruct vmask_tolong_sve(iRegLNoSp dst, vReg src, vReg tmp) %{ + predicate(UseSVE > 0 && !VM_Version::supports_svebitperm()); + match(Set dst (VectorMaskToLong src)); + effect(TEMP tmp); + format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp" %} + ins_encode %{ + // Input "src" is a vector of boolean represented as + // bytes with 0x00/0x01 as element values. + __ sve_vmask_tolong($dst$$Register, $src$$FloatRegister, + $tmp$$FloatRegister, Matcher::vector_length(this, $src)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmask_tolong_sve2(iRegLNoSp dst, vReg src, vReg tmp1, vReg tmp2) %{ + predicate(VM_Version::supports_svebitperm()); match(Set dst (VectorMaskToLong src)); effect(TEMP tmp1, TEMP tmp2); - format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp1, $tmp2" %} + format %{ "vmask_tolong_sve2 $dst, $src\t# KILL $tmp1, $tmp2" %} ins_encode %{ - __ sve_vmask_tolong($dst$$Register, $src$$PRegister, - Matcher::vector_element_basic_type(this, $src), - Matcher::vector_length(this, $src), - $tmp1$$FloatRegister, $tmp2$$FloatRegister); + // Input "src" is a vector of boolean represented as + // bytes with 0x00/0x01 as element values. + __ sve2_vmask_tolong($dst$$Register, $src$$FloatRegister, + $tmp1$$FloatRegister, $tmp2$$FloatRegister, + Matcher::vector_length(this, $src)); %} ins_pipe(pipe_slow); %} // fromlong -instruct vmask_fromlong(pReg dst, iRegL src, vReg tmp1, vReg tmp2) %{ +instruct vmask_fromlong(vReg dst, iRegL src, vReg tmp) %{ match(Set dst (VectorLongToMask src)); - effect(TEMP tmp1, TEMP tmp2); - format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp1, $tmp2" %} + effect(TEMP_DEF dst, TEMP tmp); + format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp" %} ins_encode %{ - __ sve_vmask_fromlong($dst$$PRegister, $src$$Register, - Matcher::vector_element_basic_type(this), - Matcher::vector_length(this), - $tmp1$$FloatRegister, $tmp2$$FloatRegister); + __ sve_vmask_fromlong($dst$$FloatRegister, $src$$Register, + $tmp$$FloatRegister, Matcher::vector_length(this)); %} ins_pipe(pipe_slow); %} diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index a9f42e1bc08..dff82ce95ac 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -383,6 +383,32 @@ source %{ return false; } + bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) { + // Only SVE supports the predicate feature. + if (UseSVE == 0) { + // On architectures that do not support predicate, masks are stored in + // general vector registers (TypeVect) with sizes ranging from TypeVectA + // to TypeVectX based on the vector size in bytes. + assert(vt->isa_vectmask() == nullptr, "mask type is not matched"); + return false; + } + + assert(vt->isa_vectmask() != nullptr, "expected TypeVectMask on SVE"); + switch (opcode) { + case Op_VectorMaskToLong: + case Op_VectorLongToMask: + // These operations lack native SVE predicate instructions and are + // implemented using general vector instructions instead. Use vector + // registers rather than predicate registers to save the mask for + // better performance. + return false; + default: + // By default, the mask operations are implemented with predicate + // instructions with a predicate input/output. + return true; + } + } + // Assert that the given node is not a variable shift. bool assert_not_var_shift(const Node* n) { assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift"); @@ -4303,31 +4329,44 @@ instruct vmask_tolong_neon(iRegLNoSp dst, vReg src) %{ ins_pipe(pipe_slow); %} -instruct vmask_tolong_sve(iRegLNoSp dst, pReg src, vReg tmp1, vReg tmp2) %{ - predicate(UseSVE > 0); +instruct vmask_tolong_sve(iRegLNoSp dst, vReg src, vReg tmp) %{ + predicate(UseSVE > 0 && !VM_Version::supports_svebitperm()); + match(Set dst (VectorMaskToLong src)); + effect(TEMP tmp); + format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp" %} + ins_encode %{ + // Input "src" is a vector of boolean represented as + // bytes with 0x00/0x01 as element values. + __ sve_vmask_tolong($dst$$Register, $src$$FloatRegister, + $tmp$$FloatRegister, Matcher::vector_length(this, $src)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmask_tolong_sve2(iRegLNoSp dst, vReg src, vReg tmp1, vReg tmp2) %{ + predicate(VM_Version::supports_svebitperm()); match(Set dst (VectorMaskToLong src)); effect(TEMP tmp1, TEMP tmp2); - format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp1, $tmp2" %} + format %{ "vmask_tolong_sve2 $dst, $src\t# KILL $tmp1, $tmp2" %} ins_encode %{ - __ sve_vmask_tolong($dst$$Register, $src$$PRegister, - Matcher::vector_element_basic_type(this, $src), - Matcher::vector_length(this, $src), - $tmp1$$FloatRegister, $tmp2$$FloatRegister); + // Input "src" is a vector of boolean represented as + // bytes with 0x00/0x01 as element values. + __ sve2_vmask_tolong($dst$$Register, $src$$FloatRegister, + $tmp1$$FloatRegister, $tmp2$$FloatRegister, + Matcher::vector_length(this, $src)); %} ins_pipe(pipe_slow); %} // fromlong -instruct vmask_fromlong(pReg dst, iRegL src, vReg tmp1, vReg tmp2) %{ +instruct vmask_fromlong(vReg dst, iRegL src, vReg tmp) %{ match(Set dst (VectorLongToMask src)); - effect(TEMP tmp1, TEMP tmp2); - format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp1, $tmp2" %} + effect(TEMP_DEF dst, TEMP tmp); + format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp" %} ins_encode %{ - __ sve_vmask_fromlong($dst$$PRegister, $src$$Register, - Matcher::vector_element_basic_type(this), - Matcher::vector_length(this), - $tmp1$$FloatRegister, $tmp2$$FloatRegister); + __ sve_vmask_fromlong($dst$$FloatRegister, $src$$Register, + $tmp$$FloatRegister, Matcher::vector_length(this)); %} ins_pipe(pipe_slow); %} diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index ebb4a897906..5e57044dcba 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -1399,137 +1399,125 @@ void C2_MacroAssembler::bytemask_compress(Register dst) { andr(dst, dst, 0xff); // dst = 0x8D } -// Pack the lowest-numbered bit of each mask element in src into a long value -// in dst, at most the first 64 lane elements. -// Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. -void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, - FloatRegister vtmp1, FloatRegister vtmp2) { +// Pack the value of each mask element in "src" into a long value in "dst", at most +// the first 64 lane elements. The input "src" is a vector of boolean represented as +// bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into +// one bit in "dst". +// +// Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16 +// Expected: dst = 0x658D +// +// Clobbers: rscratch1 +void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src, + FloatRegister vtmp, int lane_cnt) { assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); assert_different_registers(dst, rscratch1); - assert_different_registers(vtmp1, vtmp2); + assert_different_registers(src, vtmp); + assert(UseSVE > 0, "must be"); - Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); - // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 - // Expected: dst = 0x658D + // Compress the lowest 8 bytes. + fmovd(dst, src); + bytemask_compress(dst); + if (lane_cnt <= 8) return; - // Convert the mask into vector with sequential bytes. - // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 - sve_cpy(vtmp1, size, src, 1, false); - if (bt != T_BYTE) { - sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); - } - - if (UseSVE > 1 && VM_Version::supports_svebitperm()) { - // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea - // is to compress each significant bit of the byte in a cross-lane way. Due - // to the lack of a cross-lane bit-compress instruction, we use BEXT - // (bit-compress in each lane) with the biggest lane size (T = D) then - // concatenate the results. - - // The second source input of BEXT, initialized with 0x01 in each byte. - // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 - sve_dup(vtmp2, B, 1); - - // BEXT vtmp1.D, vtmp1.D, vtmp2.D - // vtmp1 = 0x0001010000010001 | 0x0100000001010001 - // vtmp2 = 0x0101010101010101 | 0x0101010101010101 - // --------------------------------------- - // vtmp1 = 0x0000000000000065 | 0x000000000000008D - sve_bext(vtmp1, D, vtmp1, vtmp2); - - // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the - // result to dst. - // vtmp1 = 0x0000000000000000 | 0x000000000000658D - // dst = 0x658D - if (lane_cnt <= 8) { - // No need to concatenate. - umov(dst, vtmp1, B, 0); - } else if (lane_cnt <= 16) { - ins(vtmp1, B, vtmp1, 1, 8); - umov(dst, vtmp1, H, 0); - } else { - // As the lane count is 64 at most, the final expected value must be in - // the lowest 64 bits after narrowing vtmp1 from D to B. - sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); - umov(dst, vtmp1, D, 0); - } - } else if (UseSVE > 0) { - // Compress the lowest 8 bytes. - fmovd(dst, vtmp1); - bytemask_compress(dst); - if (lane_cnt <= 8) return; - - // Repeat on higher bytes and join the results. - // Compress 8 bytes in each iteration. - for (int idx = 1; idx < (lane_cnt / 8); idx++) { - sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); - bytemask_compress(rscratch1); - orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); - } - } else { - assert(false, "unsupported"); - ShouldNotReachHere(); + // Repeat on higher bytes and join the results. + // Compress 8 bytes in each iteration. + for (int idx = 1; idx < (lane_cnt / 8); idx++) { + sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp); + bytemask_compress(rscratch1); + orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); } } -// Unpack the mask, a long value in src, into predicate register dst based on the -// corresponding data type. Note that dst can support at most 64 lanes. -// Below example gives the expected dst predicate register in different types, with -// a valid src(0x658D) on a 1024-bit vector size machine. -// BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D -// SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 -// INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 -// LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 -// -// The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which -// has 24 significant bits would be an invalid input if dst predicate register refers to -// a LONG type 1024-bit vector, which has at most 16 lanes. -void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, - FloatRegister vtmp1, FloatRegister vtmp2) { - assert(UseSVE == 2 && VM_Version::supports_svebitperm() && - lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); - Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); - // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 - // Expected: dst = 0b01101001 10001101 +// The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT +// instruction which requires the FEAT_BITPERM feature. +void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src, + FloatRegister vtmp1, FloatRegister vtmp2, + int lane_cnt) { + assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); + assert_different_registers(src, vtmp1, vtmp2); + assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be"); - // Put long value from general purpose register into the first lane of vector. - // vtmp1 = 0x0000000000000000 | 0x000000000000658D - sve_dup(vtmp1, B, 0); - mov(vtmp1, D, 0, src); + // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea + // is to compress each significant bit of the byte in a cross-lane way. Due + // to the lack of a cross-lane bit-compress instruction, we use BEXT + // (bit-compress in each lane) with the biggest lane size (T = D) then + // concatenate the results. - // As sve_cmp generates mask value with the minimum unit in byte, we should - // transform the value in the first lane which is mask in bit now to the - // mask in byte, which can be done by SVE2's BDEP instruction. - - // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. - // vtmp1 = 0x0000000000000065 | 0x000000000000008D - if (lane_cnt <= 8) { - // Nothing. As only one byte exsits. - } else if (lane_cnt <= 16) { - ins(vtmp1, B, vtmp1, 8, 1); - mov(vtmp1, B, 1, zr); - } else { - sve_vector_extend(vtmp1, D, vtmp1, B); - } - - // The second source input of BDEP instruction, initialized with 0x01 for each byte. + // The second source input of BEXT, initialized with 0x01 in each byte. // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 sve_dup(vtmp2, B, 1); - // BDEP vtmp1.D, vtmp1.D, vtmp2.D - // vtmp1 = 0x0000000000000065 | 0x000000000000008D + // BEXT vtmp1.D, src.D, vtmp2.D + // src = 0x0001010000010001 | 0x0100000001010001 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 // --------------------------------------- - // vtmp1 = 0x0001010000010001 | 0x0100000001010001 - sve_bdep(vtmp1, D, vtmp1, vtmp2); + // vtmp1 = 0x0000000000000065 | 0x000000000000008D + sve_bext(vtmp1, D, src, vtmp2); - if (bt != T_BYTE) { - sve_vector_extend(vtmp1, size, vtmp1, B); + // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the + // result to dst. + // vtmp1 = 0x0000000000000000 | 0x000000000000658D + // dst = 0x658D + if (lane_cnt <= 8) { + // No need to concatenate. + umov(dst, vtmp1, B, 0); + } else if (lane_cnt <= 16) { + ins(vtmp1, B, vtmp1, 1, 8); + umov(dst, vtmp1, H, 0); + } else { + // As the lane count is 64 at most, the final expected value must be in + // the lowest 64 bits after narrowing vtmp1 from D to B. + sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); + umov(dst, vtmp1, D, 0); } - // Generate mask according to the given vector, in which the elements have been - // extended to expected type. - // dst = 0b01101001 10001101 - sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); +} + +// Unpack the mask, a long value in "src", into a vector register of boolean +// represented as bytes with 0x00/0x01 as element values in "dst". Each bit in +// "src" is unpacked into one byte lane in "dst". Note that "dst" can support at +// most 64 lanes. +// +// Below example gives the expected dst vector register, with a valid src(0x658D) +// on a 128-bit vector size machine. +// dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 +void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src, + FloatRegister vtmp, int lane_cnt) { + assert_different_registers(dst, vtmp); + assert(UseSVE == 2 && VM_Version::supports_svebitperm() && + lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); + + // Example: src = 0x658D, lane_cnt = 16 + // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 + + // Put long value from general purpose register into the first lane of vector. + // vtmp = 0x0000000000000000 | 0x000000000000658D + sve_dup(vtmp, B, 0); + mov(vtmp, D, 0, src); + + // Transform the value in the first lane which is mask in bit now to the mask in + // byte, which can be done by SVE2's BDEP instruction. + + // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. + // vtmp = 0x0000000000000065 | 0x000000000000008D + if (lane_cnt <= 8) { + // Nothing. As only one byte exsits. + } else if (lane_cnt <= 16) { + ins(vtmp, B, vtmp, 8, 1); + } else { + sve_vector_extend(vtmp, D, vtmp, B); + } + + // The second source input of BDEP instruction, initialized with 0x01 for each byte. + // dst = 0x01010101 0x01010101 0x01010101 0x01010101 + sve_dup(dst, B, 1); + + // BDEP dst.D, vtmp.D, dst.D + // vtmp = 0x0000000000000065 | 0x000000000000008D + // dst = 0x0101010101010101 | 0x0101010101010101 + // --------------------------------------- + // dst = 0x0001010000010001 | 0x0100000001010001 + sve_bdep(dst, D, vtmp, dst); } // Clobbers: rflags diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp index ccd091938a3..412f0f37e9e 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp @@ -85,15 +85,19 @@ // the higher garbage bits. void bytemask_compress(Register dst); - // Pack the lowest-numbered bit of each mask element in src into a long value - // in dst, at most the first 64 lane elements. - void sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, - FloatRegister vtmp1, FloatRegister vtmp2); + // Pack the value of each mask element in "src" into a long value in "dst", at most the + // first 64 lane elements. The input "src" is a vector of boolean represented as bytes + // with 0x00/0x01 as element values. Each lane value from "src" is packed into one bit in + // "dst". + void sve_vmask_tolong(Register dst, FloatRegister src, FloatRegister vtmp, int lane_cnt); - // Unpack the mask, a long value in src, into predicate register dst based on the - // corresponding data type. Note that dst can support at most 64 lanes. - void sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, - FloatRegister vtmp1, FloatRegister vtmp2); + void sve2_vmask_tolong(Register dst, FloatRegister src, FloatRegister vtmp1, + FloatRegister vtmp2, int lane_cnt); + + // Unpack the mask, a long value in "src", into vector register "dst" with boolean type. + // Each bit in "src" is unpacked into one byte lane in "dst". Note that "dst" can support + // at most 64 lanes. + void sve_vmask_fromlong(FloatRegister dst, Register src, FloatRegister vtmp, int lane_cnt); // SIMD&FP comparison void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad index 31a442be624..92c0df68deb 100644 --- a/src/hotspot/cpu/arm/arm.ad +++ b/src/hotspot/cpu/arm/arm.ad @@ -1003,6 +1003,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen return false; } +bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) { + return false; +} + const RegMask* Matcher::predicate_reg_mask(void) { return nullptr; } diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index 36326e5fdb7..7fcd096d2ad 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -2292,6 +2292,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen return false; } +bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) { + return false; +} + const RegMask* Matcher::predicate_reg_mask(void) { return nullptr; } diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index fe323474d60..d162280106a 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -164,6 +164,11 @@ source %{ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) { return false; } + + bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) { + // Prefer predicate if the mask type is "TypeVectMask". + return vt->isa_vectmask() != nullptr; + } %} // All VEC instructions diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index 2b2ce713491..cab3965ecfa 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -1809,6 +1809,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen return false; } +bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) { + return false; +} + const RegMask* Matcher::predicate_reg_mask(void) { return nullptr; } diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 9a0bbdc27a0..a9748617e1f 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -3736,6 +3736,11 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen } } +bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) { + // Prefer predicate if the mask type is "TypeVectMask". + return vt->isa_vectmask() != nullptr; +} + MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) { assert(Matcher::is_generic_vector(generic_opnd), "not generic"); bool legacy = (generic_opnd->opcode() == LEGVEC); diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index e4396b423ac..01f11b1fdc9 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -333,6 +333,12 @@ public: static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen); + // Identify if a vector mask operation prefers the input/output mask to be + // saved with a predicate type or not. + // - Return true if it prefers a predicate type (i.e. TypeVectMask). + // - Return false if it prefers a general vector type (i.e. TypeVectA to TypeVectZ). + static bool mask_op_prefers_predicate(int opcode, const TypeVect* vt); + static const RegMask* predicate_reg_mask(void); // Vector width in bytes diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index 85d9790c0eb..b48b5f2cd05 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -622,7 +622,7 @@ bool LibraryCallKit::inline_vector_mask_operation() { return false; } - if (mask_vec->bottom_type()->isa_vectmask() == nullptr) { + if (!Matcher::mask_op_prefers_predicate(mopc, mask_vec->bottom_type()->is_vect())) { mask_vec = gvn().transform(VectorStoreMaskNode::make(gvn(), mask_vec, elem_bt, num_elem)); } const Type* maskoper_ty = mopc == Op_VectorMaskToLong ? (const Type*)TypeLong::LONG : (const Type*)TypeInt::INT; @@ -708,7 +708,7 @@ bool LibraryCallKit::inline_vector_frombits_coerced() { if (opc == Op_VectorLongToMask) { const TypeVect* vt = TypeVect::makemask(elem_bt, num_elem); - if (vt->isa_vectmask()) { + if (Matcher::mask_op_prefers_predicate(opc, vt)) { broadcast = gvn().transform(new VectorLongToMaskNode(elem, vt)); } else { const TypeVect* mvt = TypeVect::make(T_BOOLEAN, num_elem); @@ -2545,7 +2545,7 @@ bool LibraryCallKit::inline_vector_extract() { return false; } // VectorMaskToLongNode requires the input is either a mask or a vector with BOOLEAN type. - if (opd->bottom_type()->isa_vectmask() == nullptr) { + if (!Matcher::mask_op_prefers_predicate(Op_VectorMaskToLong, opd->bottom_type()->is_vect())) { opd = gvn().transform(VectorStoreMaskNode::make(gvn(), opd, elem_bt, num_elem)); } // ((toLong() >>> pos) & 1L diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 6ae8bbe8aa0..a49f3d24fd4 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1403,7 +1403,7 @@ Node* ReductionNode::Ideal(PhaseGVN* phase, bool can_reshape) { } // Convert fromLong to maskAll if the input sets or unsets all lanes. -Node* convertFromLongToMaskAll(PhaseGVN* phase, const TypeLong* bits_type, bool is_mask, const TypeVect* vt) { +static Node* convertFromLongToMaskAll(PhaseGVN* phase, const TypeLong* bits_type, const TypeVect* vt) { uint vlen = vt->length(); BasicType bt = vt->element_basic_type(); // The "maskAll" API uses the corresponding integer types for floating-point data. @@ -1418,7 +1418,7 @@ Node* convertFromLongToMaskAll(PhaseGVN* phase, const TypeLong* bits_type, bool } else { con = phase->intcon(con_value); } - Node* res = VectorNode::scalar2vector(con, vlen, maskall_bt, is_mask); + Node* res = VectorNode::scalar2vector(con, vlen, maskall_bt, vt->isa_vectmask() != nullptr); // Convert back to the original floating-point data type. if (is_floating_point_type(bt)) { res = new VectorMaskCastNode(phase->transform(res), vt); @@ -1432,7 +1432,7 @@ Node* VectorLoadMaskNode::Ideal(PhaseGVN* phase, bool can_reshape) { // VectorLoadMask(VectorLongToMask(-1/0)) => Replicate(-1/0) if (in(1)->Opcode() == Op_VectorLongToMask) { const TypeVect* vt = bottom_type()->is_vect(); - Node* res = convertFromLongToMaskAll(phase, in(1)->in(1)->bottom_type()->isa_long(), false, vt); + Node* res = convertFromLongToMaskAll(phase, in(1)->in(1)->bottom_type()->isa_long(), vt); if (res != nullptr) { return res; } @@ -1900,10 +1900,12 @@ Node* VectorMaskCastNode::Identity(PhaseGVN* phase) { // l is -1 or 0. Node* VectorMaskToLongNode::Ideal_MaskAll(PhaseGVN* phase) { Node* in1 = in(1); - // VectorMaskToLong follows a VectorStoreMask if predicate is not supported. + // VectorMaskToLong follows a VectorStoreMask if it doesn't require the mask + // saved with a predicate type. if (in1->Opcode() == Op_VectorStoreMask) { - assert(!in1->in(1)->bottom_type()->isa_vectmask(), "sanity"); - in1 = in1->in(1); + Node* mask = in1->in(1); + assert(!Matcher::mask_op_prefers_predicate(Opcode(), mask->bottom_type()->is_vect()), "sanity"); + in1 = mask; } if (VectorNode::is_all_ones_vector(in1)) { int vlen = in1->bottom_type()->is_vect()->length(); @@ -1960,7 +1962,7 @@ Node* VectorLongToMaskNode::Ideal(PhaseGVN* phase, bool can_reshape) { // VectorLongToMask(-1/0) => MaskAll(-1/0) const TypeLong* bits_type = in(1)->bottom_type()->isa_long(); if (bits_type && is_mask) { - Node* res = convertFromLongToMaskAll(phase, bits_type, true, dst_type); + Node* res = convertFromLongToMaskAll(phase, bits_type, dst_type); if (res != nullptr) { return res; } diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 80429ad868a..25ebcc94844 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -2060,6 +2060,16 @@ public class IRNode { beforeMatchingNameRegex(STORE_VECTOR_SCATTER_MASKED, "StoreVectorScatterMasked"); } + public static final String VECTOR_LOAD_MASK = PREFIX + "VECTOR_LOAD_MASK" + POSTFIX; + static { + beforeMatchingNameRegex(VECTOR_LOAD_MASK, "VectorLoadMask"); + } + + public static final String VECTOR_STORE_MASK = PREFIX + "VECTOR_STORE_MASK" + POSTFIX; + static { + beforeMatchingNameRegex(VECTOR_STORE_MASK, "VectorStoreMask"); + } + public static final String SUB = PREFIX + "SUB" + POSTFIX; static { beforeMatchingNameRegex(SUB, "Sub(I|L|F|D|HF)"); diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java index daa2b9765f8..a24cfbd3e37 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java @@ -114,6 +114,7 @@ public class IREncodingPrinter { "asimd", "sve", "sve2", + "svebitperm", "fphp", "asimdhp", // RISCV64 diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskFromLongTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskFromLongTest.java index eaa6211efc5..c4feb97ebf3 100644 --- a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskFromLongTest.java +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskFromLongTest.java @@ -22,14 +22,14 @@ */ /* -* @test -* @bug 8356760 8367391 -* @library /test/lib / -* @summary Optimize VectorMask.fromLong for all-true/all-false cases -* @modules jdk.incubator.vector -* -* @run driver compiler.vectorapi.VectorMaskFromLongTest -*/ + * @test + * @bug 8356760 8367391 8367292 + * @library /test/lib / + * @summary IR test for VectorMask.fromLong() + * @modules jdk.incubator.vector + * + * @run driver compiler.vectorapi.VectorMaskFromLongTest + */ package compiler.vectorapi; @@ -47,11 +47,6 @@ public class VectorMaskFromLongTest { static boolean[] mr = new boolean[B_SPECIES.length()]; - @ForceInline - public static void maskFromLongKernel(VectorSpecies species, long inputLong) { - VectorMask.fromLong(species, inputLong).intoArray(mr, 0); - } - @DontInline public static void verifyMaskFromLong(VectorSpecies species, long inputLong) { for (int i = 0; i < species.length(); i++) { @@ -63,9 +58,11 @@ public class VectorMaskFromLongTest { } } + // Tests for "VectorLongToMask(-1/0) => MaskAll(-1/0)" + @ForceInline - public static void testMaskFromLong(VectorSpecies species, long inputLong ) { - maskFromLongKernel(species, inputLong); + public static void fromLongMaskAllKernel(VectorSpecies species, long inputLong ) { + VectorMask.fromLong(species, inputLong).intoArray(mr, 0); verifyMaskFromLong(species, inputLong); } @@ -73,16 +70,16 @@ public class VectorMaskFromLongTest { public static void testMaskFromLongMaskAll(VectorSpecies species) { int vlen = species.length(); long inputLong = 0L; - testMaskFromLong(species, inputLong); + fromLongMaskAllKernel(species, inputLong); inputLong = vlen >= 64 ? 0L : (0x1L << vlen); - testMaskFromLong(species, inputLong); + fromLongMaskAllKernel(species, inputLong); inputLong = -1L; - testMaskFromLong(species, inputLong); + fromLongMaskAllKernel(species, inputLong); inputLong = (-1L >>> (64 - vlen)); - testMaskFromLong(species, inputLong); + fromLongMaskAllKernel(species, inputLong); } @Test @@ -169,102 +166,104 @@ public class VectorMaskFromLongTest { testMaskFromLongMaskAll(D_SPECIES); } - // Tests for general input long values + // Tests for general input long values. The purpose is to test the IRs + // for API VectorMask.fromLong(). To avoid any IR being optimized out by + // compiler, we insert a VectorMask.not() after fromLong(). + + @ForceInline + public static void fromLongGeneralKernel(VectorSpecies species, long inputLong) { + VectorMask.fromLong(species, inputLong).not().intoArray(mr, 0); + verifyMaskFromLong(species, inputLong ^ -1L); + } + + @ForceInline + public static void testMaskFromLongGeneral(VectorSpecies species) { + fromLongGeneralKernel(species, (-1L >>> (64 - species.length())) - 1); + fromLongGeneralKernel(species, (-1L >>> (64 - species.length())) >>> 1); + } @Test - @IR(counts = { IRNode.MASK_ALL, "= 0", + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" }) - @IR(counts = { IRNode.REPLICATE_B, "= 0", - IRNode.VECTOR_LONG_TO_MASK, "= 0" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) - @IR(counts = { IRNode.REPLICATE_B, "= 0", + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", + IRNode.VECTOR_LONG_TO_MASK, "= 2" }, + applyIfCPUFeature = { "svebitperm", "true" }) public static void testMaskFromLongByte() { - // Test cases where some but not all bits are set. - testMaskFromLong(B_SPECIES, (-1L >>> (64 - B_SPECIES.length())) - 1); - testMaskFromLong(B_SPECIES, (-1L >>> (64 - B_SPECIES.length())) >>> 1); + testMaskFromLongGeneral(B_SPECIES); } @Test - @IR(counts = { IRNode.MASK_ALL, "= 0", + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" }) - @IR(counts = { IRNode.REPLICATE_S, "= 0", - IRNode.VECTOR_LONG_TO_MASK, "= 0" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) - @IR(counts = { IRNode.REPLICATE_S, "= 0", + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", + IRNode.VECTOR_LONG_TO_MASK, "= 2" }, + applyIfCPUFeature = { "svebitperm", "true" }) public static void testMaskFromLongShort() { - // Test cases where some but not all bits are set. - testMaskFromLong(S_SPECIES, (-1L >>> (64 - S_SPECIES.length())) - 1); - testMaskFromLong(S_SPECIES, (-1L >>> (64 - S_SPECIES.length())) >>> 1); + testMaskFromLongGeneral(S_SPECIES); } @Test - @IR(counts = { IRNode.MASK_ALL, "= 0", + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" }) - @IR(counts = { IRNode.REPLICATE_I, "= 0", - IRNode.VECTOR_LONG_TO_MASK, "= 0" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) - @IR(counts = { IRNode.REPLICATE_I, "= 0", + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", + IRNode.VECTOR_LONG_TO_MASK, "= 2" }, + applyIfCPUFeature = { "svebitperm", "true" }) public static void testMaskFromLongInt() { - // Test cases where some but not all bits are set. - testMaskFromLong(I_SPECIES, (-1L >>> (64 - I_SPECIES.length())) - 1); - testMaskFromLong(I_SPECIES, (-1L >>> (64 - I_SPECIES.length())) >>> 1); + testMaskFromLongGeneral(I_SPECIES); } @Test - @IR(counts = { IRNode.MASK_ALL, "= 0", + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" }) - @IR(counts = { IRNode.REPLICATE_L, "= 0", - IRNode.VECTOR_LONG_TO_MASK, "= 0" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) - @IR(counts = { IRNode.REPLICATE_L, "= 0", + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", + IRNode.VECTOR_LONG_TO_MASK, "= 2" }, + applyIfCPUFeature = { "svebitperm", "true" }) public static void testMaskFromLongLong() { - // Test cases where some but not all bits are set. - testMaskFromLong(L_SPECIES, (-1L >>> (64 - L_SPECIES.length())) - 1); - testMaskFromLong(L_SPECIES, (-1L >>> (64 - L_SPECIES.length())) >>> 1); + testMaskFromLongGeneral(L_SPECIES); } @Test - @IR(counts = { IRNode.MASK_ALL, "= 0", + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" }) - @IR(counts = { IRNode.REPLICATE_I, "= 0", - IRNode.VECTOR_LONG_TO_MASK, "= 0" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) - @IR(counts = { IRNode.REPLICATE_I, "= 0", + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", + IRNode.VECTOR_LONG_TO_MASK, "= 2" }, + applyIfCPUFeature = { "svebitperm", "true" }) public static void testMaskFromLongFloat() { - // Test cases where some but not all bits are set. - testMaskFromLong(F_SPECIES, (-1L >>> (64 - F_SPECIES.length())) - 1); - testMaskFromLong(F_SPECIES, (-1L >>> (64 - F_SPECIES.length())) >>> 1); + testMaskFromLongGeneral(F_SPECIES); } @Test - @IR(counts = { IRNode.MASK_ALL, "= 0", + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" }) - @IR(counts = { IRNode.REPLICATE_L, "= 0", - IRNode.VECTOR_LONG_TO_MASK, "= 0" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) - @IR(counts = { IRNode.REPLICATE_L, "= 0", + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", IRNode.VECTOR_LONG_TO_MASK, "= 2" }, applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2", + IRNode.VECTOR_LONG_TO_MASK, "= 2" }, + applyIfCPUFeature = { "svebitperm", "true" }) public static void testMaskFromLongDouble() { - // Test cases where some but not all bits are set. - testMaskFromLong(D_SPECIES, (-1L >>> (64 - D_SPECIES.length())) - 1); - testMaskFromLong(D_SPECIES, (-1L >>> (64 - D_SPECIES.length())) >>> 1); + testMaskFromLongGeneral(D_SPECIES); } public static void main(String[] args) { diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskToLongTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskToLongTest.java index 3201d593efe..35a5aca966a 100644 --- a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskToLongTest.java +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskToLongTest.java @@ -22,18 +22,19 @@ */ /* -* @test -* @bug 8356760 -* @library /test/lib / -* @summary Optimize VectorMask.fromLong for all-true/all-false cases -* @modules jdk.incubator.vector -* -* @run driver compiler.vectorapi.VectorMaskToLongTest -*/ + * @test + * @bug 8356760 8367292 + * @library /test/lib / + * @summary IR test for VectorMask.toLong() + * @modules jdk.incubator.vector + * + * @run driver compiler.vectorapi.VectorMaskToLongTest + */ package compiler.vectorapi; import compiler.lib.ir_framework.*; +import java.util.Arrays; import jdk.incubator.vector.*; import jdk.test.lib.Asserts; @@ -45,12 +46,21 @@ public class VectorMaskToLongTest { static final VectorSpecies L_SPECIES = LongVector.SPECIES_MAX; static final VectorSpecies D_SPECIES = DoubleVector.SPECIES_MAX; + private static boolean[] m; + + static { + m = new boolean[B_SPECIES.length()]; + Arrays.fill(m, true); + } + @DontInline public static void verifyMaskToLong(VectorSpecies species, long inputLong, long got) { long expected = inputLong & (-1L >>> (64 - species.length())); Asserts.assertEquals(expected, got, "for input long " + inputLong); } + // Tests for "VectorMaskToLong(MaskAll(0/-1)) => ((0/-1) & (-1ULL >> (64 - vlen)))" + @ForceInline public static void testMaskAllToLong(VectorSpecies species) { int vlen = species.length(); @@ -173,12 +183,12 @@ public class VectorMaskToLongTest { @Test @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 0" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" }) + applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" }) @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 1" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) + applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" }) public static void testFromLongToLongByte() { - // Test the case where some but not all bits are set. + // Test the case where some but not all bits are set. long inputLong = (-1L >>> (64 - B_SPECIES.length()))-1; long got = VectorMask.fromLong(B_SPECIES, inputLong).toLong(); verifyMaskToLong(B_SPECIES, inputLong, got); @@ -187,10 +197,10 @@ public class VectorMaskToLongTest { @Test @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 0" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" }) + applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" }) @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 1" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) + applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" }) public static void testFromLongToLongShort() { // Test the case where some but not all bits are set. long inputLong = (-1L >>> (64 - S_SPECIES.length()))-1; @@ -201,10 +211,10 @@ public class VectorMaskToLongTest { @Test @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 0" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" }) + applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" }) @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 1" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) + applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" }) public static void testFromLongToLongInt() { // Test the case where some but not all bits are set. long inputLong = (-1L >>> (64 - I_SPECIES.length()))-1; @@ -215,10 +225,10 @@ public class VectorMaskToLongTest { @Test @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 0" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" }) + applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" }) @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 1" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) + applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" }) public static void testFromLongToLongLong() { // Test the case where some but not all bits are set. long inputLong = (-1L >>> (64 - L_SPECIES.length()))-1; @@ -229,10 +239,10 @@ public class VectorMaskToLongTest { @Test @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 1", IRNode.VECTOR_MASK_TO_LONG, "= 1" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" }) + applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" }) @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 1" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) + applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" }) public static void testFromLongToLongFloat() { // Test the case where some but not all bits are set. long inputLong = (-1L >>> (64 - F_SPECIES.length()))-1; @@ -243,10 +253,10 @@ public class VectorMaskToLongTest { @Test @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 1", IRNode.VECTOR_MASK_TO_LONG, "= 1" }, - applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" }) + applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" }) @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0", IRNode.VECTOR_MASK_TO_LONG, "= 1" }, - applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" }) + applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" }) public static void testFromLongToLongDouble() { // Test the case where some but not all bits are set. long inputLong = (-1L >>> (64 - D_SPECIES.length()))-1; @@ -254,6 +264,100 @@ public class VectorMaskToLongTest { verifyMaskToLong(D_SPECIES, inputLong, got); } + // General cases for VectorMask.toLong(). The main purpose is to test the IRs + // for API VectorMask.toLong(). To avoid the IRs being optimized out by compiler, + // we insert a VectorMask.not() before toLong(). + + @ForceInline + public static void testToLongGeneral(VectorSpecies species) { + long got = VectorMask.fromArray(species, m, 0).not().toLong(); + verifyMaskToLong(species, 0, got); + } + + @Test + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeature = { "asimd", "true" }) + public static void testToLongByte() { + testToLongGeneral(B_SPECIES); + } + + @Test + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeature = { "asimd", "true" }) + public static void testToLongShort() { + testToLongGeneral(S_SPECIES); + } + + @Test + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeature = { "asimd", "true" }) + public static void testToLongInt() { + testToLongGeneral(I_SPECIES); + } + + @Test + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeature = { "asimd", "true" }) + public static void testToLongLong() { + testToLongGeneral(L_SPECIES); + } + + @Test + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeature = { "asimd", "true" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + public static void testToLongFloat() { + testToLongGeneral(F_SPECIES); + } + + @Test + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" }) + @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1", + IRNode.VECTOR_MASK_TO_LONG, "= 1" }, + applyIfCPUFeature = { "asimd", "true" }) + public static void testToLongDouble() { + testToLongGeneral(D_SPECIES); + } + public static void main(String[] args) { TestFramework testFramework = new TestFramework(); testFramework.setDefaultWarmup(10000)