diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index ef35b66003d..3379041b2cc 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -7081,29 +7081,31 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{ %} instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, - vReg tmp3, vReg tmp4, pReg ptmp, pRegGov pgtmp) %{ + vReg tmp3, pReg ptmp, pRegGov pgtmp) %{ predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE); - effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ptmp, TEMP pgtmp); + effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp); match(Set dst (CompressV src pg)); - format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, tmp4, $ptmp, $pgtmp" %} + format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %} ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); __ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister, - $tmp1$$FloatRegister,$tmp2$$FloatRegister, - $tmp3$$FloatRegister,$tmp4$$FloatRegister, - $ptmp$$PRegister, $pgtmp$$PRegister); + $tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister, + $ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes); %} ins_pipe(pipe_slow); %} -instruct vcompressS(vReg dst, vReg src, pReg pg, - vReg tmp1, vReg tmp2, pRegGov pgtmp) %{ +instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{ predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT); effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp); match(Set dst (CompressV src pg)); format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %} ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + __ sve_dup($tmp1$$FloatRegister, __ H, 0); __ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister, - $tmp1$$FloatRegister,$tmp2$$FloatRegister, $pgtmp$$PRegister); + $tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister, + length_in_bytes); %} ins_pipe(pipe_slow); %} diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 012de7e46d8..6d296cbdb3a 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -5069,29 +5069,31 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{ %} instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, - vReg tmp3, vReg tmp4, pReg ptmp, pRegGov pgtmp) %{ + vReg tmp3, pReg ptmp, pRegGov pgtmp) %{ predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE); - effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ptmp, TEMP pgtmp); + effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp); match(Set dst (CompressV src pg)); - format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, tmp4, $ptmp, $pgtmp" %} + format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %} ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); __ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister, - $tmp1$$FloatRegister,$tmp2$$FloatRegister, - $tmp3$$FloatRegister,$tmp4$$FloatRegister, - $ptmp$$PRegister, $pgtmp$$PRegister); + $tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister, + $ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes); %} ins_pipe(pipe_slow); %} -instruct vcompressS(vReg dst, vReg src, pReg pg, - vReg tmp1, vReg tmp2, pRegGov pgtmp) %{ +instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{ predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT); effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp); match(Set dst (CompressV src pg)); format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %} ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + __ sve_dup($tmp1$$FloatRegister, __ H, 0); __ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister, - $tmp1$$FloatRegister,$tmp2$$FloatRegister, $pgtmp$$PRegister); + $tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister, + length_in_bytes); %} ins_pipe(pipe_slow); %} diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 4c4251fbe9f..a8f378e524f 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -3486,6 +3486,7 @@ public: INSN(sve_smaxv, 0b00000100, 0b001000001); // signed maximum reduction to scalar INSN(sve_smin, 0b00000100, 0b001010000); // signed minimum vectors INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar + INSN(sve_splice,0b00000101, 0b101100100); // splice two vectors under predicate control, destructive INSN(sve_sub, 0b00000100, 0b000001000); // vector sub INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar INSN(sve_umax, 0b00000100, 0b001001000); // unsigned maximum vectors diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index b61a0e4e378..328ef0c53e6 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -2203,114 +2203,117 @@ void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t l // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. // Any remaining elements of dst will be filled with zero. // Clobbers: rscratch1 -// Preserves: src, mask +// Preserves: mask, vzr void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, - FloatRegister vtmp1, FloatRegister vtmp2, - PRegister pgtmp) { + FloatRegister vzr, FloatRegister vtmp, + PRegister pgtmp, unsigned vector_length_in_bytes) { assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); - assert_different_registers(dst, src, vtmp1, vtmp2); + // When called by sve_compress_byte, src and vtmp may be the same register. + assert_different_registers(dst, src, vzr); + assert_different_registers(dst, vtmp, vzr); assert_different_registers(mask, pgtmp); - - // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 - // mask = 0001 0000 0000 0001 0001 0000 0001 0001 - // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 - sve_dup(vtmp2, H, 0); + // high <-- low + // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits. + // mask = 01 00 00 01 01 00 01 01, one character is 1 bit. + // Expected result: dst = 00 00 00 hh ee dd bb aa // Extend lowest half to type INT. - // dst = 00004444 00003333 00002222 00001111 + // dst = 00dd 00cc 00bb 00aa sve_uunpklo(dst, S, src); - // pgtmp = 00000001 00000000 00000001 00000001 + // pgtmp = 0001 0000 0001 0001 sve_punpklo(pgtmp, mask); // Pack the active elements in size of type INT to the right, // and fill the remainings with zero. - // dst = 00000000 00004444 00002222 00001111 + // dst = 0000 00dd 00bb 00aa sve_compact(dst, S, dst, pgtmp); // Narrow the result back to type SHORT. - // dst = 0000 0000 0000 0000 0000 4444 2222 1111 - sve_uzp1(dst, H, dst, vtmp2); + // dst = 00 00 00 00 00 dd bb aa + sve_uzp1(dst, H, dst, vzr); + + // Return if the vector length is no more than MaxVectorSize/2, since the + // highest half is invalid. + if (vector_length_in_bytes <= (MaxVectorSize >> 1)) { + return; + } + // Count the active elements of lowest half. // rscratch1 = 3 sve_cntp(rscratch1, S, ptrue, pgtmp); // Repeat to the highest half. - // pgtmp = 00000001 00000000 00000000 00000001 + // pgtmp = 0001 0000 0000 0001 sve_punpkhi(pgtmp, mask); - // vtmp1 = 00008888 00007777 00006666 00005555 - sve_uunpkhi(vtmp1, S, src); - // vtmp1 = 00000000 00000000 00008888 00005555 - sve_compact(vtmp1, S, vtmp1, pgtmp); - // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 - sve_uzp1(vtmp1, H, vtmp1, vtmp2); + // vtmp = 00hh 00gg 00ff 00ee + sve_uunpkhi(vtmp, S, src); + // vtmp = 0000 0000 00hh 00ee + sve_compact(vtmp, S, vtmp, pgtmp); + // vtmp = 00 00 00 00 00 00 hh ee + sve_uzp1(vtmp, H, vtmp, vzr); - // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 - // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 - // Left shift(cross lane) compressed high with TRUE_CNT lanes, - // TRUE_CNT is the number of active elements in the compressed low. - neg(rscratch1, rscratch1); - // vtmp2 = {4 3 2 1 0 -1 -2 -3} - sve_index(vtmp2, H, rscratch1, 1); - // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 - sve_tbl(vtmp1, H, vtmp1, vtmp2); - - // Combine the compressed high(after shifted) with the compressed low. - // dst = 0000 0000 0000 8888 5555 4444 2222 1111 - sve_orr(dst, dst, vtmp1); + // pgtmp = 00 00 00 00 00 01 01 01 + sve_whilelt(pgtmp, H, zr, rscratch1); + // Compressed low: dst = 00 00 00 00 00 dd bb aa + // Compressed high: vtmp = 00 00 00 00 00 00 hh ee + // Combine the compressed low with the compressed high: + // dst = 00 00 00 hh ee dd bb aa + sve_splice(dst, H, pgtmp, vtmp); } // Clobbers: rscratch1, rscratch2 // Preserves: src, mask void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, - FloatRegister vtmp1, FloatRegister vtmp2, - FloatRegister vtmp3, FloatRegister vtmp4, - PRegister ptmp, PRegister pgtmp) { + FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, + PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) { assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); - assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); + assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3); assert_different_registers(mask, ptmp, pgtmp); - // Example input: src = 88 77 66 55 44 33 22 11 - // mask = 01 00 00 01 01 00 01 01 - // Expected result: dst = 00 00 00 88 55 44 22 11 + // high <-- low + // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits. + // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit. + // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a + FloatRegister vzr = vtmp3; + sve_dup(vzr, B, 0); - sve_dup(vtmp4, B, 0); // Extend lowest half to type SHORT. - // vtmp1 = 0044 0033 0022 0011 + // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a sve_uunpklo(vtmp1, H, src); - // ptmp = 0001 0000 0001 0001 + // ptmp = 00 01 00 00 00 01 00 01 sve_punpklo(ptmp, mask); + // Pack the active elements in size of type SHORT to the right, + // and fill the remainings with zero. + // dst = 00 00 00 00 00 0g 0c 0a + unsigned extended_size = vector_length_in_bytes << 1; + sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size); + // Narrow the result back to type BYTE. + // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a + sve_uzp1(dst, B, dst, vzr); + + // Return if the vector length is no more than MaxVectorSize/2, since the + // highest half is invalid. + if (vector_length_in_bytes <= (MaxVectorSize >> 1)) { + return; + } // Count the active elements of lowest half. // rscratch2 = 3 sve_cntp(rscratch2, H, ptrue, ptmp); - // Pack the active elements in size of type SHORT to the right, - // and fill the remainings with zero. - // dst = 0000 0044 0022 0011 - sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); - // Narrow the result back to type BYTE. - // dst = 00 00 00 00 00 44 22 11 - sve_uzp1(dst, B, dst, vtmp4); // Repeat to the highest half. - // ptmp = 0001 0000 0000 0001 + // ptmp = 00 01 00 00 00 00 00 01 sve_punpkhi(ptmp, mask); - // vtmp1 = 0088 0077 0066 0055 + // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i sve_uunpkhi(vtmp2, H, src); - // vtmp1 = 0000 0000 0088 0055 - sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); + // vtmp1 = 00 00 00 00 00 00 0p 0i + sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize); + // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i + sve_uzp1(vtmp1, B, vtmp1, vzr); - sve_dup(vtmp4, B, 0); - // vtmp1 = 00 00 00 00 00 00 88 55 - sve_uzp1(vtmp1, B, vtmp1, vtmp4); - - // Compressed low: dst = 00 00 00 00 00 44 22 11 - // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 - // Left shift(cross lane) compressed high with TRUE_CNT lanes, - // TRUE_CNT is the number of active elements in the compressed low. - neg(rscratch2, rscratch2); - // vtmp2 = {4 3 2 1 0 -1 -2 -3} - sve_index(vtmp2, B, rscratch2, 1); - // vtmp1 = 00 00 00 88 55 00 00 00 - sve_tbl(vtmp1, B, vtmp1, vtmp2); - // Combine the compressed high(after shifted) with the compressed low. - // dst = 00 00 00 88 55 44 22 11 - sve_orr(dst, dst, vtmp1); + // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 + sve_whilelt(ptmp, B, zr, rscratch2); + // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a + // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i + // Combine the compressed low with the compressed high: + // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a + sve_splice(dst, B, ptmp, vtmp1); } void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp index cb8ded142f4..09850a60c64 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp @@ -173,13 +173,12 @@ // lowest-numbered elements of dst. Any remaining elements of dst will // be filled with zero. void sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, - FloatRegister vtmp1, FloatRegister vtmp2, - FloatRegister vtmp3, FloatRegister vtmp4, - PRegister ptmp, PRegister pgtmp); + FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, + PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes); void sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, - FloatRegister vtmp1, FloatRegister vtmp2, - PRegister pgtmp); + FloatRegister vzr, FloatRegister vtmp, + PRegister pgtmp, unsigned vector_length_in_bytes); void neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ); diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py index bf4f2111999..48b19acaa05 100644 --- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py +++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py @@ -2143,6 +2143,10 @@ generate(SpecialCases, [["ccmn", "__ ccmn(zr, zr, 3u, Assembler::LE);", ["facge", "__ sve_fac(Assembler::GE, p1, __ H, p2, z4, z5);", "facge\tp1.h, p2/z, z4.h, z5.h"], ["facge", "__ sve_fac(Assembler::GE, p1, __ S, p2, z4, z5);", "facge\tp1.s, p2/z, z4.s, z5.s"], ["facge", "__ sve_fac(Assembler::GE, p1, __ D, p2, z4, z5);", "facge\tp1.d, p2/z, z4.d, z5.d"], + ["splice", "__ sve_splice(z0, __ B, p0, z1);", "splice\tz0.b, p0, z0.b, z1.b"], + ["splice", "__ sve_splice(z0, __ H, p0, z1);", "splice\tz0.h, p0, z0.h, z1.h"], + ["splice", "__ sve_splice(z0, __ S, p0, z1);", "splice\tz0.s, p0, z0.s, z1.s"], + ["splice", "__ sve_splice(z0, __ D, p0, z1);", "splice\tz0.d, p0, z0.d, z1.d"], # SVE2 instructions ["histcnt", "__ sve_histcnt(z16, __ S, p0, z16, z16);", "histcnt\tz16.s, p0/z, z16.s, z16.s"], ["histcnt", "__ sve_histcnt(z17, __ D, p0, z17, z17);", "histcnt\tz17.d, p0/z, z17.d, z17.d"], diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h index 352ea33750e..34a5f8ca68e 100644 --- a/test/hotspot/gtest/aarch64/asmtest.out.h +++ b/test/hotspot/gtest/aarch64/asmtest.out.h @@ -1156,6 +1156,10 @@ __ sve_fac(Assembler::GE, p1, __ H, p2, z4, z5); // facge p1.h, p2/z, z4.h, z5.h __ sve_fac(Assembler::GE, p1, __ S, p2, z4, z5); // facge p1.s, p2/z, z4.s, z5.s __ sve_fac(Assembler::GE, p1, __ D, p2, z4, z5); // facge p1.d, p2/z, z4.d, z5.d + __ sve_splice(z0, __ B, p0, z1); // splice z0.b, p0, z0.b, z1.b + __ sve_splice(z0, __ H, p0, z1); // splice z0.h, p0, z0.h, z1.h + __ sve_splice(z0, __ S, p0, z1); // splice z0.s, p0, z0.s, z1.s + __ sve_splice(z0, __ D, p0, z1); // splice z0.d, p0, z0.d, z1.d __ sve_histcnt(z16, __ S, p0, z16, z16); // histcnt z16.s, p0/z, z16.s, z16.s __ sve_histcnt(z17, __ D, p0, z17, z17); // histcnt z17.d, p0/z, z17.d, z17.d @@ -1445,30 +1449,30 @@ 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0227, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, - 0x14000000, 0x17ffffd7, 0x140004b7, 0x94000000, - 0x97ffffd4, 0x940004b4, 0x3400000a, 0x34fffa2a, - 0x3400962a, 0x35000008, 0x35fff9c8, 0x350095c8, - 0xb400000b, 0xb4fff96b, 0xb400956b, 0xb500001d, - 0xb5fff91d, 0xb500951d, 0x10000013, 0x10fff8b3, - 0x100094b3, 0x90000013, 0x36300016, 0x3637f836, - 0x36309436, 0x3758000c, 0x375ff7cc, 0x375893cc, + 0x14000000, 0x17ffffd7, 0x140004bb, 0x94000000, + 0x97ffffd4, 0x940004b8, 0x3400000a, 0x34fffa2a, + 0x340096aa, 0x35000008, 0x35fff9c8, 0x35009648, + 0xb400000b, 0xb4fff96b, 0xb40095eb, 0xb500001d, + 0xb5fff91d, 0xb500959d, 0x10000013, 0x10fff8b3, + 0x10009533, 0x90000013, 0x36300016, 0x3637f836, + 0x363094b6, 0x3758000c, 0x375ff7cc, 0x3758944c, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, - 0x540091a0, 0x54000001, 0x54fff541, 0x54009141, - 0x54000002, 0x54fff4e2, 0x540090e2, 0x54000002, - 0x54fff482, 0x54009082, 0x54000003, 0x54fff423, - 0x54009023, 0x54000003, 0x54fff3c3, 0x54008fc3, - 0x54000004, 0x54fff364, 0x54008f64, 0x54000005, - 0x54fff305, 0x54008f05, 0x54000006, 0x54fff2a6, - 0x54008ea6, 0x54000007, 0x54fff247, 0x54008e47, - 0x54000008, 0x54fff1e8, 0x54008de8, 0x54000009, - 0x54fff189, 0x54008d89, 0x5400000a, 0x54fff12a, - 0x54008d2a, 0x5400000b, 0x54fff0cb, 0x54008ccb, - 0x5400000c, 0x54fff06c, 0x54008c6c, 0x5400000d, - 0x54fff00d, 0x54008c0d, 0x5400000e, 0x54ffefae, - 0x54008bae, 0x5400000f, 0x54ffef4f, 0x54008b4f, + 0x54009220, 0x54000001, 0x54fff541, 0x540091c1, + 0x54000002, 0x54fff4e2, 0x54009162, 0x54000002, + 0x54fff482, 0x54009102, 0x54000003, 0x54fff423, + 0x540090a3, 0x54000003, 0x54fff3c3, 0x54009043, + 0x54000004, 0x54fff364, 0x54008fe4, 0x54000005, + 0x54fff305, 0x54008f85, 0x54000006, 0x54fff2a6, + 0x54008f26, 0x54000007, 0x54fff247, 0x54008ec7, + 0x54000008, 0x54fff1e8, 0x54008e68, 0x54000009, + 0x54fff189, 0x54008e09, 0x5400000a, 0x54fff12a, + 0x54008daa, 0x5400000b, 0x54fff0cb, 0x54008d4b, + 0x5400000c, 0x54fff06c, 0x54008cec, 0x5400000d, + 0x54fff00d, 0x54008c8d, 0x5400000e, 0x54ffefae, + 0x54008c2e, 0x5400000f, 0x54ffef4f, 0x54008bcf, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f, 0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf, @@ -1689,7 +1693,8 @@ 0x05a14c00, 0x05e14c00, 0x05304001, 0x05314001, 0x05a18610, 0x05e18610, 0x0420bc31, 0x05271e11, 0x6545e891, 0x6585e891, 0x65c5e891, 0x6545c891, - 0x6585c891, 0x65c5c891, 0x45b0c210, 0x45f1c231, + 0x6585c891, 0x65c5c891, 0x052c8020, 0x056c8020, + 0x05ac8020, 0x05ec8020, 0x45b0c210, 0x45f1c231, 0x1e601000, 0x1e603000, 0x1e621000, 0x1e623000, 0x1e641000, 0x1e643000, 0x1e661000, 0x1e663000, 0x1e681000, 0x1e683000, 0x1e6a1000, 0x1e6a3000, diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 99a289476ec..f0f7aaf3836 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -2840,6 +2840,36 @@ public class IRNode { vectorNode(EXPAND_BITS_VL, "ExpandBitsV", TYPE_LONG); } + public static final String COMPRESS_VB = VECTOR_PREFIX + "COMPRESS_VB" + POSTFIX; + static { + vectorNode(COMPRESS_VB, "CompressV", TYPE_BYTE); + } + + public static final String COMPRESS_VS = VECTOR_PREFIX + "COMPRESS_VS" + POSTFIX; + static { + vectorNode(COMPRESS_VS, "CompressV", TYPE_SHORT); + } + + public static final String COMPRESS_VI = VECTOR_PREFIX + "COMPRESS_VI" + POSTFIX; + static { + vectorNode(COMPRESS_VI, "CompressV", TYPE_INT); + } + + public static final String COMPRESS_VL = VECTOR_PREFIX + "COMPRESS_VL" + POSTFIX; + static { + vectorNode(COMPRESS_VL, "CompressV", TYPE_LONG); + } + + public static final String COMPRESS_VF = VECTOR_PREFIX + "COMPRESS_VF" + POSTFIX; + static { + vectorNode(COMPRESS_VF, "CompressV", TYPE_FLOAT); + } + + public static final String COMPRESS_VD = VECTOR_PREFIX + "COMPRESS_VD" + POSTFIX; + static { + vectorNode(COMPRESS_VD, "CompressV", TYPE_DOUBLE); + } + public static final String EXPAND_VB = VECTOR_PREFIX + "EXPAND_VB" + POSTFIX; static { vectorNode(EXPAND_VB, "ExpandV", TYPE_BYTE); diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java index c05124edcd7..daa2b9765f8 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java @@ -106,6 +106,7 @@ public class IREncodingPrinter { "avx512_fp16", "avx512_vnni", "avx512_vbmi", + "avx512_vbmi2", "avx10_2", "bmi2", // AArch64 diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorCompressTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorCompressTest.java new file mode 100644 index 00000000000..7ab60885ad2 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorCompressTest.java @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.vectorapi; + +import compiler.lib.generators.*; +import compiler.lib.ir_framework.*; +import jdk.incubator.vector.*; +import jdk.test.lib.Asserts; + +/** + * @test + * @bug 8366333 + * @key randomness + * @library /test/lib / + * @summary IR test for VectorAPI compress + * @modules jdk.incubator.vector + * + * @run driver compiler.vectorapi.VectorCompressTest + */ + +public class VectorCompressTest { + static final VectorSpecies B_SPECIES = ByteVector.SPECIES_MAX; + static final VectorSpecies S_SPECIES = ShortVector.SPECIES_MAX; + static final VectorSpecies I_SPECIES = IntVector.SPECIES_MAX; + static final VectorSpecies F_SPECIES = FloatVector.SPECIES_MAX; + static final VectorSpecies L_SPECIES = LongVector.SPECIES_MAX; + static final VectorSpecies D_SPECIES = DoubleVector.SPECIES_MAX; + static final int LENGTH = 512; + static final Generators RD = Generators.G; + static byte[] ba, bb; + static short[] sa, sb; + static int[] ia, ib; + static long[] la, lb; + static float[] fa, fb; + static double[] da, db; + static boolean[] ma; + + static { + ba = new byte[LENGTH]; + bb = new byte[LENGTH]; + sa = new short[LENGTH]; + sb = new short[LENGTH]; + ia = new int[LENGTH]; + ib = new int[LENGTH]; + la = new long[LENGTH]; + lb = new long[LENGTH]; + fa = new float[LENGTH]; + fb = new float[LENGTH]; + da = new double[LENGTH]; + db = new double[LENGTH]; + ma = new boolean[LENGTH]; + + Generator iGen = RD.ints(); + Generator lGen = RD.longs(); + Generator fGen = RD.floats(); + Generator dGen = RD.doubles(); + + for (int i = 0; i < LENGTH; i++) { + ba[i] = iGen.next().byteValue(); + sa[i] = iGen.next().shortValue(); + ma[i] = iGen.next() % 2 == 0; + } + RD.fill(iGen, ia); + RD.fill(lGen, la); + RD.fill(fGen, fa); + RD.fill(dGen, da); + } + + @DontInline + static void verifyVectorCompressByte(int vlen) { + int index = 0; + for (int i = 0; i < vlen; i++) { + if (ma[i]) { + Asserts.assertEquals(ba[i], bb[index++]); + } + } + for (int i = index; i < vlen; i++) { + Asserts.assertEquals((byte)0, bb[i]); + } + } + + @DontInline + static void verifyVectorCompressShort(int vlen) { + int index = 0; + for (int i = 0; i < vlen; i++) { + if (ma[i]) { + Asserts.assertEquals(sa[i], sb[index++]); + } + } + for (int i = index; i < vlen; i++) { + Asserts.assertEquals((short)0, sb[i]); + } + } + + @DontInline + static void verifyVectorCompressInteger(int vlen) { + int index = 0; + for (int i = 0; i < vlen; i++) { + if (ma[i]) { + Asserts.assertEquals(ia[i], ib[index++]); + } + } + for (int i = index; i < vlen; i++) { + Asserts.assertEquals(0, ib[i]); + } + } + + @DontInline + static void verifyVectorCompressLong(int vlen) { + int index = 0; + for (int i = 0; i < vlen; i++) { + if (ma[i]) { + Asserts.assertEquals(la[i], lb[index++]); + } + } + for (int i = index; i < vlen; i++) { + Asserts.assertEquals(0L, lb[i]); + } + } + + @DontInline + static void verifyVectorCompressFloat(int vlen) { + int index = 0; + for (int i = 0; i < vlen; i++) { + if (ma[i]) { + Asserts.assertEquals(fa[i], fb[index++]); + } + } + for (int i = index; i < vlen; i++) { + Asserts.assertEquals(0.0f, fb[i]); + } + } + + @DontInline + static void verifyVectorCompressDouble(int vlen) { + int index = 0; + for (int i = 0; i < vlen; i++) { + if (ma[i]) { + Asserts.assertEquals(da[i], db[index++]); + } + } + for (int i = index; i < vlen; i++) { + Asserts.assertEquals(0.0, db[i]); + } + } + + @Test + @IR(counts = { IRNode.COMPRESS_VB, "= 1" }, + applyIfCPUFeature = { "sve", "true" }) + @IR(counts = { IRNode.COMPRESS_VB, "= 1" }, + applyIfCPUFeatureAnd = {"avx512_vbmi2", "true", "avx512vl", "true"}) + public static void testVectorCompressByte() { + ByteVector av = ByteVector.fromArray(B_SPECIES, ba, 0); + VectorMask m = VectorMask.fromArray(B_SPECIES, ma, 0); + av.compress(m).intoArray(bb, 0); + verifyVectorCompressByte(B_SPECIES.length()); + } + + @Test + @IR(counts = { IRNode.COMPRESS_VS, "= 1" }, + applyIfCPUFeature = { "sve", "true" }) + @IR(counts = { IRNode.COMPRESS_VS, "= 1" }, + applyIfCPUFeatureAnd = {"avx512_vbmi2", "true", "avx512vl", "true"}) + public static void testVectorCompressShort() { + ShortVector av = ShortVector.fromArray(S_SPECIES, sa, 0); + VectorMask m = VectorMask.fromArray(S_SPECIES, ma, 0); + av.compress(m).intoArray(sb, 0); + verifyVectorCompressShort(S_SPECIES.length()); + } + + @Test + @IR(counts = { IRNode.COMPRESS_VI, "= 1" }, + applyIfCPUFeature = { "sve", "true" }) + @IR(counts = { IRNode.COMPRESS_VI, "= 1" }, + applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"}) + public static void testVectorCompressInt() { + IntVector av = IntVector.fromArray(I_SPECIES, ia, 0); + VectorMask m = VectorMask.fromArray(I_SPECIES, ma, 0); + av.compress(m).intoArray(ib, 0); + verifyVectorCompressInteger(I_SPECIES.length()); + } + + @Test + @IR(counts = { IRNode.COMPRESS_VL, "= 1" }, + applyIfCPUFeature = { "sve", "true" }) + @IR(counts = { IRNode.COMPRESS_VL, "= 1" }, + applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"}) + public static void testVectorCompressLong() { + LongVector av = LongVector.fromArray(L_SPECIES, la, 0); + VectorMask m = VectorMask.fromArray(L_SPECIES, ma, 0); + av.compress(m).intoArray(lb, 0); + verifyVectorCompressLong(L_SPECIES.length()); + } + + @Test + @IR(counts = { IRNode.COMPRESS_VF, "= 1" }, + applyIfCPUFeature = { "sve", "true" }) + @IR(counts = { IRNode.COMPRESS_VF, "= 1" }, + applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"}) + public static void testVectorCompressFloat() { + FloatVector av = FloatVector.fromArray(F_SPECIES, fa, 0); + VectorMask m = VectorMask.fromArray(F_SPECIES, ma, 0); + av.compress(m).intoArray(fb, 0); + verifyVectorCompressFloat(F_SPECIES.length()); + } + + @Test + @IR(counts = { IRNode.COMPRESS_VD, "= 1" }, + applyIfCPUFeature = { "sve", "true" }) + @IR(counts = { IRNode.COMPRESS_VD, "= 1" }, + applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"}) + public static void testVectorCompressDouble() { + DoubleVector av = DoubleVector.fromArray(D_SPECIES, da, 0); + VectorMask m = VectorMask.fromArray(D_SPECIES, ma, 0); + av.compress(m).intoArray(db, 0); + verifyVectorCompressDouble(D_SPECIES.length()); + } + + public static void main(String[] args) { + TestFramework testFramework = new TestFramework(); + testFramework.setDefaultWarmup(10000) + .addFlags("--add-modules=jdk.incubator.vector") + .start(); + } +}