8366333: AArch64: Enhance SVE subword type implementation of vector compress

Co-authored-by: Jatin Bhateja <jbhateja@openjdk.org>
Reviewed-by: jbhateja, xgong, galder, vlivanov
This commit is contained in:
erifan 2025-10-21 01:20:38 +00:00 committed by Xiaohong Gong
parent 0522cf2ed9
commit 2de8d58552
10 changed files with 408 additions and 115 deletions

View File

@ -7081,29 +7081,31 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
%}
instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2,
vReg tmp3, vReg tmp4, pReg ptmp, pRegGov pgtmp) %{
vReg tmp3, pReg ptmp, pRegGov pgtmp) %{
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ptmp, TEMP pgtmp);
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp);
match(Set dst (CompressV src pg));
format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, tmp4, $ptmp, $pgtmp" %}
format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
__ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
$tmp1$$FloatRegister,$tmp2$$FloatRegister,
$tmp3$$FloatRegister,$tmp4$$FloatRegister,
$ptmp$$PRegister, $pgtmp$$PRegister);
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister,
$ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes);
%}
ins_pipe(pipe_slow);
%}
instruct vcompressS(vReg dst, vReg src, pReg pg,
vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT);
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp);
match(Set dst (CompressV src pg));
format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
__ sve_dup($tmp1$$FloatRegister, __ H, 0);
__ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
$tmp1$$FloatRegister,$tmp2$$FloatRegister, $pgtmp$$PRegister);
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister,
length_in_bytes);
%}
ins_pipe(pipe_slow);
%}

View File

@ -5069,29 +5069,31 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
%}
instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2,
vReg tmp3, vReg tmp4, pReg ptmp, pRegGov pgtmp) %{
vReg tmp3, pReg ptmp, pRegGov pgtmp) %{
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ptmp, TEMP pgtmp);
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp);
match(Set dst (CompressV src pg));
format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, tmp4, $ptmp, $pgtmp" %}
format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
__ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
$tmp1$$FloatRegister,$tmp2$$FloatRegister,
$tmp3$$FloatRegister,$tmp4$$FloatRegister,
$ptmp$$PRegister, $pgtmp$$PRegister);
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister,
$ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes);
%}
ins_pipe(pipe_slow);
%}
instruct vcompressS(vReg dst, vReg src, pReg pg,
vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT);
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp);
match(Set dst (CompressV src pg));
format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
__ sve_dup($tmp1$$FloatRegister, __ H, 0);
__ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
$tmp1$$FloatRegister,$tmp2$$FloatRegister, $pgtmp$$PRegister);
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister,
length_in_bytes);
%}
ins_pipe(pipe_slow);
%}

View File

@ -3486,6 +3486,7 @@ public:
INSN(sve_smaxv, 0b00000100, 0b001000001); // signed maximum reduction to scalar
INSN(sve_smin, 0b00000100, 0b001010000); // signed minimum vectors
INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar
INSN(sve_splice,0b00000101, 0b101100100); // splice two vectors under predicate control, destructive
INSN(sve_sub, 0b00000100, 0b000001000); // vector sub
INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar
INSN(sve_umax, 0b00000100, 0b001001000); // unsigned maximum vectors

View File

@ -2203,114 +2203,117 @@ void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t l
// Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
// Any remaining elements of dst will be filled with zero.
// Clobbers: rscratch1
// Preserves: src, mask
// Preserves: mask, vzr
void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
FloatRegister vtmp1, FloatRegister vtmp2,
PRegister pgtmp) {
FloatRegister vzr, FloatRegister vtmp,
PRegister pgtmp, unsigned vector_length_in_bytes) {
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
assert_different_registers(dst, src, vtmp1, vtmp2);
// When called by sve_compress_byte, src and vtmp may be the same register.
assert_different_registers(dst, src, vzr);
assert_different_registers(dst, vtmp, vzr);
assert_different_registers(mask, pgtmp);
// Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111
// mask = 0001 0000 0000 0001 0001 0000 0001 0001
// Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111
sve_dup(vtmp2, H, 0);
// high <-- low
// Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
// mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
// Expected result: dst = 00 00 00 hh ee dd bb aa
// Extend lowest half to type INT.
// dst = 00004444 00003333 00002222 00001111
// dst = 00dd 00cc 00bb 00aa
sve_uunpklo(dst, S, src);
// pgtmp = 00000001 00000000 00000001 00000001
// pgtmp = 0001 0000 0001 0001
sve_punpklo(pgtmp, mask);
// Pack the active elements in size of type INT to the right,
// and fill the remainings with zero.
// dst = 00000000 00004444 00002222 00001111
// dst = 0000 00dd 00bb 00aa
sve_compact(dst, S, dst, pgtmp);
// Narrow the result back to type SHORT.
// dst = 0000 0000 0000 0000 0000 4444 2222 1111
sve_uzp1(dst, H, dst, vtmp2);
// dst = 00 00 00 00 00 dd bb aa
sve_uzp1(dst, H, dst, vzr);
// Return if the vector length is no more than MaxVectorSize/2, since the
// highest half is invalid.
if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
return;
}
// Count the active elements of lowest half.
// rscratch1 = 3
sve_cntp(rscratch1, S, ptrue, pgtmp);
// Repeat to the highest half.
// pgtmp = 00000001 00000000 00000000 00000001
// pgtmp = 0001 0000 0000 0001
sve_punpkhi(pgtmp, mask);
// vtmp1 = 00008888 00007777 00006666 00005555
sve_uunpkhi(vtmp1, S, src);
// vtmp1 = 00000000 00000000 00008888 00005555
sve_compact(vtmp1, S, vtmp1, pgtmp);
// vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
sve_uzp1(vtmp1, H, vtmp1, vtmp2);
// vtmp = 00hh 00gg 00ff 00ee
sve_uunpkhi(vtmp, S, src);
// vtmp = 0000 0000 00hh 00ee
sve_compact(vtmp, S, vtmp, pgtmp);
// vtmp = 00 00 00 00 00 00 hh ee
sve_uzp1(vtmp, H, vtmp, vzr);
// Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111
// Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
// Left shift(cross lane) compressed high with TRUE_CNT lanes,
// TRUE_CNT is the number of active elements in the compressed low.
neg(rscratch1, rscratch1);
// vtmp2 = {4 3 2 1 0 -1 -2 -3}
sve_index(vtmp2, H, rscratch1, 1);
// vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
sve_tbl(vtmp1, H, vtmp1, vtmp2);
// Combine the compressed high(after shifted) with the compressed low.
// dst = 0000 0000 0000 8888 5555 4444 2222 1111
sve_orr(dst, dst, vtmp1);
// pgtmp = 00 00 00 00 00 01 01 01
sve_whilelt(pgtmp, H, zr, rscratch1);
// Compressed low: dst = 00 00 00 00 00 dd bb aa
// Compressed high: vtmp = 00 00 00 00 00 00 hh ee
// Combine the compressed low with the compressed high:
// dst = 00 00 00 hh ee dd bb aa
sve_splice(dst, H, pgtmp, vtmp);
}
// Clobbers: rscratch1, rscratch2
// Preserves: src, mask
void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
FloatRegister vtmp1, FloatRegister vtmp2,
FloatRegister vtmp3, FloatRegister vtmp4,
PRegister ptmp, PRegister pgtmp) {
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
assert_different_registers(mask, ptmp, pgtmp);
// Example input: src = 88 77 66 55 44 33 22 11
// mask = 01 00 00 01 01 00 01 01
// Expected result: dst = 00 00 00 88 55 44 22 11
// high <-- low
// Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
// mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
// Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
FloatRegister vzr = vtmp3;
sve_dup(vzr, B, 0);
sve_dup(vtmp4, B, 0);
// Extend lowest half to type SHORT.
// vtmp1 = 0044 0033 0022 0011
// vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
sve_uunpklo(vtmp1, H, src);
// ptmp = 0001 0000 0001 0001
// ptmp = 00 01 00 00 00 01 00 01
sve_punpklo(ptmp, mask);
// Pack the active elements in size of type SHORT to the right,
// and fill the remainings with zero.
// dst = 00 00 00 00 00 0g 0c 0a
unsigned extended_size = vector_length_in_bytes << 1;
sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
// Narrow the result back to type BYTE.
// dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
sve_uzp1(dst, B, dst, vzr);
// Return if the vector length is no more than MaxVectorSize/2, since the
// highest half is invalid.
if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
return;
}
// Count the active elements of lowest half.
// rscratch2 = 3
sve_cntp(rscratch2, H, ptrue, ptmp);
// Pack the active elements in size of type SHORT to the right,
// and fill the remainings with zero.
// dst = 0000 0044 0022 0011
sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
// Narrow the result back to type BYTE.
// dst = 00 00 00 00 00 44 22 11
sve_uzp1(dst, B, dst, vtmp4);
// Repeat to the highest half.
// ptmp = 0001 0000 0000 0001
// ptmp = 00 01 00 00 00 00 00 01
sve_punpkhi(ptmp, mask);
// vtmp1 = 0088 0077 0066 0055
// vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
sve_uunpkhi(vtmp2, H, src);
// vtmp1 = 0000 0000 0088 0055
sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
// vtmp1 = 00 00 00 00 00 00 0p 0i
sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
// vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
sve_uzp1(vtmp1, B, vtmp1, vzr);
sve_dup(vtmp4, B, 0);
// vtmp1 = 00 00 00 00 00 00 88 55
sve_uzp1(vtmp1, B, vtmp1, vtmp4);
// Compressed low: dst = 00 00 00 00 00 44 22 11
// Compressed high: vtmp1 = 00 00 00 00 00 00 88 55
// Left shift(cross lane) compressed high with TRUE_CNT lanes,
// TRUE_CNT is the number of active elements in the compressed low.
neg(rscratch2, rscratch2);
// vtmp2 = {4 3 2 1 0 -1 -2 -3}
sve_index(vtmp2, B, rscratch2, 1);
// vtmp1 = 00 00 00 88 55 00 00 00
sve_tbl(vtmp1, B, vtmp1, vtmp2);
// Combine the compressed high(after shifted) with the compressed low.
// dst = 00 00 00 88 55 44 22 11
sve_orr(dst, dst, vtmp1);
// ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
sve_whilelt(ptmp, B, zr, rscratch2);
// Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
// Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
// Combine the compressed low with the compressed high:
// dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
sve_splice(dst, B, ptmp, vtmp1);
}
void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {

View File

@ -173,13 +173,12 @@
// lowest-numbered elements of dst. Any remaining elements of dst will
// be filled with zero.
void sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
FloatRegister vtmp1, FloatRegister vtmp2,
FloatRegister vtmp3, FloatRegister vtmp4,
PRegister ptmp, PRegister pgtmp);
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes);
void sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
FloatRegister vtmp1, FloatRegister vtmp2,
PRegister pgtmp);
FloatRegister vzr, FloatRegister vtmp,
PRegister pgtmp, unsigned vector_length_in_bytes);
void neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);

View File

@ -2143,6 +2143,10 @@ generate(SpecialCases, [["ccmn", "__ ccmn(zr, zr, 3u, Assembler::LE);",
["facge", "__ sve_fac(Assembler::GE, p1, __ H, p2, z4, z5);", "facge\tp1.h, p2/z, z4.h, z5.h"],
["facge", "__ sve_fac(Assembler::GE, p1, __ S, p2, z4, z5);", "facge\tp1.s, p2/z, z4.s, z5.s"],
["facge", "__ sve_fac(Assembler::GE, p1, __ D, p2, z4, z5);", "facge\tp1.d, p2/z, z4.d, z5.d"],
["splice", "__ sve_splice(z0, __ B, p0, z1);", "splice\tz0.b, p0, z0.b, z1.b"],
["splice", "__ sve_splice(z0, __ H, p0, z1);", "splice\tz0.h, p0, z0.h, z1.h"],
["splice", "__ sve_splice(z0, __ S, p0, z1);", "splice\tz0.s, p0, z0.s, z1.s"],
["splice", "__ sve_splice(z0, __ D, p0, z1);", "splice\tz0.d, p0, z0.d, z1.d"],
# SVE2 instructions
["histcnt", "__ sve_histcnt(z16, __ S, p0, z16, z16);", "histcnt\tz16.s, p0/z, z16.s, z16.s"],
["histcnt", "__ sve_histcnt(z17, __ D, p0, z17, z17);", "histcnt\tz17.d, p0/z, z17.d, z17.d"],

View File

@ -1156,6 +1156,10 @@
__ sve_fac(Assembler::GE, p1, __ H, p2, z4, z5); // facge p1.h, p2/z, z4.h, z5.h
__ sve_fac(Assembler::GE, p1, __ S, p2, z4, z5); // facge p1.s, p2/z, z4.s, z5.s
__ sve_fac(Assembler::GE, p1, __ D, p2, z4, z5); // facge p1.d, p2/z, z4.d, z5.d
__ sve_splice(z0, __ B, p0, z1); // splice z0.b, p0, z0.b, z1.b
__ sve_splice(z0, __ H, p0, z1); // splice z0.h, p0, z0.h, z1.h
__ sve_splice(z0, __ S, p0, z1); // splice z0.s, p0, z0.s, z1.s
__ sve_splice(z0, __ D, p0, z1); // splice z0.d, p0, z0.d, z1.d
__ sve_histcnt(z16, __ S, p0, z16, z16); // histcnt z16.s, p0/z, z16.s, z16.s
__ sve_histcnt(z17, __ D, p0, z17, z17); // histcnt z17.d, p0/z, z17.d, z17.d
@ -1445,30 +1449,30 @@
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
0x14000000, 0x17ffffd7, 0x140004b7, 0x94000000,
0x97ffffd4, 0x940004b4, 0x3400000a, 0x34fffa2a,
0x3400962a, 0x35000008, 0x35fff9c8, 0x350095c8,
0xb400000b, 0xb4fff96b, 0xb400956b, 0xb500001d,
0xb5fff91d, 0xb500951d, 0x10000013, 0x10fff8b3,
0x100094b3, 0x90000013, 0x36300016, 0x3637f836,
0x36309436, 0x3758000c, 0x375ff7cc, 0x375893cc,
0x14000000, 0x17ffffd7, 0x140004bb, 0x94000000,
0x97ffffd4, 0x940004b8, 0x3400000a, 0x34fffa2a,
0x340096aa, 0x35000008, 0x35fff9c8, 0x35009648,
0xb400000b, 0xb4fff96b, 0xb40095eb, 0xb500001d,
0xb5fff91d, 0xb500959d, 0x10000013, 0x10fff8b3,
0x10009533, 0x90000013, 0x36300016, 0x3637f836,
0x363094b6, 0x3758000c, 0x375ff7cc, 0x3758944c,
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
0x540091a0, 0x54000001, 0x54fff541, 0x54009141,
0x54000002, 0x54fff4e2, 0x540090e2, 0x54000002,
0x54fff482, 0x54009082, 0x54000003, 0x54fff423,
0x54009023, 0x54000003, 0x54fff3c3, 0x54008fc3,
0x54000004, 0x54fff364, 0x54008f64, 0x54000005,
0x54fff305, 0x54008f05, 0x54000006, 0x54fff2a6,
0x54008ea6, 0x54000007, 0x54fff247, 0x54008e47,
0x54000008, 0x54fff1e8, 0x54008de8, 0x54000009,
0x54fff189, 0x54008d89, 0x5400000a, 0x54fff12a,
0x54008d2a, 0x5400000b, 0x54fff0cb, 0x54008ccb,
0x5400000c, 0x54fff06c, 0x54008c6c, 0x5400000d,
0x54fff00d, 0x54008c0d, 0x5400000e, 0x54ffefae,
0x54008bae, 0x5400000f, 0x54ffef4f, 0x54008b4f,
0x54009220, 0x54000001, 0x54fff541, 0x540091c1,
0x54000002, 0x54fff4e2, 0x54009162, 0x54000002,
0x54fff482, 0x54009102, 0x54000003, 0x54fff423,
0x540090a3, 0x54000003, 0x54fff3c3, 0x54009043,
0x54000004, 0x54fff364, 0x54008fe4, 0x54000005,
0x54fff305, 0x54008f85, 0x54000006, 0x54fff2a6,
0x54008f26, 0x54000007, 0x54fff247, 0x54008ec7,
0x54000008, 0x54fff1e8, 0x54008e68, 0x54000009,
0x54fff189, 0x54008e09, 0x5400000a, 0x54fff12a,
0x54008daa, 0x5400000b, 0x54fff0cb, 0x54008d4b,
0x5400000c, 0x54fff06c, 0x54008cec, 0x5400000d,
0x54fff00d, 0x54008c8d, 0x5400000e, 0x54ffefae,
0x54008c2e, 0x5400000f, 0x54ffef4f, 0x54008bcf,
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f,
0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf,
@ -1689,7 +1693,8 @@
0x05a14c00, 0x05e14c00, 0x05304001, 0x05314001,
0x05a18610, 0x05e18610, 0x0420bc31, 0x05271e11,
0x6545e891, 0x6585e891, 0x65c5e891, 0x6545c891,
0x6585c891, 0x65c5c891, 0x45b0c210, 0x45f1c231,
0x6585c891, 0x65c5c891, 0x052c8020, 0x056c8020,
0x05ac8020, 0x05ec8020, 0x45b0c210, 0x45f1c231,
0x1e601000, 0x1e603000, 0x1e621000, 0x1e623000,
0x1e641000, 0x1e643000, 0x1e661000, 0x1e663000,
0x1e681000, 0x1e683000, 0x1e6a1000, 0x1e6a3000,

View File

@ -2840,6 +2840,36 @@ public class IRNode {
vectorNode(EXPAND_BITS_VL, "ExpandBitsV", TYPE_LONG);
}
public static final String COMPRESS_VB = VECTOR_PREFIX + "COMPRESS_VB" + POSTFIX;
static {
vectorNode(COMPRESS_VB, "CompressV", TYPE_BYTE);
}
public static final String COMPRESS_VS = VECTOR_PREFIX + "COMPRESS_VS" + POSTFIX;
static {
vectorNode(COMPRESS_VS, "CompressV", TYPE_SHORT);
}
public static final String COMPRESS_VI = VECTOR_PREFIX + "COMPRESS_VI" + POSTFIX;
static {
vectorNode(COMPRESS_VI, "CompressV", TYPE_INT);
}
public static final String COMPRESS_VL = VECTOR_PREFIX + "COMPRESS_VL" + POSTFIX;
static {
vectorNode(COMPRESS_VL, "CompressV", TYPE_LONG);
}
public static final String COMPRESS_VF = VECTOR_PREFIX + "COMPRESS_VF" + POSTFIX;
static {
vectorNode(COMPRESS_VF, "CompressV", TYPE_FLOAT);
}
public static final String COMPRESS_VD = VECTOR_PREFIX + "COMPRESS_VD" + POSTFIX;
static {
vectorNode(COMPRESS_VD, "CompressV", TYPE_DOUBLE);
}
public static final String EXPAND_VB = VECTOR_PREFIX + "EXPAND_VB" + POSTFIX;
static {
vectorNode(EXPAND_VB, "ExpandV", TYPE_BYTE);

View File

@ -106,6 +106,7 @@ public class IREncodingPrinter {
"avx512_fp16",
"avx512_vnni",
"avx512_vbmi",
"avx512_vbmi2",
"avx10_2",
"bmi2",
// AArch64

View File

@ -0,0 +1,246 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.vectorapi;
import compiler.lib.generators.*;
import compiler.lib.ir_framework.*;
import jdk.incubator.vector.*;
import jdk.test.lib.Asserts;
/**
* @test
* @bug 8366333
* @key randomness
* @library /test/lib /
* @summary IR test for VectorAPI compress
* @modules jdk.incubator.vector
*
* @run driver compiler.vectorapi.VectorCompressTest
*/
public class VectorCompressTest {
static final VectorSpecies<Byte> B_SPECIES = ByteVector.SPECIES_MAX;
static final VectorSpecies<Short> S_SPECIES = ShortVector.SPECIES_MAX;
static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_MAX;
static final VectorSpecies<Float> F_SPECIES = FloatVector.SPECIES_MAX;
static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_MAX;
static final VectorSpecies<Double> D_SPECIES = DoubleVector.SPECIES_MAX;
static final int LENGTH = 512;
static final Generators RD = Generators.G;
static byte[] ba, bb;
static short[] sa, sb;
static int[] ia, ib;
static long[] la, lb;
static float[] fa, fb;
static double[] da, db;
static boolean[] ma;
static {
ba = new byte[LENGTH];
bb = new byte[LENGTH];
sa = new short[LENGTH];
sb = new short[LENGTH];
ia = new int[LENGTH];
ib = new int[LENGTH];
la = new long[LENGTH];
lb = new long[LENGTH];
fa = new float[LENGTH];
fb = new float[LENGTH];
da = new double[LENGTH];
db = new double[LENGTH];
ma = new boolean[LENGTH];
Generator<Integer> iGen = RD.ints();
Generator<Long> lGen = RD.longs();
Generator<Float> fGen = RD.floats();
Generator<Double> dGen = RD.doubles();
for (int i = 0; i < LENGTH; i++) {
ba[i] = iGen.next().byteValue();
sa[i] = iGen.next().shortValue();
ma[i] = iGen.next() % 2 == 0;
}
RD.fill(iGen, ia);
RD.fill(lGen, la);
RD.fill(fGen, fa);
RD.fill(dGen, da);
}
@DontInline
static void verifyVectorCompressByte(int vlen) {
int index = 0;
for (int i = 0; i < vlen; i++) {
if (ma[i]) {
Asserts.assertEquals(ba[i], bb[index++]);
}
}
for (int i = index; i < vlen; i++) {
Asserts.assertEquals((byte)0, bb[i]);
}
}
@DontInline
static void verifyVectorCompressShort(int vlen) {
int index = 0;
for (int i = 0; i < vlen; i++) {
if (ma[i]) {
Asserts.assertEquals(sa[i], sb[index++]);
}
}
for (int i = index; i < vlen; i++) {
Asserts.assertEquals((short)0, sb[i]);
}
}
@DontInline
static void verifyVectorCompressInteger(int vlen) {
int index = 0;
for (int i = 0; i < vlen; i++) {
if (ma[i]) {
Asserts.assertEquals(ia[i], ib[index++]);
}
}
for (int i = index; i < vlen; i++) {
Asserts.assertEquals(0, ib[i]);
}
}
@DontInline
static void verifyVectorCompressLong(int vlen) {
int index = 0;
for (int i = 0; i < vlen; i++) {
if (ma[i]) {
Asserts.assertEquals(la[i], lb[index++]);
}
}
for (int i = index; i < vlen; i++) {
Asserts.assertEquals(0L, lb[i]);
}
}
@DontInline
static void verifyVectorCompressFloat(int vlen) {
int index = 0;
for (int i = 0; i < vlen; i++) {
if (ma[i]) {
Asserts.assertEquals(fa[i], fb[index++]);
}
}
for (int i = index; i < vlen; i++) {
Asserts.assertEquals(0.0f, fb[i]);
}
}
@DontInline
static void verifyVectorCompressDouble(int vlen) {
int index = 0;
for (int i = 0; i < vlen; i++) {
if (ma[i]) {
Asserts.assertEquals(da[i], db[index++]);
}
}
for (int i = index; i < vlen; i++) {
Asserts.assertEquals(0.0, db[i]);
}
}
@Test
@IR(counts = { IRNode.COMPRESS_VB, "= 1" },
applyIfCPUFeature = { "sve", "true" })
@IR(counts = { IRNode.COMPRESS_VB, "= 1" },
applyIfCPUFeatureAnd = {"avx512_vbmi2", "true", "avx512vl", "true"})
public static void testVectorCompressByte() {
ByteVector av = ByteVector.fromArray(B_SPECIES, ba, 0);
VectorMask<Byte> m = VectorMask.fromArray(B_SPECIES, ma, 0);
av.compress(m).intoArray(bb, 0);
verifyVectorCompressByte(B_SPECIES.length());
}
@Test
@IR(counts = { IRNode.COMPRESS_VS, "= 1" },
applyIfCPUFeature = { "sve", "true" })
@IR(counts = { IRNode.COMPRESS_VS, "= 1" },
applyIfCPUFeatureAnd = {"avx512_vbmi2", "true", "avx512vl", "true"})
public static void testVectorCompressShort() {
ShortVector av = ShortVector.fromArray(S_SPECIES, sa, 0);
VectorMask<Short> m = VectorMask.fromArray(S_SPECIES, ma, 0);
av.compress(m).intoArray(sb, 0);
verifyVectorCompressShort(S_SPECIES.length());
}
@Test
@IR(counts = { IRNode.COMPRESS_VI, "= 1" },
applyIfCPUFeature = { "sve", "true" })
@IR(counts = { IRNode.COMPRESS_VI, "= 1" },
applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"})
public static void testVectorCompressInt() {
IntVector av = IntVector.fromArray(I_SPECIES, ia, 0);
VectorMask<Integer> m = VectorMask.fromArray(I_SPECIES, ma, 0);
av.compress(m).intoArray(ib, 0);
verifyVectorCompressInteger(I_SPECIES.length());
}
@Test
@IR(counts = { IRNode.COMPRESS_VL, "= 1" },
applyIfCPUFeature = { "sve", "true" })
@IR(counts = { IRNode.COMPRESS_VL, "= 1" },
applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"})
public static void testVectorCompressLong() {
LongVector av = LongVector.fromArray(L_SPECIES, la, 0);
VectorMask<Long> m = VectorMask.fromArray(L_SPECIES, ma, 0);
av.compress(m).intoArray(lb, 0);
verifyVectorCompressLong(L_SPECIES.length());
}
@Test
@IR(counts = { IRNode.COMPRESS_VF, "= 1" },
applyIfCPUFeature = { "sve", "true" })
@IR(counts = { IRNode.COMPRESS_VF, "= 1" },
applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"})
public static void testVectorCompressFloat() {
FloatVector av = FloatVector.fromArray(F_SPECIES, fa, 0);
VectorMask<Float> m = VectorMask.fromArray(F_SPECIES, ma, 0);
av.compress(m).intoArray(fb, 0);
verifyVectorCompressFloat(F_SPECIES.length());
}
@Test
@IR(counts = { IRNode.COMPRESS_VD, "= 1" },
applyIfCPUFeature = { "sve", "true" })
@IR(counts = { IRNode.COMPRESS_VD, "= 1" },
applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"})
public static void testVectorCompressDouble() {
DoubleVector av = DoubleVector.fromArray(D_SPECIES, da, 0);
VectorMask<Double> m = VectorMask.fromArray(D_SPECIES, ma, 0);
av.compress(m).intoArray(db, 0);
verifyVectorCompressDouble(D_SPECIES.length());
}
public static void main(String[] args) {
TestFramework testFramework = new TestFramework();
testFramework.setDefaultWarmup(10000)
.addFlags("--add-modules=jdk.incubator.vector")
.start();
}
}