From 5fbce068bd782cafa005133fcdce24c3be67f2df Mon Sep 17 00:00:00 2001 From: Eric Fang Date: Wed, 17 Jun 2026 06:30:18 +0000 Subject: [PATCH] 8382052: VectorAPI: Optimize the lanewise BITWISE_BLEND for AArch64 Reviewed-by: xgong, epeter, aph --- src/hotspot/cpu/aarch64/aarch64_vector.ad | 33 +++ src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 33 +++ src/hotspot/cpu/aarch64/assembler_aarch64.hpp | 13 +- .../cpu/aarch64/macroAssembler_aarch64.hpp | 18 +- src/hotspot/share/opto/classes.hpp | 1 + src/hotspot/share/opto/matcher.cpp | 3 +- src/hotspot/share/opto/vectornode.cpp | 70 ++++++ src/hotspot/share/opto/vectornode.hpp | 19 ++ test/hotspot/gtest/aarch64/aarch64-asmtest.py | 5 +- test/hotspot/gtest/aarch64/asmtest.out.h | 115 +++++----- .../compiler/lib/ir_framework/IRNode.java | 6 + .../vectorapi/VectorBitwiseBlendTest.java | 217 ++++++++++++++++++ .../jdk/incubator/vector/MaskedLogicOpts.java | 19 +- 13 files changed, 477 insertions(+), 75 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/vectorapi/VectorBitwiseBlendTest.java diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index b9899995531..5ba549df480 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -317,6 +317,13 @@ source %{ return false; // NEON only, since SLI/USHR are not available in SVE } break; + case Op_VectorBitwiseBlend: + // Use NEON BSL when UseSVE < 2; SVE1 has no BSL so larger vectors are + // not supported on UseSVE == 1 machines. + if (UseSVE < 2 && length_in_bytes > 16) { + return false; + } + break; default: break; } @@ -340,6 +347,7 @@ source %{ case Op_MulReductionVL: case Op_CompressBitsV: case Op_ExpandBitsV: + case Op_VectorBitwiseBlend: return false; case Op_SaturatingAddV: case Op_SaturatingSubV: @@ -7051,6 +7059,31 @@ instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{ ins_pipe(pipe_slow); %} +// ------------------------------ Vector bitwise blend ------------------------- + +instruct vbitwise_blend_neon_sve1(vReg src1, vReg src2, vReg dst_src3) %{ + predicate(UseSVE < 2 && + VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src3 (VectorBitwiseBlend (Binary src1 src2) dst_src3)); + format %{ "vbitwise_blend_neon_sve1 $src1, $src2, $dst_src3" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + Assembler::SIMD_Arrangement T = length_in_bytes == 16 ? __ T16B : __ T8B; + __ bsl($dst_src3$$FloatRegister, T, $src2$$FloatRegister, $src1$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vbitwise_blend_sve2(vReg src1, vReg dst_src2, vReg src3) %{ + predicate(UseSVE == 2); + match(Set dst_src2 (VectorBitwiseBlend (Binary src1 dst_src2) src3)); + format %{ "vbitwise_blend_sve2 $src1, $dst_src2, $src3" %} + ins_encode %{ + __ sve_bsl($dst_src2$$FloatRegister, $src1$$FloatRegister, $src3$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ Vector round --------------------------------- // vector Math.round diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index febd084d78a..68c407bc9af 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -307,6 +307,13 @@ source %{ return false; // NEON only, since SLI/USHR are not available in SVE } break; + case Op_VectorBitwiseBlend: + // Use NEON BSL when UseSVE < 2; SVE1 has no BSL so larger vectors are + // not supported on UseSVE == 1 machines. + if (UseSVE < 2 && length_in_bytes > 16) { + return false; + } + break; default: break; } @@ -330,6 +337,7 @@ source %{ case Op_MulReductionVL: case Op_CompressBitsV: case Op_ExpandBitsV: + case Op_VectorBitwiseBlend: return false; case Op_SaturatingAddV: case Op_SaturatingSubV: @@ -4754,6 +4762,31 @@ instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{ ins_pipe(pipe_slow); %} +// ------------------------------ Vector bitwise blend ------------------------- + +instruct vbitwise_blend_neon_sve1(vReg src1, vReg src2, vReg dst_src3) %{ + predicate(UseSVE < 2 && + VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src3 (VectorBitwiseBlend (Binary src1 src2) dst_src3)); + format %{ "vbitwise_blend_neon_sve1 $src1, $src2, $dst_src3" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + Assembler::SIMD_Arrangement T = length_in_bytes == 16 ? __ T16B : __ T8B; + __ bsl($dst_src3$$FloatRegister, T, $src2$$FloatRegister, $src1$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vbitwise_blend_sve2(vReg src1, vReg dst_src2, vReg src3) %{ + predicate(UseSVE == 2); + match(Set dst_src2 (VectorBitwiseBlend (Binary src1 dst_src2) src3)); + format %{ "vbitwise_blend_sve2 $src1, $dst_src2, $src3" %} + ins_encode %{ + __ sve_bsl($dst_src2$$FloatRegister, $src1$$FloatRegister, $src3$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ Vector round --------------------------------- // vector Math.round diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index c8d5ee2eaeb..4eb2f6010c0 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -4292,14 +4292,15 @@ public: #undef INSN // SVE2 bitwise ternary operations -#define INSN(NAME, opc) \ - void NAME(FloatRegister Zdn, FloatRegister Zm, FloatRegister Zk) { \ - starti; \ - f(0b00000100, 31, 24), f(opc, 23, 21), rf(Zm, 16); \ - f(0b001110, 15, 10), rf(Zk, 5), rf(Zdn, 0); \ +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zdn, FloatRegister Zm, FloatRegister Zk) { \ + starti; \ + f(0b00000100, 31, 24), f(op1, 23, 21), rf(Zm, 16); \ + f(0b00111, 15, 11), f(op2, 10), rf(Zk, 5), rf(Zdn, 0); \ } - INSN(sve_eor3, 0b001); // Bitwise exclusive OR of three vectors + INSN(sve_eor3, 0b001, 0b0); // Bitwise exclusive OR of three vectors + INSN(sve_bsl, 0b001, 0b1); // Bitwise select #undef INSN // SVE2 saturating operations - predicate diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index a15b0630610..8f1e662765e 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1843,13 +1843,19 @@ public: #undef SVE_DESTRUCTIVE_TERNARY_INS - using Assembler::sve_eor3; - void sve_eor3(FloatRegister Zd, FloatRegister Zm, FloatRegister Zk) { - if (Zd != Zm && Zd != Zk) { - try_to_replace_prev_vector_copy_with_movprfx(Zd); - } - Assembler::sve_eor3(Zd, Zm, Zk); +#define SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(NAME) \ + using Assembler::NAME; \ + void NAME(FloatRegister Zd, FloatRegister Zm, FloatRegister Zk) { \ + if (Zd != Zm && Zd != Zk) { \ + try_to_replace_prev_vector_copy_with_movprfx(Zd); \ + } \ + Assembler::NAME(Zd, Zm, Zk); \ } + + SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(sve_bsl); + SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(sve_eor3); + +#undef SVE_DESTRUCTIVE_TERNARY_UNPRED_INS }; #ifdef ASSERT diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index 4ac5c31789f..7033dad211c 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -512,6 +512,7 @@ macro(VectorMaskWrapper) macro(VectorMaskCmp) macro(VectorMaskCast) macro(VectorTest) +macro(VectorBitwiseBlend) macro(VectorBlend) macro(VectorRearrange) macro(VectorLoadMask) diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index d2a9250b3ee..57fa2de05df 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -2385,7 +2385,8 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) { break; } case Op_VectorBlend: - case Op_VectorInsert: { + case Op_VectorInsert: + case Op_VectorBitwiseBlend: { Node* pair = new BinaryNode(n->in(1), n->in(2)); n->set_req(1, pair); n->set_req(2, n->in(3)); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 61a19d5837b..ebacde2f5d6 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -874,6 +874,7 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, Node* n3, const TypeV case Op_SignumVD: return new SignumVDNode(n1, n2, n3, vt); case Op_SignumVF: return new SignumVFNode(n1, n2, n3, vt); case Op_VectorBlend: return new VectorBlendNode(n1, n2, n3); + case Op_VectorBitwiseBlend: return new VectorBitwiseBlendNode(n1, n2, n3, vt); default: fatal("Missed vector creation for '%s'", NodeClassNames[vopc]); return nullptr; @@ -2768,6 +2769,70 @@ Node* XorVNode::Ideal_XorV_VectorMaskCmp(PhaseGVN* phase, bool can_reshape) { return res; } +// XorV(a, AndV(sel, XorV(a, b))) => VectorBitwiseBlend(a, b, sel) +// XorV(a, AndV(sel, XorV(a, b)), mask) => +// VectorBlend(a, VectorBitwiseBlend(a, b, sel), mask) +Node* XorVNode::Ideal_XorV_to_VectorBitwiseBlend(PhaseGVN* phase, bool can_reshape) { + const TypeVect* vt = vect_type(); + BasicType bt = vt->element_basic_type(); + uint vlen = vt->length(); + if (!Matcher::match_rule_supported_vector(Op_VectorBitwiseBlend, vlen, bt)) { + return nullptr; + } + + bool is_masked = is_predicated_vector(); + if (is_masked && + !Matcher::match_rule_supported_vector(Op_VectorBlend, vlen, bt)) { + return nullptr; + } + + // For the predicated case in(1) is fixed as the merge source. Otherwise the + // outer XorV is commutative. + Node* a = nullptr; + Node* andv = nullptr; + if (is_masked || in(2)->Opcode() == Op_AndV) { + andv = in(2); + a = in(1); + } else { + andv = in(1); + a = in(2); + } + if (andv->Opcode() != Op_AndV || andv->is_predicated_vector()) { + return nullptr; + } + + Node* sel = nullptr; + Node* inner_xor = nullptr; + if (andv->in(2)->Opcode() == Op_XorV) { + inner_xor = andv->in(2); + sel = andv->in(1); + } else if (andv->in(1)->Opcode() == Op_XorV) { + inner_xor = andv->in(1); + sel = andv->in(2); + } else { + return nullptr; + } + if (inner_xor->is_predicated_vector()) { + return nullptr; + } + + Node* b = nullptr; + if (inner_xor->in(1) == a) { + b = inner_xor->in(2); + } else if (inner_xor->in(2) == a) { + b = inner_xor->in(1); + } else { + return nullptr; + } + + Node* blend = new VectorBitwiseBlendNode(a, b, sel, vt); + if (!is_masked) { + return blend; + } + blend = phase->transform(blend); + return new VectorBlendNode(a, blend, in(3)); +} + Node* XorVNode::Ideal(PhaseGVN* phase, bool can_reshape) { // (XorV src src) => (Replicate zero) // (XorVMask src src) => (MaskAll zero) @@ -2786,6 +2851,11 @@ Node* XorVNode::Ideal(PhaseGVN* phase, bool can_reshape) { if (res != nullptr) { return res; } + + res = Ideal_XorV_to_VectorBitwiseBlend(phase, can_reshape); + if (res != nullptr) { + return res; + } return VectorNode::Ideal(phase, can_reshape); } diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index de077015bca..73181bce256 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -1075,6 +1075,7 @@ class XorVNode : public VectorNode { virtual int Opcode() const; virtual Node* Ideal(PhaseGVN* phase, bool can_reshape); Node* Ideal_XorV_VectorMaskCmp(PhaseGVN* phase, bool can_reshape); + Node* Ideal_XorV_to_VectorBitwiseBlend(PhaseGVN* phase, bool can_reshape); }; // Vector xor byte, short, int, long as a reduction @@ -1802,6 +1803,24 @@ class VectorBlendNode : public VectorNode { Node* vec_mask() const { return in(3); } }; +// Vector bitwise blend (bit-select): (sel & vec_true) | (~sel & vec_false). +class VectorBitwiseBlendNode : public VectorNode { + public: + VectorBitwiseBlendNode(Node* vec_false, Node* vec_true, Node* sel, const TypeVect* vt) + : VectorNode(vec_false, vec_true, sel, vt) { + assert(vec_false->bottom_type()->isa_vect() != nullptr && + vec_true->bottom_type()->isa_vect() != nullptr && + sel->bottom_type()->isa_vect() != nullptr, + "inputs must all be vectors"); + uint vlen = vt->length(); + assert(vec_false->bottom_type()->is_vect()->length() == vlen && + vec_true->bottom_type()->is_vect()->length() == vlen && + sel->bottom_type()->is_vect()->length() == vlen, + "mismatched vector length"); + } + virtual int Opcode() const; +}; + // Rearrange lane elements from a source vector under the control of a shuffle // (indexes) vector. Each lane in the shuffle vector specifies which lane from // the source vector to select for the corresponding output lane. All indexes diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py index bcf786d6f1f..04088bb0dc8 100644 --- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py +++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py @@ -1121,7 +1121,7 @@ class SVEVectorOp(Instruction): self._bitwiseop = False if name[0] == 'f': self._width = RegVariant(2, 3) - elif not self._isPredicated and (name in ["and", "eor", "orr", "bic", "eor3"]): + elif not self._isPredicated and (name in ["and", "bic", "bsl", "eor", "eor3", "orr"]): self._width = RegVariant(3, 3) self._bitwiseop = True elif name == "revb": @@ -1150,7 +1150,7 @@ class SVEVectorOp(Instruction): width + [str(self.reg[i]) for i in range(1, self.numRegs)])) def astr(self): - firstArg = 0 if self._name == "eor3" else 1 + firstArg = 0 if self._name in ["bsl", "eor3"] else 1 formatStr = "%s%s" + ''.join([", %s" for i in range(firstArg, self.numRegs)]) if self._dnm == 'dn': formatStr += ", %s" @@ -2258,6 +2258,7 @@ generate(SVEVectorOp, [["add", "ZZZ"], # SVE2 instructions ["bext", "ZZZ"], ["bdep", "ZZZ"], + ["bsl", "ZZZ"], ["eor3", "ZZZ"], ["sqadd", "ZPZ", "m", "dn"], ["sqsub", "ZPZ", "m", "dn"], diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h index cd9fd4cfe9a..bad9825af9b 100644 --- a/test/hotspot/gtest/aarch64/asmtest.out.h +++ b/test/hotspot/gtest/aarch64/asmtest.out.h @@ -1419,38 +1419,39 @@ __ sve_fabd(z14, __ S, p5, z22); // fabd z14.s, p5/m, z14.s, z22.s __ sve_bext(z5, __ H, z18, z0); // bext z5.h, z18.h, z0.h __ sve_bdep(z9, __ D, z2, z3); // bdep z9.d, z2.d, z3.d - __ sve_eor3(z14, z4, z29); // eor3 z14.d, z14.d, z4.d, z29.d - __ sve_sqadd(z14, __ D, p5, z4); // sqadd z14.d, p5/m, z14.d, z4.d - __ sve_sqsub(z27, __ S, p3, z22); // sqsub z27.s, p3/m, z27.s, z22.s - __ sve_uqadd(z31, __ S, p6, z11); // uqadd z31.s, p6/m, z31.s, z11.s - __ sve_uqsub(z12, __ B, p4, z28); // uqsub z12.b, p4/m, z12.b, z28.b + __ sve_bsl(z14, z4, z29); // bsl z14.d, z14.d, z4.d, z29.d + __ sve_eor3(z14, z22, z4); // eor3 z14.d, z14.d, z22.d, z4.d + __ sve_sqadd(z27, __ S, p3, z22); // sqadd z27.s, p3/m, z27.s, z22.s + __ sve_sqsub(z31, __ S, p6, z11); // sqsub z31.s, p6/m, z31.s, z11.s + __ sve_uqadd(z12, __ B, p4, z28); // uqadd z12.b, p4/m, z12.b, z28.b + __ sve_uqsub(z28, __ D, p4, z4); // uqsub z28.d, p4/m, z28.d, z4.d // SVEReductionOp - __ sve_andv(v28, __ D, p4, z4); // andv d28, p4, z4.d - __ sve_orv(v6, __ S, p0, z15); // orv s6, p0, z15.s - __ sve_eorv(v1, __ S, p5, z18); // eorv s1, p5, z18.s - __ sve_smaxv(v2, __ H, p2, z4); // smaxv h2, p2, z4.h - __ sve_sminv(v11, __ S, p2, z28); // sminv s11, p2, z28.s - __ sve_umaxv(v3, __ H, p5, z31); // umaxv h3, p5, z31.h - __ sve_uminv(v24, __ H, p5, z15); // uminv h24, p5, z15.h - __ sve_fminv(v6, __ S, p3, z8); // fminv s6, p3, z8.s - __ sve_fmaxv(v21, __ D, p7, z4); // fmaxv d21, p7, z4.d - __ sve_fadda(v24, __ S, p5, z6); // fadda s24, p5, s24, z6.s - __ sve_uaddv(v4, __ D, p2, z9); // uaddv d4, p2, z9.d + __ sve_andv(v6, __ S, p0, z15); // andv s6, p0, z15.s + __ sve_orv(v1, __ S, p5, z18); // orv s1, p5, z18.s + __ sve_eorv(v2, __ H, p2, z4); // eorv h2, p2, z4.h + __ sve_smaxv(v11, __ S, p2, z28); // smaxv s11, p2, z28.s + __ sve_sminv(v3, __ H, p5, z31); // sminv h3, p5, z31.h + __ sve_umaxv(v24, __ H, p5, z15); // umaxv h24, p5, z15.h + __ sve_uminv(v6, __ H, p3, z8); // uminv h6, p3, z8.h + __ sve_fminv(v21, __ D, p7, z4); // fminv d21, p7, z4.d + __ sve_fmaxv(v24, __ S, p5, z6); // fmaxv s24, p5, z6.s + __ sve_fadda(v4, __ D, p2, z9); // fadda d4, p2, d4, z9.d + __ sve_uaddv(v10, __ S, p1, z31); // uaddv d10, p1, z31.s // AddWideNEONOp - __ saddwv(v10, v11, __ T8H, v12, __ T8B); // saddw v10.8H, v11.8H, v12.8B - __ saddwv2(v5, v6, __ T8H, v7, __ T16B); // saddw2 v5.8H, v6.8H, v7.16B - __ saddwv(v31, v0, __ T4S, v1, __ T4H); // saddw v31.4S, v0.4S, v1.4H - __ saddwv2(v22, v23, __ T4S, v24, __ T8H); // saddw2 v22.4S, v23.4S, v24.8H - __ saddwv(v25, v26, __ T2D, v27, __ T2S); // saddw v25.2D, v26.2D, v27.2S - __ saddwv2(v15, v16, __ T2D, v17, __ T4S); // saddw2 v15.2D, v16.2D, v17.4S - __ uaddwv(v3, v4, __ T8H, v5, __ T8B); // uaddw v3.8H, v4.8H, v5.8B - __ uaddwv2(v18, v19, __ T8H, v20, __ T16B); // uaddw2 v18.8H, v19.8H, v20.16B - __ uaddwv(v14, v15, __ T4S, v16, __ T4H); // uaddw v14.4S, v15.4S, v16.4H - __ uaddwv2(v10, v11, __ T4S, v12, __ T8H); // uaddw2 v10.4S, v11.4S, v12.8H - __ uaddwv(v2, v3, __ T2D, v4, __ T2S); // uaddw v2.2D, v3.2D, v4.2S - __ uaddwv2(v10, v11, __ T2D, v12, __ T4S); // uaddw2 v10.2D, v11.2D, v12.4S + __ saddwv(v25, v26, __ T8H, v27, __ T8B); // saddw v25.8H, v26.8H, v27.8B + __ saddwv2(v15, v16, __ T8H, v17, __ T16B); // saddw2 v15.8H, v16.8H, v17.16B + __ saddwv(v3, v4, __ T4S, v5, __ T4H); // saddw v3.4S, v4.4S, v5.4H + __ saddwv2(v18, v19, __ T4S, v20, __ T8H); // saddw2 v18.4S, v19.4S, v20.8H + __ saddwv(v14, v15, __ T2D, v16, __ T2S); // saddw v14.2D, v15.2D, v16.2S + __ saddwv2(v10, v11, __ T2D, v12, __ T4S); // saddw2 v10.2D, v11.2D, v12.4S + __ uaddwv(v2, v3, __ T8H, v4, __ T8B); // uaddw v2.8H, v3.8H, v4.8B + __ uaddwv2(v10, v11, __ T8H, v12, __ T16B); // uaddw2 v10.8H, v11.8H, v12.16B + __ uaddwv(v8, v9, __ T4S, v10, __ T4H); // uaddw v8.4S, v9.4S, v10.4H + __ uaddwv2(v11, v12, __ T4S, v13, __ T8H); // uaddw2 v11.4S, v12.4S, v13.8H + __ uaddwv(v22, v23, __ T2D, v24, __ T2S); // uaddw v22.2D, v23.2D, v24.2S + __ uaddwv2(v3, v4, __ T2D, v5, __ T4S); // uaddw2 v3.2D, v4.2D, v5.4S __ bind(forth); @@ -1469,30 +1470,30 @@ 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0227, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, - 0x14000000, 0x17ffffd7, 0x140004cb, 0x94000000, - 0x97ffffd4, 0x940004c8, 0x3400000a, 0x34fffa2a, - 0x340098aa, 0x35000008, 0x35fff9c8, 0x35009848, - 0xb400000b, 0xb4fff96b, 0xb40097eb, 0xb500001d, - 0xb5fff91d, 0xb500979d, 0x10000013, 0x10fff8b3, - 0x10009733, 0x90000013, 0x36300016, 0x3637f836, - 0x363096b6, 0x3758000c, 0x375ff7cc, 0x3758964c, + 0x14000000, 0x17ffffd7, 0x140004cc, 0x94000000, + 0x97ffffd4, 0x940004c9, 0x3400000a, 0x34fffa2a, + 0x340098ca, 0x35000008, 0x35fff9c8, 0x35009868, + 0xb400000b, 0xb4fff96b, 0xb400980b, 0xb500001d, + 0xb5fff91d, 0xb50097bd, 0x10000013, 0x10fff8b3, + 0x10009753, 0x90000013, 0x36300016, 0x3637f836, + 0x363096d6, 0x3758000c, 0x375ff7cc, 0x3758966c, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, - 0x54009420, 0x54000001, 0x54fff541, 0x540093c1, - 0x54000002, 0x54fff4e2, 0x54009362, 0x54000002, - 0x54fff482, 0x54009302, 0x54000003, 0x54fff423, - 0x540092a3, 0x54000003, 0x54fff3c3, 0x54009243, - 0x54000004, 0x54fff364, 0x540091e4, 0x54000005, - 0x54fff305, 0x54009185, 0x54000006, 0x54fff2a6, - 0x54009126, 0x54000007, 0x54fff247, 0x540090c7, - 0x54000008, 0x54fff1e8, 0x54009068, 0x54000009, - 0x54fff189, 0x54009009, 0x5400000a, 0x54fff12a, - 0x54008faa, 0x5400000b, 0x54fff0cb, 0x54008f4b, - 0x5400000c, 0x54fff06c, 0x54008eec, 0x5400000d, - 0x54fff00d, 0x54008e8d, 0x5400000e, 0x54ffefae, - 0x54008e2e, 0x5400000f, 0x54ffef4f, 0x54008dcf, + 0x54009440, 0x54000001, 0x54fff541, 0x540093e1, + 0x54000002, 0x54fff4e2, 0x54009382, 0x54000002, + 0x54fff482, 0x54009322, 0x54000003, 0x54fff423, + 0x540092c3, 0x54000003, 0x54fff3c3, 0x54009263, + 0x54000004, 0x54fff364, 0x54009204, 0x54000005, + 0x54fff305, 0x540091a5, 0x54000006, 0x54fff2a6, + 0x54009146, 0x54000007, 0x54fff247, 0x540090e7, + 0x54000008, 0x54fff1e8, 0x54009088, 0x54000009, + 0x54fff189, 0x54009029, 0x5400000a, 0x54fff12a, + 0x54008fca, 0x5400000b, 0x54fff0cb, 0x54008f6b, + 0x5400000c, 0x54fff06c, 0x54008f0c, 0x5400000d, + 0x54fff00d, 0x54008ead, 0x5400000e, 0x54ffefae, + 0x54008e4e, 0x5400000f, 0x54ffef4f, 0x54008def, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f, 0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf, @@ -1535,7 +1536,7 @@ 0x39598921, 0x795d3077, 0x399d0675, 0x7998d8f3, 0x79dbd02a, 0xb99d068a, 0xfd5d11a0, 0xbd58d76b, 0xfd1ac72d, 0xbd1d9c14, 0x5800001a, 0x18ffda33, - 0xf8991100, 0xd8007880, 0xf8a758e0, 0xf9989d80, + 0xf8991100, 0xd80078a0, 0xf8a758e0, 0xf9989d80, 0x1a0b0298, 0x3a1c01a0, 0x5a0400ea, 0x7a02020f, 0x9a1d028c, 0xba0e01ad, 0xda140186, 0xfa19022c, 0x0b2b877e, 0x2b21c8ee, 0xcb3ba47d, 0x6b3ae9a0, @@ -1769,13 +1770,13 @@ 0x65b45aff, 0x65e07fa2, 0x04454097, 0x044d6e3c, 0x04283148, 0x04bd3013, 0x047731b0, 0x04ed33d7, 0x05606ad9, 0x056b6fd9, 0x658896ce, 0x4540b245, - 0x45c3b449, 0x04243bae, 0x44d8948e, 0x449a8edb, - 0x4499997f, 0x441b938c, 0x04da309c, 0x049821e6, - 0x04993641, 0x04482882, 0x048a2b8b, 0x044937e3, - 0x044b35f8, 0x65872d06, 0x65c63c95, 0x659834d8, - 0x04c12924, 0x0e2c116a, 0x4e2710c5, 0x0e61101f, - 0x4e7812f6, 0x0ebb1359, 0x4eb1120f, 0x2e251083, - 0x6e341272, 0x2e7011ee, 0x6e6c116a, 0x2ea41062, - 0x6eac116a, + 0x45c3b449, 0x04243fae, 0x0436388e, 0x44988edb, + 0x449a997f, 0x4419938c, 0x44db909c, 0x049a21e6, + 0x04983641, 0x04592882, 0x04882b8b, 0x044a37e3, + 0x044935f8, 0x044b2d06, 0x65c73c95, 0x658634d8, + 0x65d82924, 0x048127ea, 0x0e3b1359, 0x4e31120f, + 0x0e651083, 0x4e741272, 0x0eb011ee, 0x4eac116a, + 0x2e241062, 0x6e2c116a, 0x2e6a1128, 0x6e6d118b, + 0x2eb812f6, 0x6ea51083, }; // END Generated code -- do not edit diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 6f7d20246bb..a76853016d9 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -2442,6 +2442,12 @@ public class IRNode { vectorNode(VECTOR_BLEND_D, "VectorBlend", TYPE_DOUBLE); } + public static final String VECTOR_BITWISE_BLEND = PREFIX + "VECTOR_BITWISE_BLEND" + POSTFIX; + static { + String regex = START + "VectorBitwiseBlend" + MID + END; + afterBarrierExpansionToBeforeMatching(VECTOR_BITWISE_BLEND, regex); + } + public static final String VECTOR_MASK_CMP_I = VECTOR_PREFIX + "VECTOR_MASK_CMP_I" + POSTFIX; static { vectorNode(VECTOR_MASK_CMP_I, "VectorMaskCmp", TYPE_INT); diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorBitwiseBlendTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorBitwiseBlendTest.java new file mode 100644 index 00000000000..feeb79254b4 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorBitwiseBlendTest.java @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8382052 + * @key randomness + * @library /test/lib / + * @summary IR tests for Vector BITWISE_BLEND optimization + * @modules jdk.incubator.vector + * + * @run driver ${test.main.class} + */ + +package compiler.vectorapi; + +import compiler.lib.generators.*; +import compiler.lib.ir_framework.*; +import jdk.incubator.vector.*; + +public class VectorBitwiseBlendTest { + + private static final Generators RD = Generators.G; + + private static final VectorSpecies B_SPECIES = ByteVector.SPECIES_MAX; + private static final VectorSpecies S_SPECIES = ShortVector.SPECIES_MAX; + private static final VectorSpecies I_SPECIES = IntVector.SPECIES_MAX; + private static final VectorSpecies L_SPECIES = LongVector.SPECIES_MAX; + + private static final int BUF_LEN = 256; + + private static final byte[] ba = new byte[BUF_LEN]; + private static final byte[] bb = new byte[BUF_LEN]; + private static final byte[] bc = new byte[BUF_LEN]; + private static final byte[] br = new byte[BUF_LEN]; + + private static final short[] sa = new short[BUF_LEN]; + private static final short[] sb = new short[BUF_LEN]; + private static final short[] sc = new short[BUF_LEN]; + private static final short[] sr = new short[BUF_LEN]; + + private static final int[] ia = new int[BUF_LEN]; + private static final int[] ib = new int[BUF_LEN]; + private static final int[] ic = new int[BUF_LEN]; + private static final int[] ir = new int[BUF_LEN]; + + private static final long[] la = new long[BUF_LEN]; + private static final long[] lb = new long[BUF_LEN]; + private static final long[] lc = new long[BUF_LEN]; + private static final long[] lr = new long[BUF_LEN]; + + private static final boolean[] mask_arr = new boolean[BUF_LEN]; + + static { + Generator iGen = RD.ints(); + Generator lGen = RD.longs(); + + for (int i = 0; i < BUF_LEN; i++) { + mask_arr[i] = (i & 1) != 0; + ba[i] = iGen.next().byteValue(); + bb[i] = iGen.next().byteValue(); + bc[i] = iGen.next().byteValue(); + sa[i] = iGen.next().shortValue(); + sb[i] = iGen.next().shortValue(); + sc[i] = iGen.next().shortValue(); + } + RD.fill(iGen, ia); + RD.fill(iGen, ib); + RD.fill(iGen, ic); + RD.fill(lGen, la); + RD.fill(lGen, lb); + RD.fill(lGen, lc); + } + + @Test + @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" }, + applyIf = { "MaxVectorSize", "<= 16" }) + @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeature = { "sve2", "true" }) + public static void testUnmaskedBlendByte() { + ByteVector va = ByteVector.fromArray(B_SPECIES, ba, 0); + ByteVector vb = ByteVector.fromArray(B_SPECIES, bb, 0); + ByteVector vc = ByteVector.fromArray(B_SPECIES, bc, 0); + va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(br, 0); + } + + @Test + @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" }, + applyIf = { "MaxVectorSize", "<= 16" }) + @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeature = { "sve2", "true" }) + public static void testUnmaskedBlendShort() { + ShortVector va = ShortVector.fromArray(S_SPECIES, sa, 0); + ShortVector vb = ShortVector.fromArray(S_SPECIES, sb, 0); + ShortVector vc = ShortVector.fromArray(S_SPECIES, sc, 0); + va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(sr, 0); + } + + @Test + @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" }, + applyIf = { "MaxVectorSize", "<= 16" }) + @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeature = { "sve2", "true" }) + public static void testUnmaskedBlendInt() { + IntVector va = IntVector.fromArray(I_SPECIES, ia, 0); + IntVector vb = IntVector.fromArray(I_SPECIES, ib, 0); + IntVector vc = IntVector.fromArray(I_SPECIES, ic, 0); + va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(ir, 0); + } + + @Test + @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" }, + applyIf = { "MaxVectorSize", "<= 16" }) + @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeature = { "sve2", "true" }) + public static void testUnmaskedBlendLong() { + LongVector va = LongVector.fromArray(L_SPECIES, la, 0); + LongVector vb = LongVector.fromArray(L_SPECIES, lb, 0); + LongVector vc = LongVector.fromArray(L_SPECIES, lc, 0); + va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(lr, 0); + } + + @Test + @IR(counts = { IRNode.VECTOR_BLEND_B, "= 1", + IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" }, + applyIf = { "MaxVectorSize", "<= 16" }) + @IR(counts = { IRNode.VECTOR_BLEND_B, "= 1", + IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeature = { "sve2", "true" }) + public static void testMaskedBlendByte() { + VectorMask mask = VectorMask.fromArray(B_SPECIES, mask_arr, 0); + ByteVector va = ByteVector.fromArray(B_SPECIES, ba, 0); + ByteVector vb = ByteVector.fromArray(B_SPECIES, bb, 0); + ByteVector vc = ByteVector.fromArray(B_SPECIES, bc, 0); + va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(br, 0); + } + + @Test + @IR(counts = { IRNode.VECTOR_BLEND_S, "= 1", + IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" }, + applyIf = { "MaxVectorSize", "<= 16" }) + @IR(counts = { IRNode.VECTOR_BLEND_S, "= 1", + IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeature = { "sve2", "true" }) + public static void testMaskedBlendShort() { + VectorMask mask = VectorMask.fromArray(S_SPECIES, mask_arr, 0); + ShortVector va = ShortVector.fromArray(S_SPECIES, sa, 0); + ShortVector vb = ShortVector.fromArray(S_SPECIES, sb, 0); + ShortVector vc = ShortVector.fromArray(S_SPECIES, sc, 0); + va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(sr, 0); + } + + @Test + @IR(counts = { IRNode.VECTOR_BLEND_I, "= 1", + IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" }, + applyIf = { "MaxVectorSize", "<= 16" }) + @IR(counts = { IRNode.VECTOR_BLEND_I, "= 1", + IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeature = { "sve2", "true" }) + public static void testMaskedBlendInt() { + VectorMask mask = VectorMask.fromArray(I_SPECIES, mask_arr, 0); + IntVector va = IntVector.fromArray(I_SPECIES, ia, 0); + IntVector vb = IntVector.fromArray(I_SPECIES, ib, 0); + IntVector vc = IntVector.fromArray(I_SPECIES, ic, 0); + va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(ir, 0); + } + + @Test + @IR(counts = { IRNode.VECTOR_BLEND_L, "= 1", + IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" }, + applyIf = { "MaxVectorSize", "<= 16" }) + @IR(counts = { IRNode.VECTOR_BLEND_L, "= 1", + IRNode.VECTOR_BITWISE_BLEND, "= 1" }, + applyIfCPUFeature = { "sve2", "true" }) + public static void testMaskedBlendLong() { + VectorMask mask = VectorMask.fromArray(L_SPECIES, mask_arr, 0); + LongVector va = LongVector.fromArray(L_SPECIES, la, 0); + LongVector vb = LongVector.fromArray(L_SPECIES, lb, 0); + LongVector vc = LongVector.fromArray(L_SPECIES, lc, 0); + va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(lr, 0); + } + + public static void main(String[] args) { + TestFramework testFramework = new TestFramework(); + testFramework.setDefaultWarmup(10000) + .addFlags("--add-modules=jdk.incubator.vector") + .start(); + } +} diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/MaskedLogicOpts.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/MaskedLogicOpts.java index 7fc1fbce2e4..3d8d1265c69 100644 --- a/test/micro/org/openjdk/bench/jdk/incubator/vector/MaskedLogicOpts.java +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/MaskedLogicOpts.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2022, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -55,8 +55,9 @@ public class MaskedLogicOpts { int int512_arr_idx; int int256_arr_idx; int int128_arr_idx; - int long256_arr_idx; int long512_arr_idx; + int long256_arr_idx; + int long128_arr_idx; private Random r = new Random(1024); @@ -65,8 +66,9 @@ public class MaskedLogicOpts { int512_arr_idx = -16; int256_arr_idx = -8; int128_arr_idx = -4; - long256_arr_idx = -4; long512_arr_idx = -8; + long256_arr_idx = -4; + long128_arr_idx = -2; mask_arr = new boolean[ARRAYLEN]; i1 = new int[ARRAYLEN]; @@ -106,6 +108,7 @@ public class MaskedLogicOpts { int128_arr_idx = (((ARRAYLEN & ~3) - int128_arr_idx) <= 4) ? 0 : int128_arr_idx + 4; long512_arr_idx = (((ARRAYLEN & ~7) - long512_arr_idx) <= 8) ? 0 : long512_arr_idx + 8; long256_arr_idx = (((ARRAYLEN & ~3) - long256_arr_idx) <= 4) ? 0 : long256_arr_idx + 4; + long128_arr_idx = (((ARRAYLEN & ~1) - long128_arr_idx) <= 2) ? 0 : long128_arr_idx + 2; } @CompilerControl(CompilerControl.Mode.INLINE) @@ -278,6 +281,11 @@ public class MaskedLogicOpts { partiallyMaskedLogicOperationsLongKernel(LongVector.SPECIES_256, long256_arr_idx); } + @Benchmark + public void partiallyMaskedLogicOperationsLong128() { + partiallyMaskedLogicOperationsLongKernel(LongVector.SPECIES_128, long128_arr_idx); + } + @CompilerControl(CompilerControl.Mode.INLINE) public void bitwiseBlendOperationLongKernel(VectorSpecies SPECIES, int index) { VectorMask lmask = VectorMask.fromArray(SPECIES, mask_arr, index); @@ -305,4 +313,9 @@ public class MaskedLogicOpts { public void bitwiseBlendOperationLong256() { bitwiseBlendOperationLongKernel(LongVector.SPECIES_256, long256_arr_idx); } + + @Benchmark + public void bitwiseBlendOperationLong128() { + bitwiseBlendOperationLongKernel(LongVector.SPECIES_128, long128_arr_idx); + } }