8382052: VectorAPI: Optimize the lanewise BITWISE_BLEND for AArch64

Reviewed-by: xgong, epeter, aph
This commit is contained in:
Eric Fang 2026-06-17 06:30:18 +00:00 committed by Xiaohong Gong
parent 2d65ea61d9
commit 5fbce068bd
13 changed files with 477 additions and 75 deletions

View File

@ -317,6 +317,13 @@ source %{
return false; // NEON only, since SLI/USHR are not available in SVE
}
break;
case Op_VectorBitwiseBlend:
// Use NEON BSL when UseSVE < 2; SVE1 has no BSL so larger vectors are
// not supported on UseSVE == 1 machines.
if (UseSVE < 2 && length_in_bytes > 16) {
return false;
}
break;
default:
break;
}
@ -340,6 +347,7 @@ source %{
case Op_MulReductionVL:
case Op_CompressBitsV:
case Op_ExpandBitsV:
case Op_VectorBitwiseBlend:
return false;
case Op_SaturatingAddV:
case Op_SaturatingSubV:
@ -7051,6 +7059,31 @@ instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{
ins_pipe(pipe_slow);
%}
// ------------------------------ Vector bitwise blend -------------------------
instruct vbitwise_blend_neon_sve1(vReg src1, vReg src2, vReg dst_src3) %{
predicate(UseSVE < 2 &&
VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
match(Set dst_src3 (VectorBitwiseBlend (Binary src1 src2) dst_src3));
format %{ "vbitwise_blend_neon_sve1 $src1, $src2, $dst_src3" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
Assembler::SIMD_Arrangement T = length_in_bytes == 16 ? __ T16B : __ T8B;
__ bsl($dst_src3$$FloatRegister, T, $src2$$FloatRegister, $src1$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
instruct vbitwise_blend_sve2(vReg src1, vReg dst_src2, vReg src3) %{
predicate(UseSVE == 2);
match(Set dst_src2 (VectorBitwiseBlend (Binary src1 dst_src2) src3));
format %{ "vbitwise_blend_sve2 $src1, $dst_src2, $src3" %}
ins_encode %{
__ sve_bsl($dst_src2$$FloatRegister, $src1$$FloatRegister, $src3$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
// ------------------------------ Vector round ---------------------------------
// vector Math.round

View File

@ -307,6 +307,13 @@ source %{
return false; // NEON only, since SLI/USHR are not available in SVE
}
break;
case Op_VectorBitwiseBlend:
// Use NEON BSL when UseSVE < 2; SVE1 has no BSL so larger vectors are
// not supported on UseSVE == 1 machines.
if (UseSVE < 2 && length_in_bytes > 16) {
return false;
}
break;
default:
break;
}
@ -330,6 +337,7 @@ source %{
case Op_MulReductionVL:
case Op_CompressBitsV:
case Op_ExpandBitsV:
case Op_VectorBitwiseBlend:
return false;
case Op_SaturatingAddV:
case Op_SaturatingSubV:
@ -4754,6 +4762,31 @@ instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{
ins_pipe(pipe_slow);
%}
// ------------------------------ Vector bitwise blend -------------------------
instruct vbitwise_blend_neon_sve1(vReg src1, vReg src2, vReg dst_src3) %{
predicate(UseSVE < 2 &&
VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
match(Set dst_src3 (VectorBitwiseBlend (Binary src1 src2) dst_src3));
format %{ "vbitwise_blend_neon_sve1 $src1, $src2, $dst_src3" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
Assembler::SIMD_Arrangement T = length_in_bytes == 16 ? __ T16B : __ T8B;
__ bsl($dst_src3$$FloatRegister, T, $src2$$FloatRegister, $src1$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
instruct vbitwise_blend_sve2(vReg src1, vReg dst_src2, vReg src3) %{
predicate(UseSVE == 2);
match(Set dst_src2 (VectorBitwiseBlend (Binary src1 dst_src2) src3));
format %{ "vbitwise_blend_sve2 $src1, $dst_src2, $src3" %}
ins_encode %{
__ sve_bsl($dst_src2$$FloatRegister, $src1$$FloatRegister, $src3$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
// ------------------------------ Vector round ---------------------------------
// vector Math.round

View File

@ -4292,14 +4292,15 @@ public:
#undef INSN
// SVE2 bitwise ternary operations
#define INSN(NAME, opc) \
void NAME(FloatRegister Zdn, FloatRegister Zm, FloatRegister Zk) { \
starti; \
f(0b00000100, 31, 24), f(opc, 23, 21), rf(Zm, 16); \
f(0b001110, 15, 10), rf(Zk, 5), rf(Zdn, 0); \
#define INSN(NAME, op1, op2) \
void NAME(FloatRegister Zdn, FloatRegister Zm, FloatRegister Zk) { \
starti; \
f(0b00000100, 31, 24), f(op1, 23, 21), rf(Zm, 16); \
f(0b00111, 15, 11), f(op2, 10), rf(Zk, 5), rf(Zdn, 0); \
}
INSN(sve_eor3, 0b001); // Bitwise exclusive OR of three vectors
INSN(sve_eor3, 0b001, 0b0); // Bitwise exclusive OR of three vectors
INSN(sve_bsl, 0b001, 0b1); // Bitwise select
#undef INSN
// SVE2 saturating operations - predicate

View File

@ -1843,13 +1843,19 @@ public:
#undef SVE_DESTRUCTIVE_TERNARY_INS
using Assembler::sve_eor3;
void sve_eor3(FloatRegister Zd, FloatRegister Zm, FloatRegister Zk) {
if (Zd != Zm && Zd != Zk) {
try_to_replace_prev_vector_copy_with_movprfx(Zd);
}
Assembler::sve_eor3(Zd, Zm, Zk);
#define SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(NAME) \
using Assembler::NAME; \
void NAME(FloatRegister Zd, FloatRegister Zm, FloatRegister Zk) { \
if (Zd != Zm && Zd != Zk) { \
try_to_replace_prev_vector_copy_with_movprfx(Zd); \
} \
Assembler::NAME(Zd, Zm, Zk); \
}
SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(sve_bsl);
SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(sve_eor3);
#undef SVE_DESTRUCTIVE_TERNARY_UNPRED_INS
};
#ifdef ASSERT

View File

@ -512,6 +512,7 @@ macro(VectorMaskWrapper)
macro(VectorMaskCmp)
macro(VectorMaskCast)
macro(VectorTest)
macro(VectorBitwiseBlend)
macro(VectorBlend)
macro(VectorRearrange)
macro(VectorLoadMask)

View File

@ -2385,7 +2385,8 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
break;
}
case Op_VectorBlend:
case Op_VectorInsert: {
case Op_VectorInsert:
case Op_VectorBitwiseBlend: {
Node* pair = new BinaryNode(n->in(1), n->in(2));
n->set_req(1, pair);
n->set_req(2, n->in(3));

View File

@ -874,6 +874,7 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, Node* n3, const TypeV
case Op_SignumVD: return new SignumVDNode(n1, n2, n3, vt);
case Op_SignumVF: return new SignumVFNode(n1, n2, n3, vt);
case Op_VectorBlend: return new VectorBlendNode(n1, n2, n3);
case Op_VectorBitwiseBlend: return new VectorBitwiseBlendNode(n1, n2, n3, vt);
default:
fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
return nullptr;
@ -2768,6 +2769,70 @@ Node* XorVNode::Ideal_XorV_VectorMaskCmp(PhaseGVN* phase, bool can_reshape) {
return res;
}
// XorV(a, AndV(sel, XorV(a, b))) => VectorBitwiseBlend(a, b, sel)
// XorV(a, AndV(sel, XorV(a, b)), mask) =>
// VectorBlend(a, VectorBitwiseBlend(a, b, sel), mask)
Node* XorVNode::Ideal_XorV_to_VectorBitwiseBlend(PhaseGVN* phase, bool can_reshape) {
const TypeVect* vt = vect_type();
BasicType bt = vt->element_basic_type();
uint vlen = vt->length();
if (!Matcher::match_rule_supported_vector(Op_VectorBitwiseBlend, vlen, bt)) {
return nullptr;
}
bool is_masked = is_predicated_vector();
if (is_masked &&
!Matcher::match_rule_supported_vector(Op_VectorBlend, vlen, bt)) {
return nullptr;
}
// For the predicated case in(1) is fixed as the merge source. Otherwise the
// outer XorV is commutative.
Node* a = nullptr;
Node* andv = nullptr;
if (is_masked || in(2)->Opcode() == Op_AndV) {
andv = in(2);
a = in(1);
} else {
andv = in(1);
a = in(2);
}
if (andv->Opcode() != Op_AndV || andv->is_predicated_vector()) {
return nullptr;
}
Node* sel = nullptr;
Node* inner_xor = nullptr;
if (andv->in(2)->Opcode() == Op_XorV) {
inner_xor = andv->in(2);
sel = andv->in(1);
} else if (andv->in(1)->Opcode() == Op_XorV) {
inner_xor = andv->in(1);
sel = andv->in(2);
} else {
return nullptr;
}
if (inner_xor->is_predicated_vector()) {
return nullptr;
}
Node* b = nullptr;
if (inner_xor->in(1) == a) {
b = inner_xor->in(2);
} else if (inner_xor->in(2) == a) {
b = inner_xor->in(1);
} else {
return nullptr;
}
Node* blend = new VectorBitwiseBlendNode(a, b, sel, vt);
if (!is_masked) {
return blend;
}
blend = phase->transform(blend);
return new VectorBlendNode(a, blend, in(3));
}
Node* XorVNode::Ideal(PhaseGVN* phase, bool can_reshape) {
// (XorV src src) => (Replicate zero)
// (XorVMask src src) => (MaskAll zero)
@ -2786,6 +2851,11 @@ Node* XorVNode::Ideal(PhaseGVN* phase, bool can_reshape) {
if (res != nullptr) {
return res;
}
res = Ideal_XorV_to_VectorBitwiseBlend(phase, can_reshape);
if (res != nullptr) {
return res;
}
return VectorNode::Ideal(phase, can_reshape);
}

View File

@ -1075,6 +1075,7 @@ class XorVNode : public VectorNode {
virtual int Opcode() const;
virtual Node* Ideal(PhaseGVN* phase, bool can_reshape);
Node* Ideal_XorV_VectorMaskCmp(PhaseGVN* phase, bool can_reshape);
Node* Ideal_XorV_to_VectorBitwiseBlend(PhaseGVN* phase, bool can_reshape);
};
// Vector xor byte, short, int, long as a reduction
@ -1802,6 +1803,24 @@ class VectorBlendNode : public VectorNode {
Node* vec_mask() const { return in(3); }
};
// Vector bitwise blend (bit-select): (sel & vec_true) | (~sel & vec_false).
class VectorBitwiseBlendNode : public VectorNode {
public:
VectorBitwiseBlendNode(Node* vec_false, Node* vec_true, Node* sel, const TypeVect* vt)
: VectorNode(vec_false, vec_true, sel, vt) {
assert(vec_false->bottom_type()->isa_vect() != nullptr &&
vec_true->bottom_type()->isa_vect() != nullptr &&
sel->bottom_type()->isa_vect() != nullptr,
"inputs must all be vectors");
uint vlen = vt->length();
assert(vec_false->bottom_type()->is_vect()->length() == vlen &&
vec_true->bottom_type()->is_vect()->length() == vlen &&
sel->bottom_type()->is_vect()->length() == vlen,
"mismatched vector length");
}
virtual int Opcode() const;
};
// Rearrange lane elements from a source vector under the control of a shuffle
// (indexes) vector. Each lane in the shuffle vector specifies which lane from
// the source vector to select for the corresponding output lane. All indexes

View File

@ -1121,7 +1121,7 @@ class SVEVectorOp(Instruction):
self._bitwiseop = False
if name[0] == 'f':
self._width = RegVariant(2, 3)
elif not self._isPredicated and (name in ["and", "eor", "orr", "bic", "eor3"]):
elif not self._isPredicated and (name in ["and", "bic", "bsl", "eor", "eor3", "orr"]):
self._width = RegVariant(3, 3)
self._bitwiseop = True
elif name == "revb":
@ -1150,7 +1150,7 @@ class SVEVectorOp(Instruction):
width +
[str(self.reg[i]) for i in range(1, self.numRegs)]))
def astr(self):
firstArg = 0 if self._name == "eor3" else 1
firstArg = 0 if self._name in ["bsl", "eor3"] else 1
formatStr = "%s%s" + ''.join([", %s" for i in range(firstArg, self.numRegs)])
if self._dnm == 'dn':
formatStr += ", %s"
@ -2258,6 +2258,7 @@ generate(SVEVectorOp, [["add", "ZZZ"],
# SVE2 instructions
["bext", "ZZZ"],
["bdep", "ZZZ"],
["bsl", "ZZZ"],
["eor3", "ZZZ"],
["sqadd", "ZPZ", "m", "dn"],
["sqsub", "ZPZ", "m", "dn"],

View File

@ -1419,38 +1419,39 @@
__ sve_fabd(z14, __ S, p5, z22); // fabd z14.s, p5/m, z14.s, z22.s
__ sve_bext(z5, __ H, z18, z0); // bext z5.h, z18.h, z0.h
__ sve_bdep(z9, __ D, z2, z3); // bdep z9.d, z2.d, z3.d
__ sve_eor3(z14, z4, z29); // eor3 z14.d, z14.d, z4.d, z29.d
__ sve_sqadd(z14, __ D, p5, z4); // sqadd z14.d, p5/m, z14.d, z4.d
__ sve_sqsub(z27, __ S, p3, z22); // sqsub z27.s, p3/m, z27.s, z22.s
__ sve_uqadd(z31, __ S, p6, z11); // uqadd z31.s, p6/m, z31.s, z11.s
__ sve_uqsub(z12, __ B, p4, z28); // uqsub z12.b, p4/m, z12.b, z28.b
__ sve_bsl(z14, z4, z29); // bsl z14.d, z14.d, z4.d, z29.d
__ sve_eor3(z14, z22, z4); // eor3 z14.d, z14.d, z22.d, z4.d
__ sve_sqadd(z27, __ S, p3, z22); // sqadd z27.s, p3/m, z27.s, z22.s
__ sve_sqsub(z31, __ S, p6, z11); // sqsub z31.s, p6/m, z31.s, z11.s
__ sve_uqadd(z12, __ B, p4, z28); // uqadd z12.b, p4/m, z12.b, z28.b
__ sve_uqsub(z28, __ D, p4, z4); // uqsub z28.d, p4/m, z28.d, z4.d
// SVEReductionOp
__ sve_andv(v28, __ D, p4, z4); // andv d28, p4, z4.d
__ sve_orv(v6, __ S, p0, z15); // orv s6, p0, z15.s
__ sve_eorv(v1, __ S, p5, z18); // eorv s1, p5, z18.s
__ sve_smaxv(v2, __ H, p2, z4); // smaxv h2, p2, z4.h
__ sve_sminv(v11, __ S, p2, z28); // sminv s11, p2, z28.s
__ sve_umaxv(v3, __ H, p5, z31); // umaxv h3, p5, z31.h
__ sve_uminv(v24, __ H, p5, z15); // uminv h24, p5, z15.h
__ sve_fminv(v6, __ S, p3, z8); // fminv s6, p3, z8.s
__ sve_fmaxv(v21, __ D, p7, z4); // fmaxv d21, p7, z4.d
__ sve_fadda(v24, __ S, p5, z6); // fadda s24, p5, s24, z6.s
__ sve_uaddv(v4, __ D, p2, z9); // uaddv d4, p2, z9.d
__ sve_andv(v6, __ S, p0, z15); // andv s6, p0, z15.s
__ sve_orv(v1, __ S, p5, z18); // orv s1, p5, z18.s
__ sve_eorv(v2, __ H, p2, z4); // eorv h2, p2, z4.h
__ sve_smaxv(v11, __ S, p2, z28); // smaxv s11, p2, z28.s
__ sve_sminv(v3, __ H, p5, z31); // sminv h3, p5, z31.h
__ sve_umaxv(v24, __ H, p5, z15); // umaxv h24, p5, z15.h
__ sve_uminv(v6, __ H, p3, z8); // uminv h6, p3, z8.h
__ sve_fminv(v21, __ D, p7, z4); // fminv d21, p7, z4.d
__ sve_fmaxv(v24, __ S, p5, z6); // fmaxv s24, p5, z6.s
__ sve_fadda(v4, __ D, p2, z9); // fadda d4, p2, d4, z9.d
__ sve_uaddv(v10, __ S, p1, z31); // uaddv d10, p1, z31.s
// AddWideNEONOp
__ saddwv(v10, v11, __ T8H, v12, __ T8B); // saddw v10.8H, v11.8H, v12.8B
__ saddwv2(v5, v6, __ T8H, v7, __ T16B); // saddw2 v5.8H, v6.8H, v7.16B
__ saddwv(v31, v0, __ T4S, v1, __ T4H); // saddw v31.4S, v0.4S, v1.4H
__ saddwv2(v22, v23, __ T4S, v24, __ T8H); // saddw2 v22.4S, v23.4S, v24.8H
__ saddwv(v25, v26, __ T2D, v27, __ T2S); // saddw v25.2D, v26.2D, v27.2S
__ saddwv2(v15, v16, __ T2D, v17, __ T4S); // saddw2 v15.2D, v16.2D, v17.4S
__ uaddwv(v3, v4, __ T8H, v5, __ T8B); // uaddw v3.8H, v4.8H, v5.8B
__ uaddwv2(v18, v19, __ T8H, v20, __ T16B); // uaddw2 v18.8H, v19.8H, v20.16B
__ uaddwv(v14, v15, __ T4S, v16, __ T4H); // uaddw v14.4S, v15.4S, v16.4H
__ uaddwv2(v10, v11, __ T4S, v12, __ T8H); // uaddw2 v10.4S, v11.4S, v12.8H
__ uaddwv(v2, v3, __ T2D, v4, __ T2S); // uaddw v2.2D, v3.2D, v4.2S
__ uaddwv2(v10, v11, __ T2D, v12, __ T4S); // uaddw2 v10.2D, v11.2D, v12.4S
__ saddwv(v25, v26, __ T8H, v27, __ T8B); // saddw v25.8H, v26.8H, v27.8B
__ saddwv2(v15, v16, __ T8H, v17, __ T16B); // saddw2 v15.8H, v16.8H, v17.16B
__ saddwv(v3, v4, __ T4S, v5, __ T4H); // saddw v3.4S, v4.4S, v5.4H
__ saddwv2(v18, v19, __ T4S, v20, __ T8H); // saddw2 v18.4S, v19.4S, v20.8H
__ saddwv(v14, v15, __ T2D, v16, __ T2S); // saddw v14.2D, v15.2D, v16.2S
__ saddwv2(v10, v11, __ T2D, v12, __ T4S); // saddw2 v10.2D, v11.2D, v12.4S
__ uaddwv(v2, v3, __ T8H, v4, __ T8B); // uaddw v2.8H, v3.8H, v4.8B
__ uaddwv2(v10, v11, __ T8H, v12, __ T16B); // uaddw2 v10.8H, v11.8H, v12.16B
__ uaddwv(v8, v9, __ T4S, v10, __ T4H); // uaddw v8.4S, v9.4S, v10.4H
__ uaddwv2(v11, v12, __ T4S, v13, __ T8H); // uaddw2 v11.4S, v12.4S, v13.8H
__ uaddwv(v22, v23, __ T2D, v24, __ T2S); // uaddw v22.2D, v23.2D, v24.2S
__ uaddwv2(v3, v4, __ T2D, v5, __ T4S); // uaddw2 v3.2D, v4.2D, v5.4S
__ bind(forth);
@ -1469,30 +1470,30 @@
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
0x14000000, 0x17ffffd7, 0x140004cb, 0x94000000,
0x97ffffd4, 0x940004c8, 0x3400000a, 0x34fffa2a,
0x340098aa, 0x35000008, 0x35fff9c8, 0x35009848,
0xb400000b, 0xb4fff96b, 0xb40097eb, 0xb500001d,
0xb5fff91d, 0xb500979d, 0x10000013, 0x10fff8b3,
0x10009733, 0x90000013, 0x36300016, 0x3637f836,
0x363096b6, 0x3758000c, 0x375ff7cc, 0x3758964c,
0x14000000, 0x17ffffd7, 0x140004cc, 0x94000000,
0x97ffffd4, 0x940004c9, 0x3400000a, 0x34fffa2a,
0x340098ca, 0x35000008, 0x35fff9c8, 0x35009868,
0xb400000b, 0xb4fff96b, 0xb400980b, 0xb500001d,
0xb5fff91d, 0xb50097bd, 0x10000013, 0x10fff8b3,
0x10009753, 0x90000013, 0x36300016, 0x3637f836,
0x363096d6, 0x3758000c, 0x375ff7cc, 0x3758966c,
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
0x54009420, 0x54000001, 0x54fff541, 0x540093c1,
0x54000002, 0x54fff4e2, 0x54009362, 0x54000002,
0x54fff482, 0x54009302, 0x54000003, 0x54fff423,
0x540092a3, 0x54000003, 0x54fff3c3, 0x54009243,
0x54000004, 0x54fff364, 0x540091e4, 0x54000005,
0x54fff305, 0x54009185, 0x54000006, 0x54fff2a6,
0x54009126, 0x54000007, 0x54fff247, 0x540090c7,
0x54000008, 0x54fff1e8, 0x54009068, 0x54000009,
0x54fff189, 0x54009009, 0x5400000a, 0x54fff12a,
0x54008faa, 0x5400000b, 0x54fff0cb, 0x54008f4b,
0x5400000c, 0x54fff06c, 0x54008eec, 0x5400000d,
0x54fff00d, 0x54008e8d, 0x5400000e, 0x54ffefae,
0x54008e2e, 0x5400000f, 0x54ffef4f, 0x54008dcf,
0x54009440, 0x54000001, 0x54fff541, 0x540093e1,
0x54000002, 0x54fff4e2, 0x54009382, 0x54000002,
0x54fff482, 0x54009322, 0x54000003, 0x54fff423,
0x540092c3, 0x54000003, 0x54fff3c3, 0x54009263,
0x54000004, 0x54fff364, 0x54009204, 0x54000005,
0x54fff305, 0x540091a5, 0x54000006, 0x54fff2a6,
0x54009146, 0x54000007, 0x54fff247, 0x540090e7,
0x54000008, 0x54fff1e8, 0x54009088, 0x54000009,
0x54fff189, 0x54009029, 0x5400000a, 0x54fff12a,
0x54008fca, 0x5400000b, 0x54fff0cb, 0x54008f6b,
0x5400000c, 0x54fff06c, 0x54008f0c, 0x5400000d,
0x54fff00d, 0x54008ead, 0x5400000e, 0x54ffefae,
0x54008e4e, 0x5400000f, 0x54ffef4f, 0x54008def,
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f,
0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf,
@ -1535,7 +1536,7 @@
0x39598921, 0x795d3077, 0x399d0675, 0x7998d8f3,
0x79dbd02a, 0xb99d068a, 0xfd5d11a0, 0xbd58d76b,
0xfd1ac72d, 0xbd1d9c14, 0x5800001a, 0x18ffda33,
0xf8991100, 0xd8007880, 0xf8a758e0, 0xf9989d80,
0xf8991100, 0xd80078a0, 0xf8a758e0, 0xf9989d80,
0x1a0b0298, 0x3a1c01a0, 0x5a0400ea, 0x7a02020f,
0x9a1d028c, 0xba0e01ad, 0xda140186, 0xfa19022c,
0x0b2b877e, 0x2b21c8ee, 0xcb3ba47d, 0x6b3ae9a0,
@ -1769,13 +1770,13 @@
0x65b45aff, 0x65e07fa2, 0x04454097, 0x044d6e3c,
0x04283148, 0x04bd3013, 0x047731b0, 0x04ed33d7,
0x05606ad9, 0x056b6fd9, 0x658896ce, 0x4540b245,
0x45c3b449, 0x04243bae, 0x44d8948e, 0x449a8edb,
0x4499997f, 0x441b938c, 0x04da309c, 0x049821e6,
0x04993641, 0x04482882, 0x048a2b8b, 0x044937e3,
0x044b35f8, 0x65872d06, 0x65c63c95, 0x659834d8,
0x04c12924, 0x0e2c116a, 0x4e2710c5, 0x0e61101f,
0x4e7812f6, 0x0ebb1359, 0x4eb1120f, 0x2e251083,
0x6e341272, 0x2e7011ee, 0x6e6c116a, 0x2ea41062,
0x6eac116a,
0x45c3b449, 0x04243fae, 0x0436388e, 0x44988edb,
0x449a997f, 0x4419938c, 0x44db909c, 0x049a21e6,
0x04983641, 0x04592882, 0x04882b8b, 0x044a37e3,
0x044935f8, 0x044b2d06, 0x65c73c95, 0x658634d8,
0x65d82924, 0x048127ea, 0x0e3b1359, 0x4e31120f,
0x0e651083, 0x4e741272, 0x0eb011ee, 0x4eac116a,
0x2e241062, 0x6e2c116a, 0x2e6a1128, 0x6e6d118b,
0x2eb812f6, 0x6ea51083,
};
// END Generated code -- do not edit

View File

@ -2442,6 +2442,12 @@ public class IRNode {
vectorNode(VECTOR_BLEND_D, "VectorBlend", TYPE_DOUBLE);
}
public static final String VECTOR_BITWISE_BLEND = PREFIX + "VECTOR_BITWISE_BLEND" + POSTFIX;
static {
String regex = START + "VectorBitwiseBlend" + MID + END;
afterBarrierExpansionToBeforeMatching(VECTOR_BITWISE_BLEND, regex);
}
public static final String VECTOR_MASK_CMP_I = VECTOR_PREFIX + "VECTOR_MASK_CMP_I" + POSTFIX;
static {
vectorNode(VECTOR_MASK_CMP_I, "VectorMaskCmp", TYPE_INT);

View File

@ -0,0 +1,217 @@
/*
* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 8382052
* @key randomness
* @library /test/lib /
* @summary IR tests for Vector BITWISE_BLEND optimization
* @modules jdk.incubator.vector
*
* @run driver ${test.main.class}
*/
package compiler.vectorapi;
import compiler.lib.generators.*;
import compiler.lib.ir_framework.*;
import jdk.incubator.vector.*;
public class VectorBitwiseBlendTest {
private static final Generators RD = Generators.G;
private static final VectorSpecies<Byte> B_SPECIES = ByteVector.SPECIES_MAX;
private static final VectorSpecies<Short> S_SPECIES = ShortVector.SPECIES_MAX;
private static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_MAX;
private static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_MAX;
private static final int BUF_LEN = 256;
private static final byte[] ba = new byte[BUF_LEN];
private static final byte[] bb = new byte[BUF_LEN];
private static final byte[] bc = new byte[BUF_LEN];
private static final byte[] br = new byte[BUF_LEN];
private static final short[] sa = new short[BUF_LEN];
private static final short[] sb = new short[BUF_LEN];
private static final short[] sc = new short[BUF_LEN];
private static final short[] sr = new short[BUF_LEN];
private static final int[] ia = new int[BUF_LEN];
private static final int[] ib = new int[BUF_LEN];
private static final int[] ic = new int[BUF_LEN];
private static final int[] ir = new int[BUF_LEN];
private static final long[] la = new long[BUF_LEN];
private static final long[] lb = new long[BUF_LEN];
private static final long[] lc = new long[BUF_LEN];
private static final long[] lr = new long[BUF_LEN];
private static final boolean[] mask_arr = new boolean[BUF_LEN];
static {
Generator<Integer> iGen = RD.ints();
Generator<Long> lGen = RD.longs();
for (int i = 0; i < BUF_LEN; i++) {
mask_arr[i] = (i & 1) != 0;
ba[i] = iGen.next().byteValue();
bb[i] = iGen.next().byteValue();
bc[i] = iGen.next().byteValue();
sa[i] = iGen.next().shortValue();
sb[i] = iGen.next().shortValue();
sc[i] = iGen.next().shortValue();
}
RD.fill(iGen, ia);
RD.fill(iGen, ib);
RD.fill(iGen, ic);
RD.fill(lGen, la);
RD.fill(lGen, lb);
RD.fill(lGen, lc);
}
@Test
@IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
applyIf = { "MaxVectorSize", "<= 16" })
@IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeature = { "sve2", "true" })
public static void testUnmaskedBlendByte() {
ByteVector va = ByteVector.fromArray(B_SPECIES, ba, 0);
ByteVector vb = ByteVector.fromArray(B_SPECIES, bb, 0);
ByteVector vc = ByteVector.fromArray(B_SPECIES, bc, 0);
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(br, 0);
}
@Test
@IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
applyIf = { "MaxVectorSize", "<= 16" })
@IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeature = { "sve2", "true" })
public static void testUnmaskedBlendShort() {
ShortVector va = ShortVector.fromArray(S_SPECIES, sa, 0);
ShortVector vb = ShortVector.fromArray(S_SPECIES, sb, 0);
ShortVector vc = ShortVector.fromArray(S_SPECIES, sc, 0);
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(sr, 0);
}
@Test
@IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
applyIf = { "MaxVectorSize", "<= 16" })
@IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeature = { "sve2", "true" })
public static void testUnmaskedBlendInt() {
IntVector va = IntVector.fromArray(I_SPECIES, ia, 0);
IntVector vb = IntVector.fromArray(I_SPECIES, ib, 0);
IntVector vc = IntVector.fromArray(I_SPECIES, ic, 0);
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(ir, 0);
}
@Test
@IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
applyIf = { "MaxVectorSize", "<= 16" })
@IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeature = { "sve2", "true" })
public static void testUnmaskedBlendLong() {
LongVector va = LongVector.fromArray(L_SPECIES, la, 0);
LongVector vb = LongVector.fromArray(L_SPECIES, lb, 0);
LongVector vc = LongVector.fromArray(L_SPECIES, lc, 0);
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(lr, 0);
}
@Test
@IR(counts = { IRNode.VECTOR_BLEND_B, "= 1",
IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
applyIf = { "MaxVectorSize", "<= 16" })
@IR(counts = { IRNode.VECTOR_BLEND_B, "= 1",
IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeature = { "sve2", "true" })
public static void testMaskedBlendByte() {
VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, mask_arr, 0);
ByteVector va = ByteVector.fromArray(B_SPECIES, ba, 0);
ByteVector vb = ByteVector.fromArray(B_SPECIES, bb, 0);
ByteVector vc = ByteVector.fromArray(B_SPECIES, bc, 0);
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(br, 0);
}
@Test
@IR(counts = { IRNode.VECTOR_BLEND_S, "= 1",
IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
applyIf = { "MaxVectorSize", "<= 16" })
@IR(counts = { IRNode.VECTOR_BLEND_S, "= 1",
IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeature = { "sve2", "true" })
public static void testMaskedBlendShort() {
VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, mask_arr, 0);
ShortVector va = ShortVector.fromArray(S_SPECIES, sa, 0);
ShortVector vb = ShortVector.fromArray(S_SPECIES, sb, 0);
ShortVector vc = ShortVector.fromArray(S_SPECIES, sc, 0);
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(sr, 0);
}
@Test
@IR(counts = { IRNode.VECTOR_BLEND_I, "= 1",
IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
applyIf = { "MaxVectorSize", "<= 16" })
@IR(counts = { IRNode.VECTOR_BLEND_I, "= 1",
IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeature = { "sve2", "true" })
public static void testMaskedBlendInt() {
VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, mask_arr, 0);
IntVector va = IntVector.fromArray(I_SPECIES, ia, 0);
IntVector vb = IntVector.fromArray(I_SPECIES, ib, 0);
IntVector vc = IntVector.fromArray(I_SPECIES, ic, 0);
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(ir, 0);
}
@Test
@IR(counts = { IRNode.VECTOR_BLEND_L, "= 1",
IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
applyIf = { "MaxVectorSize", "<= 16" })
@IR(counts = { IRNode.VECTOR_BLEND_L, "= 1",
IRNode.VECTOR_BITWISE_BLEND, "= 1" },
applyIfCPUFeature = { "sve2", "true" })
public static void testMaskedBlendLong() {
VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, mask_arr, 0);
LongVector va = LongVector.fromArray(L_SPECIES, la, 0);
LongVector vb = LongVector.fromArray(L_SPECIES, lb, 0);
LongVector vc = LongVector.fromArray(L_SPECIES, lc, 0);
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(lr, 0);
}
public static void main(String[] args) {
TestFramework testFramework = new TestFramework();
testFramework.setDefaultWarmup(10000)
.addFlags("--add-modules=jdk.incubator.vector")
.start();
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2022, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -55,8 +55,9 @@ public class MaskedLogicOpts {
int int512_arr_idx;
int int256_arr_idx;
int int128_arr_idx;
int long256_arr_idx;
int long512_arr_idx;
int long256_arr_idx;
int long128_arr_idx;
private Random r = new Random(1024);
@ -65,8 +66,9 @@ public class MaskedLogicOpts {
int512_arr_idx = -16;
int256_arr_idx = -8;
int128_arr_idx = -4;
long256_arr_idx = -4;
long512_arr_idx = -8;
long256_arr_idx = -4;
long128_arr_idx = -2;
mask_arr = new boolean[ARRAYLEN];
i1 = new int[ARRAYLEN];
@ -106,6 +108,7 @@ public class MaskedLogicOpts {
int128_arr_idx = (((ARRAYLEN & ~3) - int128_arr_idx) <= 4) ? 0 : int128_arr_idx + 4;
long512_arr_idx = (((ARRAYLEN & ~7) - long512_arr_idx) <= 8) ? 0 : long512_arr_idx + 8;
long256_arr_idx = (((ARRAYLEN & ~3) - long256_arr_idx) <= 4) ? 0 : long256_arr_idx + 4;
long128_arr_idx = (((ARRAYLEN & ~1) - long128_arr_idx) <= 2) ? 0 : long128_arr_idx + 2;
}
@CompilerControl(CompilerControl.Mode.INLINE)
@ -278,6 +281,11 @@ public class MaskedLogicOpts {
partiallyMaskedLogicOperationsLongKernel(LongVector.SPECIES_256, long256_arr_idx);
}
@Benchmark
public void partiallyMaskedLogicOperationsLong128() {
partiallyMaskedLogicOperationsLongKernel(LongVector.SPECIES_128, long128_arr_idx);
}
@CompilerControl(CompilerControl.Mode.INLINE)
public void bitwiseBlendOperationLongKernel(VectorSpecies<Long> SPECIES, int index) {
VectorMask<Long> lmask = VectorMask.fromArray(SPECIES, mask_arr, index);
@ -305,4 +313,9 @@ public class MaskedLogicOpts {
public void bitwiseBlendOperationLong256() {
bitwiseBlendOperationLongKernel(LongVector.SPECIES_256, long256_arr_idx);
}
@Benchmark
public void bitwiseBlendOperationLong128() {
bitwiseBlendOperationLongKernel(LongVector.SPECIES_128, long128_arr_idx);
}
}