mirror of
https://github.com/openjdk/jdk.git
synced 2026-06-11 04:57:12 +00:00
Vector API `lanewise BITWISE_BLEND` on AArch64 is currently lowered to a generic vector sequence built from `(XorV(AndV(XorV)))` nodes. AArch64 provides a more efficient mapping for this operation through the NEON `BSL` and SVE `BSL` (bitwise select) instructions. This change teaches C2 to recognize the `BITWISE_BLEND` patterns and lower them to the dedicated AArch64 instructions for better performance. The change includes the AArch64 match rules and assembler support, updates the AArch64 asm tests, adds IR framework nodes for the new mach instructions, introduces a new jtreg IR test and extends the MaskedLogicOpts JMH benchmark for 128-bit long type. JMH results show **11% - 54%** performance improvements for the optimized cases, and all jtreg tests (tier1, tier2 and tier3) passe on SVE2, SVE1, and NEON configurations. On a Nvidia Grace (Neoverse-V2) machine with 128-bit SVE2: ``` Benchmark Unit ARRAYLEN Before Error After Error Uplift bitwiseBlendOperationInt128 ops/s 256.00 3787.49 5.29 4277.64 8.89 1.13 bitwiseBlendOperationInt128 ops/s 512.00 1888.24 11.02 2143.21 6.32 1.14 bitwiseBlendOperationInt128 ops/s 1024.00 938.22 6.24 1053.45 14.68 1.12 bitwiseBlendOperationLong128 ops/s 256.00 1895.45 13.68 2140.31 3.68 1.13 bitwiseBlendOperationLong128 ops/s 512.00 938.71 5.32 1052.16 14.07 1.12 bitwiseBlendOperationLong128 ops/s 1024.00 474.15 2.33 526.49 2.62 1.11 ``` On an AWS Graviton3 (Neoverse-V1) machine with 256-bit SVE1: ``` Benchmark Unit ARRAYLEN Before Error After Error Uplift bitwiseBlendOperationInt128 ops/s 256.00 2051.52 13.85 2481.44 0.27 1.21 bitwiseBlendOperationInt128 ops/s 512.00 995.47 20.77 1235.10 5.70 1.24 bitwiseBlendOperationInt128 ops/s 1024.00 507.73 9.83 617.59 2.43 1.22 bitwiseBlendOperationLong128 ops/s 256.00 1000.99 21.50 1235.39 5.48 1.23 bitwiseBlendOperationLong128 ops/s 512.00 507.73 9.74 617.67 2.32 1.22 bitwiseBlendOperationLong128 ops/s 1024.00 258.86 0.01 310.70 0.04 1.20 ``` On a Nvidia Grace (Neoverse-V2) machine with 128-bit NEON: ``` Benchmark Unit ARRAYLEN Before Error After Error Uplift bitwiseBlendOperationInt128 ops/s 256.00 2336.17 13.18 3505.19 19.61 1.50 bitwiseBlendOperationInt128 ops/s 512.00 1145.50 12.40 1735.24 10.43 1.51 bitwiseBlendOperationInt128 ops/s 1024.00 571.41 6.51 866.01 3.34 1.52 bitwiseBlendOperationLong128 ops/s 256.00 1140.38 13.77 1740.28 11.16 1.53 bitwiseBlendOperationLong128 ops/s 512.00 570.20 7.58 865.67 3.33 1.52 bitwiseBlendOperationLong128 ops/s 1024.00 280.94 2.58 432.78 0.19 1.54 ```
222 lines
9.5 KiB
Java
222 lines
9.5 KiB
Java
/*
|
|
* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*/
|
|
|
|
/*
|
|
* @test
|
|
* @bug 8382052
|
|
* @key randomness
|
|
* @library /test/lib /
|
|
* @summary IR tests for AArch64 BITWISE_BLEND optimization match rules
|
|
* @modules jdk.incubator.vector
|
|
*
|
|
* @run driver ${test.main.class}
|
|
*/
|
|
|
|
package compiler.vectorapi;
|
|
|
|
import compiler.lib.generators.*;
|
|
import compiler.lib.ir_framework.*;
|
|
import jdk.incubator.vector.*;
|
|
|
|
public class VectorBitwiseBlendTest {
|
|
|
|
private static final Generators RD = Generators.G;
|
|
|
|
private static final VectorSpecies<Byte> B_SPECIES = ByteVector.SPECIES_MAX;
|
|
private static final VectorSpecies<Short> S_SPECIES = ShortVector.SPECIES_MAX;
|
|
private static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_MAX;
|
|
private static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_MAX;
|
|
|
|
private static final int BUF_LEN = 256;
|
|
|
|
private static final byte[] ba = new byte[BUF_LEN];
|
|
private static final byte[] bb = new byte[BUF_LEN];
|
|
private static final byte[] bc = new byte[BUF_LEN];
|
|
private static final byte[] br = new byte[BUF_LEN];
|
|
|
|
private static final short[] sa = new short[BUF_LEN];
|
|
private static final short[] sb = new short[BUF_LEN];
|
|
private static final short[] sc = new short[BUF_LEN];
|
|
private static final short[] sr = new short[BUF_LEN];
|
|
|
|
private static final int[] ia = new int[BUF_LEN];
|
|
private static final int[] ib = new int[BUF_LEN];
|
|
private static final int[] ic = new int[BUF_LEN];
|
|
private static final int[] ir = new int[BUF_LEN];
|
|
|
|
private static final long[] la = new long[BUF_LEN];
|
|
private static final long[] lb = new long[BUF_LEN];
|
|
private static final long[] lc = new long[BUF_LEN];
|
|
private static final long[] lr = new long[BUF_LEN];
|
|
|
|
private static final boolean[] mask_arr = new boolean[BUF_LEN];
|
|
|
|
static {
|
|
Generator<Integer> iGen = RD.ints();
|
|
Generator<Long> lGen = RD.longs();
|
|
|
|
for (int i = 0; i < BUF_LEN; i++) {
|
|
mask_arr[i] = (i & 1) != 0;
|
|
ba[i] = iGen.next().byteValue();
|
|
bb[i] = iGen.next().byteValue();
|
|
bc[i] = iGen.next().byteValue();
|
|
sa[i] = iGen.next().shortValue();
|
|
sb[i] = iGen.next().shortValue();
|
|
sc[i] = iGen.next().shortValue();
|
|
}
|
|
RD.fill(iGen, ia);
|
|
RD.fill(iGen, ib);
|
|
RD.fill(iGen, ic);
|
|
RD.fill(lGen, la);
|
|
RD.fill(lGen, lb);
|
|
RD.fill(lGen, lc);
|
|
}
|
|
|
|
@Test
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_NEON_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_SVE2, "= 1" },
|
|
applyIfCPUFeature = { "sve2", "true" })
|
|
public static void testUnmaskedBlendByte() {
|
|
ByteVector va = ByteVector.fromArray(B_SPECIES, ba, 0);
|
|
ByteVector vb = ByteVector.fromArray(B_SPECIES, bb, 0);
|
|
ByteVector vc = ByteVector.fromArray(B_SPECIES, bc, 0);
|
|
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(br, 0);
|
|
}
|
|
|
|
@Test
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_NEON_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_SVE2, "= 1" },
|
|
applyIfCPUFeature = { "sve2", "true" })
|
|
public static void testUnmaskedBlendShort() {
|
|
ShortVector va = ShortVector.fromArray(S_SPECIES, sa, 0);
|
|
ShortVector vb = ShortVector.fromArray(S_SPECIES, sb, 0);
|
|
ShortVector vc = ShortVector.fromArray(S_SPECIES, sc, 0);
|
|
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(sr, 0);
|
|
}
|
|
|
|
@Test
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_NEON_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_SVE2, "= 1" },
|
|
applyIfCPUFeature = { "sve2", "true" })
|
|
public static void testUnmaskedBlendInt() {
|
|
IntVector va = IntVector.fromArray(I_SPECIES, ia, 0);
|
|
IntVector vb = IntVector.fromArray(I_SPECIES, ib, 0);
|
|
IntVector vc = IntVector.fromArray(I_SPECIES, ic, 0);
|
|
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(ir, 0);
|
|
}
|
|
|
|
@Test
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_NEON_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_SVE2, "= 1" },
|
|
applyIfCPUFeature = { "sve2", "true" })
|
|
public static void testUnmaskedBlendLong() {
|
|
LongVector va = LongVector.fromArray(L_SPECIES, la, 0);
|
|
LongVector vb = LongVector.fromArray(L_SPECIES, lb, 0);
|
|
LongVector vc = LongVector.fromArray(L_SPECIES, lc, 0);
|
|
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(lr, 0);
|
|
}
|
|
|
|
@Test
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_NEON_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_MASKED_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "sve", "true", "sve2", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_MASKED_SVE2, "= 1" },
|
|
applyIfCPUFeature = { "sve2", "true" })
|
|
public static void testMaskedBlendByte() {
|
|
VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, mask_arr, 0);
|
|
ByteVector va = ByteVector.fromArray(B_SPECIES, ba, 0);
|
|
ByteVector vb = ByteVector.fromArray(B_SPECIES, bb, 0);
|
|
ByteVector vc = ByteVector.fromArray(B_SPECIES, bc, 0);
|
|
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(br, 0);
|
|
}
|
|
|
|
@Test
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_NEON_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_MASKED_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "sve", "true", "sve2", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_MASKED_SVE2, "= 1" },
|
|
applyIfCPUFeature = { "sve2", "true" })
|
|
public static void testMaskedBlendShort() {
|
|
VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, mask_arr, 0);
|
|
ShortVector va = ShortVector.fromArray(S_SPECIES, sa, 0);
|
|
ShortVector vb = ShortVector.fromArray(S_SPECIES, sb, 0);
|
|
ShortVector vc = ShortVector.fromArray(S_SPECIES, sc, 0);
|
|
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(sr, 0);
|
|
}
|
|
|
|
@Test
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_NEON_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_MASKED_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "sve", "true", "sve2", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_MASKED_SVE2, "= 1" },
|
|
applyIfCPUFeature = { "sve2", "true" })
|
|
public static void testMaskedBlendInt() {
|
|
VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, mask_arr, 0);
|
|
IntVector va = IntVector.fromArray(I_SPECIES, ia, 0);
|
|
IntVector vb = IntVector.fromArray(I_SPECIES, ib, 0);
|
|
IntVector vc = IntVector.fromArray(I_SPECIES, ic, 0);
|
|
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(ir, 0);
|
|
}
|
|
|
|
@Test
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_NEON_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_MASKED_SVE1, "= 1" },
|
|
applyIfCPUFeatureAnd = { "sve", "true", "sve2", "false" },
|
|
applyIf = { "MaxVectorSize", "<= 16" })
|
|
@IR(counts = { IRNode.VBITWISE_BLEND_MASKED_SVE2, "= 1" },
|
|
applyIfCPUFeature = { "sve2", "true" })
|
|
public static void testMaskedBlendLong() {
|
|
VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, mask_arr, 0);
|
|
LongVector va = LongVector.fromArray(L_SPECIES, la, 0);
|
|
LongVector vb = LongVector.fromArray(L_SPECIES, lb, 0);
|
|
LongVector vc = LongVector.fromArray(L_SPECIES, lc, 0);
|
|
va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(lr, 0);
|
|
}
|
|
|
|
public static void main(String[] args) {
|
|
TestFramework testFramework = new TestFramework();
|
|
testFramework.setDefaultWarmup(10000)
|
|
.addFlags("--add-modules=jdk.incubator.vector")
|
|
.start();
|
|
}
|
|
}
|