diff --git a/test/hotspot/jtreg/compiler/vectorization/TestVectorAlgorithms.java b/test/hotspot/jtreg/compiler/vectorization/TestVectorAlgorithms.java index a0c20f58c26..af040b49c65 100644 --- a/test/hotspot/jtreg/compiler/vectorization/TestVectorAlgorithms.java +++ b/test/hotspot/jtreg/compiler/vectorization/TestVectorAlgorithms.java @@ -31,13 +31,12 @@ * @run driver ${test.main.class} */ -package compiler.loopopts.superword; +package compiler.vectorization; import java.util.Map; import java.util.HashMap; import jdk.test.lib.Utils; import java.util.Random; -import jdk.incubator.vector.*; import compiler.lib.ir_framework.*; import compiler.lib.generators.*; @@ -55,8 +54,6 @@ public class TestVectorAlgorithms { private static final Random RANDOM = Utils.getRandomInstance(); private static final RestrictableGenerator INT_GEN = Generators.G.ints(); - private static final VectorSpecies SPECIES_I = IntVector.SPECIES_PREFERRED; - interface TestFunction { Object run(); } @@ -67,7 +64,7 @@ public class TestVectorAlgorithms { public static void main(String[] args) { TestFramework framework = new TestFramework(); - framework.addFlags("--add-modules=jdk.incubator.vector"); + framework.addFlags("--add-modules=jdk.incubator.vector", "-XX:CompileCommand=inline,*VectorAlgorithmsImpl::*"); framework.start(); } @@ -131,60 +128,21 @@ public class TestVectorAlgorithms { IRNode.ADD_VI, "> 0"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) public int reduceAddI_loop(int[] a) { - int sum = 0; - for (int i = 0; i < a.length; i++) { - // Relying on simple reduction loop should vectorize since JDK26. - sum += a[i]; - } - return sum; + return VectorAlgorithmsImpl.reduceAddI_loop(a); } @Test public int reduceAddI_reassociate(int[] a) { - int sum = 0; - int i; - for (i = 0; i < a.length - 3; i+=4) { - // Unroll 4x, reassociate inside. - sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3]; - } - for (; i < a.length; i++) { - // Tail - sum += a[i]; - } - return sum; + return VectorAlgorithmsImpl.reduceAddI_reassociate(a); } @Test public int reduceAddI_VectorAPI_naive(int[] a) { - var sum = 0; - int i; - for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { - IntVector v = IntVector.fromArray(SPECIES_I, a, i); - // reduceLanes in loop is better than scalar performance, but still - // relatively slow. - sum += v.reduceLanes(VectorOperators.ADD); - } - for (; i < a.length; i++) { - sum += a[i]; - } - return sum; + return VectorAlgorithmsImpl.reduceAddI_VectorAPI_naive(aI); } @Test public int reduceAddI_VectorAPI_reduction_after_loop(int[] a) { - var acc = IntVector.broadcast(SPECIES_I, 0); - int i; - for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { - IntVector v = IntVector.fromArray(SPECIES_I, a, i); - // Element-wide addition into a vector of partial sums is much faster. - // Now, we only need to do a reduceLanes after the loop. - // This works because int-addition is associative and commutative. - acc = acc.add(v); - } - int sum = acc.reduceLanes(VectorOperators.ADD); - for (; i < a.length; i++) { - sum += a[i]; - } - return sum; + return VectorAlgorithmsImpl.reduceAddI_VectorAPI_reduction_after_loop(aI); } } diff --git a/test/hotspot/jtreg/compiler/vectorization/VectorAlgorithmsImpl.java b/test/hotspot/jtreg/compiler/vectorization/VectorAlgorithmsImpl.java new file mode 100644 index 00000000000..6037496a2a1 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorization/VectorAlgorithmsImpl.java @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +package compiler.vectorization; + +import jdk.incubator.vector.*; + +/** + * The code below is supposed to be an exact copy of: + * micro/org/openjdk/bench/vm/compiler/VectorAlgorithmsImpl.java + */ +public class VectorAlgorithmsImpl { + private static final VectorSpecies SPECIES_I = IntVector.SPECIES_PREFERRED; + private static final VectorSpecies SPECIES_I512 = IntVector.SPECIES_512; + + public static int reduceAddI_loop(int[] a) { + int sum = 0; + for (int i = 0; i < a.length; i++) { + // Relying on simple reduction loop should vectorize since JDK26. + sum += a[i]; + } + return sum; + } + + public static int reduceAddI_reassociate(int[] a) { + int sum = 0; + int i; + for (i = 0; i < a.length - 3; i+=4) { + // Unroll 4x, reassociate inside. + sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3]; + } + for (; i < a.length; i++) { + // Tail + sum += a[i]; + } + return sum; + } + + public static int reduceAddI_VectorAPI_naive(int[] a) { + var sum = 0; + int i; + for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { + IntVector v = IntVector.fromArray(SPECIES_I, a, i); + // reduceLanes in loop is better than scalar performance, but still + // relatively slow. + sum += v.reduceLanes(VectorOperators.ADD); + } + for (; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + public static int reduceAddI_VectorAPI_reduction_after_loop(int[] a) { + var acc = IntVector.broadcast(SPECIES_I, 0); + int i; + for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { + IntVector v = IntVector.fromArray(SPECIES_I, a, i); + // Element-wide addition into a vector of partial sums is much faster. + // Now, we only need to do a reduceLanes after the loop. + // This works because int-addition is associative and commutative. + acc = acc.add(v); + } + int sum = acc.reduceLanes(VectorOperators.ADD); + for (; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + //@Benchmark + //public void scanAddI_loop() { + // int sum = 0; + // for (int i = 0; i < AI.length; i++) { + // sum += AI[i]; + // RI[i] = sum; + // } + //} + + //@Benchmark + //public void scanAddI_loop_reassociate() { + // int sum = 0; + // for (int i = 0; i < AI.length; i+=4) { + // // We cut the latency by a factor of 4, but increase the number of additions. + // int old_sum = sum; + // int v0 = AI[i + 0]; + // int v1 = AI[i + 1]; + // int v2 = AI[i + 2]; + // int v3 = AI[i + 3]; + // int v01 = v0 + v1; + // int v23 = v2 + v3; + // int v0123 = v01 + v23; + // sum += v0123; + // RI[i + 0] = old_sum + v0; + // RI[i + 1] = old_sum + v01; + // RI[i + 2] = old_sum + v01 + v2; + // RI[i + 3] = old_sum + v0123; + // } + //} + + //@Benchmark + //public void scanAddI_VectorAPI_shift_blend_add() { + // // Using Naive Parallel Algorithm: Hills and Steele + // int sum = 0; + // for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) { + // IntVector v = IntVector.fromArray(SPECIES_I512, AI, i); + // v = v.add(v.lanewise(VectorOperators.LSHL, 1 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111110))); + // v = v.add(v.lanewise(VectorOperators.LSHL, 2 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111100))); + // v = v.add(v.lanewise(VectorOperators.LSHL, 4 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111110000))); + // v = v.add(v.lanewise(VectorOperators.LSHL, 8 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111100000000))); + // v = v.add(sum); + // v.intoArray(RI, i); + // sum = v.lane(SPECIES_I512.length() - 1); + // } + //} + + //@Benchmark + //public void scanAddI_VectorAPI_permute_add() { + // // Using Naive Parallel Algorithm: Hills and Steele + // int sum = 0; + // var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0); + // var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0); + // var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12}, 0); + // var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8}, 0); + // var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110); + // var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100); + // var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000); + // var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000); + // for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) { + // IntVector v = IntVector.fromArray(SPECIES_I512, AI, i); + // v = v.add(v.rearrange(shf1), mask1); + // v = v.add(v.rearrange(shf2), mask2); + // v = v.add(v.rearrange(shf3), mask3); + // v = v.add(v.rearrange(shf4), mask4); + // v = v.add(sum); + // v.intoArray(RI, i); + // sum = v.lane(SPECIES_I512.length() - 1); + // } + //} +} diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorAlgorithms.java b/test/micro/org/openjdk/bench/vm/compiler/VectorAlgorithms.java index b17d6a2e18c..a827b894875 100644 --- a/test/micro/org/openjdk/bench/vm/compiler/VectorAlgorithms.java +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorAlgorithms.java @@ -24,7 +24,6 @@ package org.openjdk.bench.vm.compiler; -import jdk.incubator.vector.*; import java.util.concurrent.TimeUnit; import org.openjdk.jmh.annotations.*; @@ -46,11 +45,8 @@ import org.openjdk.jmh.annotations.*; @State(Scope.Thread) @Warmup(iterations = 2, time = 1) @Measurement(iterations = 3, time = 1) -@Fork(value = 1, jvmArgs = {"--add-modules=jdk.incubator.vector"}) +@Fork(value = 1, jvmArgs = {"--add-modules=jdk.incubator.vector", "-XX:CompileCommand=inline,*VectorAlgorithmsImpl::*"}) public class VectorAlgorithms { - private static final VectorSpecies SPECIES_I = IntVector.SPECIES_PREFERRED; - private static final VectorSpecies SPECIES_I512 = IntVector.SPECIES_512; - @Param({"640000"}) public int SIZE; @@ -66,152 +62,22 @@ public class VectorAlgorithms { // ------------------------------------------------------------------------------------------ @Benchmark - public int bench_reduceAddI_loop() { - return reduceAddI_loop(aI); + public int reduceAddI_loop() { + return VectorAlgorithmsImpl.reduceAddI_loop(aI); } @Benchmark - public int bench_reduceAddI_reassociate() { - return reduceAddI_reassociate(aI); + public int reduceAddI_reassociate() { + return VectorAlgorithmsImpl.reduceAddI_reassociate(aI); } @Benchmark - public int bench_reduceAddI_VectorAPI_naive() { - return reduceAddI_VectorAPI_naive(aI); + public int reduceAddI_VectorAPI_naive() { + return VectorAlgorithmsImpl.reduceAddI_VectorAPI_naive(aI); } @Benchmark - public int bench_reduceAddI_VectorAPI_reduction_after_loop() { - return reduceAddI_VectorAPI_reduction_after_loop(aI); + public int reduceAddI_VectorAPI_reduction_after_loop() { + return VectorAlgorithmsImpl.reduceAddI_VectorAPI_reduction_after_loop(aI); } - - // ------------------------------------------------------------------------------------------ - // Below: just copied from TestVectorAlgorithms.java - // Only stripped @Test and @IR annotations. - // ------------------------------------------------------------------------------------------ - - public int reduceAddI_loop(int[] a) { - int sum = 0; - for (int i = 0; i < a.length; i++) { - // Relying on simple reduction loop should vectorize since JDK26. - sum += a[i]; - } - return sum; - } - - public int reduceAddI_reassociate(int[] a) { - int sum = 0; - int i; - for (i = 0; i < a.length - 3; i+=4) { - // Unroll 4x, reassociate inside. - sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3]; - } - for (; i < a.length; i++) { - // Tail - sum += a[i]; - } - return sum; - } - - public int reduceAddI_VectorAPI_naive(int[] a) { - var sum = 0; - int i; - for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { - IntVector v = IntVector.fromArray(SPECIES_I, a, i); - // reduceLanes in loop is better than scalar performance, but still - // relatively slow. - sum += v.reduceLanes(VectorOperators.ADD); - } - for (; i < a.length; i++) { - sum += a[i]; - } - return sum; - } - - public int reduceAddI_VectorAPI_reduction_after_loop(int[] a) { - var acc = IntVector.broadcast(SPECIES_I, 0); - int i; - for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { - IntVector v = IntVector.fromArray(SPECIES_I, a, i); - // Element-wide addition into a vector of partial sums is much faster. - // Now, we only need to do a reduceLanes after the loop. - // This works because int-addition is associative and commutative. - acc = acc.add(v); - } - int sum = acc.reduceLanes(VectorOperators.ADD); - for (; i < a.length; i++) { - sum += a[i]; - } - return sum; - } - - //@Benchmark - //public void scanAddI_loop() { - // int sum = 0; - // for (int i = 0; i < AI.length; i++) { - // sum += AI[i]; - // RI[i] = sum; - // } - //} - - //@Benchmark - //public void scanAddI_loop_reassociate() { - // int sum = 0; - // for (int i = 0; i < AI.length; i+=4) { - // // We cut the latency by a factor of 4, but increase the number of additions. - // int old_sum = sum; - // int v0 = AI[i + 0]; - // int v1 = AI[i + 1]; - // int v2 = AI[i + 2]; - // int v3 = AI[i + 3]; - // int v01 = v0 + v1; - // int v23 = v2 + v3; - // int v0123 = v01 + v23; - // sum += v0123; - // RI[i + 0] = old_sum + v0; - // RI[i + 1] = old_sum + v01; - // RI[i + 2] = old_sum + v01 + v2; - // RI[i + 3] = old_sum + v0123; - // } - //} - - //@Benchmark - //public void scanAddI_VectorAPI_shift_blend_add() { - // // Using Naive Parallel Algorithm: Hills and Steele - // int sum = 0; - // for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) { - // IntVector v = IntVector.fromArray(SPECIES_I512, AI, i); - // v = v.add(v.lanewise(VectorOperators.LSHL, 1 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111110))); - // v = v.add(v.lanewise(VectorOperators.LSHL, 2 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111100))); - // v = v.add(v.lanewise(VectorOperators.LSHL, 4 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111110000))); - // v = v.add(v.lanewise(VectorOperators.LSHL, 8 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111100000000))); - // v = v.add(sum); - // v.intoArray(RI, i); - // sum = v.lane(SPECIES_I512.length() - 1); - // } - //} - - //@Benchmark - //public void scanAddI_VectorAPI_permute_add() { - // // Using Naive Parallel Algorithm: Hills and Steele - // int sum = 0; - // var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0); - // var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0); - // var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12}, 0); - // var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8}, 0); - // var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110); - // var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100); - // var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000); - // var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000); - // for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) { - // IntVector v = IntVector.fromArray(SPECIES_I512, AI, i); - // v = v.add(v.rearrange(shf1), mask1); - // v = v.add(v.rearrange(shf2), mask2); - // v = v.add(v.rearrange(shf3), mask3); - // v = v.add(v.rearrange(shf4), mask4); - // v = v.add(sum); - // v.intoArray(RI, i); - // sum = v.lane(SPECIES_I512.length() - 1); - // } - //} } diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorAlgorithmsImpl.java b/test/micro/org/openjdk/bench/vm/compiler/VectorAlgorithmsImpl.java new file mode 100644 index 00000000000..6fbf659d0f6 --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorAlgorithmsImpl.java @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +package org.openjdk.bench.vm.compiler; + +import jdk.incubator.vector.*; + +/** + * The code below is supposed to be an exact copy of: + * test/hotspot/jtreg/compiler/vectorization/VectorAlgorithmsImpl.java + */ +public class VectorAlgorithmsImpl { + private static final VectorSpecies SPECIES_I = IntVector.SPECIES_PREFERRED; + private static final VectorSpecies SPECIES_I512 = IntVector.SPECIES_512; + + public static int reduceAddI_loop(int[] a) { + int sum = 0; + for (int i = 0; i < a.length; i++) { + // Relying on simple reduction loop should vectorize since JDK26. + sum += a[i]; + } + return sum; + } + + public static int reduceAddI_reassociate(int[] a) { + int sum = 0; + int i; + for (i = 0; i < a.length - 3; i+=4) { + // Unroll 4x, reassociate inside. + sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3]; + } + for (; i < a.length; i++) { + // Tail + sum += a[i]; + } + return sum; + } + + public static int reduceAddI_VectorAPI_naive(int[] a) { + var sum = 0; + int i; + for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { + IntVector v = IntVector.fromArray(SPECIES_I, a, i); + // reduceLanes in loop is better than scalar performance, but still + // relatively slow. + sum += v.reduceLanes(VectorOperators.ADD); + } + for (; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + public static int reduceAddI_VectorAPI_reduction_after_loop(int[] a) { + var acc = IntVector.broadcast(SPECIES_I, 0); + int i; + for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { + IntVector v = IntVector.fromArray(SPECIES_I, a, i); + // Element-wide addition into a vector of partial sums is much faster. + // Now, we only need to do a reduceLanes after the loop. + // This works because int-addition is associative and commutative. + acc = acc.add(v); + } + int sum = acc.reduceLanes(VectorOperators.ADD); + for (; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + //@Benchmark + //public void scanAddI_loop() { + // int sum = 0; + // for (int i = 0; i < AI.length; i++) { + // sum += AI[i]; + // RI[i] = sum; + // } + //} + + //@Benchmark + //public void scanAddI_loop_reassociate() { + // int sum = 0; + // for (int i = 0; i < AI.length; i+=4) { + // // We cut the latency by a factor of 4, but increase the number of additions. + // int old_sum = sum; + // int v0 = AI[i + 0]; + // int v1 = AI[i + 1]; + // int v2 = AI[i + 2]; + // int v3 = AI[i + 3]; + // int v01 = v0 + v1; + // int v23 = v2 + v3; + // int v0123 = v01 + v23; + // sum += v0123; + // RI[i + 0] = old_sum + v0; + // RI[i + 1] = old_sum + v01; + // RI[i + 2] = old_sum + v01 + v2; + // RI[i + 3] = old_sum + v0123; + // } + //} + + //@Benchmark + //public void scanAddI_VectorAPI_shift_blend_add() { + // // Using Naive Parallel Algorithm: Hills and Steele + // int sum = 0; + // for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) { + // IntVector v = IntVector.fromArray(SPECIES_I512, AI, i); + // v = v.add(v.lanewise(VectorOperators.LSHL, 1 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111110))); + // v = v.add(v.lanewise(VectorOperators.LSHL, 2 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111100))); + // v = v.add(v.lanewise(VectorOperators.LSHL, 4 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111110000))); + // v = v.add(v.lanewise(VectorOperators.LSHL, 8 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111100000000))); + // v = v.add(sum); + // v.intoArray(RI, i); + // sum = v.lane(SPECIES_I512.length() - 1); + // } + //} + + //@Benchmark + //public void scanAddI_VectorAPI_permute_add() { + // // Using Naive Parallel Algorithm: Hills and Steele + // int sum = 0; + // var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0); + // var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0); + // var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12}, 0); + // var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8}, 0); + // var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110); + // var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100); + // var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000); + // var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000); + // for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) { + // IntVector v = IntVector.fromArray(SPECIES_I512, AI, i); + // v = v.add(v.rearrange(shf1), mask1); + // v = v.add(v.rearrange(shf2), mask2); + // v = v.add(v.rearrange(shf3), mask3); + // v = v.add(v.rearrange(shf4), mask4); + // v = v.add(sum); + // v.intoArray(RI, i); + // sum = v.lane(SPECIES_I512.length() - 1); + // } + //} +}