mirror of
https://github.com/openjdk/jdk.git
synced 2026-01-28 12:09:14 +00:00
refactor into separate file that is identical for test and benchmark
This commit is contained in:
parent
7020b32393
commit
480c7bcc65
@ -31,13 +31,12 @@
|
||||
* @run driver ${test.main.class}
|
||||
*/
|
||||
|
||||
package compiler.loopopts.superword;
|
||||
package compiler.vectorization;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import jdk.test.lib.Utils;
|
||||
import java.util.Random;
|
||||
import jdk.incubator.vector.*;
|
||||
|
||||
import compiler.lib.ir_framework.*;
|
||||
import compiler.lib.generators.*;
|
||||
@ -55,8 +54,6 @@ public class TestVectorAlgorithms {
|
||||
private static final Random RANDOM = Utils.getRandomInstance();
|
||||
private static final RestrictableGenerator<Integer> INT_GEN = Generators.G.ints();
|
||||
|
||||
private static final VectorSpecies<Integer> SPECIES_I = IntVector.SPECIES_PREFERRED;
|
||||
|
||||
interface TestFunction {
|
||||
Object run();
|
||||
}
|
||||
@ -67,7 +64,7 @@ public class TestVectorAlgorithms {
|
||||
|
||||
public static void main(String[] args) {
|
||||
TestFramework framework = new TestFramework();
|
||||
framework.addFlags("--add-modules=jdk.incubator.vector");
|
||||
framework.addFlags("--add-modules=jdk.incubator.vector", "-XX:CompileCommand=inline,*VectorAlgorithmsImpl::*");
|
||||
framework.start();
|
||||
}
|
||||
|
||||
@ -131,60 +128,21 @@ public class TestVectorAlgorithms {
|
||||
IRNode.ADD_VI, "> 0"},
|
||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
|
||||
public int reduceAddI_loop(int[] a) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
// Relying on simple reduction loop should vectorize since JDK26.
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
return VectorAlgorithmsImpl.reduceAddI_loop(a);
|
||||
}
|
||||
|
||||
@Test
|
||||
public int reduceAddI_reassociate(int[] a) {
|
||||
int sum = 0;
|
||||
int i;
|
||||
for (i = 0; i < a.length - 3; i+=4) {
|
||||
// Unroll 4x, reassociate inside.
|
||||
sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3];
|
||||
}
|
||||
for (; i < a.length; i++) {
|
||||
// Tail
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
return VectorAlgorithmsImpl.reduceAddI_reassociate(a);
|
||||
}
|
||||
|
||||
@Test
|
||||
public int reduceAddI_VectorAPI_naive(int[] a) {
|
||||
var sum = 0;
|
||||
int i;
|
||||
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
|
||||
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
|
||||
// reduceLanes in loop is better than scalar performance, but still
|
||||
// relatively slow.
|
||||
sum += v.reduceLanes(VectorOperators.ADD);
|
||||
}
|
||||
for (; i < a.length; i++) {
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
return VectorAlgorithmsImpl.reduceAddI_VectorAPI_naive(aI);
|
||||
}
|
||||
|
||||
@Test
|
||||
public int reduceAddI_VectorAPI_reduction_after_loop(int[] a) {
|
||||
var acc = IntVector.broadcast(SPECIES_I, 0);
|
||||
int i;
|
||||
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
|
||||
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
|
||||
// Element-wide addition into a vector of partial sums is much faster.
|
||||
// Now, we only need to do a reduceLanes after the loop.
|
||||
// This works because int-addition is associative and commutative.
|
||||
acc = acc.add(v);
|
||||
}
|
||||
int sum = acc.reduceLanes(VectorOperators.ADD);
|
||||
for (; i < a.length; i++) {
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
return VectorAlgorithmsImpl.reduceAddI_VectorAPI_reduction_after_loop(aI);
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,161 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
package compiler.vectorization;
|
||||
|
||||
import jdk.incubator.vector.*;
|
||||
|
||||
/**
|
||||
* The code below is supposed to be an exact copy of:
|
||||
* micro/org/openjdk/bench/vm/compiler/VectorAlgorithmsImpl.java
|
||||
*/
|
||||
public class VectorAlgorithmsImpl {
|
||||
private static final VectorSpecies<Integer> SPECIES_I = IntVector.SPECIES_PREFERRED;
|
||||
private static final VectorSpecies<Integer> SPECIES_I512 = IntVector.SPECIES_512;
|
||||
|
||||
public static int reduceAddI_loop(int[] a) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
// Relying on simple reduction loop should vectorize since JDK26.
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static int reduceAddI_reassociate(int[] a) {
|
||||
int sum = 0;
|
||||
int i;
|
||||
for (i = 0; i < a.length - 3; i+=4) {
|
||||
// Unroll 4x, reassociate inside.
|
||||
sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3];
|
||||
}
|
||||
for (; i < a.length; i++) {
|
||||
// Tail
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static int reduceAddI_VectorAPI_naive(int[] a) {
|
||||
var sum = 0;
|
||||
int i;
|
||||
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
|
||||
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
|
||||
// reduceLanes in loop is better than scalar performance, but still
|
||||
// relatively slow.
|
||||
sum += v.reduceLanes(VectorOperators.ADD);
|
||||
}
|
||||
for (; i < a.length; i++) {
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static int reduceAddI_VectorAPI_reduction_after_loop(int[] a) {
|
||||
var acc = IntVector.broadcast(SPECIES_I, 0);
|
||||
int i;
|
||||
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
|
||||
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
|
||||
// Element-wide addition into a vector of partial sums is much faster.
|
||||
// Now, we only need to do a reduceLanes after the loop.
|
||||
// This works because int-addition is associative and commutative.
|
||||
acc = acc.add(v);
|
||||
}
|
||||
int sum = acc.reduceLanes(VectorOperators.ADD);
|
||||
for (; i < a.length; i++) {
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_loop() {
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < AI.length; i++) {
|
||||
// sum += AI[i];
|
||||
// RI[i] = sum;
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_loop_reassociate() {
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < AI.length; i+=4) {
|
||||
// // We cut the latency by a factor of 4, but increase the number of additions.
|
||||
// int old_sum = sum;
|
||||
// int v0 = AI[i + 0];
|
||||
// int v1 = AI[i + 1];
|
||||
// int v2 = AI[i + 2];
|
||||
// int v3 = AI[i + 3];
|
||||
// int v01 = v0 + v1;
|
||||
// int v23 = v2 + v3;
|
||||
// int v0123 = v01 + v23;
|
||||
// sum += v0123;
|
||||
// RI[i + 0] = old_sum + v0;
|
||||
// RI[i + 1] = old_sum + v01;
|
||||
// RI[i + 2] = old_sum + v01 + v2;
|
||||
// RI[i + 3] = old_sum + v0123;
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_VectorAPI_shift_blend_add() {
|
||||
// // Using Naive Parallel Algorithm: Hills and Steele
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) {
|
||||
// IntVector v = IntVector.fromArray(SPECIES_I512, AI, i);
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 1 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111110)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 2 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111100)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 4 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111110000)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 8 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111100000000)));
|
||||
// v = v.add(sum);
|
||||
// v.intoArray(RI, i);
|
||||
// sum = v.lane(SPECIES_I512.length() - 1);
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_VectorAPI_permute_add() {
|
||||
// // Using Naive Parallel Algorithm: Hills and Steele
|
||||
// int sum = 0;
|
||||
// var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0);
|
||||
// var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0);
|
||||
// var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12}, 0);
|
||||
// var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8}, 0);
|
||||
// var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110);
|
||||
// var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100);
|
||||
// var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000);
|
||||
// var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000);
|
||||
// for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) {
|
||||
// IntVector v = IntVector.fromArray(SPECIES_I512, AI, i);
|
||||
// v = v.add(v.rearrange(shf1), mask1);
|
||||
// v = v.add(v.rearrange(shf2), mask2);
|
||||
// v = v.add(v.rearrange(shf3), mask3);
|
||||
// v = v.add(v.rearrange(shf4), mask4);
|
||||
// v = v.add(sum);
|
||||
// v.intoArray(RI, i);
|
||||
// sum = v.lane(SPECIES_I512.length() - 1);
|
||||
// }
|
||||
//}
|
||||
}
|
||||
@ -24,7 +24,6 @@
|
||||
|
||||
package org.openjdk.bench.vm.compiler;
|
||||
|
||||
import jdk.incubator.vector.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
@ -46,11 +45,8 @@ import org.openjdk.jmh.annotations.*;
|
||||
@State(Scope.Thread)
|
||||
@Warmup(iterations = 2, time = 1)
|
||||
@Measurement(iterations = 3, time = 1)
|
||||
@Fork(value = 1, jvmArgs = {"--add-modules=jdk.incubator.vector"})
|
||||
@Fork(value = 1, jvmArgs = {"--add-modules=jdk.incubator.vector", "-XX:CompileCommand=inline,*VectorAlgorithmsImpl::*"})
|
||||
public class VectorAlgorithms {
|
||||
private static final VectorSpecies<Integer> SPECIES_I = IntVector.SPECIES_PREFERRED;
|
||||
private static final VectorSpecies<Integer> SPECIES_I512 = IntVector.SPECIES_512;
|
||||
|
||||
@Param({"640000"})
|
||||
public int SIZE;
|
||||
|
||||
@ -66,152 +62,22 @@ public class VectorAlgorithms {
|
||||
// ------------------------------------------------------------------------------------------
|
||||
|
||||
@Benchmark
|
||||
public int bench_reduceAddI_loop() {
|
||||
return reduceAddI_loop(aI);
|
||||
public int reduceAddI_loop() {
|
||||
return VectorAlgorithmsImpl.reduceAddI_loop(aI);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int bench_reduceAddI_reassociate() {
|
||||
return reduceAddI_reassociate(aI);
|
||||
public int reduceAddI_reassociate() {
|
||||
return VectorAlgorithmsImpl.reduceAddI_reassociate(aI);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int bench_reduceAddI_VectorAPI_naive() {
|
||||
return reduceAddI_VectorAPI_naive(aI);
|
||||
public int reduceAddI_VectorAPI_naive() {
|
||||
return VectorAlgorithmsImpl.reduceAddI_VectorAPI_naive(aI);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int bench_reduceAddI_VectorAPI_reduction_after_loop() {
|
||||
return reduceAddI_VectorAPI_reduction_after_loop(aI);
|
||||
public int reduceAddI_VectorAPI_reduction_after_loop() {
|
||||
return VectorAlgorithmsImpl.reduceAddI_VectorAPI_reduction_after_loop(aI);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------------
|
||||
// Below: just copied from TestVectorAlgorithms.java
|
||||
// Only stripped @Test and @IR annotations.
|
||||
// ------------------------------------------------------------------------------------------
|
||||
|
||||
public int reduceAddI_loop(int[] a) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
// Relying on simple reduction loop should vectorize since JDK26.
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public int reduceAddI_reassociate(int[] a) {
|
||||
int sum = 0;
|
||||
int i;
|
||||
for (i = 0; i < a.length - 3; i+=4) {
|
||||
// Unroll 4x, reassociate inside.
|
||||
sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3];
|
||||
}
|
||||
for (; i < a.length; i++) {
|
||||
// Tail
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public int reduceAddI_VectorAPI_naive(int[] a) {
|
||||
var sum = 0;
|
||||
int i;
|
||||
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
|
||||
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
|
||||
// reduceLanes in loop is better than scalar performance, but still
|
||||
// relatively slow.
|
||||
sum += v.reduceLanes(VectorOperators.ADD);
|
||||
}
|
||||
for (; i < a.length; i++) {
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public int reduceAddI_VectorAPI_reduction_after_loop(int[] a) {
|
||||
var acc = IntVector.broadcast(SPECIES_I, 0);
|
||||
int i;
|
||||
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
|
||||
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
|
||||
// Element-wide addition into a vector of partial sums is much faster.
|
||||
// Now, we only need to do a reduceLanes after the loop.
|
||||
// This works because int-addition is associative and commutative.
|
||||
acc = acc.add(v);
|
||||
}
|
||||
int sum = acc.reduceLanes(VectorOperators.ADD);
|
||||
for (; i < a.length; i++) {
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_loop() {
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < AI.length; i++) {
|
||||
// sum += AI[i];
|
||||
// RI[i] = sum;
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_loop_reassociate() {
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < AI.length; i+=4) {
|
||||
// // We cut the latency by a factor of 4, but increase the number of additions.
|
||||
// int old_sum = sum;
|
||||
// int v0 = AI[i + 0];
|
||||
// int v1 = AI[i + 1];
|
||||
// int v2 = AI[i + 2];
|
||||
// int v3 = AI[i + 3];
|
||||
// int v01 = v0 + v1;
|
||||
// int v23 = v2 + v3;
|
||||
// int v0123 = v01 + v23;
|
||||
// sum += v0123;
|
||||
// RI[i + 0] = old_sum + v0;
|
||||
// RI[i + 1] = old_sum + v01;
|
||||
// RI[i + 2] = old_sum + v01 + v2;
|
||||
// RI[i + 3] = old_sum + v0123;
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_VectorAPI_shift_blend_add() {
|
||||
// // Using Naive Parallel Algorithm: Hills and Steele
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) {
|
||||
// IntVector v = IntVector.fromArray(SPECIES_I512, AI, i);
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 1 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111110)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 2 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111100)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 4 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111110000)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 8 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111100000000)));
|
||||
// v = v.add(sum);
|
||||
// v.intoArray(RI, i);
|
||||
// sum = v.lane(SPECIES_I512.length() - 1);
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_VectorAPI_permute_add() {
|
||||
// // Using Naive Parallel Algorithm: Hills and Steele
|
||||
// int sum = 0;
|
||||
// var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0);
|
||||
// var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0);
|
||||
// var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12}, 0);
|
||||
// var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8}, 0);
|
||||
// var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110);
|
||||
// var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100);
|
||||
// var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000);
|
||||
// var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000);
|
||||
// for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) {
|
||||
// IntVector v = IntVector.fromArray(SPECIES_I512, AI, i);
|
||||
// v = v.add(v.rearrange(shf1), mask1);
|
||||
// v = v.add(v.rearrange(shf2), mask2);
|
||||
// v = v.add(v.rearrange(shf3), mask3);
|
||||
// v = v.add(v.rearrange(shf4), mask4);
|
||||
// v = v.add(sum);
|
||||
// v.intoArray(RI, i);
|
||||
// sum = v.lane(SPECIES_I512.length() - 1);
|
||||
// }
|
||||
//}
|
||||
}
|
||||
|
||||
@ -0,0 +1,161 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
package org.openjdk.bench.vm.compiler;
|
||||
|
||||
import jdk.incubator.vector.*;
|
||||
|
||||
/**
|
||||
* The code below is supposed to be an exact copy of:
|
||||
* test/hotspot/jtreg/compiler/vectorization/VectorAlgorithmsImpl.java
|
||||
*/
|
||||
public class VectorAlgorithmsImpl {
|
||||
private static final VectorSpecies<Integer> SPECIES_I = IntVector.SPECIES_PREFERRED;
|
||||
private static final VectorSpecies<Integer> SPECIES_I512 = IntVector.SPECIES_512;
|
||||
|
||||
public static int reduceAddI_loop(int[] a) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
// Relying on simple reduction loop should vectorize since JDK26.
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static int reduceAddI_reassociate(int[] a) {
|
||||
int sum = 0;
|
||||
int i;
|
||||
for (i = 0; i < a.length - 3; i+=4) {
|
||||
// Unroll 4x, reassociate inside.
|
||||
sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3];
|
||||
}
|
||||
for (; i < a.length; i++) {
|
||||
// Tail
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static int reduceAddI_VectorAPI_naive(int[] a) {
|
||||
var sum = 0;
|
||||
int i;
|
||||
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
|
||||
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
|
||||
// reduceLanes in loop is better than scalar performance, but still
|
||||
// relatively slow.
|
||||
sum += v.reduceLanes(VectorOperators.ADD);
|
||||
}
|
||||
for (; i < a.length; i++) {
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static int reduceAddI_VectorAPI_reduction_after_loop(int[] a) {
|
||||
var acc = IntVector.broadcast(SPECIES_I, 0);
|
||||
int i;
|
||||
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
|
||||
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
|
||||
// Element-wide addition into a vector of partial sums is much faster.
|
||||
// Now, we only need to do a reduceLanes after the loop.
|
||||
// This works because int-addition is associative and commutative.
|
||||
acc = acc.add(v);
|
||||
}
|
||||
int sum = acc.reduceLanes(VectorOperators.ADD);
|
||||
for (; i < a.length; i++) {
|
||||
sum += a[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_loop() {
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < AI.length; i++) {
|
||||
// sum += AI[i];
|
||||
// RI[i] = sum;
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_loop_reassociate() {
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < AI.length; i+=4) {
|
||||
// // We cut the latency by a factor of 4, but increase the number of additions.
|
||||
// int old_sum = sum;
|
||||
// int v0 = AI[i + 0];
|
||||
// int v1 = AI[i + 1];
|
||||
// int v2 = AI[i + 2];
|
||||
// int v3 = AI[i + 3];
|
||||
// int v01 = v0 + v1;
|
||||
// int v23 = v2 + v3;
|
||||
// int v0123 = v01 + v23;
|
||||
// sum += v0123;
|
||||
// RI[i + 0] = old_sum + v0;
|
||||
// RI[i + 1] = old_sum + v01;
|
||||
// RI[i + 2] = old_sum + v01 + v2;
|
||||
// RI[i + 3] = old_sum + v0123;
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_VectorAPI_shift_blend_add() {
|
||||
// // Using Naive Parallel Algorithm: Hills and Steele
|
||||
// int sum = 0;
|
||||
// for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) {
|
||||
// IntVector v = IntVector.fromArray(SPECIES_I512, AI, i);
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 1 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111110)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 2 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111100)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 4 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111110000)));
|
||||
// v = v.add(v.lanewise(VectorOperators.LSHL, 8 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111100000000)));
|
||||
// v = v.add(sum);
|
||||
// v.intoArray(RI, i);
|
||||
// sum = v.lane(SPECIES_I512.length() - 1);
|
||||
// }
|
||||
//}
|
||||
|
||||
//@Benchmark
|
||||
//public void scanAddI_VectorAPI_permute_add() {
|
||||
// // Using Naive Parallel Algorithm: Hills and Steele
|
||||
// int sum = 0;
|
||||
// var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0);
|
||||
// var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0);
|
||||
// var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12}, 0);
|
||||
// var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8}, 0);
|
||||
// var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110);
|
||||
// var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100);
|
||||
// var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000);
|
||||
// var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000);
|
||||
// for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) {
|
||||
// IntVector v = IntVector.fromArray(SPECIES_I512, AI, i);
|
||||
// v = v.add(v.rearrange(shf1), mask1);
|
||||
// v = v.add(v.rearrange(shf2), mask2);
|
||||
// v = v.add(v.rearrange(shf3), mask3);
|
||||
// v = v.add(v.rearrange(shf4), mask4);
|
||||
// v = v.add(sum);
|
||||
// v.intoArray(RI, i);
|
||||
// sum = v.lane(SPECIES_I512.length() - 1);
|
||||
// }
|
||||
//}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user