benchmark and test for scanAddI

This commit is contained in:
Emanuel Peter 2025-12-04 16:13:42 +01:00
parent e17d87f78b
commit 57f0d5eb79
4 changed files with 84 additions and 99 deletions

View File

@ -90,7 +90,6 @@ public class TestVectorAlgorithms {
testGroups.put("scanAddI", new HashMap<String,TestFunction>());
testGroups.get("scanAddI").put("scanAddI_loop", () -> { return scanAddI_loop(aI, rI1); });
testGroups.get("scanAddI").put("scanAddI_loop_reassociate", () -> { return scanAddI_loop_reassociate(aI, rI2); });
testGroups.get("scanAddI").put("scanAddI_VectorAPI_shift_blend_add", () -> { return scanAddI_VectorAPI_shift_blend_add(aI, rI3); });
testGroups.get("scanAddI").put("scanAddI_VectorAPI_permute_add", () -> { return scanAddI_VectorAPI_permute_add(aI, rI4); });
}
@ -101,7 +100,6 @@ public class TestVectorAlgorithms {
"reduceAddI_VectorAPI_reduction_after_loop",
"scanAddI_loop",
"scanAddI_loop_reassociate",
"scanAddI_VectorAPI_shift_blend_add",
"scanAddI_VectorAPI_permute_add"})
public void runTests(RunInfo info) {
// Repeat many times, so that we also have multiple iterations for post-warmup to potentially recompile
@ -177,11 +175,6 @@ public class TestVectorAlgorithms {
return VectorAlgorithmsImpl.scanAddI_loop_reassociate(a, r);
}
@Test
public Object scanAddI_VectorAPI_shift_blend_add(int[] a, int[] r) {
return VectorAlgorithmsImpl.scanAddI_VectorAPI_shift_blend_add(a, r);
}
@Test
public Object scanAddI_VectorAPI_permute_add(int[] a, int[] r) {
return VectorAlgorithmsImpl.scanAddI_VectorAPI_permute_add(a, r);

View File

@ -124,34 +124,14 @@ public class VectorAlgorithmsImpl {
return r;
}
public static Object scanAddI_VectorAPI_shift_blend_add(int[] a, int[] r) {
// Using Naive Parallel Algorithm: Hills and Steele
int sum = 0;
int i = 0;
//for (; i < SPECIES_I512.loopBound(a.length); i += SPECIES_I512.length()) {
// IntVector v = IntVector.fromArray(SPECIES_I512, a, i);
// v = v.add(v.lanewise(VectorOperators.LSHL, 1 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111110)));
// v = v.add(v.lanewise(VectorOperators.LSHL, 2 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111100)));
// v = v.add(v.lanewise(VectorOperators.LSHL, 4 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111110000)));
// v = v.add(v.lanewise(VectorOperators.LSHL, 8 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111100000000)));
// v = v.add(sum);
// v.intoArray(r, i);
// sum = v.lane(SPECIES_I512.length() - 1);
//}
for (; i < a.length; i++) {
sum += a[i];
r[i] = sum;
}
return r;
}
public static Object scanAddI_VectorAPI_permute_add(int[] a, int[] r) {
// Using Naive Parallel Algorithm: Hills and Steele
int sum = 0;
var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0);
var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0);
var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 0);
var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7}, 0);
int xx = 0; // masked later anyway
var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0);
var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0);
var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 0);
var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, xx, xx, xx, xx, 0, 1, 2, 3, 4, 5, 6, 7}, 0);
var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110);
var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100);
var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000);

View File

@ -51,10 +51,12 @@ public class VectorAlgorithms {
public int SIZE;
public static int[] aI;
public static int[] rI;
@Setup
public void init() {
aI = new int[SIZE];
rI = new int[SIZE];
}
// ------------------------------------------------------------------------------------------
@ -80,4 +82,19 @@ public class VectorAlgorithms {
public int reduceAddI_VectorAPI_reduction_after_loop() {
return VectorAlgorithmsImpl.reduceAddI_VectorAPI_reduction_after_loop(aI);
}
@Benchmark
public Object scanAddI_loop() {
return VectorAlgorithmsImpl.scanAddI_loop(aI, rI);
}
@Benchmark
public Object scanAddI_loop_reassociate() {
return VectorAlgorithmsImpl.scanAddI_loop_reassociate(aI, rI);
}
@Benchmark
public Object scanAddI_VectorAPI_permute_add() {
return VectorAlgorithmsImpl.scanAddI_VectorAPI_permute_add(aI, rI);
}
}

View File

@ -89,73 +89,68 @@ public class VectorAlgorithmsImpl {
return sum;
}
//@Benchmark
//public void scanAddI_loop() {
// int sum = 0;
// for (int i = 0; i < AI.length; i++) {
// sum += AI[i];
// RI[i] = sum;
// }
//}
public static Object scanAddI_loop(int[] a, int[] r) {
int sum = 0;
for (int i = 0; i < a.length; i++) {
sum += a[i];
r[i] = sum;
}
return r;
}
//@Benchmark
//public void scanAddI_loop_reassociate() {
// int sum = 0;
// for (int i = 0; i < AI.length; i+=4) {
// // We cut the latency by a factor of 4, but increase the number of additions.
// int old_sum = sum;
// int v0 = AI[i + 0];
// int v1 = AI[i + 1];
// int v2 = AI[i + 2];
// int v3 = AI[i + 3];
// int v01 = v0 + v1;
// int v23 = v2 + v3;
// int v0123 = v01 + v23;
// sum += v0123;
// RI[i + 0] = old_sum + v0;
// RI[i + 1] = old_sum + v01;
// RI[i + 2] = old_sum + v01 + v2;
// RI[i + 3] = old_sum + v0123;
// }
//}
public static Object scanAddI_loop_reassociate(int[] a, int[] r) {
int sum = 0;
int i = 0;
for (; i < a.length - 3; i+=4) {
// We cut the latency by a factor of 4, but increase the number of additions.
int old_sum = sum;
int v0 = a[i + 0];
int v1 = a[i + 1];
int v2 = a[i + 2];
int v3 = a[i + 3];
int v01 = v0 + v1;
int v23 = v2 + v3;
int v0123 = v01 + v23;
sum += v0123;
r[i + 0] = old_sum + v0;
r[i + 1] = old_sum + v01;
r[i + 2] = old_sum + v01 + v2;
r[i + 3] = old_sum + v0123;
}
for (; i < a.length; i++) {
sum += a[i];
r[i] = sum;
}
return r;
}
//@Benchmark
//public void scanAddI_VectorAPI_shift_blend_add() {
// // Using Naive Parallel Algorithm: Hills and Steele
// int sum = 0;
// for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) {
// IntVector v = IntVector.fromArray(SPECIES_I512, AI, i);
// v = v.add(v.lanewise(VectorOperators.LSHL, 1 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111110)));
// v = v.add(v.lanewise(VectorOperators.LSHL, 2 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111111100)));
// v = v.add(v.lanewise(VectorOperators.LSHL, 4 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111111110000)));
// v = v.add(v.lanewise(VectorOperators.LSHL, 8 ).blend(0, VectorMask.fromLong(SPECIES_I512, 0b1111111100000000)));
// v = v.add(sum);
// v.intoArray(RI, i);
// sum = v.lane(SPECIES_I512.length() - 1);
// }
//}
//@Benchmark
//public void scanAddI_VectorAPI_permute_add() {
// // Using Naive Parallel Algorithm: Hills and Steele
// int sum = 0;
// var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0);
// var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0);
// var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12}, 0);
// var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 6, 7, 8}, 0);
// var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110);
// var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100);
// var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000);
// var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000);
// for (int i = 0; i < SPECIES_I512.loopBound(AI.length); i += SPECIES_I512.length()) {
// IntVector v = IntVector.fromArray(SPECIES_I512, AI, i);
// v = v.add(v.rearrange(shf1), mask1);
// v = v.add(v.rearrange(shf2), mask2);
// v = v.add(v.rearrange(shf3), mask3);
// v = v.add(v.rearrange(shf4), mask4);
// v = v.add(sum);
// v.intoArray(RI, i);
// sum = v.lane(SPECIES_I512.length() - 1);
// }
//}
public static Object scanAddI_VectorAPI_permute_add(int[] a, int[] r) {
// Using Naive Parallel Algorithm: Hills and Steele
int sum = 0;
int xx = 0; // masked later anyway
var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0);
var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0);
var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 0);
var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, xx, xx, xx, xx, 0, 1, 2, 3, 4, 5, 6, 7}, 0);
var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110);
var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100);
var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000);
var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000);
int i = 0;
for (; i < SPECIES_I512.loopBound(a.length); i += SPECIES_I512.length()) {
IntVector v = IntVector.fromArray(SPECIES_I512, a, i);
v = v.add(v.rearrange(shf1), mask1);
v = v.add(v.rearrange(shf2), mask2);
v = v.add(v.rearrange(shf3), mask3);
v = v.add(v.rearrange(shf4), mask4);
v = v.add(sum);
v.intoArray(r, i);
sum = v.lane(SPECIES_I512.length() - 1);
}
for (; i < a.length; i++) {
sum += a[i];
r[i] = sum;
}
return r;
}
}