jdk/test/hotspot/jtreg/compiler/vectorization/VectorAlgorithmsImpl.java
2026-01-16 17:00:47 +01:00

522 lines
18 KiB
Java

/*
* Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
package compiler.vectorization;
import java.util.Arrays;
import jdk.incubator.vector.*;
/**
* The code below is supposed to be an exact copy of:
* micro/org/openjdk/bench/vm/compiler/VectorAlgorithmsImpl.java
*/
public class VectorAlgorithmsImpl {
private static final VectorSpecies<Integer> SPECIES_I = IntVector.SPECIES_PREFERRED;
private static final VectorSpecies<Integer> SPECIES_I512 = IntVector.SPECIES_512;
private static final VectorSpecies<Integer> SPECIES_I256 = IntVector.SPECIES_256;
private static final VectorSpecies<Byte> SPECIES_B64 = ByteVector.SPECIES_64;
private static final VectorSpecies<Float> SPECIES_F = FloatVector.SPECIES_PREFERRED;
public static Object fillI_loop(int[] r) {
for (int i = 0; i < r.length; i++) {
r[i] = 42;
}
return r;
}
public static Object fillI_Arrays(int[] r) {
Arrays.fill(r, 42);
return r;
}
public static Object fillI_VectorAPI(int[] r) {
var v = IntVector.broadcast(SPECIES_I, 42);
int i = 0;
for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) {
v.intoArray(r, i);
}
for (; i < r.length; i++) {
r[i] = 42;
}
return r;
}
public static Object iotaI_loop(int[] r) {
for (int i = 0; i < r.length; i++) {
r[i] = i;
}
return r;
}
public static Object iotaI_VectorAPI(int[] r) {
var iota = IntVector.broadcast(SPECIES_I, 0).addIndex(1);
int i = 0;
for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) {
iota.intoArray(r, i);
iota = iota.add(SPECIES_I.length());
}
for (; i < r.length; i++) {
r[i] = i;
}
return r;
}
public static Object copyI_loop(int[] a, int[] r) {
for (int i = 0; i < a.length; i++) {
r[i] = a[i];
}
return r;
}
public static Object copyI_System_arraycopy(int[] a, int[] r) {
System.arraycopy(a, 0, r, 0, a.length);
return r;
}
public static Object copyI_VectorAPI(int[] a, int[] r) {
int i = 0;
for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) {
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
v.intoArray(r, i);
}
for (; i < r.length; i++) {
r[i] = a[i];
}
return r;
}
public static Object mapI_loop(int[] a, int[] r) {
for (int i = 0; i < a.length; i++) {
r[i] = a[i] * 42;
}
return r;
}
public static Object mapI_VectorAPI(int[] a, int[] r) {
int i = 0;
for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) {
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
v = v.mul(42);
v.intoArray(r, i);
}
for (; i < r.length; i++) {
r[i] = a[i];
}
return r;
}
public static int reduceAddI_loop(int[] a) {
int sum = 0;
for (int i = 0; i < a.length; i++) {
// Relying on simple reduction loop should vectorize since JDK26.
sum += a[i];
}
return sum;
}
public static int reduceAddI_reassociate(int[] a) {
int sum = 0;
int i;
for (i = 0; i < a.length - 3; i+=4) {
// Unroll 4x, reassociate inside.
sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3];
}
for (; i < a.length; i++) {
// Tail
sum += a[i];
}
return sum;
}
public static int reduceAddI_VectorAPI_naive(int[] a) {
var sum = 0;
int i;
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
// reduceLanes in loop is better than scalar performance, but still
// relatively slow.
sum += v.reduceLanes(VectorOperators.ADD);
}
for (; i < a.length; i++) {
sum += a[i];
}
return sum;
}
public static int reduceAddI_VectorAPI_reduction_after_loop(int[] a) {
var acc = IntVector.broadcast(SPECIES_I, 0);
int i;
for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
// Element-wide addition into a vector of partial sums is much faster.
// Now, we only need to do a reduceLanes after the loop.
// This works because int-addition is associative and commutative.
acc = acc.add(v);
}
int sum = acc.reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
sum += a[i];
}
return sum;
}
public static float dotProductF_loop(float[] a, float[] b) {
float sum = 0;
for (int i = 0; i < a.length; i++) {
sum += a[i] * b[i];
}
return sum;
}
public static float dotProductF_VectorAPI_naive(float[] a, float[] b) {
float sum = 0;
int i;
for (i = 0; i < SPECIES_F.loopBound(a.length); i += SPECIES_F.length()) {
var va = FloatVector.fromArray(SPECIES_F, a, i);
var vb = FloatVector.fromArray(SPECIES_F, b, i);
sum += va.mul(vb).reduceLanes(VectorOperators.ADD);
}
for (; i < a.length; i++) {
sum += a[i] * b[i];
}
return sum;
}
public static float dotProductF_VectorAPI_reduction_after_loop(float[] a, float[] b) {
var sums = FloatVector.broadcast(SPECIES_F, 0.0f);
int i;
for (i = 0; i < SPECIES_F.loopBound(a.length); i += SPECIES_F.length()) {
var va = FloatVector.fromArray(SPECIES_F, a, i);
var vb = FloatVector.fromArray(SPECIES_F, b, i);
sums = sums.add(va.mul(vb));
}
float sum = sums.reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
sum += a[i] * b[i];
}
return sum;
}
public static int hashCodeB_loop(byte[] a) {
int h = 1;
for (int i = 0; i < a.length; i++) {
h = 31 * h + a[i];
}
return h;
}
public static int hashCodeB_Arrays(byte[] a) {
return Arrays.hashCode(a);
}
// Simplified intrinsic code from C2_MacroAssembler::arrays_hashcode in c2_MacroAssembler_x86.cpp
//
// Ideas that may help understand the code:
//
// h(i) = 31 * h(i-1) + a[i]
// "unroll" by factor of L=8:
// h(i+8) = h(i) * 31^8 + a[i+1] * 31^7 + a[i+2] * 31^6 + ... + a[i+8] * 1
// ----------- ------------------------------------------------
// scalar vector: notice the powers of 31 in reverse
//
// We notice that we can load a[i+1 .. i+8], then element-wise multiply with
// the vector of reversed powers-of-31, and then do reduceLanes(ADD).
// But we can do even better: By looking at multiple such 8-unrolled iterations.
// Instead of applying the "next" factor of "31^8" to the reduced scalar, we can
// already apply it element-wise. That allows us to move the reduction out
// of the loop.
//
// Note: the intrinsic additionally unrolls the loop by a factor of 4,
// but we want to keep thins simple for demonstration purposes.
//
private static int[] REVERSE_POWERS_OF_31 = new int[9];
static {
int p = 1;
for (int i = REVERSE_POWERS_OF_31.length - 1; i >= 0; i--) {
REVERSE_POWERS_OF_31[i] = p;
p *= 31;
}
}
public static int hashCodeB_VectorAPI_v1(byte[] a) {
int result = 1; // initialValue
var vresult = IntVector.zero(SPECIES_I256);
int next = REVERSE_POWERS_OF_31[0]; // 31^L
var vnext = IntVector.broadcast(SPECIES_I256, next);
var vcoef = IntVector.fromArray(SPECIES_I256, REVERSE_POWERS_OF_31, 1); // powers of 2 in reverse
int i;
for (i = 0; i < SPECIES_B64.loopBound(a.length); i += SPECIES_B64.length()) {
// scalar part: result *= 31^L
result *= next;
// vector part: element-wise apply the next factor and add in the new values.
var vb = ByteVector.fromArray(SPECIES_B64, a, i);
var vi = vb.castShape(SPECIES_I256, 0);
vresult = vresult.mul(vnext).add(vi);
}
// reduce the partial hashes in the elements, using the reverse list of powers of 2.
result += vresult.mul(vcoef).reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
result = 31 * result + a[i];
}
return result;
}
public static Object scanAddI_loop(int[] a, int[] r) {
int sum = 0;
for (int i = 0; i < a.length; i++) {
sum += a[i];
r[i] = sum;
}
return r;
}
public static Object scanAddI_loop_reassociate(int[] a, int[] r) {
int sum = 0;
int i = 0;
for (; i < a.length - 3; i+=4) {
// We cut the latency by a factor of 4, but increase the number of additions.
int old_sum = sum;
int v0 = a[i + 0];
int v1 = a[i + 1];
int v2 = a[i + 2];
int v3 = a[i + 3];
int v01 = v0 + v1;
int v23 = v2 + v3;
int v0123 = v01 + v23;
sum += v0123;
r[i + 0] = old_sum + v0;
r[i + 1] = old_sum + v01;
r[i + 2] = old_sum + v01 + v2;
r[i + 3] = old_sum + v0123;
}
for (; i < a.length; i++) {
sum += a[i];
r[i] = sum;
}
return r;
}
public static Object scanAddI_VectorAPI_permute_add(int[] a, int[] r) {
// Using Naive Parallel Algorithm: Hills and Steele
int sum = 0;
int xx = 0; // masked later anyway
var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0);
var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0);
var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 0);
var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, xx, xx, xx, xx, 0, 1, 2, 3, 4, 5, 6, 7}, 0);
var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110);
var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100);
var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000);
var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000);
int i = 0;
for (; i < SPECIES_I512.loopBound(a.length); i += SPECIES_I512.length()) {
IntVector v = IntVector.fromArray(SPECIES_I512, a, i);
v = v.add(v.rearrange(shf1), mask1);
v = v.add(v.rearrange(shf2), mask2);
v = v.add(v.rearrange(shf3), mask3);
v = v.add(v.rearrange(shf4), mask4);
v = v.add(sum);
v.intoArray(r, i);
sum = v.lane(SPECIES_I512.length() - 1);
}
for (; i < a.length; i++) {
sum += a[i];
r[i] = sum;
}
return r;
}
public static int findMinIndexI_loop(int[] a) {
int min = a[0];
int index = 0;
for (int i = 1; i < a.length; i++) {
int ai = a[i];
if (ai < min) {
min = ai;
index = i;
}
}
return index;
}
public static int findMinIndexI_VectorAPI(int[] a) {
// Main approach: have partial results in mins and idxs.
var mins = IntVector.broadcast(SPECIES_I, a[0]);
var idxs = IntVector.broadcast(SPECIES_I, 0);
var iota = IntVector.broadcast(SPECIES_I, 0).addIndex(1);
int i = 0;
for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
var mask = v.compare(VectorOperators.LT, mins);
mins = mins.blend(v, mask);
idxs = idxs.blend(iota, mask);
iota = iota.add(SPECIES_I.length());
}
// Reduce the vectors down
int min = mins.reduceLanes(VectorOperators.MIN);
var not_min_mask = mins.compare(VectorOperators.NE, min);
int index = idxs.blend(a.length, not_min_mask).reduceLanes(VectorOperators.MIN);
// Tail loop
for (; i < a.length; i++) {
int ai = a[i];
if (ai < min) {
min = ai;
index = i;
}
}
return index;
}
public static int findI_loop(int[] a, int e) {
for (int i = 0; i < a.length; i++) {
int ai = a[i];
if (ai == e) {
return i;
}
}
return -1;
}
public static int findI_VectorAPI(int[] a, int e) {
var es = IntVector.broadcast(SPECIES_I, e);
int i = 0;
for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
var mask = v.compare(VectorOperators.EQ, es);
if (mask.anyTrue()) {
var ml = mask.toLong();
return i + Long.numberOfTrailingZeros(ml);
}
}
for (; i < a.length; i++) {
int ai = a[i];
if (ai == e) {
return i;
}
}
return -1;
}
public static Object reverseI_loop(int[] a, int[] r) {
for (int i = 0; i < a.length; i++) {
r[a.length - i - 1] = a[i];
}
return r;
}
private static final VectorShuffle<Integer> REVERSE_SHUFFLE_I = SPECIES_I.iotaShuffle(SPECIES_I.length()-1, -1, true);
public static Object reverseI_VectorAPI(int[] a, int[] r) {
int i = 0;
for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
v = v.rearrange(REVERSE_SHUFFLE_I);
v.intoArray(r, r.length - SPECIES_I.length() - i);
}
for (; i < a.length; i++) {
r[a.length - i - 1] = a[i];
}
return r;
}
public static Object filterI_loop(int[] a, int[] r, int threshold) {
int j = 0;
for (int i = 0; i < a.length; i++) {
int ai = a[i];
if (ai >= threshold) {
r[j++] = ai;
}
}
// Just force the resulting length onto the same array.
r[r.length - 1] = j;
return r;
}
public static Object filterI_VectorAPI(int[] a, int[] r, int threshold) {
var thresholds = IntVector.broadcast(SPECIES_I, threshold);
int j = 0;
int i = 0;
for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
IntVector v = IntVector.fromArray(SPECIES_I, a, i);
var mask = v.compare(VectorOperators.GE, thresholds);
v = v.compress(mask);
int trueCount = mask.trueCount();
var prefixMask = mask.compress();
v.intoArray(r, j, prefixMask);
j += trueCount;
}
for (; i < a.length; i++) {
int ai = a[i];
if (ai >= threshold) {
r[j++] = ai;
}
}
// Just force the resulting length onto the same array.
r[r.length - 1] = j;
return r;
}
// X4: ints simulate 4-byte oops.
// oops: if non-zero (= non-null), every entry simpulates a 4-byte oop, pointing into mem.
// mem: an int array that simulates the memory.
//
// Task: Find all non-null oops, and dereference them, get the relevant field.
// Objects have 16 bytes, and the relevant field is at bytes 12-16.
// That maps to 4 ints, and the relevant field is the 4th element of 4.
// Sum up all the field values.
public static int reduceAddIFieldsX4_loop(int[] oops, int[] mem) {
int sum = 0;
for (int i = 0; i < oops.length; i++) {
int oop = oops[i];
if (oop != 0) {
int fieldValue = mem[oop + 3]; // oop+12
sum += fieldValue;
}
}
return sum;
}
public static int reduceAddIFieldsX4_VectorAPI(int[] oops, int[] mem) {
var nulls = IntVector.broadcast(SPECIES_I, 0);
var acc = IntVector.broadcast(SPECIES_I, 0);
int i = 0;
for (; i < SPECIES_I.loopBound(oops.length); i += SPECIES_I.length()) {
var oopv = IntVector.fromArray(SPECIES_I, oops, i);
var mask = oopv.compare(VectorOperators.NE, nulls);
// We are lucky today: we need to access mem[oop + 3]
var fieldValues = IntVector.fromArray(SPECIES_I, mem, 3, oops, i, mask);
acc = acc.add(fieldValues);
}
int sum = acc.reduceLanes(VectorOperators.ADD);
for (; i < oops.length; i++) {
int oop = oops[i];
if (oop != 0) {
int fieldValue = mem[oop + 3]; // oop+12
sum += fieldValue;
}
}
return sum;
}
}