/* * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. * */ package compiler.vectorization; import java.util.Arrays; import jdk.incubator.vector.*; /** * The code below is supposed to be an exact copy of: * micro/org/openjdk/bench/vm/compiler/VectorAlgorithmsImpl.java */ public class VectorAlgorithmsImpl { private static final VectorSpecies SPECIES_I = IntVector.SPECIES_PREFERRED; private static final VectorSpecies SPECIES_I512 = IntVector.SPECIES_512; private static final VectorSpecies SPECIES_I256 = IntVector.SPECIES_256; private static final VectorSpecies SPECIES_B64 = ByteVector.SPECIES_64; private static final VectorSpecies SPECIES_F = FloatVector.SPECIES_PREFERRED; public static Object fillI_loop(int[] r) { for (int i = 0; i < r.length; i++) { r[i] = 42; } return r; } public static Object fillI_Arrays(int[] r) { Arrays.fill(r, 42); return r; } public static Object fillI_VectorAPI(int[] r) { var v = IntVector.broadcast(SPECIES_I, 42); int i = 0; for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) { v.intoArray(r, i); } for (; i < r.length; i++) { r[i] = 42; } return r; } public static Object iotaI_loop(int[] r) { for (int i = 0; i < r.length; i++) { r[i] = i; } return r; } public static Object iotaI_VectorAPI(int[] r) { var iota = IntVector.broadcast(SPECIES_I, 0).addIndex(1); int i = 0; for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) { iota.intoArray(r, i); iota = iota.add(SPECIES_I.length()); } for (; i < r.length; i++) { r[i] = i; } return r; } public static Object copyI_loop(int[] a, int[] r) { for (int i = 0; i < a.length; i++) { r[i] = a[i]; } return r; } public static Object copyI_System_arraycopy(int[] a, int[] r) { System.arraycopy(a, 0, r, 0, a.length); return r; } public static Object copyI_VectorAPI(int[] a, int[] r) { int i = 0; for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) { IntVector v = IntVector.fromArray(SPECIES_I, a, i); v.intoArray(r, i); } for (; i < r.length; i++) { r[i] = a[i]; } return r; } public static Object mapI_loop(int[] a, int[] r) { for (int i = 0; i < a.length; i++) { r[i] = a[i] * 42; } return r; } public static Object mapI_VectorAPI(int[] a, int[] r) { int i = 0; for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) { IntVector v = IntVector.fromArray(SPECIES_I, a, i); v = v.mul(42); v.intoArray(r, i); } for (; i < r.length; i++) { r[i] = a[i]; } return r; } public static int reduceAddI_loop(int[] a) { int sum = 0; for (int i = 0; i < a.length; i++) { // Relying on simple reduction loop should vectorize since JDK26. sum += a[i]; } return sum; } public static int reduceAddI_reassociate(int[] a) { int sum = 0; int i; for (i = 0; i < a.length - 3; i+=4) { // Unroll 4x, reassociate inside. sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3]; } for (; i < a.length; i++) { // Tail sum += a[i]; } return sum; } public static int reduceAddI_VectorAPI_naive(int[] a) { var sum = 0; int i; for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { IntVector v = IntVector.fromArray(SPECIES_I, a, i); // reduceLanes in loop is better than scalar performance, but still // relatively slow. sum += v.reduceLanes(VectorOperators.ADD); } for (; i < a.length; i++) { sum += a[i]; } return sum; } public static int reduceAddI_VectorAPI_reduction_after_loop(int[] a) { var acc = IntVector.broadcast(SPECIES_I, 0); int i; for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { IntVector v = IntVector.fromArray(SPECIES_I, a, i); // Element-wide addition into a vector of partial sums is much faster. // Now, we only need to do a reduceLanes after the loop. // This works because int-addition is associative and commutative. acc = acc.add(v); } int sum = acc.reduceLanes(VectorOperators.ADD); for (; i < a.length; i++) { sum += a[i]; } return sum; } public static float dotProductF_loop(float[] a, float[] b) { float sum = 0; for (int i = 0; i < a.length; i++) { sum += a[i] * b[i]; } return sum; } public static float dotProductF_VectorAPI_naive(float[] a, float[] b) { float sum = 0; int i; for (i = 0; i < SPECIES_F.loopBound(a.length); i += SPECIES_F.length()) { var va = FloatVector.fromArray(SPECIES_F, a, i); var vb = FloatVector.fromArray(SPECIES_F, b, i); sum += va.mul(vb).reduceLanes(VectorOperators.ADD); } for (; i < a.length; i++) { sum += a[i] * b[i]; } return sum; } public static float dotProductF_VectorAPI_reduction_after_loop(float[] a, float[] b) { var sums = FloatVector.broadcast(SPECIES_F, 0.0f); int i; for (i = 0; i < SPECIES_F.loopBound(a.length); i += SPECIES_F.length()) { var va = FloatVector.fromArray(SPECIES_F, a, i); var vb = FloatVector.fromArray(SPECIES_F, b, i); sums = sums.add(va.mul(vb)); } float sum = sums.reduceLanes(VectorOperators.ADD); for (; i < a.length; i++) { sum += a[i] * b[i]; } return sum; } public static int hashCodeB_loop(byte[] a) { int h = 1; for (int i = 0; i < a.length; i++) { h = 31 * h + a[i]; } return h; } public static int hashCodeB_Arrays(byte[] a) { return Arrays.hashCode(a); } // Simplified intrinsic code from C2_MacroAssembler::arrays_hashcode in c2_MacroAssembler_x86.cpp // // Ideas that may help understand the code: // // h(i) = 31 * h(i-1) + a[i] // "unroll" by factor of L=8: // h(i+8) = h(i) * 31^8 + a[i+1] * 31^7 + a[i+2] * 31^6 + ... + a[i+8] * 1 // ----------- ------------------------------------------------ // scalar vector: notice the powers of 31 in reverse // // We notice that we can load a[i+1 .. i+8], then element-wise multiply with // the vector of reversed powers-of-31, and then do reduceLanes(ADD). // But we can do even better: By looking at multiple such 8-unrolled iterations. // Instead of applying the "next" factor of "31^8" to the reduced scalar, we can // already apply it element-wise. That allows us to move the reduction out // of the loop. // // Note: the intrinsic additionally unrolls the loop by a factor of 4, // but we want to keep thins simple for demonstration purposes. // private static int[] REVERSE_POWERS_OF_31 = new int[9]; static { int p = 1; for (int i = REVERSE_POWERS_OF_31.length - 1; i >= 0; i--) { REVERSE_POWERS_OF_31[i] = p; p *= 31; } } public static int hashCodeB_VectorAPI_v1(byte[] a) { int result = 1; // initialValue var vresult = IntVector.zero(SPECIES_I256); int next = REVERSE_POWERS_OF_31[0]; // 31^L var vnext = IntVector.broadcast(SPECIES_I256, next); var vcoef = IntVector.fromArray(SPECIES_I256, REVERSE_POWERS_OF_31, 1); // powers of 2 in reverse int i; for (i = 0; i < SPECIES_B64.loopBound(a.length); i += SPECIES_B64.length()) { // scalar part: result *= 31^L result *= next; // vector part: element-wise apply the next factor and add in the new values. var vb = ByteVector.fromArray(SPECIES_B64, a, i); var vi = vb.castShape(SPECIES_I256, 0); vresult = vresult.mul(vnext).add(vi); } // reduce the partial hashes in the elements, using the reverse list of powers of 2. result += vresult.mul(vcoef).reduceLanes(VectorOperators.ADD); for (; i < a.length; i++) { result = 31 * result + a[i]; } return result; } public static Object scanAddI_loop(int[] a, int[] r) { int sum = 0; for (int i = 0; i < a.length; i++) { sum += a[i]; r[i] = sum; } return r; } public static Object scanAddI_loop_reassociate(int[] a, int[] r) { int sum = 0; int i = 0; for (; i < a.length - 3; i+=4) { // We cut the latency by a factor of 4, but increase the number of additions. int old_sum = sum; int v0 = a[i + 0]; int v1 = a[i + 1]; int v2 = a[i + 2]; int v3 = a[i + 3]; int v01 = v0 + v1; int v23 = v2 + v3; int v0123 = v01 + v23; sum += v0123; r[i + 0] = old_sum + v0; r[i + 1] = old_sum + v01; r[i + 2] = old_sum + v01 + v2; r[i + 3] = old_sum + v0123; } for (; i < a.length; i++) { sum += a[i]; r[i] = sum; } return r; } public static Object scanAddI_VectorAPI_permute_add(int[] a, int[] r) { // Using Naive Parallel Algorithm: Hills and Steele int sum = 0; int xx = 0; // masked later anyway var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, 0); var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, 0); var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 0); var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, xx, xx, xx, xx, 0, 1, 2, 3, 4, 5, 6, 7}, 0); var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110); var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100); var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000); var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000); int i = 0; for (; i < SPECIES_I512.loopBound(a.length); i += SPECIES_I512.length()) { IntVector v = IntVector.fromArray(SPECIES_I512, a, i); v = v.add(v.rearrange(shf1), mask1); v = v.add(v.rearrange(shf2), mask2); v = v.add(v.rearrange(shf3), mask3); v = v.add(v.rearrange(shf4), mask4); v = v.add(sum); v.intoArray(r, i); sum = v.lane(SPECIES_I512.length() - 1); } for (; i < a.length; i++) { sum += a[i]; r[i] = sum; } return r; } public static int findMinIndexI_loop(int[] a) { int min = a[0]; int index = 0; for (int i = 1; i < a.length; i++) { int ai = a[i]; if (ai < min) { min = ai; index = i; } } return index; } public static int findMinIndexI_VectorAPI(int[] a) { // Main approach: have partial results in mins and idxs. var mins = IntVector.broadcast(SPECIES_I, a[0]); var idxs = IntVector.broadcast(SPECIES_I, 0); var iota = IntVector.broadcast(SPECIES_I, 0).addIndex(1); int i = 0; for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { IntVector v = IntVector.fromArray(SPECIES_I, a, i); var mask = v.compare(VectorOperators.LT, mins); mins = mins.blend(v, mask); idxs = idxs.blend(iota, mask); iota = iota.add(SPECIES_I.length()); } // Reduce the vectors down int min = mins.reduceLanes(VectorOperators.MIN); var not_min_mask = mins.compare(VectorOperators.NE, min); int index = idxs.blend(a.length, not_min_mask).reduceLanes(VectorOperators.MIN); // Tail loop for (; i < a.length; i++) { int ai = a[i]; if (ai < min) { min = ai; index = i; } } return index; } public static int findI_loop(int[] a, int e) { for (int i = 0; i < a.length; i++) { int ai = a[i]; if (ai == e) { return i; } } return -1; } public static int findI_VectorAPI(int[] a, int e) { var es = IntVector.broadcast(SPECIES_I, e); int i = 0; for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { IntVector v = IntVector.fromArray(SPECIES_I, a, i); var mask = v.compare(VectorOperators.EQ, es); if (mask.anyTrue()) { var ml = mask.toLong(); return i + Long.numberOfTrailingZeros(ml); } } for (; i < a.length; i++) { int ai = a[i]; if (ai == e) { return i; } } return -1; } public static Object reverseI_loop(int[] a, int[] r) { for (int i = 0; i < a.length; i++) { r[a.length - i - 1] = a[i]; } return r; } private static final VectorShuffle REVERSE_SHUFFLE_I = SPECIES_I.iotaShuffle(SPECIES_I.length()-1, -1, true); public static Object reverseI_VectorAPI(int[] a, int[] r) { int i = 0; for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { IntVector v = IntVector.fromArray(SPECIES_I, a, i); v = v.rearrange(REVERSE_SHUFFLE_I); v.intoArray(r, r.length - SPECIES_I.length() - i); } for (; i < a.length; i++) { r[a.length - i - 1] = a[i]; } return r; } public static Object filterI_loop(int[] a, int[] r, int threshold) { int j = 0; for (int i = 0; i < a.length; i++) { int ai = a[i]; if (ai >= threshold) { r[j++] = ai; } } // Just force the resulting length onto the same array. r[r.length - 1] = j; return r; } public static Object filterI_VectorAPI(int[] a, int[] r, int threshold) { var thresholds = IntVector.broadcast(SPECIES_I, threshold); int j = 0; int i = 0; for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) { IntVector v = IntVector.fromArray(SPECIES_I, a, i); var mask = v.compare(VectorOperators.GE, thresholds); v = v.compress(mask); int trueCount = mask.trueCount(); var prefixMask = mask.compress(); v.intoArray(r, j, prefixMask); j += trueCount; } for (; i < a.length; i++) { int ai = a[i]; if (ai >= threshold) { r[j++] = ai; } } // Just force the resulting length onto the same array. r[r.length - 1] = j; return r; } // X4: ints simulate 4-byte oops. // oops: if non-zero (= non-null), every entry simpulates a 4-byte oop, pointing into mem. // mem: an int array that simulates the memory. // // Task: Find all non-null oops, and dereference them, get the relevant field. // Objects have 16 bytes, and the relevant field is at bytes 12-16. // That maps to 4 ints, and the relevant field is the 4th element of 4. // Sum up all the field values. public static int reduceAddIFieldsX4_loop(int[] oops, int[] mem) { int sum = 0; for (int i = 0; i < oops.length; i++) { int oop = oops[i]; if (oop != 0) { int fieldValue = mem[oop + 3]; // oop+12 sum += fieldValue; } } return sum; } public static int reduceAddIFieldsX4_VectorAPI(int[] oops, int[] mem) { var nulls = IntVector.broadcast(SPECIES_I, 0); var acc = IntVector.broadcast(SPECIES_I, 0); int i = 0; for (; i < SPECIES_I.loopBound(oops.length); i += SPECIES_I.length()) { var oopv = IntVector.fromArray(SPECIES_I, oops, i); var mask = oopv.compare(VectorOperators.NE, nulls); // We are lucky today: we need to access mem[oop + 3] var fieldValues = IntVector.fromArray(SPECIES_I, mem, 3, oops, i, mask); acc = acc.add(fieldValues); } int sum = acc.reduceLanes(VectorOperators.ADD); for (; i < oops.length; i++) { int oop = oops[i]; if (oop != 0) { int fieldValue = mem[oop + 3]; // oop+12 sum += fieldValue; } } return sum; } }