jdk/test/hotspot/jtreg/compiler/vectorization/VectorAlgorithmsImpl.java

/*
 *  Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
 *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 *  This code is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License version 2 only, as
 *  published by the Free Software Foundation.
 *
 *  This code is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  version 2 for more details (a copy is included in the LICENSE file that
 *  accompanied this code).
 *
 *  You should have received a copy of the GNU General Public License version
 *  2 along with this work; if not, write to the Free Software Foundation,
 *  Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 *  Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 *  or visit www.oracle.com if you need additional information or have any
 *  questions.
 *
 */

package compiler.vectorization;

import java.util.Arrays;
import jdk.incubator.vector.*;

/**
 * The code below is supposed to be an exact copy of:
 *   micro/org/openjdk/bench/vm/compiler/VectorAlgorithmsImpl.java
 */
public class VectorAlgorithmsImpl {
    private static final VectorSpecies<Integer> SPECIES_I    = IntVector.SPECIES_PREFERRED;
    private static final VectorSpecies<Integer> SPECIES_I512 = IntVector.SPECIES_512;
    private static final VectorSpecies<Integer> SPECIES_I256 = IntVector.SPECIES_256;
    private static final VectorSpecies<Byte> SPECIES_B64     = ByteVector.SPECIES_64;
    private static final VectorSpecies<Float> SPECIES_F      = FloatVector.SPECIES_PREFERRED;

    public static Object fillI_loop(int[] r) {
        for (int i = 0; i < r.length; i++) {
            r[i] = 42;
        }
        return r;
    }

    public static Object fillI_Arrays(int[] r) {
        Arrays.fill(r, 42);
        return r;
    }

    public static Object fillI_VectorAPI(int[] r) {
        var v = IntVector.broadcast(SPECIES_I, 42);
        int i = 0;
        for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) {
            v.intoArray(r, i);
        }
        for (; i < r.length; i++) {
            r[i] = 42;
        }
        return r;
    }

    public static Object iotaI_loop(int[] r) {
        for (int i = 0; i < r.length; i++) {
            r[i] = i;
        }
        return r;
    }

    public static Object iotaI_VectorAPI(int[] r) {
        var iota = IntVector.broadcast(SPECIES_I, 0).addIndex(1);
        int i = 0;
        for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) {
            iota.intoArray(r, i);
            iota = iota.add(SPECIES_I.length());
        }
        for (; i < r.length; i++) {
            r[i] = i;
        }
        return r;
    }

    public static Object copyI_loop(int[] a, int[] r) {
        for (int i = 0; i < a.length; i++) {
            r[i] = a[i];
        }
        return r;
    }

    public static Object copyI_System_arraycopy(int[] a, int[] r) {
        System.arraycopy(a, 0, r, 0, a.length);
        return r;
    }

    public static Object copyI_VectorAPI(int[] a, int[] r) {
        int i = 0;
        for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I, a, i);
            v.intoArray(r, i);
        }
        for (; i < r.length; i++) {
            r[i] = a[i];
        }
        return r;
    }

    public static Object mapI_loop(int[] a, int[] r) {
        for (int i = 0; i < a.length; i++) {
            r[i] = a[i] * 42;
        }
        return r;
    }

    public static Object mapI_VectorAPI(int[] a, int[] r) {
        int i = 0;
        for (; i < SPECIES_I.loopBound(r.length); i += SPECIES_I.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I, a, i);
            v = v.mul(42);
            v.intoArray(r, i);
        }
        for (; i < r.length; i++) {
            r[i] = a[i];
        }
        return r;
    }

    public static int reduceAddI_loop(int[] a) {
        int sum = 0;
        for (int i = 0; i < a.length; i++) {
            // Relying on simple reduction loop should vectorize since JDK26.
            sum += a[i];
        }
        return sum;
    }

    public static int reduceAddI_reassociate(int[] a) {
        int sum = 0;
        int i;
        for (i = 0; i < a.length - 3; i+=4) {
            // Unroll 4x, reassociate inside.
            sum += a[i] + a[i + 1] + a[i + 2] + a[i + 3];
        }
        for (; i < a.length; i++) {
            // Tail
            sum += a[i];
        }
        return sum;
    }

    public static int reduceAddI_VectorAPI_naive(int[] a) {
        var sum = 0;
        int i;
        for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I, a, i);
            // reduceLanes in loop is better than scalar performance, but still
            // relatively slow.
            sum += v.reduceLanes(VectorOperators.ADD);
        }
        for (; i < a.length; i++) {
            sum += a[i];
        }
        return sum;
    }

    public static int reduceAddI_VectorAPI_reduction_after_loop(int[] a) {
        var acc = IntVector.broadcast(SPECIES_I, 0);
        int i;
        for (i = 0; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I, a, i);
            // Element-wide addition into a vector of partial sums is much faster.
            // Now, we only need to do a reduceLanes after the loop.
            // This works because int-addition is associative and commutative.
            acc = acc.add(v);
        }
        int sum = acc.reduceLanes(VectorOperators.ADD);
        for (; i < a.length; i++) {
            sum += a[i];
        }
        return sum;
    }

    public static float dotProductF_loop(float[] a, float[] b) {
        float sum = 0;
        for (int i = 0; i < a.length; i++) {
            sum += a[i] * b[i];
        }
        return sum;
    }

    public static float dotProductF_VectorAPI_naive(float[] a, float[] b) {
        float sum = 0;
        int i;
        for (i = 0; i < SPECIES_F.loopBound(a.length); i += SPECIES_F.length()) {
            var va = FloatVector.fromArray(SPECIES_F, a, i);
            var vb = FloatVector.fromArray(SPECIES_F, b, i);
            sum += va.mul(vb).reduceLanes(VectorOperators.ADD);
        }
        for (; i < a.length; i++) {
            sum += a[i] * b[i];
        }
        return sum;
    }

    public static float dotProductF_VectorAPI_reduction_after_loop(float[] a, float[] b) {
        var sums = FloatVector.broadcast(SPECIES_F, 0.0f);
        int i;
        for (i = 0; i < SPECIES_F.loopBound(a.length); i += SPECIES_F.length()) {
            var va = FloatVector.fromArray(SPECIES_F, a, i);
            var vb = FloatVector.fromArray(SPECIES_F, b, i);
            sums = sums.add(va.mul(vb));
        }
        float sum = sums.reduceLanes(VectorOperators.ADD);
        for (; i < a.length; i++) {
            sum += a[i] * b[i];
        }
        return sum;
    }

    public static int hashCodeB_loop(byte[] a) {
        int h = 1;
        for (int i = 0; i < a.length; i++) {
            h = 31 * h + a[i];
        }
        return h;
    }

    public static int hashCodeB_Arrays(byte[] a) {
        return Arrays.hashCode(a);
    }

    // Simplified intrinsic code from C2_MacroAssembler::arrays_hashcode in c2_MacroAssembler_x86.cpp
    //
    // Ideas that may help understand the code:
    //   h(i) = 31 * h(i-1) + a[i]
    // "unroll" by factor of L=8:
    //   h(i+8) = h(i) * 31^8 + a[i+1] * 31^7 + a[i+2] * 31^6 + ... + a[i+8] * 1
    //            -----------   ------------------------------------------------
    //            scalar        vector: notice the powers of 31 in reverse
    //
    // We notice that we can load a[i+1 .. i+8], then element-wise multiply with
    // the vector of reversed powers-of-31, and then do reduceLanes(ADD).
    // But we can do even better: By looking at multiple such 8-unrolled iterations.
    // Instead of applying the "next" factor of "31^8" to the reduced scalar, we can
    // already apply it element-wise. That allows us to move the reduction out
    // of the loop.
    //
    // Note: the intrinsic additionally unrolls the loop by a factor of 4,
    //       but we want to keep thins simple for demonstration purposes.
    //
    private static int[] REVERSE_POWERS_OF_31 = new int[9];
    static {
        int p = 1;
        for (int i = REVERSE_POWERS_OF_31.length - 1; i >= 0; i--) {
            REVERSE_POWERS_OF_31[i] = p;
            p *= 31;
        }
    }
    public static int hashCodeB_VectorAPI_v1(byte[] a) {
        int result = 1; // initialValue
        var vresult = IntVector.zero(SPECIES_I256);
        int next = REVERSE_POWERS_OF_31[0]; // 31^L
        var vcoef = IntVector.fromArray(SPECIES_I256, REVERSE_POWERS_OF_31, 1); // powers of 2 in reverse
        int i;
        for (i = 0; i < SPECIES_B64.loopBound(a.length); i += SPECIES_B64.length()) {
            // scalar part: result *= 31^L
            result *= next;
            // vector part: element-wise apply the next factor and add in the new values.
            var vb = ByteVector.fromArray(SPECIES_B64, a, i);
            var vi = vb.castShape(SPECIES_I256, 0);
            vresult = vresult.mul(next).add(vi);
        }
        // reduce the partial hashes in the elements, using the reverse list of powers of 2.
        result += vresult.mul(vcoef).reduceLanes(VectorOperators.ADD);
        for (; i < a.length; i++) {
            result = 31 * result + a[i];
        }
        return result;
    }

    // This second approach follows the idea from this blog post by Otmar Ertl:
    // https://www.dynatrace.com/news/blog/java-arrays-hashcode-byte-efficiency-techniques/
    //
    // I simplified the algorithm a little, so that it is a bit closer
    // to the solution "v1" above.
    //
    // The major issue with "v1" is that we cannot load a full vector of bytes,
    // because of the cast to ints. So we can only fill 1/4 of the maximal
    // vector size. The trick here is to do an unrolling of factor 4, from:
    //   h(i) = 31 * h(i-1) + a[i]
    // to:
    //   h(i+4) = h(i) * 31^4 + a[i + 1] * 31^3
    //                        + a[i + 2] * 31^2
    //                        + a[i + 3] * 31^1
    //                        + a[i + 4] * 31^0
    // The goal is now to compute this value for 4 bytes within a 4 byte
    // lane of the vector. One concern is that we start with byte values,
    // but need to do int-multiplication with powers of 31. If we instead
    // did a byte-multiplication, we could get overflows that we would not
    // have had in the int-multiplication.
    // One trick that helps with chaning the size of the lanes from byte
    // to short to int is doing all operations with unsigned integers. That
    // way, we can zero-extend instead of sign-bit extend. The first step
    // is thus to convert the bytes into unsigned values. Since byte is in
    // range [-128..128), doing "a[i+j] + 128" makes it a positive value,
    // allowing for unsigned multiplication.
    // h(i+4) = h(i) * 31^4 +   a[i + 1]              * 31^3
    //                      +   a[i + 2]              * 31^2
    //                      +   a[i + 3]              * 31^1
    //                      +   a[i + 4]              * 31^0
    //        = h(i) * 31^4 +  (a[i + 1] + 128 - 128) * 31^3
    //                      +  (a[i + 2] + 128 - 128) * 31^2
    //                      +  (a[i + 3] + 128 - 128) * 31^1
    //                      +  (a[i + 4] + 128 - 128) * 31^0
    //        = h(i) * 31^4 +  (a[i + 1] + 128      ) * 31^3
    //                      +  (a[i + 2] + 128      ) * 31^2
    //                      +  (a[i + 3] + 128      ) * 31^1
    //                      +  (a[i + 4] + 128      ) * 31^0
    //                      +  -128 * (31^3 + 31^2 + 31^1 + 1)
    //        = h(i) * 31^4 + ((a[i + 1] + 128) * 31
    //                      +  (a[i + 2] + 128      ) * 31^2
    //                      + ((a[i + 3] + 128) * 31
    //                      +  (a[i + 4] + 128      )
    //                      +  -128 * (31^3 + 31^2 + 31^1 + 1)
    //
    // Getting from the signed a[i] value to unsigned with +128, we can
    // just xor with 0x80=128. Any numbers there in range [-128..0) are
    // now in range [0..128). And any numbers that were in range [0..128)
    // are now in unsigned range [128..255). What a neat trick!
    //
    // We then apply a byte->short transition where we crunch 2 bytes
    // into one short, applying a multiplication with 31 to one of the
    // two bytes. This multiplication cannot overflow in a short.
    // then we apply a short->int transition where we crunch 2 shorts
    // into one int, applying a multiplication with 31^2 to one of the
    // two shorts. This multiplication cannot overflow in an int.
    //
    public static int hashCodeB_VectorAPI_v2(byte[] a) {
        return HashCodeB_VectorAPI_V2.compute(a);
    }

    private static class HashCodeB_VectorAPI_V2 {
        private static final int L = Math.min(ByteVector.SPECIES_PREFERRED.length(),
                                              IntVector.SPECIES_PREFERRED.length() * 4);
        private static final VectorShape SHAPE = VectorShape.forBitSize(8 * L);
        private static final VectorSpecies<Byte>    SPECIES_B = SHAPE.withLanes(byte.class);
        private static final VectorSpecies<Integer> SPECIES_I = SHAPE.withLanes(int.class);

        private static int[] REVERSE_POWERS_OF_31_STEP_4 = new int[L / 4 + 1];
        static {
            int p = 1;
            int step = 31 * 31 * 31 * 31; // step by 4
            for (int i = REVERSE_POWERS_OF_31_STEP_4.length - 1; i >= 0; i--) {
                REVERSE_POWERS_OF_31_STEP_4[i] = p;
                p *= step;
            }
        }

        public static int compute(byte[] a) {
            int result = 1; // initialValue
            int next = REVERSE_POWERS_OF_31_STEP_4[0]; // 31^L
            var vcoef = IntVector.fromArray(SPECIES_I, REVERSE_POWERS_OF_31_STEP_4, 1); // W
            var vresult = IntVector.zero(SPECIES_I);
            int i;
            for (i = 0; i < SPECIES_B.loopBound(a.length); i += SPECIES_B.length()) {
                var vb = ByteVector.fromArray(SPECIES_B, a, i);
                // Add 128 to each byte.
                var vs = vb.lanewise(VectorOperators.XOR, (byte)0x80)
                           .reinterpretAsShorts();
                // Each short lane contains 2 bytes, crunch them.
                var vi = vs.and((short)0xff) // lower byte
                           .mul((short)31)
                           .add(vs.lanewise(VectorOperators.LSHR, 8)) // upper byte
                           .reinterpretAsInts();
                // Each int contains 2 shorts, crunch them.
                var v  = vi.and(0xffff) // lower short
                           .mul(31 * 31)
                           .add(vi.lanewise(VectorOperators.LSHR, 16)); // upper short
                // Add the correction for the 128 additions above.
                v = v.add(-128 * (31*31*31 + 31*31 + 31 + 1));
                // Every element of v now contains a crunched int-package of 4 bytes.
                result *= next;
                vresult = vresult.mul(next).add(v);
            }
            result += vresult.mul(vcoef).reduceLanes(VectorOperators.ADD);
            for (; i < a.length; i++) {
                result = 31 * result + a[i];
            }
            return result;
        }
    }

    public static Object scanAddI_loop(int[] a, int[] r) {
        int sum = 0;
        for (int i = 0; i < a.length; i++) {
            sum += a[i];
            r[i] = sum;
        }
        return r;
    }

    public static Object scanAddI_loop_reassociate(int[] a, int[] r) {
        int sum = 0;
        int i = 0;
        for (; i < a.length - 3; i+=4) {
            // We cut the latency by a factor of 4, but increase the number of additions.
            int old_sum = sum;
            int v0 = a[i + 0];
            int v1 = a[i + 1];
            int v2 = a[i + 2];
            int v3 = a[i + 3];
            int v01 = v0 + v1;
            int v23 = v2 + v3;
            int v0123 = v01 + v23;
            sum += v0123;
            r[i + 0] = old_sum + v0;
            r[i + 1] = old_sum + v01;
            r[i + 2] = old_sum + v01 + v2;
            r[i + 3] = old_sum + v0123;
        }
        for (; i < a.length; i++) {
            sum += a[i];
            r[i] = sum;
        }
        return r;
    }

    public static Object scanAddI_VectorAPI_permute_add(int[] a, int[] r) {
        // Using Naive Parallel Algorithm: Hills and Steele
        int sum = 0;
        int xx = 0; // masked later anyway
        var shf1 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14}, 0);
        var shf2 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13}, 0);
        var shf3 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11}, 0);
        var shf4 = VectorShuffle.fromArray(SPECIES_I512, new int[]{xx, xx, xx, xx, xx, xx, xx, xx,  0,  1,  2,  3,  4,  5,  6,  7}, 0);
        var mask1 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111110);
        var mask2 = VectorMask.fromLong(SPECIES_I512, 0b1111111111111100);
        var mask3 = VectorMask.fromLong(SPECIES_I512, 0b1111111111110000);
        var mask4 = VectorMask.fromLong(SPECIES_I512, 0b1111111100000000);
        int i = 0;
        for (; i < SPECIES_I512.loopBound(a.length); i += SPECIES_I512.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I512, a, i);
            v = v.add(v.rearrange(shf1), mask1);
            v = v.add(v.rearrange(shf2), mask2);
            v = v.add(v.rearrange(shf3), mask3);
            v = v.add(v.rearrange(shf4), mask4);
            v = v.add(sum);
            v.intoArray(r, i);
            sum = v.lane(SPECIES_I512.length() - 1);
        }
        for (; i < a.length; i++) {
            sum += a[i];
            r[i] = sum;
        }
        return r;
    }

    public static int findMinIndexI_loop(int[] a) {
        int min = a[0];
        int index = 0;
        for (int i = 1; i < a.length; i++) {
            int ai = a[i];
            if (ai < min) {
                min = ai;
                index = i;
            }
        }
        return index;
    }

    public static int findMinIndexI_VectorAPI(int[] a) {
        // Main approach: have partial results in mins and idxs.
        var mins = IntVector.broadcast(SPECIES_I, a[0]);
        var idxs = IntVector.broadcast(SPECIES_I, 0);
        var iota = IntVector.broadcast(SPECIES_I, 0).addIndex(1);
        int i = 0;
        for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I, a, i);
            var mask = v.compare(VectorOperators.LT, mins);
            mins = mins.blend(v, mask);
            idxs = idxs.blend(iota, mask);
            iota = iota.add(SPECIES_I.length());
        }
        // Reduce the vectors down
        int min = mins.reduceLanes(VectorOperators.MIN);
        var not_min_mask = mins.compare(VectorOperators.NE, min);
        int index = idxs.blend(a.length, not_min_mask).reduceLanes(VectorOperators.MIN);
        // Tail loop
        for (; i < a.length; i++) {
            int ai = a[i];
            if (ai < min) {
                min = ai;
                index = i;
            }
        }
        return index;
    }

    public static int findI_loop(int[] a, int e) {
        for (int i = 0; i < a.length; i++) {
            int ai = a[i];
            if (ai == e) {
                return i;
            }
        }
        return -1;
    }

    public static int findI_VectorAPI(int[] a, int e) {
        var es = IntVector.broadcast(SPECIES_I, e);
        int i = 0;
        for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I, a, i);
            var mask = v.compare(VectorOperators.EQ, es);
            if (mask.anyTrue()) {
                var ml = mask.toLong();
                return i + Long.numberOfTrailingZeros(ml);
            }
        }
        for (; i < a.length; i++) {
            int ai = a[i];
            if (ai == e) {
                return i;
            }
        }
        return -1;
    }

    public static Object reverseI_loop(int[] a, int[] r) {
        for (int i = 0; i < a.length; i++) {
            r[a.length - i - 1] = a[i];
        }
        return r;
    }

    private static final VectorShuffle<Integer> REVERSE_SHUFFLE_I = SPECIES_I.iotaShuffle(SPECIES_I.length()-1, -1, true);

    public static Object reverseI_VectorAPI(int[] a, int[] r) {
        int i = 0;
        for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I, a, i);
            v = v.rearrange(REVERSE_SHUFFLE_I);
            v.intoArray(r, r.length - SPECIES_I.length() - i);
        }
        for (; i < a.length; i++) {
            r[a.length - i - 1] = a[i];
        }
        return r;
    }

    public static Object filterI_loop(int[] a, int[] r, int threshold) {
        int j = 0;
        for (int i = 0; i < a.length; i++) {
            int ai = a[i];
            if (ai >= threshold) {
                r[j++] = ai;
            }
        }
        // Just force the resulting length onto the same array.
        r[r.length - 1] = j;
        return r;
    }

    public static Object filterI_VectorAPI(int[] a, int[] r, int threshold) {
        var thresholds = IntVector.broadcast(SPECIES_I, threshold);
        int j = 0;
        int i = 0;
        for (; i < SPECIES_I.loopBound(a.length); i += SPECIES_I.length()) {
            IntVector v = IntVector.fromArray(SPECIES_I, a, i);
            var mask = v.compare(VectorOperators.GE, thresholds);
            v = v.compress(mask);
            int trueCount = mask.trueCount();
            var prefixMask = mask.compress();
            v.intoArray(r, j, prefixMask);
            j += trueCount;
        }

        for (; i < a.length; i++) {
            int ai = a[i];
            if (ai >= threshold) {
                r[j++] = ai;
            }
        }
        // Just force the resulting length onto the same array.
        r[r.length - 1] = j;
        return r;
    }

    // X4: ints simulate 4-byte oops.
    // oops: if non-zero (= non-null), every entry simpulates a 4-byte oop, pointing into mem.
    // mem: an int array that simulates the memory.
    //
    // Task: Find all non-null oops, and dereference them, get the relevant field.
    //       Objects have 16 bytes, and the relevant field is at bytes 12-16.
    //       That maps to 4 ints, and the relevant field is the 4th element of 4.
    //       Sum up all the field values.
    public static int reduceAddIFieldsX4_loop(int[] oops, int[] mem) {
        int sum = 0;
        for (int i = 0; i < oops.length; i++) {
            int oop = oops[i];
            if (oop != 0) {
                int fieldValue = mem[oop + 3]; // oop+12
                sum += fieldValue;
            }
        }
        return sum;
    }

    public static int reduceAddIFieldsX4_VectorAPI(int[] oops, int[] mem) {
        var nulls = IntVector.broadcast(SPECIES_I, 0);
        var acc = IntVector.broadcast(SPECIES_I, 0);
        int i = 0;
        for (; i < SPECIES_I.loopBound(oops.length); i += SPECIES_I.length()) {
            var oopv = IntVector.fromArray(SPECIES_I, oops, i);
            var mask = oopv.compare(VectorOperators.NE, nulls);
            // We are lucky today: we need to access mem[oop + 3]
            var fieldValues = IntVector.fromArray(SPECIES_I, mem, 3, oops, i, mask);
            acc = acc.add(fieldValues);
        }
        int sum = acc.reduceLanes(VectorOperators.ADD);
        for (; i < oops.length; i++) {
            int oop = oops[i];
            if (oop != 0) {
                int fieldValue = mem[oop + 3]; // oop+12
                sum += fieldValue;
            }
        }
        return sum;
    }
}