8350126: Regression ~3% on Crypto-ChaCha20Poly1305.encrypt for MacOSX aarch64

Reviewed-by: aph
2026-03-14 18:03:44 +00:00 · 2025-04-22 16:49:29 +00:00 · 2025-04-22 16:49:29 +00:00 · 594b26516e
commit 594b26516e
parent d783a94098
3 changed files with 305 additions and 248 deletions
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -1611,11 +1611,15 @@ public:
  void aes_round(FloatRegister input, FloatRegister subkey);

  // ChaCha20 functions support block
-  void cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
-          FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
-          FloatRegister tbl);
-  void cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
-          FloatRegister dVec, bool colToDiag);
+  void cc20_qr_add4(FloatRegister (&addFirst)[4],
+          FloatRegister (&addSecond)[4]);
+  void cc20_qr_xor4(FloatRegister (&firstElem)[4],
+          FloatRegister (&secondElem)[4], FloatRegister (&result)[4]);
+  void cc20_qr_lrot4(FloatRegister (&sourceReg)[4],
+          FloatRegister (&destReg)[4], int bits, FloatRegister table);
+  void cc20_set_qr_registers(FloatRegister (&vectorSet)[4],
+          const FloatRegister (&stateVectors)[16], int idx1, int idx2,
+          int idx3, int idx4);

  // Place an ISB after code may have been modified due to a safepoint.
  void safepoint_isb();
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp
@ -28,60 +28,119 @@
 #include "runtime/stubRoutines.hpp"

 /**
- * Perform the quarter round calculations on values contained within
- * four SIMD registers.
+ * Perform the vectorized add for a group of 4 quarter round operations.
+ * In the ChaCha20 quarter round, there are two add ops: a += b and c += d.
+ * Each parameter is a set of 4 registers representing the 4 registers
+ * for the each addend in the add operation for each of the quarter rounds.
+ * (e.g. for "a" it would consist of v0/v1/v2/v3).  The result of the add
+ * is placed into the vectors in the "addFirst" array.
 *
- * @param aVec the SIMD register containing only the "a" values
- * @param bVec the SIMD register containing only the "b" values
- * @param cVec the SIMD register containing only the "c" values
- * @param dVec the SIMD register containing only the "d" values
- * @param scratch scratch SIMD register used for 12 and 7 bit left rotations
- * @param table the SIMD register used as a table for 8 bit left rotations
+ * @param addFirst array of SIMD registers representing the first addend.
+ * @param addSecond array of SIMD registers representing the second addend.
 */
-void MacroAssembler::cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
-    FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
-     FloatRegister table) {
+void MacroAssembler::cc20_qr_add4(FloatRegister (&addFirst)[4],
+    FloatRegister (&addSecond)[4]) {
+  for (int i = 0; i < 4; i++) {
+      addv(addFirst[i], T4S, addFirst[i], addSecond[i]);
+  }
+}

-  // a += b, d ^= a, d <<<= 16
-  addv(aVec, T4S, aVec, bVec);
-  eor(dVec, T16B, dVec, aVec);
-  rev32(dVec, T8H, dVec);

-  // c += d, b ^= c, b <<<= 12
-  addv(cVec, T4S, cVec, dVec);
-  eor(scratch, T16B, bVec, cVec);
-  ushr(bVec, T4S, scratch, 20);
-  sli(bVec, T4S, scratch, 12);
-
-  // a += b, d ^= a, d <<<= 8
-  addv(aVec, T4S, aVec, bVec);
-  eor(dVec, T16B, dVec, aVec);
-  tbl(dVec, T16B, dVec,  1, table);
-
-  // c += d, b ^= c, b <<<= 7
-  addv(cVec, T4S, cVec, dVec);
-  eor(scratch, T16B, bVec, cVec);
-  ushr(bVec, T4S, scratch, 25);
-  sli(bVec, T4S, scratch, 7);
+/**
+ * Perform the vectorized XOR for a group of 4 quarter round operations.
+ * In the ChaCha20 quarter round, there are two XOR ops: d ^= a and b ^= c
+ * Each parameter is a set of 4 registers representing the 4 registers
+ * for the each element in the xor operation for each of the quarter rounds.
+ * (e.g. for "a" it would consist of v0/v1/v2/v3)
+ * Note: because the b ^= c ops precede a non-byte-aligned left-rotation,
+ *       there is a third parameter which can take a set of scratch registers
+ *       for the result, which facilitates doing the subsequent operations for
+ *       the left rotation.
+ *
+ * @param firstElem array of SIMD registers representing the first element.
+ * @param secondElem array of SIMD registers representing the second element.
+ * @param result array of SIMD registers representing the destination.
+ *        May be the same as firstElem or secondElem, or a separate array.
+ */
+void MacroAssembler::cc20_qr_xor4(FloatRegister (&firstElem)[4],
+    FloatRegister (&secondElem)[4], FloatRegister (&result)[4]) {
+  for (int i = 0; i < 4; i++) {
+    eor(result[i], T16B, firstElem[i], secondElem[i]);
+  }
 }

 /**
- * Shift the b, c, and d vectors between columnar and diagonal representations.
- * Note that the "a" vector does not shift.
+ * Perform the vectorized left-rotation on 32-bit lanes for a group of
+ * 4 quarter round operations.
+ * Each parameter is a set of 4 registers representing the 4 registers
+ * for the each element in the source and destination for each of the quarter
+ * rounds (e.g. for "d" it would consist of v12/v13/v14/v15 on columns and
+ * v15/v12/v13/v14 on diagonal alignments).
 *
- * @param bVec the SIMD register containing only the "b" values
- * @param cVec the SIMD register containing only the "c" values
- * @param dVec the SIMD register containing only the "d" values
- * @param colToDiag true if moving columnar to diagonal, false if
- *                  moving diagonal back to columnar.
+ * @param sourceReg array of SIMD registers representing the source
+ * @param destReg array of SIMD registers representing the destination
+ * @param bits the distance of the rotation in bits, must be 16/12/8/7 per
+ *        the ChaCha20 specification.
 */
-void MacroAssembler::cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
-    FloatRegister dVec, bool colToDiag) {
-  int bShift = colToDiag ? 4 : 12;
-  int cShift = 8;
-  int dShift = colToDiag ? 12 : 4;
+void MacroAssembler::cc20_qr_lrot4(FloatRegister (&sourceReg)[4],
+    FloatRegister (&destReg)[4], int bits, FloatRegister table) {
+  switch (bits) {
+  case 16:      // reg <<<= 16, in-place swap of half-words
+    for (int i = 0; i < 4; i++) {
+      rev32(destReg[i], T8H, sourceReg[i]);
+    }
+    break;

-  ext(bVec, T16B, bVec, bVec, bShift);
-  ext(cVec, T16B, cVec, cVec, cShift);
-  ext(dVec, T16B, dVec, dVec, dShift);
+  case 7:       // reg <<<= (12 || 7)
+  case 12:      // r-shift src -> dest, l-shift src & ins to dest
+    for (int i = 0; i < 4; i++) {
+      ushr(destReg[i], T4S, sourceReg[i], 32 - bits);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      sli(destReg[i], T4S, sourceReg[i], bits);
+    }
+    break;
+
+  case 8:       // reg <<<= 8, simulate left rotation with table reorg
+    for (int i = 0; i < 4; i++) {
+      tbl(destReg[i], T16B, sourceReg[i], 1, table);
+    }
+    break;
+
+  default:
+    // The caller shouldn't be sending bit rotation values outside
+    // of the 16/12/8/7 as defined in the specification.
+    ShouldNotReachHere();
+  }
+}
+
+/**
+ * Set the FloatRegisters for a 4-vector register set.  These will be used
+ * during various quarter round transformations (adds, xors and left-rotations).
+ * This method itself does not result in the output of any assembly
+ * instructions.  It just organizes the vectors so they can be in columnar or
+ * diagonal alignments.
+ *
+ * @param vectorSet a 4-vector array to be altered into a new alignment
+ * @param stateVectors the 16-vector array that represents the current
+ *        working state.  The indices of this array match up with the
+ *        organization of the ChaCha20 state per RFC 7539 (e.g. stateVectors[12]
+ *        would contain the vector that holds the 32-bit counter, etc.)
+ * @param idx1 the index of the stateVectors array to be assigned to the
+ *        first vectorSet element.
+ * @param idx2 the index of the stateVectors array to be assigned to the
+ *        second vectorSet element.
+ * @param idx3 the index of the stateVectors array to be assigned to the
+ *        third vectorSet element.
+ * @param idx4 the index of the stateVectors array to be assigned to the
+ *        fourth vectorSet element.
+ */
+void MacroAssembler::cc20_set_qr_registers(FloatRegister (&vectorSet)[4],
+    const FloatRegister (&stateVectors)[16], int idx1, int idx2,
+    int idx3, int idx4) {
+  vectorSet[0] = stateVectors[idx1];
+  vectorSet[1] = stateVectors[idx2];
+  vectorSet[2] = stateVectors[idx3];
+  vectorSet[3] = stateVectors[idx4];
 }
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -4405,89 +4405,44 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

-  /**
-   *  Arguments:
-   *
-   * Inputs:
-   *   c_rarg0   - int crc
-   *   c_rarg1   - byte* buf
-   *   c_rarg2   - int length
-   *
-   * Output:
-   *       rax   - int crc result
-   */
-  address generate_updateBytesCRC32() {
-    assert(UseCRC32Intrinsics, "what are we doing here?");
-
-    __ align(CodeEntryAlignment);
-    StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
-    StubCodeMark mark(this, stub_id);
-
-    address start = __ pc();
-
-    const Register crc   = c_rarg0;  // crc
-    const Register buf   = c_rarg1;  // source java byte array address
-    const Register len   = c_rarg2;  // length
-    const Register table0 = c_rarg3; // crc_table address
-    const Register table1 = c_rarg4;
-    const Register table2 = c_rarg5;
-    const Register table3 = c_rarg6;
-    const Register tmp3 = c_rarg7;
-
-    BLOCK_COMMENT("Entry:");
-    __ enter(); // required for proper stackwalking of RuntimeStub frame
-
-    __ kernel_crc32(crc, buf, len,
-              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
-
-    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ ret(lr);
-
-    return start;
-  }
-
-  // ChaCha20 block function.  This version parallelizes 4 quarter
-  // round operations at a time.  It uses 16 SIMD registers to
-  // produce 4 blocks of key stream.
+  // ChaCha20 block function.  This version parallelizes the 32-bit
+  // state elements on each of 16 vectors, producing 4 blocks of
+  // keystream at a time.
  //
  // state (int[16]) = c_rarg0
  // keystream (byte[256]) = c_rarg1
-  // return - number of bytes of keystream (always 256)
+  // return - number of bytes of produced keystream (always 256)
  //
-  // In this approach, we load the 512-bit start state sequentially into
-  // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
-  // state, with each successive set of 4 vectors having a +1 added into
-  // the first 32-bit lane of the 4th vector in that group (the counter).
-  // By doing this, we can perform the block function on 4 512-bit blocks
-  // within one run of this intrinsic.
-  // The alignment of the data across the 4-vector group is such that at
-  // the start it is already aligned for the first round of each two-round
-  // loop iteration.  In other words, the corresponding lanes of each vector
-  // will contain the values needed for that quarter round operation (e.g.
-  // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
-  // In between each full round, a lane shift must occur.  Within a loop
-  // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
-  // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
-  // is effectively a diagonal orientation in columnar form.  After the
-  // second full round, those registers are left-rotated again, this time
-  // 96, 64, and 32 bits - returning the vectors to their columnar organization.
-  // After all 10 iterations, the original state is added to each 4-vector
-  // working state along with the add mask, and the 4 vector groups are
-  // sequentially written to the memory dedicated for the output key stream.
-  //
-  // For a more detailed explanation, see Goll and Gueron, "Vectorization of
-  // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
-  // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
-  address generate_chacha20Block_qrpar() {
-    Label L_Q_twoRounds, L_Q_cc20_const;
+  // This implementation takes each 32-bit integer from the state
+  // array and broadcasts it across all 4 32-bit lanes of a vector register
+  // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
+  // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
+  // the quarter round schedule is implemented as outlined in RFC 7539 section
+  // 2.3.  However, instead of sequentially processing the 3 quarter round
+  // operations represented by one QUARTERROUND function, we instead stack all
+  // the adds, xors and left-rotations from the first 4 quarter rounds together
+  // and then do the same for the second set of 4 quarter rounds.  This removes
+  // some latency that would otherwise be incurred by waiting for an add to
+  // complete before performing an xor (which depends on the result of the
+  // add), etc. An adjustment happens between the first and second groups of 4
+  // quarter rounds, but this is done only in the inputs to the macro functions
+  // that generate the assembly instructions - these adjustments themselves are
+  // not part of the resulting assembly.
+  // The 4 registers v0-v3 are used during the quarter round operations as
+  // scratch registers.  Once the 20 rounds are complete, these 4 scratch
+  // registers become the vectors involved in adding the start state back onto
+  // the post-QR working state.  After the adds are complete, each of the 16
+  // vectors write their first lane back to the keystream buffer, followed
+  // by the second lane from all vectors and so on.
+  address generate_chacha20Block_blockpar() {
+    Label L_twoRounds, L_cc20_const;
    // The constant data is broken into two 128-bit segments to be loaded
-    // onto SIMD registers.  The first 128 bits are a counter add overlay
-    // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
+    // onto FloatRegisters.  The first 128 bits are a counter add overlay
+    // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
    // The second 128-bits is a table constant used for 8-bit left rotations.
-    // on 32-bit lanes within a SIMD register.
-    __ BIND(L_Q_cc20_const);
-    __ emit_int64(0x0000000000000001UL);
-    __ emit_int64(0x0000000000000000UL);
+    __ BIND(L_cc20_const);
+    __ emit_int64(0x0000000100000000UL);
+    __ emit_int64(0x0000000300000002UL);
    __ emit_int64(0x0605040702010003UL);
    __ emit_int64(0x0E0D0C0F0A09080BUL);

@ -4497,144 +4452,142 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

+    int i, j;
    const Register state = c_rarg0;
    const Register keystream = c_rarg1;
    const Register loopCtr = r10;
    const Register tmpAddr = r11;
+    const FloatRegister ctrAddOverlay = v28;
+    const FloatRegister lrot8Tbl = v29;

-    const FloatRegister aState = v0;
-    const FloatRegister bState = v1;
-    const FloatRegister cState = v2;
-    const FloatRegister dState = v3;
-    const FloatRegister a1Vec = v4;
-    const FloatRegister b1Vec = v5;
-    const FloatRegister c1Vec = v6;
-    const FloatRegister d1Vec = v7;
-    // Skip the callee-saved registers v8 - v15
-    const FloatRegister a2Vec = v16;
-    const FloatRegister b2Vec = v17;
-    const FloatRegister c2Vec = v18;
-    const FloatRegister d2Vec = v19;
-    const FloatRegister a3Vec = v20;
-    const FloatRegister b3Vec = v21;
-    const FloatRegister c3Vec = v22;
-    const FloatRegister d3Vec = v23;
-    const FloatRegister a4Vec = v24;
-    const FloatRegister b4Vec = v25;
-    const FloatRegister c4Vec = v26;
-    const FloatRegister d4Vec = v27;
-    const FloatRegister scratch = v28;
-    const FloatRegister addMask = v29;
-    const FloatRegister lrot8Tbl = v30;
+    // Organize SIMD registers in an array that facilitates
+    // putting repetitive opcodes into loop structures.  It is
+    // important that each grouping of 4 registers is monotonically
+    // increasing to support the requirements of multi-register
+    // instructions (e.g. ld4r, st4, etc.)
+    const FloatRegister workSt[16] = {
+         v4,  v5,  v6,  v7, v16, v17, v18, v19,
+        v20, v21, v22, v23, v24, v25, v26, v27
+    };

-    // Load the initial state in the first 4 quadword registers,
-    // then copy the initial state into the next 4 quadword registers
-    // that will be used for the working state.
-    __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
+    // Pull in constant data.  The first 16 bytes are the add overlay
+    // which is applied to the vector holding the counter (state[12]).
+    // The second 16 bytes is the index register for the 8-bit left
+    // rotation tbl instruction.
+    __ adr(tmpAddr, L_cc20_const);
+    __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));

-    // Load the index register for 2 constant 128-bit data fields.
-    // The first represents the +1/+0/+0/+0 add mask.  The second is
-    // the 8-bit left rotation.
-    __ adr(tmpAddr, L_Q_cc20_const);
-    __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
+    // Load from memory and interlace across 16 SIMD registers,
+    // With each word from memory being broadcast to all lanes of
+    // each successive SIMD register.
+    //      Addr(0) -> All lanes in workSt[i]
+    //      Addr(4) -> All lanes workSt[i + 1], etc.
+    __ mov(tmpAddr, state);
+    for (i = 0; i < 16; i += 4) {
+      __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
+          __ post(tmpAddr, 16));
+    }
+    __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay

-    __ mov(a1Vec, __ T16B, aState);
-    __ mov(b1Vec, __ T16B, bState);
-    __ mov(c1Vec, __ T16B, cState);
-    __ mov(d1Vec, __ T16B, dState);
+    // Before entering the loop, create 5 4-register arrays.  These
+    // will hold the 4 registers that represent the a/b/c/d fields
+    // in the quarter round operation.  For instance the "b" field
+    // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
+    // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
+    // since it is part of a diagonal organization.  The aSet and scratch
+    // register sets are defined at declaration time because they do not change
+    // organization at any point during the 20-round processing.
+    FloatRegister aSet[4] = { v4, v5, v6, v7 };
+    FloatRegister bSet[4];
+    FloatRegister cSet[4];
+    FloatRegister dSet[4];
+    FloatRegister scratch[4] = { v0, v1, v2, v3 };

-    __ mov(a2Vec, __ T16B, aState);
-    __ mov(b2Vec, __ T16B, bState);
-    __ mov(c2Vec, __ T16B, cState);
-    __ addv(d2Vec, __ T4S, d1Vec, addMask);
-
-    __ mov(a3Vec, __ T16B, aState);
-    __ mov(b3Vec, __ T16B, bState);
-    __ mov(c3Vec, __ T16B, cState);
-    __ addv(d3Vec, __ T4S, d2Vec, addMask);
-
-    __ mov(a4Vec, __ T16B, aState);
-    __ mov(b4Vec, __ T16B, bState);
-    __ mov(c4Vec, __ T16B, cState);
-    __ addv(d4Vec, __ T4S, d3Vec, addMask);
-
-    // Set up the 10 iteration loop
+    // Set up the 10 iteration loop and perform all 8 quarter round ops
    __ mov(loopCtr, 10);
-    __ BIND(L_Q_twoRounds);
+    __ BIND(L_twoRounds);

-    // The first set of operations on the vectors covers the first 4 quarter
-    // round operations:
-    //  Qround(state, 0, 4, 8,12)
-    //  Qround(state, 1, 5, 9,13)
-    //  Qround(state, 2, 6,10,14)
-    //  Qround(state, 3, 7,11,15)
-    __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
-    __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
-    __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
-    __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
+    // Set to columnar organization and do the following 4 quarter-rounds:
+    // QUARTERROUND(0, 4, 8, 12)
+    // QUARTERROUND(1, 5, 9, 13)
+    // QUARTERROUND(2, 6, 10, 14)
+    // QUARTERROUND(3, 7, 11, 15)
+    __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
+    __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
+    __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);

-    // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
-    // diagonals. The a1Vec does not need to change orientation.
-    __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
-    __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
-    __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
-    __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
+    __ cc20_qr_add4(aSet, bSet);                    // a += b
+    __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
+    __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16

-    // The second set of operations on the vectors covers the second 4 quarter
-    // round operations, now acting on the diagonals:
-    //  Qround(state, 0, 5,10,15)
-    //  Qround(state, 1, 6,11,12)
-    //  Qround(state, 2, 7, 8,13)
-    //  Qround(state, 3, 4, 9,14)
-    __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
-    __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
-    __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
-    __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
+    __ cc20_qr_add4(cSet, dSet);                    // c += d
+    __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
+    __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12

-    // Before we start the next iteration, we need to perform shuffles
-    // on the b/c/d vectors to move them back to columnar organizations
-    // from their current diagonal orientation.
-    __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
-    __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
-    __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
-    __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
+    __ cc20_qr_add4(aSet, bSet);                    // a += b
+    __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
+    __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
+
+    __ cc20_qr_add4(cSet, dSet);                    // c += d
+    __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
+    __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
+
+    // Set to diagonal organization and do the next 4 quarter-rounds:
+    // QUARTERROUND(0, 5, 10, 15)
+    // QUARTERROUND(1, 6, 11, 12)
+    // QUARTERROUND(2, 7, 8, 13)
+    // QUARTERROUND(3, 4, 9, 14)
+    __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
+    __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
+    __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
+
+    __ cc20_qr_add4(aSet, bSet);                    // a += b
+    __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
+    __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
+
+    __ cc20_qr_add4(cSet, dSet);                    // c += d
+    __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
+    __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
+
+    __ cc20_qr_add4(aSet, bSet);                    // a += b
+    __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
+    __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
+
+    __ cc20_qr_add4(cSet, dSet);                    // c += d
+    __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
+    __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12

    // Decrement and iterate
    __ sub(loopCtr, loopCtr, 1);
-    __ cbnz(loopCtr, L_Q_twoRounds);
+    __ cbnz(loopCtr, L_twoRounds);

-    // Once the counter reaches zero, we fall out of the loop
-    // and need to add the initial state back into the working state
-    // represented by the a/b/c/d1Vec registers.  This is destructive
-    // on the dState register but we no longer will need it.
-    __ addv(a1Vec, __ T4S, a1Vec, aState);
-    __ addv(b1Vec, __ T4S, b1Vec, bState);
-    __ addv(c1Vec, __ T4S, c1Vec, cState);
-    __ addv(d1Vec, __ T4S, d1Vec, dState);
+    __ mov(tmpAddr, state);

-    __ addv(a2Vec, __ T4S, a2Vec, aState);
-    __ addv(b2Vec, __ T4S, b2Vec, bState);
-    __ addv(c2Vec, __ T4S, c2Vec, cState);
-    __ addv(dState, __ T4S, dState, addMask);
-    __ addv(d2Vec, __ T4S, d2Vec, dState);
+    // Add the starting state back to the post-loop keystream
+    // state.  We read/interlace the state array from memory into
+    // 4 registers similar to what we did in the beginning.  Then
+    // add the counter overlay onto workSt[12] at the end.
+    for (i = 0; i < 16; i += 4) {
+      __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
+      __ addv(workSt[i], __ T4S, workSt[i], v0);
+      __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
+      __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
+      __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
+    }
+    __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay

-    __ addv(a3Vec, __ T4S, a3Vec, aState);
-    __ addv(b3Vec, __ T4S, b3Vec, bState);
-    __ addv(c3Vec, __ T4S, c3Vec, cState);
-    __ addv(dState, __ T4S, dState, addMask);
-    __ addv(d3Vec, __ T4S, d3Vec, dState);
-
-    __ addv(a4Vec, __ T4S, a4Vec, aState);
-    __ addv(b4Vec, __ T4S, b4Vec, bState);
-    __ addv(c4Vec, __ T4S, c4Vec, cState);
-    __ addv(dState, __ T4S, dState, addMask);
-    __ addv(d4Vec, __ T4S, d4Vec, dState);
-
-    // Write the final state back to the result buffer
-    __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
-    __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
-    __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
-    __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
+    // Write working state into the keystream buffer.  This is accomplished
+    // by taking the lane "i" from each of the four vectors and writing
+    // it to consecutive 4-byte offsets, then post-incrementing by 16 and
+    // repeating with the next 4 vectors until all 16 vectors have been used.
+    // Then move to the next lane and repeat the process until all lanes have
+    // been written.
+    for (i = 0; i < 4; i++) {
+      for (j = 0; j < 16; j += 4) {
+        __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
+            __ post(keystream, 16));
+      }
+    }

    __ mov(r0, 256);             // Return length of output keystream
    __ leave();
@ -7008,6 +6961,47 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  /**
+   *  Arguments:
+   *
+   * Inputs:
+   *   c_rarg0   - int crc
+   *   c_rarg1   - byte* buf
+   *   c_rarg2   - int length
+   *
+   * Output:
+   *       rax   - int crc result
+   */
+  address generate_updateBytesCRC32() {
+    assert(UseCRC32Intrinsics, "what are we doing here?");
+
+    __ align(CodeEntryAlignment);
+    StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
+    StubCodeMark mark(this, stub_id);
+
+    address start = __ pc();
+
+    const Register crc   = c_rarg0;  // crc
+    const Register buf   = c_rarg1;  // source java byte array address
+    const Register len   = c_rarg2;  // length
+    const Register table0 = c_rarg3; // crc_table address
+    const Register table1 = c_rarg4;
+    const Register table2 = c_rarg5;
+    const Register table3 = c_rarg6;
+    const Register tmp3 = c_rarg7;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ kernel_crc32(crc, buf, len,
+              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    return start;
+  }
+
  /**
   *  Arguments:
   *
@ -11403,7 +11397,7 @@ class StubGenerator: public StubCodeGenerator {
 #endif // COMPILER2

    if (UseChaCha20Intrinsics) {
-      StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
+      StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
    }

    if (UseKyberIntrinsics) {