8384847: Fix documentation typos around ML-KEM and ML-DSA intrinsic code for aarch64

Reviewed-by: adinn, aph
2026-07-02 15:20:27 +00:00 · 2026-06-23 13:08:15 +00:00 · 2026-06-23 13:08:15 +00:00 · e356cbb395
commit e356cbb395
parent acbedab719
1 changed files with 39 additions and 38 deletions
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -4977,7 +4977,7 @@ class StubGenerator: public StubCodeGenerator {
      return start;
    }
    // Implements the double_keccak() method of the
-    // sun.secyrity.provider.SHA3Parallel class
+    // sun.security.provider.SHA3Parallel class
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, stub_id);
    start = __ pc();
@ -5045,7 +5045,8 @@ class StubGenerator: public StubCodeGenerator {
    __ ldpd(v8, v9, __ post(sp, 64));

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -5458,7 +5459,7 @@ class StubGenerator: public StubCodeGenerator {

  // load N/2 pairs of quadword values from memory into N vector
  // registers via the address supplied in base with each pair indexed
-  // using the the start offset plus the corresponding entry in the
+  // using the start offset plus the corresponding entry in the
  // offsets array
  template<int N>
  void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
@ -5469,7 +5470,7 @@ class StubGenerator: public StubCodeGenerator {

  // store N vector registers into N/2 pairs of quadword memory
  // locations via the address supplied in base with each pair indexed
-  // using the the start offset plus the corresponding entry in the
+  // using the start offset plus the corresponding entry in the
  // offsets array
  template<int N>
  void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
@ -5480,7 +5481,7 @@ class StubGenerator: public StubCodeGenerator {

  // load N single quadword values from memory into N vector registers
  // via the address supplied in base with each value indexed using
-  // the the start offset plus the corresponding entry in the offsets
+  // the start offset plus the corresponding entry in the offsets
  // array
  template<int N>
  void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
@ -5492,7 +5493,7 @@ class StubGenerator: public StubCodeGenerator {

  // store N vector registers into N single quadword memory locations
  // via the address supplied in base with each value indexed using
-  // the the start offset plus the corresponding entry in the offsets
+  // the start offset plus the corresponding entry in the offsets
  // array
  template<int N>
  void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
@ -5504,7 +5505,7 @@ class StubGenerator: public StubCodeGenerator {

  // load N/2 pairs of quadword values from memory de-interleaved into
  // N vector registers 2 at a time via the address supplied in base
-  // with each pair indexed using the the start offset plus the
+  // with each pair indexed using the start offset plus the
  // corresponding entry in the offsets array
  template<int N>
  void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
@ -5517,7 +5518,7 @@ class StubGenerator: public StubCodeGenerator {

  // store N vector registers 2 at a time interleaved into N/2 pairs
  // of quadword memory locations via the address supplied in base
-  // with each pair indexed using the the start offset plus the
+  // with each pair indexed using the start offset plus the
  // corresponding entry in the offsets array
  template<int N>
  void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
@ -5776,7 +5777,7 @@ class StubGenerator: public StubCodeGenerator {
    // registers.
    // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
    // multiplications (this is because that way there should not be any
-    // overflow during the inverse NTT computation), here we usr R = 2^16 so
+    // overflow during the inverse NTT computation), here we use R = 2^16 so
    // that we can use the 16-bit arithmetic in the vector unit.
    //
    // On each level, we fill up the vector registers in such a way that the
@ -5898,7 +5899,7 @@ class StubGenerator: public StubCodeGenerator {

    // level 4
    // At level 4 coefficients occur in 8 discrete blocks of size 16
-    // so they are loaded using employing an ldr at 8 distinct offsets.
+    // so they are loaded by employing an ldr at 8 distinct offsets.

    vs_ldpq(vq, kyberConsts);
    int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
@ -5954,7 +5955,6 @@ class StubGenerator: public StubCodeGenerator {
    kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
    vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
    vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
-    // __ ldpq(v18, v19, __ post(zetas, 32));
    load32shorts(vs_front(vs2), zetas);
    kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
    vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
@ -5970,7 +5970,7 @@ class StubGenerator: public StubCodeGenerator {
    vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -6067,7 +6067,7 @@ class StubGenerator: public StubCodeGenerator {

    // level 2
    // At level 2 coefficients occur in 8 discrete blocks of size 16
-    // so they are loaded using employing an ldr at 8 distinct offsets.
+    // so they are loaded by employing an ldr at 8 distinct offsets.

    int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
    vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
@ -6262,7 +6262,7 @@ class StubGenerator: public StubCodeGenerator {
    store64shorts(vs2, tmpAddr);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -6407,7 +6407,7 @@ class StubGenerator: public StubCodeGenerator {
    __ br(Assembler::NE, kyberNttMult_loop);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -6499,7 +6499,7 @@ class StubGenerator: public StubCodeGenerator {
    }

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -6606,7 +6606,7 @@ class StubGenerator: public StubCodeGenerator {
    }

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -6692,8 +6692,8 @@ class StubGenerator: public StubCodeGenerator {
    // twice, one copy manipulated to provide the lower 4 bits
    // belonging to the first short in a pair and another copy
    // manipulated to provide the higher 4 bits belonging to the
-    // second short in a pair. This is why the the vector sequences va
-    // and vb used to hold the expanded 8H elements are of length 8.
+    // second short in a pair. This is why the vector sequences va
+    // and vb are used to hold the expanded 8H elements are of length 8.

    // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
    // n.b. target elements 2 and 3 duplicate elements 4 and 5
@ -6763,7 +6763,7 @@ class StubGenerator: public StubCodeGenerator {
    __ br(Assembler::GT, L_loop);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // bind label and generate constant data used by this stub
@ -6869,7 +6869,7 @@ class StubGenerator: public StubCodeGenerator {
    }

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -6943,7 +6943,7 @@ class StubGenerator: public StubCodeGenerator {
    vs_addv(va0, __ T4S, va0, vc);
  }

-  // Perform combined add/sub then montul on 4x4S vectors.
+  // Perform combined add/sub then montmul on 4x4S vectors.
  void dilithium_sub_add_montmul16(
          const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
          const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
@ -7079,7 +7079,7 @@ class StubGenerator: public StubCodeGenerator {
    // coefficients we load 4 adjacent values at 8 different offsets
    // using an indexed ldr with register variant Q and multiply them
    // in sequence order by the next set of inputs. Likewise we store
-    // the resuls using an indexed str with register variant Q.
+    // the results using an indexed str with register variant Q.
    for (int i = 0; i < 1024; i += 256) {
      // reload constants q, qinv each iteration as they get clobbered later
      vs_ldpq(vq, dilithiumConsts); // qInv, q
@ -7129,11 +7129,11 @@ class StubGenerator: public StubCodeGenerator {

    // level 7
    // At level 7 the coefficients we need to combine with the zetas
-    // occur singly with montmul inputs alterating with add/sub
+    // occur singly with montmul inputs alternating with add/sub
    // inputs. Once again we can use 4-way parallelism to combine 16
    // zetas at a time. However, we have to load 8 adjacent values at
    // 4 different offsets using an ld2 load with arrangement 4S. That
-    // interleaves the the odd words of each pair into one
+    // interleaves the odd words of each pair into one
    // coefficients vector register and the even words of the pair
    // into the next register. We then need to montmul the 4 even
    // elements of the coefficients register sequence by the zetas in
@ -7155,7 +7155,7 @@ class StubGenerator: public StubCodeGenerator {
      vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
    }
    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -7334,7 +7334,7 @@ class StubGenerator: public StubCodeGenerator {
      // c0 load 32 (8x4S) coefficients via first offsets
      vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
      // c1 load 32 (8x4S) coefficients via second offsets
-      vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
+      vs_ldr_indexed(vs2, __ Q, coeffs, i, offsets2);
      // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
      vs_addv(vs3, __ T4S, vs1, vs2);
      // c = c0 - c1
@ -7355,7 +7355,7 @@ class StubGenerator: public StubCodeGenerator {
    dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -7367,8 +7367,8 @@ class StubGenerator: public StubCodeGenerator {
  // Dilithium multiply polynomials in the NTT domain.
  // Straightforward implementation of the method
  // static int implDilithiumNttMult(
-  //              int[] result, int[] ntta, int[] nttb {} of
-  // the sun.security.provider.ML_DSA class.
+  //              int[] product, int[] coeffs1, int[] coeffs2) {}
+  // of the sun.security.provider.ML_DSA class.
  //
  // result (int[256]) = c_rarg0
  // poly1 (int[256]) = c_rarg1
@ -7429,7 +7429,7 @@ class StubGenerator: public StubCodeGenerator {
    __ br(Assembler::GE, L_loop);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -7438,10 +7438,10 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

-  // Dilithium Motgomery multiply an array by a constant.
+  // Dilithium Montgomery multiply an array by a constant.
  // A straightforward implementation of the method
  // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
-  // of the sun.security.provider.MLDSA class
+  // of the sun.security.provider.ML_DSA class
  //
  // coeffs (int[256]) = c_rarg0
  // constant (int) = c_rarg1
@ -7498,7 +7498,7 @@ class StubGenerator: public StubCodeGenerator {
    __ br(Assembler::GE, L_loop);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end
@ -7509,7 +7509,8 @@ class StubGenerator: public StubCodeGenerator {

  // Dilithium decompose poly.
  // Implements the method
-  // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
+  //    static int implDilithiumDecomposePoly(int[] input, int[] lowPart, int[] highPart,
+  //                                          int twoGamma2, int multiplier) {
  // of the sun.security.provider.ML_DSA class
  //
  // input (int[256]) = c_rarg0
@ -7613,7 +7614,7 @@ class StubGenerator: public StubCodeGenerator {
    vs_andr(vtmp, vs4, twog2);
    vs_subv(vs3, __ T4S, vs3, vtmp);

-    //  quotient += (mask & 1);
+    // quotient += (mask & 1);
    vs_andr(vtmp, vs4, one);
    vs_addv(vs2, __ T4S, vs2, vtmp);

@ -7647,7 +7648,7 @@ class StubGenerator: public StubCodeGenerator {
    // r1 = r1 & quotient;
    vs_andr(vs1, vs2, vs1);

-    // store results inteleaved
+    // store results interleaved
    // lowPart[m] = r0;
    // highPart[m] = r1;
    __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
@ -7664,7 +7665,7 @@ class StubGenerator: public StubCodeGenerator {
    __ ldpd(v8, v9, __ post(sp, 64));

    __ leave(); // required for proper stackwalking of RuntimeStub frame
-    __ mov(r0, zr); // return 0
+    __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
    __ ret(lr);

    // record the stub entry and end