From e356cbb3958dcd3329765716d8b5376c6a213e89 Mon Sep 17 00:00:00 2001 From: Ivan Bereziuk Date: Tue, 23 Jun 2026 13:08:15 +0000 Subject: [PATCH] 8384847: Fix documentation typos around ML-KEM and ML-DSA intrinsic code for aarch64 Reviewed-by: adinn, aph --- .../cpu/aarch64/stubGenerator_aarch64.cpp | 77 ++++++++++--------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 8e9af2b7b8a..f41a54e9d26 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -4977,7 +4977,7 @@ class StubGenerator: public StubCodeGenerator { return start; } // Implements the double_keccak() method of the - // sun.secyrity.provider.SHA3Parallel class + // sun.security.provider.SHA3Parallel class __ align(CodeEntryAlignment); StubCodeMark mark(this, stub_id); start = __ pc(); @@ -5045,7 +5045,8 @@ class StubGenerator: public StubCodeGenerator { __ ldpd(v8, v9, __ post(sp, 64)); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -5458,7 +5459,7 @@ class StubGenerator: public StubCodeGenerator { // load N/2 pairs of quadword values from memory into N vector // registers via the address supplied in base with each pair indexed - // using the the start offset plus the corresponding entry in the + // using the start offset plus the corresponding entry in the // offsets array template void vs_ldpq_indexed(const VSeq& v, Register base, int start, int (&offsets)[N/2]) { @@ -5469,7 +5470,7 @@ class StubGenerator: public StubCodeGenerator { // store N vector registers into N/2 pairs of quadword memory // locations via the address supplied in base with each pair indexed - // using the the start offset plus the corresponding entry in the + // using the start offset plus the corresponding entry in the // offsets array template void vs_stpq_indexed(const VSeq& v, Register base, int start, int offsets[N/2]) { @@ -5480,7 +5481,7 @@ class StubGenerator: public StubCodeGenerator { // load N single quadword values from memory into N vector registers // via the address supplied in base with each value indexed using - // the the start offset plus the corresponding entry in the offsets + // the start offset plus the corresponding entry in the offsets // array template void vs_ldr_indexed(const VSeq& v, Assembler::SIMD_RegVariant T, Register base, @@ -5492,7 +5493,7 @@ class StubGenerator: public StubCodeGenerator { // store N vector registers into N single quadword memory locations // via the address supplied in base with each value indexed using - // the the start offset plus the corresponding entry in the offsets + // the start offset plus the corresponding entry in the offsets // array template void vs_str_indexed(const VSeq& v, Assembler::SIMD_RegVariant T, Register base, @@ -5504,7 +5505,7 @@ class StubGenerator: public StubCodeGenerator { // load N/2 pairs of quadword values from memory de-interleaved into // N vector registers 2 at a time via the address supplied in base - // with each pair indexed using the the start offset plus the + // with each pair indexed using the start offset plus the // corresponding entry in the offsets array template void vs_ld2_indexed(const VSeq& v, Assembler::SIMD_Arrangement T, Register base, @@ -5517,7 +5518,7 @@ class StubGenerator: public StubCodeGenerator { // store N vector registers 2 at a time interleaved into N/2 pairs // of quadword memory locations via the address supplied in base - // with each pair indexed using the the start offset plus the + // with each pair indexed using the start offset plus the // corresponding entry in the offsets array template void vs_st2_indexed(const VSeq& v, Assembler::SIMD_Arrangement T, Register base, @@ -5776,7 +5777,7 @@ class StubGenerator: public StubCodeGenerator { // registers. // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery // multiplications (this is because that way there should not be any - // overflow during the inverse NTT computation), here we usr R = 2^16 so + // overflow during the inverse NTT computation), here we use R = 2^16 so // that we can use the 16-bit arithmetic in the vector unit. // // On each level, we fill up the vector registers in such a way that the @@ -5898,7 +5899,7 @@ class StubGenerator: public StubCodeGenerator { // level 4 // At level 4 coefficients occur in 8 discrete blocks of size 16 - // so they are loaded using employing an ldr at 8 distinct offsets. + // so they are loaded by employing an ldr at 8 distinct offsets. vs_ldpq(vq, kyberConsts); int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; @@ -5954,7 +5955,6 @@ class StubGenerator: public StubCodeGenerator { kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); - // __ ldpq(v18, v19, __ post(zetas, 32)); load32shorts(vs_front(vs2), zetas); kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); @@ -5970,7 +5970,7 @@ class StubGenerator: public StubCodeGenerator { vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -6067,7 +6067,7 @@ class StubGenerator: public StubCodeGenerator { // level 2 // At level 2 coefficients occur in 8 discrete blocks of size 16 - // so they are loaded using employing an ldr at 8 distinct offsets. + // so they are loaded by employing an ldr at 8 distinct offsets. int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); @@ -6262,7 +6262,7 @@ class StubGenerator: public StubCodeGenerator { store64shorts(vs2, tmpAddr); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -6407,7 +6407,7 @@ class StubGenerator: public StubCodeGenerator { __ br(Assembler::NE, kyberNttMult_loop); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -6499,7 +6499,7 @@ class StubGenerator: public StubCodeGenerator { } __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -6606,7 +6606,7 @@ class StubGenerator: public StubCodeGenerator { } __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -6692,8 +6692,8 @@ class StubGenerator: public StubCodeGenerator { // twice, one copy manipulated to provide the lower 4 bits // belonging to the first short in a pair and another copy // manipulated to provide the higher 4 bits belonging to the - // second short in a pair. This is why the the vector sequences va - // and vb used to hold the expanded 8H elements are of length 8. + // second short in a pair. This is why the vector sequences va + // and vb are used to hold the expanded 8H elements are of length 8. // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] // n.b. target elements 2 and 3 duplicate elements 4 and 5 @@ -6763,7 +6763,7 @@ class StubGenerator: public StubCodeGenerator { __ br(Assembler::GT, L_loop); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // bind label and generate constant data used by this stub @@ -6869,7 +6869,7 @@ class StubGenerator: public StubCodeGenerator { } __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -6943,7 +6943,7 @@ class StubGenerator: public StubCodeGenerator { vs_addv(va0, __ T4S, va0, vc); } - // Perform combined add/sub then montul on 4x4S vectors. + // Perform combined add/sub then montmul on 4x4S vectors. void dilithium_sub_add_montmul16( const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { @@ -7079,7 +7079,7 @@ class StubGenerator: public StubCodeGenerator { // coefficients we load 4 adjacent values at 8 different offsets // using an indexed ldr with register variant Q and multiply them // in sequence order by the next set of inputs. Likewise we store - // the resuls using an indexed str with register variant Q. + // the results using an indexed str with register variant Q. for (int i = 0; i < 1024; i += 256) { // reload constants q, qinv each iteration as they get clobbered later vs_ldpq(vq, dilithiumConsts); // qInv, q @@ -7129,11 +7129,11 @@ class StubGenerator: public StubCodeGenerator { // level 7 // At level 7 the coefficients we need to combine with the zetas - // occur singly with montmul inputs alterating with add/sub + // occur singly with montmul inputs alternating with add/sub // inputs. Once again we can use 4-way parallelism to combine 16 // zetas at a time. However, we have to load 8 adjacent values at // 4 different offsets using an ld2 load with arrangement 4S. That - // interleaves the the odd words of each pair into one + // interleaves the odd words of each pair into one // coefficients vector register and the even words of the pair // into the next register. We then need to montmul the 4 even // elements of the coefficients register sequence by the zetas in @@ -7155,7 +7155,7 @@ class StubGenerator: public StubCodeGenerator { vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); } __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -7334,7 +7334,7 @@ class StubGenerator: public StubCodeGenerator { // c0 load 32 (8x4S) coefficients via first offsets vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); // c1 load 32 (8x4S) coefficients via second offsets - vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); + vs_ldr_indexed(vs2, __ Q, coeffs, i, offsets2); // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 vs_addv(vs3, __ T4S, vs1, vs2); // c = c0 - c1 @@ -7355,7 +7355,7 @@ class StubGenerator: public StubCodeGenerator { dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -7367,8 +7367,8 @@ class StubGenerator: public StubCodeGenerator { // Dilithium multiply polynomials in the NTT domain. // Straightforward implementation of the method // static int implDilithiumNttMult( - // int[] result, int[] ntta, int[] nttb {} of - // the sun.security.provider.ML_DSA class. + // int[] product, int[] coeffs1, int[] coeffs2) {} + // of the sun.security.provider.ML_DSA class. // // result (int[256]) = c_rarg0 // poly1 (int[256]) = c_rarg1 @@ -7429,7 +7429,7 @@ class StubGenerator: public StubCodeGenerator { __ br(Assembler::GE, L_loop); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -7438,10 +7438,10 @@ class StubGenerator: public StubCodeGenerator { return start; } - // Dilithium Motgomery multiply an array by a constant. + // Dilithium Montgomery multiply an array by a constant. // A straightforward implementation of the method // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} - // of the sun.security.provider.MLDSA class + // of the sun.security.provider.ML_DSA class // // coeffs (int[256]) = c_rarg0 // constant (int) = c_rarg1 @@ -7498,7 +7498,7 @@ class StubGenerator: public StubCodeGenerator { __ br(Assembler::GE, L_loop); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end @@ -7509,7 +7509,8 @@ class StubGenerator: public StubCodeGenerator { // Dilithium decompose poly. // Implements the method - // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} + // static int implDilithiumDecomposePoly(int[] input, int[] lowPart, int[] highPart, + // int twoGamma2, int multiplier) { // of the sun.security.provider.ML_DSA class // // input (int[256]) = c_rarg0 @@ -7613,7 +7614,7 @@ class StubGenerator: public StubCodeGenerator { vs_andr(vtmp, vs4, twog2); vs_subv(vs3, __ T4S, vs3, vtmp); - // quotient += (mask & 1); + // quotient += (mask & 1); vs_andr(vtmp, vs4, one); vs_addv(vs2, __ T4S, vs2, vtmp); @@ -7647,7 +7648,7 @@ class StubGenerator: public StubCodeGenerator { // r1 = r1 & quotient; vs_andr(vs1, vs2, vs1); - // store results inteleaved + // store results interleaved // lowPart[m] = r0; // highPart[m] = r1; __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); @@ -7664,7 +7665,7 @@ class StubGenerator: public StubCodeGenerator { __ ldpd(v8, v9, __ post(sp, 64)); __ leave(); // required for proper stackwalking of RuntimeStub frame - __ mov(r0, zr); // return 0 + __ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value) __ ret(lr); // record the stub entry and end