diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
index 19a1256d3b3..9fed91b2316 100644
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@@ -4829,6 +4829,32 @@ class StubGenerator: public StubCodeGenerator {
     }
   }
 
+  template<int N>
+  void vs_shl(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
+              const VSeq<N>& v1, int shift) {
+    // output must not be constant
+    assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
+    // output cannot overwrite pending inputs
+    assert(!vs_write_before_read(v, v1), "output overwrites input");
+
+    for (int i = 0; i < N; i++) {
+      __ shl(v[i], T, v1[i], shift);
+    }
+  }
+
+  template<int N>
+  void vs_ushr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
+               const VSeq<N>& v1, int shift) {
+    // output must not be constant
+    assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
+    // output cannot overwrite pending inputs
+    assert(!vs_write_before_read(v, v1), "output overwrites input");
+
+    for (int i = 0; i < N; i++) {
+      __ ushr(v[i], T, v1[i], shift);
+    }
+  }
+
   template<int N>
   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
                const VSeq<N>& v1, int shift) {
@@ -4853,6 +4879,29 @@ class StubGenerator: public StubCodeGenerator {
     }
   }
 
+  template<int N>
+  void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const FloatRegister v2) {
+    // output must not be constant
+    assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
+    // output cannot overwrite pending inputs
+    assert(!vs_write_before_read(v, v1), "output overwrites input");
+    for (int i = 0; i < N; i++) {
+      __ andr(v[i], __ T16B, v1[i], v2);
+    }
+  }
+
+  template<int N>
+  void vs_eor(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
+    // output must not be constant
+    assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
+    // output cannot overwrite pending inputs
+    assert(!vs_write_before_read(v, v1), "output overwrites input");
+    assert(!vs_write_before_read(v, v2), "output overwrites input");
+    for (int i = 0; i < N; i++) {
+      __ eor(v[i], __ T16B, v1[i], v2[i]);
+    }
+  }
+
   template<int N>
   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
     // output must not be constant
@@ -7140,15 +7189,6 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }
 
-  // P256 Montgomery Multiplication.
-  // Implements the method protected void mult(long[] a, long[] b, long[] r) {}
-  // of the sun.security.util.math.intpoly.MontgomeryIntegerPolynomialP256 class
-  //
-  // a (long[5]) = c_rarg0
-  // b (long[5]) = c_rarg1
-  // r (long[5]) = c_rarg2
-  //
-  // Note that each arg represents a 256-bit integer broken into 52-bit limbs
   address generate_intpoly_montgomeryMult_P256() {
 
     __ align(CodeEntryAlignment);
@@ -7157,10 +7197,6 @@ class StubGenerator: public StubCodeGenerator {
     address start = __ pc();
     __ enter();
 
-    const Register a = c_rarg0;
-    const Register b = c_rarg1;
-    const Register result = c_rarg2;
-
     //Omit 3rd limb of modulus since it is 0
     static const int64_t modulus[5] = {
       0x000fffffffffffffL, 0x00000fffffffffffL,
@@ -7170,288 +7206,522 @@ class StubGenerator: public StubCodeGenerator {
     int shift1 = 12; // 64 - bits per limb
     int shift2 = 52; // bits per limb
 
-    // GPRs that are used throughout loop
-    Register b_j = r3;
-    Register mod_ptr = r4;
-    Register limb_mask_scalar = r5;
-    Register c_ptr = r6;
+    // Registers that are used throughout entire routine
+    const Register a = c_rarg0;
+    const Register b = c_rarg1;
+    const Register result = c_rarg2;
+    Register limb_mask = r3;
+    Register c_ptr = r4;
+    Register mod_0 = r5;
+    Register mod_1 = r6;
+    Register mod_3 = r7;
+    Register mod_4 = r10;
+    Register b_0 = r11;
+    Register b_1 = r12;
+    Register b_2 = r13;
+    Register b_3 = r14;
+    Register b_4 = r15;
 
-    // These neon registers remain constant through the main loop
-    FloatRegister limb_mask = v0;
-    FloatRegister mask_32_vec = v1;
-    FloatRegister b_lows = v2;
-    FloatRegister b_highs = v3;
-    FloatRegister mod_lows = v4;
-    FloatRegister mod_highs = v5;
+    FloatRegister limb_mask_vec = v0;
+    FloatRegister b_lows = v1;
+    FloatRegister b_highs = v2;
+    FloatRegister a_vals = v3;
 
     // Push callee saved registers on to the stack
     RegSet callee_saved = RegSet::range(r19, r28);
     __ push(callee_saved, sp);
 
-    // Allocate space on the stack for carry values and zero memory
-    __ sub(sp, sp, 80);
+    // Allocate space on the stack for carry values
+    __ sub(sp, sp, 48);
     __ mov(c_ptr, sp);
-    __ eor(b_j, b_j, b_j); //Create a 0 reg to clear memory
-    for (int i = 0; i < 10; i++) {
-      __ str(b_j, Address(sp, i * 8));
-    }
 
     // Calculate limb mask
-      __ mov(limb_mask_scalar, -UCONST64(1) >> (64 - shift2));
-      __ dup(limb_mask, __ T2D, limb_mask_scalar);
+    __ mov(limb_mask, -UCONST64(1) >> (64 - shift2));
+    __ dup(limb_mask_vec, __ T2D, limb_mask);
 
-    // Calculate 32-bit mask
+    //Load input arrays and modulus
     {
-      Register mask_32 = r7;
-      __ mov(mask_32, (UCONST64(1) << 32) - 1);
-      __ dup(mask_32_vec, __ T2D, mask_32);
+      Register a_ptr = r27;
+      Register mod_ptr = r28;
+      __ add(a_ptr, a, 24);
+      __ lea(mod_ptr, ExternalAddress((address)modulus));
+      __ ldr(b_0, Address(b));
+      __ ldr(b_1, Address(b, 8));
+      __ ldr(b_2, Address(b, 16));
+      __ ldr(b_3, Address(b, 24));
+      __ ldr(b_4, Address(b, 32));
+      __ ldr(mod_0, __ post(mod_ptr, 8));
+      __ ldr(mod_1, __ post(mod_ptr, 8));
+      __ ldr(mod_3, __ post(mod_ptr, 8));
+      __ ldr(mod_4, mod_ptr);
+      __ ld1(a_vals, __ T2D, a_ptr);
+      __ ld2(b_lows, b_highs, __ T4S, b);
     }
 
-    // Load modulus and input array b
-    __ lea(mod_ptr, ExternalAddress((address)modulus));
-    __ ld2(b_lows, b_highs, __ T4S, Address(b));
-    __ ld2(mod_lows, mod_highs, __ T4S, Address(mod_ptr));
-    __ ldr(b_j, Address(b, 32));
+    //Regs used throughout the main "loop", which is partially unrolled here
+    Register high = r19;
+    Register low = r20;
+    Register mul_ptr = r21;
+    Register mod_high = r23;
+    Register mod_low = r24;
+    Register a_i = r25;
+    Register c_i = r26;
+    Register tmp = r27;
+    Register n = r28;
 
-    for (int i = 0; i < 5; i++) {
-      Register c_idx = r10;
-      Register mul_tmp = r11;
-      Register scalar_ai = r12;
+    VSeq<4> A(16);
+    VSeq<4> B(20);
+    VSeq<4> C(24);
+    VSeq<4> D(28);
 
-      FloatRegister A = v6;
-      FloatRegister B = v7;
-      FloatRegister C = v8;
-      FloatRegister D = v16;
-      FloatRegister a_i = v17;
-      FloatRegister n = v18;
-      FloatRegister middle = v19;
-      FloatRegister tmp = v20;
-      FloatRegister modmul_low = v21;
-      FloatRegister modmul_high = v22;
-      FloatRegister c_01 = v23;
-      FloatRegister c_23 = v24;
-      FloatRegister low_34 = v25;
-      FloatRegister low_01 = v26;
-      FloatRegister low_23 = v27;
-      FloatRegister low_4x = v28;
-      FloatRegister high_01 = v29;
-      FloatRegister high_23 = v30;
-      FloatRegister high_4x = v31;
+    /////////////////////////
 
+    __ sub(sp, sp, 128);
+    __ mov(mul_ptr, sp);
+
+    __ umullv(A[0], __ T2D, b_lows, __ T2S, a_vals, __ S, 0);
+    __ umull2v(A[1], __ T2D, b_lows, __ T4S, a_vals, __ S, 0);
+    __ umullv(A[2], __ T2D, b_lows, __ T2S, a_vals, __ S, 2);
+    __ umull2v(A[3], __ T2D, b_lows, __ T4S, a_vals, __ S, 2);
+
+    // Limb 0
+    __ ldr(a_i, __ post(a, 8));
+    __ umulh(high, a_i, b_0);
+    __ mul(low, a_i, b_0);
+    __ lsl(high, high, shift1);
+    __ lsr(tmp, low, shift2);
+    __ orr(high, high, tmp);
+    __ andr(low, low, limb_mask);
+    __ andr(n, low, limb_mask);
+
+    __ umullv(B[0], __ T2D, b_highs, __ T2S, a_vals, __ S, 0);
+    __ umull2v(B[1], __ T2D, b_highs, __ T4S, a_vals, __ S, 0);
+    __ umullv(B[2], __ T2D, b_highs, __ T2S, a_vals, __ S, 2);
+    __ umull2v(B[3], __ T2D, b_highs, __ T4S, a_vals, __ S, 2);
+
+    // Limb 0 cont
+    __ umulh(mod_high, n, mod_0);
+    __ mul(mod_low, n, mod_0);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low, low, mod_low);
+    __ add(high, high, mod_high);
+    __ lsr(c_i, low, shift2);
+    __ add(c_i, c_i, high);
+
+    __ umullv(C[0], __ T2D, b_lows, __ T2S, a_vals, __ S, 1);
+    __ umull2v(C[1], __ T2D, b_lows, __ T4S, a_vals, __ S, 1);
+    __ umullv(C[2], __ T2D, b_lows, __ T2S, a_vals, __ S, 3);
+    __ umull2v(C[3], __ T2D, b_lows, __ T4S, a_vals, __ S, 3);
+
+    // Limb 1
+    __ umulh(high, a_i, b_1);
+    __ mul(low, a_i, b_1);
+    __ lsl(high, high, shift1);
+    __ lsr(tmp, low, shift2);
+    __ orr(high, high, tmp);
+    __ andr(low, low, limb_mask);
+
+    __ umullv(D[0], __ T2D, b_highs, __ T2S, a_vals, __ S, 1);
+    __ umull2v(D[1], __ T2D, b_highs, __ T4S, a_vals, __ S, 1);
+    __ umullv(D[2], __ T2D, b_highs, __ T2S, a_vals, __ S, 3);
+    __ umull2v(D[3], __ T2D, b_highs, __ T4S, a_vals, __ S, 3);
+
+    __ umulh(mod_high, n, mod_1);
+    __ mul(mod_low, n, mod_1);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low, low, mod_low);
+    __ add(high, high, mod_high);
+    __ add(c_i, c_i, low);
+    __ str(c_i, c_ptr);
+    __ mov(c_i, high);
+
+    vs_addv(B, __ T2D, B, C); // Store (B+C) in B
+
+    // Limb 2
+    __ umulh(high, a_i, b_2);
+    __ mul(low, a_i, b_2);
+    __ lsl(high, high, shift1);
+    __ lsr(tmp, low, shift2);
+    __ orr(high, high, tmp);
+    __ andr(low, low, limb_mask);
+    __ add(c_i, c_i, low);
+    __ str(c_i, Address(c_ptr, 8));
+    __ mov(c_i, high);
+
+    vs_shl(D, __ T2D, D, 12);
+
+    // Limb 3
+    __ umulh(high, a_i, b_3); //compute next mult to avoid waiting for result
+    __ mul(low, a_i, b_3);
+    __ lsl(high, high, shift1);
+    __ lsr(tmp, low, shift2);
+    __ orr(high, high, tmp);
+    __ andr(low, low, limb_mask);
+
+    vs_ushr(C, __ T2D, B, 20); // Use C for ((B+C) >>> 20)
+
+    __ umulh(mod_high, n, mod_3);
+    __ mul(mod_low, n, mod_3);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low, low, mod_low);
+    __ add(high, high, mod_high);
+    __ add(c_i, c_i, low);
+    __ str(c_i, Address(c_ptr, 16));
+    __ mov(c_i, high);
+
+    vs_shl(B, __ T2D, B, 32);
+
+    // Limb 4
+    __ umulh(high, a_i, b_4);
+    __ mul(low, a_i, b_4);
+    __ lsl(high, high, shift1);
+    __ lsr(tmp, low, shift2);
+    __ orr(high, high, tmp);
+    __ andr(low, low, limb_mask);
+
+    vs_addv(D, __ T2D, D, C);
+
+    __ umulh(mod_high, n, mod_4);
+    __ mul(mod_low, n, mod_4);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low, low, mod_low);
+    __ add(high, high, mod_high);
+    __ add(c_i, c_i, low);
+    __ str(c_i, Address(c_ptr, 24));
+    __ str(high, Address(c_ptr, 32));
+
+    vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
+    vs_andr(B, B, limb_mask_vec);
+    vs_andr(A, A, limb_mask_vec);
+    vs_addv(D, __ T2D, D, C);
+    vs_addv(A, __ T2D, A, B);
+
+    vs_ushr(B, __ T2D, A, shift2);
+    vs_andr(A, A, limb_mask_vec);
+    vs_addv(D, __ T2D, D, B);
+
+    __ st1(A[0], __ T2D, __ post(mul_ptr, 16));
+    __ st1(D[0], __ T2D, __ post(mul_ptr, 16));
+    __ st1(A[1], __ T2D, __ post(mul_ptr, 16));
+    __ st1(D[1], __ T2D, __ post(mul_ptr, 16));
+
+    __ st1(A[2], __ T2D, __ post(mul_ptr, 16));
+    __ st1(D[2], __ T2D, __ post(mul_ptr, 16));
+    __ st1(A[3], __ T2D, __ post(mul_ptr, 16));
+    __ st1(D[3], __ T2D, mul_ptr);
+
+    /////////////////////////
+    // Loop 2 & 3
+    /////////////////////////
+
+    for (int i = 0; i < 2; i++) {
       // Load a_i and increment by 8 bytes
-      __ ldr(scalar_ai, a);
-      __ ld1(a_i, __ D, 0, __ post(a, 8));
+      __ ldr(a_i, __ post(a, 8));
+      __ ldr(c_i, c_ptr); //Load prior c_i
 
-      // Start computing final multiply with GPR since it is not
-      // worth it to vectorize a single mult
-      __ mul(mul_tmp, scalar_ai, b_j);
-      __ mov(low_4x, Assembler::D, 0, mul_tmp);
-      __ umulh(mul_tmp, scalar_ai, b_j);
-      __ mov(high_4x, Assembler::D, 0, mul_tmp);
+      // Limb 0
+      __ umulh(high, a_i, b_0);
+      __ mul(low, a_i, b_0);
+      __ lsl(high, high, shift1);
+      __ lsr(tmp, low, shift2);
+      __ orr(high, high, tmp);
+      __ andr(low, low, limb_mask);
+      __ add(low, low, c_i);
+      __ ldr(c_i, Address(c_ptr, 8));
+      __ andr(n, low, limb_mask);
+      __ umulh(mod_high, n, mod_0);
+      __ mul(mod_low, n, mod_0);
+      __ lsl(mod_high, mod_high, shift1);
+      __ lsr(tmp, mod_low, shift2);
+      __ orr(mod_high, mod_high, tmp);
+      __ andr(mod_low, mod_low, limb_mask);
+      __ add(low, low, mod_low);
+      __ add(high, high, mod_high);
+      __ lsr(tmp, low, shift2);
+      __ add(c_i, c_i, tmp);
+      __ add(c_i, c_i, high);
 
-      // Iterate through b, multiplying each limb by a_i
-      // storing low and high parts in separate vectors.
-      // Compute high[i] = high[i] << shift1 | (low[i] >>> shift2)
-      // and low[i] &= LIMB_MASK
+      // Limb 1
+      __ umulh(high, a_i, b_1);
+      __ mul(low, a_i, b_1);
+      __ lsl(high, high, shift1);
+      __ lsr(tmp, low, shift2);
+      __ orr(high, high, tmp);
+      __ andr(low, low, limb_mask);
+      __ umulh(mod_high, n, mod_1);
+      __ mul(mod_low, n, mod_1);
+      __ lsl(mod_high, mod_high, shift1);
+      __ lsr(tmp, mod_low, shift2);
+      __ orr(mod_high, mod_high, tmp);
+      __ ldr(tmp, Address(c_ptr, 16));
+      __ andr(mod_low, mod_low, limb_mask);
+      __ add(low, low, mod_low);
+      __ add(high, high, mod_high);
+      __ add(c_i, c_i, low);
+      __ str(c_i, c_ptr);
+      __ add(c_i, tmp, high);
 
-      // Calculus low_01 and high_01
-      __ umullv(A, __ T2D, b_lows, __ T2S, a_i, __ S, 0);
-      __ umullv(B, __ T2D, b_highs, __ T2S, a_i, __ S, 0);
-      __ umullv(C, __ T2D, b_lows, __ T2S, a_i, __ S, 1);
-      __ umullv(D, __ T2D, b_highs, __ T2S, a_i, __ S, 1);
+      // Limb 2
+      __ umulh(high, a_i, b_2);
+      __ mul(low, a_i, b_2);
+      __ lsl(high, high, shift1);
+      __ lsr(tmp, low, shift2);
+      __ orr(high, high, tmp);
+      __ ldr(tmp, Address(c_ptr, 24));
+      __ andr(low, low, limb_mask);
+      __ add(c_i, c_i, low);
+      __ str(c_i, Address(c_ptr, 8));
+      __ add(c_i, tmp, high);
 
-      __ andr(middle, __ T16B, B, mask_32_vec);
-      __ ushr(tmp, __ T2D, A, 32);
-      __ addv(middle, __ T2D, middle, tmp);
-      __ addv(middle, __ T2D, middle, C);
+      // Limb 3
+      __ umulh(high, a_i, b_3);
+      __ mul(low, a_i, b_3);
+      __ lsl(high, high, shift1);
+      __ lsr(tmp, low, shift2);
+      __ orr(high, high, tmp);
+      __ andr(low, low, limb_mask);
+      __ umulh(mod_high, n, mod_3);
+      __ mul(mod_low, n, mod_3);
+      __ lsl(mod_high, mod_high, shift1);
+      __ lsr(tmp, mod_low, shift2);
+      __ orr(mod_high, mod_high, tmp);
+      __ ldr(tmp, Address(c_ptr, 32));
+      __ andr(mod_low, mod_low, limb_mask);
+      __ add(low, low, mod_low);
+      __ add(high, high, mod_high);
+      __ add(c_i, c_i, low);
+      __ str(c_i, Address(c_ptr, 16));
+      __ add(c_i, tmp, high);
 
-      __ shl(low_01, __ T2D, middle, 32);
-      __ andr(tmp, __ T16B, A, mask_32_vec);
-      __ orr(low_01, __ T16B, low_01, tmp);
-
-      __ ushr(high_01, __ T2D, middle, 32);
-      __ addv(high_01, __ T2D, high_01, D);
-      __ ushr(tmp, __ T2D, B, 32);
-      __ addv(high_01, __ T2D, high_01, tmp);
-
-      __ shl(high_01, __ T2D, high_01, shift1);
-      __ ushr(tmp, __ T2D, low_01, shift2);
-      __ orr(high_01, __ T16B, high_01, tmp);
-      __ andr(low_01, __ T16B, low_01, limb_mask);
-
-      // Calculate low_23 and high_23
-      __ umull2v(A, __ T2D, b_lows, __ T4S, a_i, __ S, 0);
-      __ umull2v(B, __ T2D, b_highs, __ T4S, a_i, __ S, 0);
-      __ umull2v(C, __ T2D, b_lows, __ T4S, a_i, __ S, 1);
-      __ umull2v(D, __ T2D, b_highs, __ T4S, a_i, __ S, 1);
-
-      __ andr(middle, __ T16B, B, mask_32_vec);
-      __ ushr(tmp, __ T2D, A, 32);
-      __ addv(middle, __ T2D, middle, tmp);
-      __ addv(middle, __ T2D, middle, C);
-
-      __ shl(low_23, __ T2D, middle, 32);
-      __ andr(tmp, __ T16B, A, mask_32_vec);
-      __ orr(low_23, __ T16B, low_23, tmp);
-
-      __ ushr(high_23, __ T2D, middle, 32);
-      __ addv(high_23, __ T2D, high_23, D);
-      __ ushr(tmp, __ T2D, B, 32);
-      __ addv(high_23, __ T2D, high_23, tmp);
-
-      __ shl(high_23, __ T2D, high_23, shift1);
-      __ ushr(tmp, __ T2D, low_23, shift2);
-      __ orr(high_23, __ T16B, high_23, tmp);
-      __ andr(low_23, __ T16B, low_23, limb_mask);
-
-      // Finish computing high_4x
-      __ shl(high_4x, __ T2D, high_4x, shift1);
-      __ ushr(tmp, __ T2D, low_4x, shift2);
-      __ orr(high_4x, __ T16B, high_4x, tmp);
-      __ andr(low_4x, __ T16B, low_4x, limb_mask);
-
-      // low_0 += c_i
-      // n = low_0 & limb_mask
-      __ eor(c_01, __ T16B, c_01, c_01);
-      __ ld1(c_01, __ D, 0, c_ptr);
-      __ addv(low_01, __ T2D, low_01, c_01);
-      __ andr(n, __ T16B, low_01, limb_mask);
-
-      // Iterate through the modulus, multiplying each limb by n and
-      // storing low and high parts in separate vectors.
-      // Compute high += modmul_high << shift1 | (modmul_low >>> shift2);
-      // and low += modmul_low & LIMB_MASK
-
-      // Calculate modmul_low and modmul_high for modulus[0] and modulus[1]
-      __ umullv(A, __ T2D, mod_lows, __ T2S, n, __ S, 0);
-      __ umullv(B, __ T2D, mod_highs, __ T2S, n, __ S, 0);
-      __ umullv(C, __ T2D, mod_lows, __ T2S, n, __ S, 1);
-      __ umullv(D, __ T2D, mod_highs, __ T2S, n, __ S, 1);
-
-      __ andr(middle, __ T16B, B, mask_32_vec);
-      __ ushr(tmp, __ T2D, A, 32);
-      __ addv(middle, __ T2D, middle, tmp);
-      __ addv(middle, __ T2D, middle, C);
-
-      __ shl(modmul_low, __ T2D, middle, 32);
-      __ andr(tmp, __ T16B, A, mask_32_vec);
-      __ orr(modmul_low, __ T16B, modmul_low, tmp);
-
-      __ ushr(modmul_high, __ T2D, middle, 32);
-      __ addv(modmul_high, __ T2D, modmul_high, D);
-      __ ushr(tmp, __ T2D, B, 32);
-      __ addv(modmul_high, __ T2D, modmul_high, tmp);
-
-      __ shl(modmul_high, __ T2D, modmul_high, shift1);
-      __ ushr(tmp, __ T2D, modmul_low, shift2);
-      __ orr(modmul_high, __ T16B, modmul_high, tmp);
-      __ addv(high_01, __ T2D, high_01, modmul_high);
-      __ andr(modmul_low, __ T16B, modmul_low, limb_mask);
-      __ addv(low_01, __ T2D, low_01, modmul_low);
-
-      // Calculate modmul_low and modmul_high for modulus[3] and modulus[4].
-      // Can omit modulus[2] since it is 0
-      __ umull2v(A, __ T2D, mod_lows, __ T4S, n, __ S, 0);
-      __ umull2v(B, __ T2D, mod_highs, __ T4S, n, __ S, 0);
-      __ umull2v(C, __ T2D, mod_lows, __ T4S, n, __ S, 1);
-      __ umull2v(D, __ T2D, mod_highs, __ T4S, n, __ S, 1);
-
-      __ andr(middle, __ T16B, B, mask_32_vec);
-      __ ushr(tmp, __ T2D, A, 32);
-      __ addv(middle, __ T2D, middle, tmp);
-      __ addv(middle, __ T2D, middle, C);
-
-      __ shl(modmul_low, __ T2D, middle, 32);
-      __ andr(tmp, __ T16B, A, mask_32_vec);
-      __ orr(modmul_low, __ T16B, modmul_low, tmp);
-
-      __ ushr(modmul_high, __ T2D, middle, 32);
-      __ addv(modmul_high, __ T2D, modmul_high, D);
-      __ ushr(tmp, __ T2D, B, 32);
-      __ addv(modmul_high, __ T2D, modmul_high, tmp);
-
-      __ shl(modmul_high, __ T2D, modmul_high, shift1);
-      __ ushr(tmp, __ T2D, modmul_low, shift2);
-      __ orr(modmul_high, __ T16B, modmul_high, tmp);
-      __ andr(modmul_low, __ T16B, modmul_low, limb_mask);
-
-      //Need to shift around vectors to get right layout bc of no modulus[2]
-      __ ins(low_34, __ D, low_23, 0, 1);
-      __ ins(low_34, __ D, low_4x, 1, 0);
-      __ addv(low_34, __ T2D, low_34, modmul_low);
-
-      __ eor(tmp, __ T16B, tmp, tmp);
-      __ ins(tmp, __ D, modmul_high, 1, 0); // tmp = [0, nn3]
-      __ addv(high_23, __ T2D, high_23, tmp);
-      __ ins(tmp, __ D, modmul_high, 0, 1); // tmp = [nn4, nn3]
-      __ addv(high_4x, __ T2D, high_4x, tmp);
-
-      // Compute carry values
-      // c_i+1 += low_1 + high_0 + (low_0 >>> shift2)
-      // c_i+2 += low_2 + high_1
-      // c_i+3 += low_3 + high_2
-      // c_i+4 += low_4 + high_3;
-      // c_i+5 = high_4
-      __ add(c_ptr, c_ptr, 8);
-      __ ld1(c_01, c_23, __ T2D, c_ptr);
-      __ add(c_idx, c_ptr, 32);
-      __ st1(high_4x, __ D, 0, c_idx);
-
-      // Add high values to c
-      __ addv(c_01, __ T2D, c_01, high_01);
-      __ addv(c_23, __ T2D, c_23, high_23);
-      __ addv(c_23, __ T2D, c_23, low_34);
-
-      // Reorder low vectors to enable simd ops
-      __ ins(tmp, __ D, low_01, 0, 1);
-      __ ins(tmp, __ D, low_23, 1, 0);
-      __ addv(c_01, __ T2D, c_01, tmp);
-
-      // clear tmp_4x and put low_0 in first lane
-      // Shift low_0 and add to c_i+1
-      __ ushr(low_01, __ T2D, low_01, shift2);
-      __ eor(tmp, __ T16B, tmp, tmp); //zero out tmp
-      __ ins(tmp, __ D, low_01, 0, 0);
-      __ addv(c_01, __ T2D, c_01, tmp);
-
-      // Write back carry values to stack
-      __ st1(c_01, c_23, __ T2D, c_ptr);
+      // Limb 4
+      __ umulh(high, a_i, b_4);
+      __ mul(low, a_i, b_4);
+      __ lsl(high, high, shift1);
+      __ lsr(tmp, low, shift2);
+      __ orr(high, high, tmp);
+      __ andr(low, low, limb_mask);
+      __ umulh(mod_high, n, mod_4);
+      __ mul(mod_low, n, mod_4);
+      __ lsl(mod_high, mod_high, shift1);
+      __ lsr(tmp, mod_low, shift2);
+      __ orr(mod_high, mod_high, tmp);
+      __ andr(mod_low, mod_low, limb_mask);
+      __ add(low, low, mod_low);
+      __ add(high, high, mod_high);
+      __ add(c_i, c_i, low);
+      __ str(c_i, Address(c_ptr, 24));
+      __ str(high, Address(c_ptr, 32));
     }
 
-    // Final carry propagate and write result
-    Register mod_j = r3; // b_j is not used after loop
-    Register tmp = r6; // c_ptr is not used after loop
-    Register c0 = r19;
-    Register c1 = r20;
-    Register c2 = r21;
-    Register c3 = r22;
-    Register c4 = r23;
-    Register c5 = r24;
-    Register c6 = r25;
-    Register c7 = r26;
-    Register c8 = r27;
-    Register c9 = r28;
+    Register low_1 = r21;
+    Register high_1 = r22;
 
-    __ pop(callee_saved, sp); //the callee saved registers overlap exactly with the carry values
+    //////////////////////////////
+    // a[3]
+    //////////////////////////////
 
+    __ ldr(low_1, Address(sp));
+    __ ldr(high_1, Address(sp, 16));
+
+    __ ldr(low, Address(sp, 8));
+    __ ldr(high, Address(sp, 24));
+    __ ldr(a_i, __ post(a, 8));
+    __ ldr(c_i, c_ptr);
+
+    // Limb 1
+    __ add(low_1, low_1, c_i);
+    __ ldr(c_i, Address(c_ptr, 8));
+    __ andr(n, low_1, limb_mask);
+    __ umulh(mod_high, n, mod_0);
+    __ mul(mod_low, n, mod_0);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low_1, low_1, mod_low);
+    __ add(high_1, high_1, mod_high);
+    __ lsr(tmp, low_1, shift2);
+    __ add(c_i, c_i, tmp);
+    __ add(c_i, c_i, high_1);
+
+    // Limb 2
+    __ ldr(low_1, Address(sp, 32));
+    __ ldr(high_1, Address(sp, 48));
+    __ umulh(mod_high, n, mod_1);
+    __ mul(mod_low, n, mod_1);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ ldr(tmp, Address(c_ptr, 16));
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low, low, mod_low);
+    __ add(high, high, mod_high);
+    __ add(c_i, c_i, low);
+    __ str(c_i, c_ptr);
+    __ add(c_i, tmp, high);
+
+    // Limb 2
+    __ ldr(low, Address(sp, 40));
+    __ ldr(high, Address(sp, 56));
+    __ ldr(tmp, Address(c_ptr, 24));
+    __ add(c_i, c_i, low_1);
+    __ str(c_i, Address(c_ptr, 8));
+    __ add(c_i, tmp, high_1);
+
+    // Limb 3
+    __ umulh(mod_high, n, mod_3);
+    __ mul(mod_low, n, mod_3);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ ldr(tmp, Address(c_ptr, 32));
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low, low, mod_low);
+    __ add(high, high, mod_high);
+    __ add(c_i, c_i, low);
+    __ str(c_i, Address(c_ptr, 16));
+    __ add(c_i, tmp, high);
+
+    // Limb 4
+    __ ldr(low, Address(sp, 64));
+    __ ldr(high, Address(sp, 80));
+    __ umulh(high_1, a_i, b_4);
+    __ mul(low_1, a_i, b_4);
+    __ lsl(high_1, high_1, shift1);
+    __ lsr(tmp, low_1, shift2);
+    __ orr(high_1, high_1, tmp);
+    __ andr(low_1, low_1, limb_mask);
+    __ umulh(mod_high, n, mod_4);
+    __ mul(mod_low, n, mod_4);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low_1, low_1, mod_low);
+    __ add(high_1, high_1, mod_high);
+    __ add(c_i, c_i, low_1);
+    __ str(c_i, Address(c_ptr, 24));
+    __ str(high_1, Address(c_ptr, 32));
+
+    //////////////////////////////
+    // a[4]
+    //////////////////////////////
+    __ ldr(a_i, a);
+    __ ldr(c_i, c_ptr);
+
+    // Limb 0
+    __ ldr(low_1, Address(sp, 72));
+    __ ldr(high_1, Address(sp, 88));
+
+    __ add(low, low, c_i);
+    __ ldr(c_i, Address(c_ptr, 8));
+    __ andr(n, low, limb_mask);
+    __ umulh(mod_high, n, mod_0);
+    __ mul(mod_low, n, mod_0);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low, low, mod_low);
+    __ add(high, high, mod_high);
+    __ lsr(tmp, low, shift2);
+    __ add(c_i, c_i, tmp);
+    __ add(c_i, c_i, high);
+
+    // Limb 1
+    __ ldr(low, Address(sp, 96));
+    __ ldr(high, Address(sp, 112));
+    __ umulh(mod_high, n, mod_1);
+    __ mul(mod_low, n, mod_1);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low_1, low_1, mod_low);
+    __ add(high_1, high_1, mod_high);
+
+    Register c5 = r11; //replace b_0
+    __ add(c5, c_i, low_1);
+    __ ldr(c_i, Address(c_ptr, 16));
+    __ lsr(tmp, c5, shift2);
+    __ add(c_i, c_i, tmp);
+    __ add(c_i, c_i, high_1);
+
+    // Limb 2
+    __ ldr(low_1, Address(sp, 104));
+    __ ldr(high_1, Address(sp, 120));
+    Register c6 = r12;
+    __ add(c6, c_i, low);
+    __ ldr(c_i, Address(c_ptr, 24));
+    __ lsr(tmp, c6, shift2);
+    __ add(c_i, c_i, tmp);
+    __ add(c_i, c_i, high);
+
+    // Limb 3
+    __ umulh(mod_high, n, mod_3);
+    __ mul(mod_low, n, mod_3);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low_1, low_1, mod_low);
+    __ add(high_1, high_1, mod_high);
+
+    Register c7 = r13;
+    __ add(c7, c_i, low_1);
+    __ ldr(c_i, Address(c_ptr, 32));
+    __ lsr(tmp, c7, shift2);
+    __ add(c_i, c_i, tmp);
+    __ add(c_i, c_i, high_1);
+
+    // Limb 4
+    __ umulh(high, a_i, b_4);
+    __ mul(low, a_i, b_4);
+    __ lsl(high, high, shift1);
+    __ lsr(tmp, low, shift2);
+    __ orr(high, high, tmp);
+    __ andr(low, low, limb_mask);
+    __ umulh(mod_high, n, mod_4);
+    __ mul(mod_low, n, mod_4);
+    __ lsl(mod_high, mod_high, shift1);
+    __ lsr(tmp, mod_low, shift2);
+    __ orr(mod_high, mod_high, tmp);
+    __ andr(mod_low, mod_low, limb_mask);
+    __ add(low, low, mod_low);
+    __ add(high, high, mod_high);
+
+    /////////////////////////////
+    // Final carry propagate
+    /////////////////////////////
+
+    // c5 += d1 + dd0 + (d0 >>> BITS_PER_LIMB);
     // c6 += (c5 >>> BITS_PER_LIMB);
     // c7 += (c6 >>> BITS_PER_LIMB);
     // c8 += (c7 >>> BITS_PER_LIMB);
     // c9 += (c8 >>> BITS_PER_LIMB);
 
-    __ lsr(tmp, c5, shift2);
-    __ add(c6, c6, tmp);
-    __ lsr(tmp, c6, shift2);
-    __ add(c7, c7, tmp);
-    __ lsr(tmp, c7, shift2);
-    __ add(c8, c8, tmp);
-    __ lsr(tmp, c8, shift2);
-    __ add(c9, c9, tmp);
+    Register c8 = r14;
+    Register c9 = r15;
+    __ add(c8, c_i, low);
+    __ lsr(c9, c8, shift2);
+    __ add(c9, c9, high);
 
-    __ andr(c5, c5, limb_mask_scalar);
-    __ andr(c6, c6, limb_mask_scalar);
-    __ andr(c7, c7, limb_mask_scalar);
-    __ andr(c8, c8, limb_mask_scalar);
+    __ andr(c5, c5, limb_mask);
+    __ andr(c6, c6, limb_mask);
+    __ andr(c7, c7, limb_mask);
+    __ andr(c8, c8, limb_mask);
 
     // c0 = c5 - modulus[0];
     // c1 = c6 - modulus[1] + (c0 >> BITS_PER_LIMB);
@@ -7462,33 +7732,30 @@ class StubGenerator: public StubCodeGenerator {
     // c2 &= LIMB_MASK;
     // c4 = c9 - modulus[4] + (c3 >> BITS_PER_LIMB);
     // c3 &= LIMB_MASK;
-    __ ldr(mod_j, Address(mod_ptr));
-    __ sub(c0, c5, mod_j);
 
-    __ ldr(mod_j, Address(mod_ptr, 8));
-    __ sub(c1, c6, mod_j);
-    __ asr(tmp, c0, shift2);
-    __ add(c1, c1, tmp);
+    Register c0 = r19;
+    Register c1 = r20;
+    Register c2 = r21;
+    Register c3 = r22;
+    Register c4 = r23;
+    Register tmp0 = r24;
+    Register tmp1 = r25;
+    Register tmp2 = r26;
+    Register tmp3 = r27;
+    Register tmp4 = r28;
 
-    // Modulus[2] is zero
-    __ asr(c2, c1, shift2);
-    __ add(c2, c2, c7);
-
-    __ ldr(mod_j, Address(mod_ptr, 16));
-    __ sub(c3, c8, mod_j);
-    __ asr(tmp, c2, shift2);
-    __ add(c3, c3, tmp);
-
-    __ ldr(mod_j, Address(mod_ptr, 24));
-    __ sub(c4, c9, mod_j);
-    __ asr(tmp, c3, shift2);
-    __ add(c4, c4, tmp);
-
-    // Apply limb mask
-    __ andr(c0, c0, limb_mask_scalar);
-    __ andr(c1, c1, limb_mask_scalar);
-    __ andr(c2, c2, limb_mask_scalar);
-    __ andr(c3, c3, limb_mask_scalar);
+    __ sub(c0, c5, mod_0);
+    __ sub(c1, c6, mod_1);
+    __ sub(c3, c8, mod_3);
+    __ sub(c4, c9, mod_4);
+    __ add(c1, c1, c0, Assembler::ASR, shift2);
+    __ andr(c0, c0, limb_mask);
+    __ add(c2, c7, c1, Assembler::ASR, shift2);
+    __ andr(c1, c1, limb_mask);
+    __ add(c3, c3, c2, Assembler::ASR, shift2);
+    __ andr(c2, c2, limb_mask);
+    __ add(c4, c4, c3, Assembler::ASR, shift2);
+    __ andr(c3, c3, limb_mask);
 
     // Final write back
     // mask = c4 >> 63
@@ -7498,54 +7765,365 @@ class StubGenerator: public StubCodeGenerator {
     // r[3] = ((c8 & mask) | (c3 & ~mask));
     // r[4] = ((c9 & mask) | (c4 & ~mask));
 
-    Register res_0 = r11;
-    Register res_1 = r12;
-    Register res_2 = r13;
-    Register res_3 = r14;
-    Register res_4 = r15;
-    Register mask = r7;
-    Register nmask = r10;
-
-    RegSet res = RegSet::range(r11, r15);
+    Register mask = r5;
+    Register nmask = r6;
 
     __ asr(mask, c4, 63);
     __ mvn(nmask, mask);
-
-    __ andr(res_0, c5, mask);
+    __ andr(c5, c5, mask);
     __ andr(tmp, c0, nmask);
-    __ orr(res_0, res_0, tmp);
-
-    __ andr(res_1, c6, mask);
+    __ orr(c5, c5, tmp);
+    __ andr(c6, c6, mask);
     __ andr(tmp, c1, nmask);
-    __ orr(res_1, res_1, tmp);
-
-    __ andr(res_2, c7, mask);
+    __ orr(c6, c6, tmp);
+    __ andr(c7, c7, mask);
     __ andr(tmp, c2, nmask);
-    __ orr(res_2, res_2, tmp);
-
-    __ andr(res_3, c8, mask);
+    __ orr(c7, c7, tmp);
+    __ andr(c8, c8, mask);
     __ andr(tmp, c3, nmask);
-    __ orr(res_3, res_3, tmp);
-
-    __ andr(res_4, c9, mask);
+    __ orr(c8, c8, tmp);
+    __ andr(c9, c9, mask);
     __ andr(tmp, c4, nmask);
-    __ orr(res_4, res_4, tmp);
+    __ orr(c9, c9, tmp);
 
-    __ str(res_0, result);
-    __ str(res_1, Address(result, 8));
-    __ str(res_2, Address(result, 16));
-    __ str(res_3, Address(result, 24));
-    __ str(res_4, Address(result, 32));
+    __ str(c5, result);
+    __ str(c6, Address(result, 8));
+    __ str(c7, Address(result, 16));
+    __ str(c8, Address(result, 24));
+    __ str(c9, Address(result, 32));
 
     // End intrinsic call
+    __ add(sp, sp, 176);
     __ pop(callee_saved, sp);
-    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ leave();
     __ mov(r0, zr); // return 0
     __ ret(lr);
 
     return start;
   }
 
+  address generate_intpoly_assign() {
+    // KNOWN Lengths:
+    //   MontgomeryIntPolynP256:  5 = 4 + 1
+    //   IntegerPolynomial1305:   5 = 4 + 1
+    //   IntegerPolynomial25519: 10 = 8 + 2
+    //   IntegerPolynomialP256:  10 = 8 + 2
+    //   Curve25519OrderField:   10 = 8 + 2
+    //   Curve25519OrderField:   10 = 8 + 2
+    //   P256OrderField:         10 = 8 + 2
+    //   IntegerPolynomialP384:  14 = 8 + 4 + 2
+    //   P384OrderField:         14 = 8 + 4 + 2
+    //   IntegerPolynomial448:   16 = 8 + 8
+    //   Curve448OrderField:     16 = 8 + 8
+    //   Curve448OrderField:     16 = 8 + 8
+    //   IntegerPolynomialP521:  19 = 8 + 8 + 2 + 1
+    //   P521OrderField:         19 = 8 + 8 + 2 + 1
+    // Special Cases 5, 10, 14, 16, 19
+
+    __ align(CodeEntryAlignment);
+    StubId stub_id = StubId::stubgen_intpoly_assign_id;
+    StubCodeMark mark(this, stub_id);
+    address start = __ pc();
+    __ enter();
+
+    // Inputs
+    const Register set = c_rarg0;
+    const Register aLimbs = c_rarg1;
+    const Register bLimbs = c_rarg2;
+    const Register length = c_rarg3;
+
+    Label L_Length5, L_Length10, L_Length14, L_Length16, L_Length19, L_DefaultLoop, L_Done;
+
+    /*
+    int maskValue = -set;
+    for (int i = 0; i < a.length; i++) {
+        long dummyLimbs = maskValue & (a[i] ^ b[i]);
+        a[i] = dummyLimbs ^ a[i];
+    }
+    */
+    Register mask_scalar = r4;
+    FloatRegister mask_vec = v0;
+
+    __ neg(mask_scalar, set);
+    __ dup(mask_vec, __ T2D, mask_scalar);
+
+    __ push(r19, sp); //needed for length = 5
+
+    __ cmp(length, (u1)5);
+    __ br(Assembler::EQ, L_Length5);
+    __ cmp(length, (u1)10);
+    __ br(Assembler::EQ, L_Length10);
+    __ cmp(length, (u1)14);
+    __ br(Assembler::EQ, L_Length14);
+    __ cmp(length, (u1)16);
+    __ br(Assembler::EQ, L_Length16);
+    __ cmp(length, (u1)19);
+    __ br(Assembler::EQ, L_Length19);
+
+    // Length = 5
+    // Use 5 GPRs (neon not faster with this few limbs)
+    __ BIND(L_Length5);
+    {
+      Register a0 = r5;
+      Register a1 = r6;
+      Register a2 = r7;
+      Register a3 = r10;
+      Register a4 = r11;
+      Register b0 = r12;
+      Register b1 = r13;
+      Register b2 = r14;
+      Register b3 = r15;
+      Register b4 = r19;
+
+      __ ldr(a0, aLimbs);
+      __ ldr(a1, Address(aLimbs, 8));
+      __ ldr(a2, Address(aLimbs, 16));
+      __ ldr(a3, Address(aLimbs, 24));
+      __ ldr(a4, Address(aLimbs, 32));
+
+      __ ldr(b0, bLimbs);
+      __ ldr(b1, Address(bLimbs, 8));
+      __ ldr(b2, Address(bLimbs, 16));
+      __ ldr(b3, Address(bLimbs, 24));
+      __ ldr(b4, Address(bLimbs, 32));
+
+      __ eor(b0, b0, a0);
+      __ eor(b1, b1, a1);
+      __ eor(b2, b2, a2);
+      __ eor(b3, b3, a3);
+      __ eor(b4, b4, a4);
+
+      __ andr(b0, b0, mask_scalar);
+      __ andr(b1, b1, mask_scalar);
+      __ andr(b2, b2, mask_scalar);
+      __ andr(b3, b3, mask_scalar);
+      __ andr(b4, b4, mask_scalar);
+
+      __ eor(a0, a0, b0);
+      __ eor(a1, a1, b1);
+      __ eor(a2, a2, b2);
+      __ eor(a3, a3, b3);
+      __ eor(a4, a4, b4);
+
+      __ str(a0, aLimbs);
+      __ str(a1, Address(aLimbs, 8));
+      __ str(a2, Address(aLimbs, 16));
+      __ str(a3, Address(aLimbs, 24));
+      __ str(a4, Address(aLimbs, 32));
+
+      __ b(L_Done);
+    }
+
+    // Length = 10
+    // Split into 4 neon regs and 2 GPRs
+    __ BIND(L_Length10);
+    {
+      Register a9 = r10;
+      Register a10 = r11;
+      Register b9 = r12;
+      Register b10 = r13;
+
+      VSeq<4> a_vec(16);
+      VSeq<4> b_vec(20);
+
+      __ ldr(a9, Address(aLimbs, 64));
+      __ ldr(a10, Address(aLimbs, 72));
+      __ ldr(b9, Address(bLimbs, 64));
+      __ ldr(b10, Address(bLimbs, 72));
+
+      __ ldpq(a_vec[0], a_vec[1], Address(aLimbs));
+      __ ldpq(a_vec[2], a_vec[3], Address(aLimbs, 32));
+
+      __ eor(b9, b9, a9);
+      __ eor(b10, b10, a10);
+
+      __ ldpq(b_vec[0], b_vec[1], Address(bLimbs));
+      __ ldpq(b_vec[2], b_vec[3], Address(bLimbs, 32));
+
+      __ andr(b9, b9, mask_scalar);
+      __ andr(b10, b10, mask_scalar);
+
+      vs_eor(b_vec, b_vec, a_vec);
+
+      __ eor(a9, a9, b9);
+      __ eor(a10, a10, b10);
+
+      vs_andr(b_vec, b_vec, mask_vec);
+
+      __ str(a9, Address(aLimbs, 64));
+      __ str(a10, Address(aLimbs, 72));
+
+      vs_eor(a_vec, a_vec, b_vec);
+
+      __ stpq(a_vec[0], a_vec[1], Address(aLimbs));
+      __ stpq(a_vec[2], a_vec[3], Address(aLimbs, 32));
+
+      __ b(L_Done);
+    }
+
+    // Length = 14
+    // Split into 5 neon regs and 4 GPRs
+    __ BIND(L_Length14);
+    {
+      Register a10 = r5;
+      Register a11 = r6;
+      Register a12 = r7;
+      Register a13 = r8;
+      Register b10 = r9;
+      Register b11 = r10;
+      Register b12 = r11;
+      Register b13 = r12;
+
+      VSeq<5> a_vec(16);
+      VSeq<5> b_vec(22);
+
+      __ ldr(a10, Address(aLimbs, 80));
+      __ ldr(a11, Address(aLimbs, 88));
+      __ ldr(a12, Address(aLimbs, 96));
+      __ ldr(a13, Address(aLimbs, 104));
+
+      __ ldr(b10, Address(bLimbs, 80));
+      __ ldr(b11, Address(bLimbs, 88));
+      __ ldr(b12, Address(bLimbs, 96));
+      __ ldr(b13, Address(bLimbs, 104));
+
+      __ ld1(a_vec[0], __ T2D, aLimbs);
+      __ ldpq(a_vec[1], a_vec[2], Address(aLimbs, 16));
+      __ ldpq(a_vec[3], a_vec[4], Address(aLimbs, 48));
+
+      __ eor(b10, b10, a10);
+      __ eor(b11, b11, a11);
+      __ eor(b12, b12, a12);
+      __ eor(b13, b13, a13);
+
+      __ ld1(b_vec[0], __ T2D, bLimbs);
+      __ ldpq(b_vec[1], b_vec[2], Address(bLimbs, 16));
+      __ ldpq(b_vec[3], b_vec[4], Address(bLimbs, 48));
+
+      __ andr(b10, b10, mask_scalar);
+      __ andr(b11, b11, mask_scalar);
+      __ andr(b12, b12, mask_scalar);
+      __ andr(b13, b13, mask_scalar);
+
+      vs_eor(b_vec, b_vec, a_vec);
+
+      __ eor(a10, a10, b10);
+      __ eor(a11, a11, b11);
+      __ eor(a12, a12, b12);
+      __ eor(a13, a13, b13);
+
+      vs_andr(b_vec, b_vec, mask_vec);
+
+      __ str(a10, Address(aLimbs, 80));
+      __ str(a11, Address(aLimbs, 88));
+      __ str(a12, Address(aLimbs, 96));
+      __ str(a13, Address(aLimbs, 104));
+
+      vs_eor(a_vec, a_vec, b_vec);
+
+      __ st1(a_vec[0], __ T2D, aLimbs);
+      __ stpq(a_vec[1], a_vec[2], Address(aLimbs, 16));
+      __ stpq(a_vec[3], a_vec[4], Address(aLimbs, 48));
+
+      __ b(L_Done);
+    }
+
+    // Length = 16
+    // Use 8 neon regs
+    __ BIND(L_Length16);
+    {
+      VSeq<8> a_vec(16);
+      VSeq<8> b_vec(24);
+
+      __ ldpq(a_vec[0], a_vec[1], aLimbs);
+      __ ldpq(a_vec[2], a_vec[3], Address(aLimbs, 32));
+      __ ldpq(a_vec[4], a_vec[5], Address(aLimbs, 64));
+      __ ldpq(a_vec[6], a_vec[7], Address(aLimbs, 96));
+
+      __ ldpq(b_vec[0], b_vec[1], bLimbs);
+      __ ldpq(b_vec[2], b_vec[3], Address(bLimbs, 32));
+      __ ldpq(b_vec[4], b_vec[5], Address(bLimbs, 64));
+      __ ldpq(b_vec[6], b_vec[7], Address(bLimbs, 96));
+
+      vs_eor(b_vec, b_vec, a_vec);
+      vs_andr(b_vec, b_vec, mask_vec);
+      vs_eor(a_vec, a_vec, b_vec);
+
+      __ stpq(a_vec[0], a_vec[1], Address(aLimbs));
+      __ stpq(a_vec[2], a_vec[3], Address(aLimbs, 32));
+      __ stpq(a_vec[4], a_vec[5], Address(aLimbs, 64));
+      __ stpq(a_vec[6], a_vec[7], Address(aLimbs, 96));
+
+      __ b(L_Done);
+    }
+
+    // Length = 19
+    // Split into 8 neon regs and 3 GPRs
+    __ BIND(L_Length19);
+    {
+      Register a17 = r10;
+      Register a18 = r11;
+      Register a19 = r12;
+      Register b17 = r13;
+      Register b18 = r14;
+      Register b19 = r15;
+
+      VSeq<8> a_vec(16);
+      VSeq<8> b_vec(24);
+
+      __ ldr(a17, Address(aLimbs, 128));
+      __ ldr(a18, Address(aLimbs, 136));
+      __ ldr(a19, Address(aLimbs, 144));
+      __ ldr(b17, Address(bLimbs, 128));
+      __ ldr(b18, Address(bLimbs, 136));
+      __ ldr(b19, Address(bLimbs, 144));
+
+      __ ldpq(a_vec[0], a_vec[1], aLimbs);
+      __ ldpq(a_vec[2], a_vec[3], Address(aLimbs, 32));
+      __ ldpq(a_vec[4], a_vec[5], Address(aLimbs, 64));
+      __ ldpq(a_vec[6], a_vec[7], Address(aLimbs, 96));
+
+      __ eor(b17, b17, a17);
+      __ eor(b18, b18, a18);
+      __ eor(b19, b19, a19);
+
+      __ ldpq(b_vec[0], b_vec[1], bLimbs);
+      __ ldpq(b_vec[2], b_vec[3], Address(bLimbs, 32));
+      __ ldpq(b_vec[4], b_vec[5], Address(bLimbs, 64));
+      __ ldpq(b_vec[6], b_vec[7], Address(bLimbs, 96));
+
+      __ andr(b17, b17, mask_scalar);
+      __ andr(b18, b18, mask_scalar);
+      __ andr(b19, b19, mask_scalar);
+
+      vs_eor(b_vec, b_vec, a_vec);
+
+      __ eor(a17, a17, b17);
+      __ eor(a18, a18, b18);
+      __ eor(a19, a19, b19);
+
+      vs_andr(b_vec, b_vec, mask_vec);
+
+      __ str(a17, Address(aLimbs, 128));
+      __ str(a18, Address(aLimbs, 136));
+      __ str(a19, Address(aLimbs, 144));
+
+      vs_eor(a_vec, a_vec, b_vec);
+
+      __ stpq(a_vec[0], a_vec[1], Address(aLimbs));
+      __ stpq(a_vec[2], a_vec[3], Address(aLimbs, 32));
+      __ stpq(a_vec[4], a_vec[5], Address(aLimbs, 64));
+      __ stpq(a_vec[6], a_vec[7], Address(aLimbs, 96));
+    }
+
+    __ BIND(L_Done);
+    __ pop(r19, sp);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ mov(r0, zr); // return 0
+    __ ret(lr);
+    return start;
+  }
+
   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
              Register tmp0, Register tmp1, Register tmp2) {
     __ bic(tmp0, a2, a1); // for a0
@@ -12288,6 +12866,7 @@ class StubGenerator: public StubCodeGenerator {
 
     if (UseIntPolyIntrinsics) {
       StubRoutines::_intpoly_montgomeryMult_P256 = generate_intpoly_montgomeryMult_P256();
+      StubRoutines::_intpoly_assign = generate_intpoly_assign();
     }
 
     if (UseKyberIntrinsics) {