8355216: Accelerate P-256 arithmetic on aarch64

Reviewed-by: adinn, aph
2026-07-02 15:20:27 +00:00 · 2026-06-24 10:21:22 +00:00 · 2026-06-24 10:21:22 +00:00 · f1cd7f6ab9
commit f1cd7f6ab9
parent 05cd2d948c
6 changed files with 1032 additions and 3 deletions
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -3151,6 +3151,34 @@ public:
    _pmull(Vd, Ta, Vn, Vm, Tb);
  }

+  //Vector by element variant of UMULL
+  void _umullv(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
+                SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
+    starti;
+    int size = (Ta == T4S) ? 0b01 : 0b10;
+    int q = (Tb == T4H || Tb == T2S) ? 0 : 1;
+    int h = (size == 0b01) ? ((lane >> 2) & 1) : ((lane >> 1) & 1);
+    int l = (size == 0b01) ? ((lane >> 1) & 1) : (lane & 1);
+    assert(Ta == T4S || Ta == T2D, "umull{2}v destination register must have arrangement T4S or T2D");
+    assert(size == 0b10 ? lane < 4 : lane < 8, "umull{2}v assumes lane < 4 when using half-words and lane < 8 otherwise");
+    assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umull{2}v requires Vm to be in range V0..V15 when Ts is H");
+    f(0, 31), f(q, 30), f(0b101111, 29, 24), f(size, 23, 22), f(l, 21); //f(m, 20);
+    rf(Vm, 16), f(0b1010, 15, 12), f(h, 11), f(0, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+  //Vector by element variant of UMULL
+  void umullv(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
+               SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
+    assert(Ta == T4S ? (Tb == T4H && Ts == H) : (Tb == T2S && Ts == S), "umullv register arrangements must adhere to spec");
+    _umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane);
+  }
+
+  void umull2v(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
+               SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
+    assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umull2v register arrangements must adhere to spec");
+    _umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane);
+  }
+
  void uqxtn(FloatRegister Vd, SIMD_Arrangement Tb, FloatRegister Vn, SIMD_Arrangement Ta) {
    starti;
    int size_b = (int)Tb >> 1;
--- a/src/hotspot/cpu/aarch64/register_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/register_aarch64.hpp
@ -535,6 +535,17 @@ VSeq<N/2> vs_odd(const VSeq<N>& v) {
  return VSeq<N/2>(v.base() + v.delta(), v.delta() * 2);
 }

+template<int N>
+FloatRegister vs_head(const VSeq<N>& v) {
+  static_assert(N > 1, "sequence length must be greater than 1");
+  return v.base();
+}
+
+template<int N>
+VSeq<N-1> vs_tail(const VSeq<N>& v) {
+  return VSeq<N-1>(v.base() + v.delta(), v.delta());
+}
+
 // convenience method to construct a vector register sequence that
 // indexes its elements in reverse order to the original

--- a/src/hotspot/cpu/aarch64/stubDeclarations_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/stubDeclarations_aarch64.hpp
@ -57,7 +57,7 @@
                                       do_arch_entry,                   \
                                       do_arch_entry_init,              \
                                       do_arch_entry_array)             \
-  do_arch_blob(compiler, 70000)                                         \
+  do_arch_blob(compiler, 75000)                                         \
  do_stub(compiler, vector_iota_indices)                                \
  do_arch_entry_array(aarch64, compiler, vector_iota_indices,           \
                      vector_iota_indices, vector_iota_indices,         \
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
@ -454,6 +454,10 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
  }

+  if (FLAG_IS_DEFAULT(UseIntPolyIntrinsics)) {
+     UseIntPolyIntrinsics = true;
+  }
+
  if (supports_feature(CPU_ASIMD)) {
      if (FLAG_IS_DEFAULT(UseKyberIntrinsics)) {
          UseKyberIntrinsics = true;
--- a/src/hotspot/share/code/aotCodeCache.hpp
+++ b/src/hotspot/share/code/aotCodeCache.hpp
@ -299,6 +299,7 @@ public:
  do_var(bool,  UseSHA256Intrinsics) \
  do_var(bool,  UseSHA3Intrinsics) \
  do_var(bool,  UseSHA512Intrinsics) \
+  do_var(bool,  UseIntPolyIntrinsics) \
  do_var(bool,  UseVectorizedMismatchIntrinsic) \
  do_fun(int,   CompressedKlassPointers_shift,          CompressedKlassPointers::shift()) \
  do_fun(bool,  JavaAssertions_systemClassDefault,      JavaAssertions::systemClassDefault()) \
@ -342,7 +343,6 @@ public:
  do_var(int,   AVX3Threshold)                          /* array copy stubs and nmethods */ \
  do_var(bool,  EnableX86ECoreOpts)                     /* nmethods */ \
  do_var(bool,  UseLibmIntrinsic) \
-  do_var(bool,  UseIntPolyIntrinsics) \
  // END
 #else
 #define AOTCODECACHE_CONFIGS_X86_DO(do_var, do_fun)