8355216: Accelerate P-256 arithmetic on aarch64

Reviewed-by: adinn, aph
This commit is contained in:
Ferenc Rakoczi 2026-06-24 10:21:22 +00:00 committed by Andrew Dinn
parent 05cd2d948c
commit f1cd7f6ab9
6 changed files with 1032 additions and 3 deletions

View File

@ -3151,6 +3151,34 @@ public:
_pmull(Vd, Ta, Vn, Vm, Tb);
}
//Vector by element variant of UMULL
void _umullv(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
starti;
int size = (Ta == T4S) ? 0b01 : 0b10;
int q = (Tb == T4H || Tb == T2S) ? 0 : 1;
int h = (size == 0b01) ? ((lane >> 2) & 1) : ((lane >> 1) & 1);
int l = (size == 0b01) ? ((lane >> 1) & 1) : (lane & 1);
assert(Ta == T4S || Ta == T2D, "umull{2}v destination register must have arrangement T4S or T2D");
assert(size == 0b10 ? lane < 4 : lane < 8, "umull{2}v assumes lane < 4 when using half-words and lane < 8 otherwise");
assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umull{2}v requires Vm to be in range V0..V15 when Ts is H");
f(0, 31), f(q, 30), f(0b101111, 29, 24), f(size, 23, 22), f(l, 21); //f(m, 20);
rf(Vm, 16), f(0b1010, 15, 12), f(h, 11), f(0, 10), rf(Vn, 5), rf(Vd, 0);
}
//Vector by element variant of UMULL
void umullv(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
assert(Ta == T4S ? (Tb == T4H && Ts == H) : (Tb == T2S && Ts == S), "umullv register arrangements must adhere to spec");
_umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane);
}
void umull2v(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umull2v register arrangements must adhere to spec");
_umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane);
}
void uqxtn(FloatRegister Vd, SIMD_Arrangement Tb, FloatRegister Vn, SIMD_Arrangement Ta) {
starti;
int size_b = (int)Tb >> 1;

View File

@ -535,6 +535,17 @@ VSeq<N/2> vs_odd(const VSeq<N>& v) {
return VSeq<N/2>(v.base() + v.delta(), v.delta() * 2);
}
template<int N>
FloatRegister vs_head(const VSeq<N>& v) {
static_assert(N > 1, "sequence length must be greater than 1");
return v.base();
}
template<int N>
VSeq<N-1> vs_tail(const VSeq<N>& v) {
return VSeq<N-1>(v.base() + v.delta(), v.delta());
}
// convenience method to construct a vector register sequence that
// indexes its elements in reverse order to the original

View File

@ -57,7 +57,7 @@
do_arch_entry, \
do_arch_entry_init, \
do_arch_entry_array) \
do_arch_blob(compiler, 70000) \
do_arch_blob(compiler, 75000) \
do_stub(compiler, vector_iota_indices) \
do_arch_entry_array(aarch64, compiler, vector_iota_indices, \
vector_iota_indices, vector_iota_indices, \

File diff suppressed because it is too large Load Diff

View File

@ -454,6 +454,10 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
}
if (FLAG_IS_DEFAULT(UseIntPolyIntrinsics)) {
UseIntPolyIntrinsics = true;
}
if (supports_feature(CPU_ASIMD)) {
if (FLAG_IS_DEFAULT(UseKyberIntrinsics)) {
UseKyberIntrinsics = true;

View File

@ -299,6 +299,7 @@ public:
do_var(bool, UseSHA256Intrinsics) \
do_var(bool, UseSHA3Intrinsics) \
do_var(bool, UseSHA512Intrinsics) \
do_var(bool, UseIntPolyIntrinsics) \
do_var(bool, UseVectorizedMismatchIntrinsic) \
do_fun(int, CompressedKlassPointers_shift, CompressedKlassPointers::shift()) \
do_fun(bool, JavaAssertions_systemClassDefault, JavaAssertions::systemClassDefault()) \
@ -342,7 +343,6 @@ public:
do_var(int, AVX3Threshold) /* array copy stubs and nmethods */ \
do_var(bool, EnableX86ECoreOpts) /* nmethods */ \
do_var(bool, UseLibmIntrinsic) \
do_var(bool, UseIntPolyIntrinsics) \
// END
#else
#define AOTCODECACHE_CONFIGS_X86_DO(do_var, do_fun)