mirror of
https://github.com/openjdk/jdk.git
synced 2026-01-28 03:58:21 +00:00
8360934: Add AVX-512 intrinsics for ML-KEM - enhancement on AVX512_VBMI
Co-authored-by: Sandhya Viswanathan <sviswanathan@openjdk.org> Reviewed-by: jbhateja, vpaprotski
This commit is contained in:
parent
436c62afd2
commit
a0e6f028a8
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2025, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -64,6 +64,39 @@ static address kyberAvx512ConstsAddr(int offset) {
|
||||
|
||||
const Register scratch = r10;
|
||||
|
||||
ATTRIBUTE_ALIGNED(64) static const uint8_t kyberAvx512_12To16Dup[] = {
|
||||
// 0 - 63
|
||||
0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16,
|
||||
16, 17, 18, 19, 19, 20, 21, 22, 22, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30,
|
||||
31, 31, 32, 33, 34, 34, 35, 36, 37, 37, 38, 39, 40, 40, 41, 42, 43, 43, 44,
|
||||
45, 46, 46, 47
|
||||
};
|
||||
|
||||
static address kyberAvx512_12To16DupAddr() {
|
||||
return (address) kyberAvx512_12To16Dup;
|
||||
}
|
||||
|
||||
ATTRIBUTE_ALIGNED(64) static const uint16_t kyberAvx512_12To16Shift[] = {
|
||||
// 0 - 31
|
||||
0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0,
|
||||
4, 0, 4, 0, 4, 0, 4
|
||||
};
|
||||
|
||||
static address kyberAvx512_12To16ShiftAddr() {
|
||||
return (address) kyberAvx512_12To16Shift;
|
||||
}
|
||||
|
||||
ATTRIBUTE_ALIGNED(64) static const uint64_t kyberAvx512_12To16And[] = {
|
||||
// 0 - 7
|
||||
0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF,
|
||||
0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF,
|
||||
0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF
|
||||
};
|
||||
|
||||
static address kyberAvx512_12To16AndAddr() {
|
||||
return (address) kyberAvx512_12To16And;
|
||||
}
|
||||
|
||||
ATTRIBUTE_ALIGNED(64) static const uint16_t kyberAvx512NttPerms[] = {
|
||||
// 0
|
||||
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||
@ -822,10 +855,65 @@ address generate_kyber12To16_avx512(StubGenerator *stubgen,
|
||||
|
||||
const Register perms = r11;
|
||||
|
||||
Label Loop;
|
||||
Label Loop, VBMILoop;
|
||||
|
||||
__ addptr(condensed, condensedOffs);
|
||||
|
||||
if (VM_Version::supports_avx512_vbmi()) {
|
||||
// mask load for the first 48 bytes of each vector
|
||||
__ mov64(rax, 0x0000FFFFFFFFFFFF);
|
||||
__ kmovql(k1, rax);
|
||||
|
||||
__ lea(perms, ExternalAddress(kyberAvx512_12To16DupAddr()));
|
||||
__ evmovdqub(xmm20, Address(perms), Assembler::AVX_512bit);
|
||||
|
||||
__ lea(perms, ExternalAddress(kyberAvx512_12To16ShiftAddr()));
|
||||
__ evmovdquw(xmm21, Address(perms), Assembler::AVX_512bit);
|
||||
|
||||
__ lea(perms, ExternalAddress(kyberAvx512_12To16AndAddr()));
|
||||
__ evmovdquq(xmm22, Address(perms), Assembler::AVX_512bit);
|
||||
|
||||
__ align(OptoLoopAlignment);
|
||||
__ BIND(VBMILoop);
|
||||
|
||||
__ evmovdqub(xmm0, k1, Address(condensed, 0), false,
|
||||
Assembler::AVX_512bit);
|
||||
__ evmovdqub(xmm1, k1, Address(condensed, 48), false,
|
||||
Assembler::AVX_512bit);
|
||||
__ evmovdqub(xmm2, k1, Address(condensed, 96), false,
|
||||
Assembler::AVX_512bit);
|
||||
__ evmovdqub(xmm3, k1, Address(condensed, 144), false,
|
||||
Assembler::AVX_512bit);
|
||||
|
||||
__ evpermb(xmm4, k0, xmm20, xmm0, false, Assembler::AVX_512bit);
|
||||
__ evpermb(xmm5, k0, xmm20, xmm1, false, Assembler::AVX_512bit);
|
||||
__ evpermb(xmm6, k0, xmm20, xmm2, false, Assembler::AVX_512bit);
|
||||
__ evpermb(xmm7, k0, xmm20, xmm3, false, Assembler::AVX_512bit);
|
||||
|
||||
__ evpsrlvw(xmm4, xmm4, xmm21, Assembler::AVX_512bit);
|
||||
__ evpsrlvw(xmm5, xmm5, xmm21, Assembler::AVX_512bit);
|
||||
__ evpsrlvw(xmm6, xmm6, xmm21, Assembler::AVX_512bit);
|
||||
__ evpsrlvw(xmm7, xmm7, xmm21, Assembler::AVX_512bit);
|
||||
|
||||
__ evpandq(xmm0, xmm22, xmm4, Assembler::AVX_512bit);
|
||||
__ evpandq(xmm1, xmm22, xmm5, Assembler::AVX_512bit);
|
||||
__ evpandq(xmm2, xmm22, xmm6, Assembler::AVX_512bit);
|
||||
__ evpandq(xmm3, xmm22, xmm7, Assembler::AVX_512bit);
|
||||
|
||||
store4regs(parsed, 0, xmm0_3, _masm);
|
||||
|
||||
__ addptr(condensed, 192);
|
||||
__ addptr(parsed, 256);
|
||||
__ subl(parsedLength, 128);
|
||||
__ jcc(Assembler::greater, VBMILoop);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov64(rax, 0); // return 0
|
||||
__ ret(0);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
__ lea(perms, ExternalAddress(kyberAvx512_12To16PermsAddr()));
|
||||
|
||||
load4regs(xmm24_27, perms, 0, _masm);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user