mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-14 09:53:18 +00:00
4295 lines
172 KiB
C++
4295 lines
172 KiB
C++
/*
|
|
* Copyright (c) 2019, 2026, Intel Corporation. All rights reserved.
|
|
*
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*
|
|
*/
|
|
|
|
#include "asm/assembler.hpp"
|
|
#include "asm/assembler.inline.hpp"
|
|
#include "runtime/stubRoutines.hpp"
|
|
#include "macroAssembler_x86.hpp"
|
|
#include "stubGenerator_x86_64.hpp"
|
|
|
|
#define __ _masm->
|
|
|
|
#ifdef PRODUCT
|
|
#define BLOCK_COMMENT(str) /* nothing */
|
|
#else
|
|
#define BLOCK_COMMENT(str) __ block_comment(str)
|
|
#endif // PRODUCT
|
|
|
|
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
|
|
|
|
// Constants
|
|
|
|
const int AESBlockSize = 16;
|
|
|
|
// Shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers.
|
|
ATTRIBUTE_ALIGNED(16) static const uint64_t KEY_SHUFFLE_MASK[] = {
|
|
0x0405060700010203UL, 0x0C0D0E0F08090A0BUL
|
|
};
|
|
static address key_shuffle_mask_addr() {
|
|
return (address)KEY_SHUFFLE_MASK;
|
|
}
|
|
|
|
// Shuffle mask for big-endian 128-bit integers.
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_SHUFFLE_MASK[] = {
|
|
0x08090A0B0C0D0E0FUL, 0x0001020304050607UL,
|
|
0x08090A0B0C0D0E0FUL, 0x0001020304050607UL,
|
|
0x08090A0B0C0D0E0FUL, 0x0001020304050607UL,
|
|
0x08090A0B0C0D0E0FUL, 0x0001020304050607UL,
|
|
};
|
|
static address counter_shuffle_mask_addr() {
|
|
return (address)COUNTER_SHUFFLE_MASK;
|
|
}
|
|
|
|
// This mask is used for incrementing counter value
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC0[] = {
|
|
0x0000000000000000UL, 0x0000000000000000UL,
|
|
0x0000000000000001UL, 0x0000000000000000UL,
|
|
0x0000000000000002UL, 0x0000000000000000UL,
|
|
0x0000000000000003UL, 0x0000000000000000UL,
|
|
};
|
|
static address counter_mask_linc0_addr() {
|
|
return (address)COUNTER_MASK_LINC0;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(16) static const uint64_t COUNTER_MASK_LINC1[] = {
|
|
0x0000000000000001UL, 0x0000000000000000UL,
|
|
};
|
|
static address counter_mask_linc1_addr() {
|
|
return (address)COUNTER_MASK_LINC1;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(16) uint64_t COUNTER_MASK_LINC1F[] = {
|
|
0x0000000000000000UL, 0x0100000000000000UL,
|
|
};
|
|
|
|
static address counter_mask_linc1f_addr() {
|
|
return (address)COUNTER_MASK_LINC1F;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(16) uint64_t COUNTER_MASK_LINC2[] = {
|
|
0x0000000000000002UL, 0x0000000000000000UL,
|
|
};
|
|
|
|
static address counter_mask_linc2_addr() {
|
|
return (address)COUNTER_MASK_LINC2;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(16) uint64_t COUNTER_MASK_LINC2F[] = {
|
|
0x0000000000000000UL, 0x0200000000000000UL,
|
|
};
|
|
|
|
static address counter_mask_linc2f_addr() {
|
|
return (address)COUNTER_MASK_LINC2F;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC4[] = {
|
|
0x0000000000000004UL, 0x0000000000000000UL,
|
|
0x0000000000000004UL, 0x0000000000000000UL,
|
|
0x0000000000000004UL, 0x0000000000000000UL,
|
|
0x0000000000000004UL, 0x0000000000000000UL,
|
|
};
|
|
static address counter_mask_linc4_addr() {
|
|
return (address)COUNTER_MASK_LINC4;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC8[] = {
|
|
0x0000000000000008UL, 0x0000000000000000UL,
|
|
0x0000000000000008UL, 0x0000000000000000UL,
|
|
0x0000000000000008UL, 0x0000000000000000UL,
|
|
0x0000000000000008UL, 0x0000000000000000UL,
|
|
};
|
|
static address counter_mask_linc8_addr() {
|
|
return (address)COUNTER_MASK_LINC8;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC16[] = {
|
|
0x0000000000000010UL, 0x0000000000000000UL,
|
|
0x0000000000000010UL, 0x0000000000000000UL,
|
|
0x0000000000000010UL, 0x0000000000000000UL,
|
|
0x0000000000000010UL, 0x0000000000000000UL,
|
|
};
|
|
static address counter_mask_linc16_addr() {
|
|
return (address)COUNTER_MASK_LINC16;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC32[] = {
|
|
0x0000000000000020UL, 0x0000000000000000UL,
|
|
0x0000000000000020UL, 0x0000000000000000UL,
|
|
0x0000000000000020UL, 0x0000000000000000UL,
|
|
0x0000000000000020UL, 0x0000000000000000UL,
|
|
};
|
|
static address counter_mask_linc32_addr() {
|
|
return (address)COUNTER_MASK_LINC32;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(64) uint64_t COUNTER_MASK_ONES[] = {
|
|
0x0000000000000000UL, 0x0000000000000001UL,
|
|
0x0000000000000000UL, 0x0000000000000001UL,
|
|
0x0000000000000000UL, 0x0000000000000001UL,
|
|
0x0000000000000000UL, 0x0000000000000001UL,
|
|
};
|
|
static address counter_mask_ones_addr() {
|
|
return (address)COUNTER_MASK_ONES;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t GHASH_POLYNOMIAL_REDUCTION[] = {
|
|
0x00000001C2000000UL, 0xC200000000000000UL,
|
|
0x00000001C2000000UL, 0xC200000000000000UL,
|
|
0x00000001C2000000UL, 0xC200000000000000UL,
|
|
0x00000001C2000000UL, 0xC200000000000000UL,
|
|
};
|
|
static address ghash_polynomial_reduction_addr() {
|
|
return (address)GHASH_POLYNOMIAL_REDUCTION;
|
|
}
|
|
|
|
ATTRIBUTE_ALIGNED(16) static const uint64_t GHASH_POLYNOMIAL_TWO_ONE[] = {
|
|
0x0000000000000001UL, 0x0000000100000000UL,
|
|
};
|
|
static address ghash_polynomial_two_one_addr() {
|
|
return (address)GHASH_POLYNOMIAL_TWO_ONE;
|
|
}
|
|
|
|
// This mask is used for incrementing counter value
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_ADDBE_4444[] = {
|
|
0x0000000000000000ULL, 0x0400000000000000ULL,
|
|
0x0000000000000000ULL, 0x0400000000000000ULL,
|
|
0x0000000000000000ULL, 0x0400000000000000ULL,
|
|
0x0000000000000000ULL, 0x0400000000000000ULL,
|
|
};
|
|
static address counter_mask_addbe_4444_addr() {
|
|
return (address)COUNTER_MASK_ADDBE_4444;
|
|
}
|
|
|
|
// This mask is used for incrementing counter value
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_ADDBE_1234[] = {
|
|
0x0000000000000000ULL, 0x0100000000000000ULL,
|
|
0x0000000000000000ULL, 0x0200000000000000ULL,
|
|
0x0000000000000000ULL, 0x0300000000000000ULL,
|
|
0x0000000000000000ULL, 0x0400000000000000ULL,
|
|
};
|
|
static address counter_mask_addbe_1234_addr() {
|
|
return (address)COUNTER_MASK_ADDBE_1234;
|
|
}
|
|
|
|
// This mask is used for incrementing counter value
|
|
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_ADD_1234[] = {
|
|
0x0000000000000001ULL, 0x0000000000000000ULL,
|
|
0x0000000000000002ULL, 0x0000000000000000ULL,
|
|
0x0000000000000003ULL, 0x0000000000000000ULL,
|
|
0x0000000000000004ULL, 0x0000000000000000ULL,
|
|
};
|
|
static address counter_mask_add_1234_addr() {
|
|
return (address)COUNTER_MASK_ADD_1234;
|
|
}
|
|
|
|
// AES intrinsic stubs
|
|
|
|
void StubGenerator::generate_aes_stubs() {
|
|
if (UseAESIntrinsics) {
|
|
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
|
|
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
|
|
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
|
|
if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
|
|
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
|
|
StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
|
|
StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
|
|
StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
|
|
} else {
|
|
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
|
|
StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt_Parallel();
|
|
StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt_Parallel();
|
|
if (VM_Version::supports_avx2()) {
|
|
StubRoutines::_galoisCounterMode_AESCrypt = generate_avx2_galoisCounterMode_AESCrypt();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (UseAESCTRIntrinsics) {
|
|
if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
|
|
StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
|
|
} else {
|
|
StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Vector AES Galois Counter Mode implementation.
|
|
//
|
|
// Inputs: Windows | Linux
|
|
// in = rcx (c_rarg0) | rsi (c_rarg0)
|
|
// len = rdx (c_rarg1) | rdi (c_rarg1)
|
|
// ct = r8 (c_rarg2) | rdx (c_rarg2)
|
|
// out = r9 (c_rarg3) | rcx (c_rarg3)
|
|
// key = rsi | r8 (c_rarg4)
|
|
// state = rdi | r9 (c_rarg5)
|
|
// subkeyHtbl = r10 | r10
|
|
// counter = r11 | r11
|
|
//
|
|
// Output:
|
|
// rax - number of processed bytes
|
|
address StubGenerator::generate_galoisCounterMode_AESCrypt() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register in = c_rarg0;
|
|
const Register len = c_rarg1;
|
|
const Register ct = c_rarg2;
|
|
const Register out = c_rarg3;
|
|
// and updated with the incremented counter in the end
|
|
#ifndef _WIN64
|
|
const Register key = c_rarg4;
|
|
const Register state = c_rarg5;
|
|
const Address subkeyH_mem(rbp, 2 * wordSize);
|
|
const Register subkeyHtbl = r10;
|
|
const Register avx512_subkeyHtbl = r12;
|
|
const Address counter_mem(rbp, 3 * wordSize);
|
|
const Register counter = r11;
|
|
#else
|
|
const Address key_mem(rbp, 6 * wordSize);
|
|
const Register key = rsi;
|
|
const Address state_mem(rbp, 7 * wordSize);
|
|
const Register state = rdi;
|
|
const Address subkeyH_mem(rbp, 8 * wordSize);
|
|
const Register subkeyHtbl = r10;
|
|
const Register avx512_subkeyHtbl = r12;
|
|
const Address counter_mem(rbp, 9 * wordSize);
|
|
const Register counter = r11;
|
|
#endif
|
|
__ enter();
|
|
// Save state before entering routine
|
|
__ push_ppx(r12);//holds pointer to avx512_subkeyHtbl
|
|
__ push_ppx(r14);//holds CTR_CHECK value to check for overflow
|
|
__ push_ppx(r15);//holds number of rounds
|
|
__ push_ppx(rbx);//scratch register
|
|
#ifdef _WIN64
|
|
// on win64, fill len_reg from stack position
|
|
__ push_ppx(rsi);
|
|
__ push_ppx(rdi);
|
|
__ movptr(key, key_mem);
|
|
__ movptr(state, state_mem);
|
|
#endif
|
|
__ movptr(subkeyHtbl, subkeyH_mem);
|
|
__ movptr(counter, counter_mem);
|
|
// Align stack
|
|
__ andq(rsp, -64);
|
|
__ subptr(rsp, 200 * longSize); // Create space on the stack for 64 htbl entries and 8 zmm AES entries
|
|
__ movptr(avx512_subkeyHtbl, rsp);
|
|
|
|
aesgcm_avx512(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
|
|
|
|
__ vzeroupper();
|
|
|
|
// Restore state before leaving routine
|
|
#ifdef _WIN64
|
|
__ lea(rsp, Address(rbp, -6 * wordSize));
|
|
__ pop_ppx(rdi);
|
|
__ pop_ppx(rsi);
|
|
#else
|
|
__ lea(rsp, Address(rbp, -4 * wordSize));
|
|
#endif
|
|
__ pop_ppx(rbx);
|
|
__ pop_ppx(r15);
|
|
__ pop_ppx(r14);
|
|
__ pop_ppx(r12);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
// AVX2 Vector AES Galois Counter Mode implementation.
|
|
//
|
|
// Inputs: Windows | Linux
|
|
// in = rcx (c_rarg0) | rsi (c_rarg0)
|
|
// len = rdx (c_rarg1) | rdi (c_rarg1)
|
|
// ct = r8 (c_rarg2) | rdx (c_rarg2)
|
|
// out = r9 (c_rarg3) | rcx (c_rarg3)
|
|
// key = rdi | r8 (c_rarg4)
|
|
// state = r13 | r9 (c_rarg5)
|
|
// subkeyHtbl = r11 | r11
|
|
// counter = rsi | r12
|
|
//
|
|
// Output:
|
|
// rax - number of processed bytes
|
|
address StubGenerator::generate_avx2_galoisCounterMode_AESCrypt() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register in = c_rarg0;
|
|
const Register len = c_rarg1;
|
|
const Register ct = c_rarg2;
|
|
const Register out = c_rarg3;
|
|
// and updated with the incremented counter in the end
|
|
#ifndef _WIN64
|
|
const Register key = c_rarg4;
|
|
const Register state = c_rarg5;
|
|
const Address subkeyH_mem(rbp, 2 * wordSize);
|
|
const Register subkeyHtbl = r11;
|
|
const Address counter_mem(rbp, 3 * wordSize);
|
|
const Register counter = r12;
|
|
#else
|
|
const Address key_mem(rbp, 6 * wordSize);
|
|
const Register key = rdi;
|
|
const Address state_mem(rbp, 7 * wordSize);
|
|
const Register state = r13;
|
|
const Address subkeyH_mem(rbp, 8 * wordSize);
|
|
const Register subkeyHtbl = r11;
|
|
const Address counter_mem(rbp, 9 * wordSize);
|
|
const Register counter = rsi;
|
|
#endif
|
|
__ enter();
|
|
// Save state before entering routine
|
|
__ push_ppx(r12);
|
|
__ push_ppx(r13);
|
|
__ push_ppx(r14);
|
|
__ push_ppx(r15);
|
|
__ push_ppx(rbx);
|
|
#ifdef _WIN64
|
|
// on win64, fill len_reg from stack position
|
|
__ push_ppx(rsi);
|
|
__ push_ppx(rdi);
|
|
__ movptr(key, key_mem);
|
|
__ movptr(state, state_mem);
|
|
#endif
|
|
__ movptr(subkeyHtbl, subkeyH_mem);
|
|
__ movptr(counter, counter_mem);
|
|
|
|
// Save rsp
|
|
__ movq(r14, rsp);
|
|
// Align stack
|
|
__ andq(rsp, -64);
|
|
__ subptr(rsp, 16 * longSize); // Create space on the stack for saving AES entries
|
|
|
|
aesgcm_avx2(in, len, ct, out, key, state, subkeyHtbl, counter);
|
|
__ vzeroupper();
|
|
__ movq(rsp, r14);
|
|
// Restore state before leaving routine
|
|
#ifdef _WIN64
|
|
__ pop_ppx(rdi);
|
|
__ pop_ppx(rsi);
|
|
#endif
|
|
__ pop_ppx(rbx);
|
|
__ pop_ppx(r15);
|
|
__ pop_ppx(r14);
|
|
__ pop_ppx(r13);
|
|
__ pop_ppx(r12);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Vector AES Counter implementation
|
|
address StubGenerator::generate_counterMode_VectorAESCrypt() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address r8
|
|
const Register counter = c_rarg3; // counter byte array initialized from counter array address
|
|
// and updated with the incremented counter in the end
|
|
#ifndef _WIN64
|
|
const Register len_reg = c_rarg4;
|
|
const Register saved_encCounter_start = c_rarg5;
|
|
const Register used_addr = r10;
|
|
const Address used_mem(rbp, 2 * wordSize);
|
|
const Register used = r11;
|
|
#else
|
|
const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
|
|
const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
|
|
const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
|
|
const Register len_reg = r10; // pick the first volatile windows register
|
|
const Register saved_encCounter_start = r11;
|
|
const Register used_addr = r13;
|
|
const Register used = r14;
|
|
#endif
|
|
__ enter();
|
|
// Save state before entering routine
|
|
__ push_ppx(r12);
|
|
__ push_ppx(r13);
|
|
__ push_ppx(r14);
|
|
__ push_ppx(r15);
|
|
#ifdef _WIN64
|
|
// on win64, fill len_reg from stack position
|
|
__ movl(len_reg, len_mem);
|
|
__ movptr(saved_encCounter_start, saved_encCounter_mem);
|
|
__ movptr(used_addr, used_mem);
|
|
__ movl(used, Address(used_addr, 0));
|
|
#else
|
|
__ push_ppx(len_reg); // Save
|
|
__ movptr(used_addr, used_mem);
|
|
__ movl(used, Address(used_addr, 0));
|
|
#endif
|
|
__ push_ppx(rbx);
|
|
|
|
aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
|
|
|
|
__ vzeroupper();
|
|
// Restore state before leaving routine
|
|
__ pop_ppx(rbx);
|
|
#ifdef _WIN64
|
|
__ movl(rax, len_mem); // return length
|
|
#else
|
|
__ pop_ppx(rax); // return length
|
|
#endif
|
|
__ pop_ppx(r15);
|
|
__ pop_ppx(r14);
|
|
__ pop_ppx(r13);
|
|
__ pop_ppx(r12);
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
// This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
|
|
// to hide instruction latency
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - sessionKe (key) in little endian int array
|
|
// c_rarg3 - counter vector byte array address
|
|
// Linux
|
|
// c_rarg4 - input length
|
|
// c_rarg5 - saved encryptedCounter start
|
|
// rbp + 6 * wordSize - saved used length
|
|
// Windows
|
|
// rbp + 6 * wordSize - input length
|
|
// rbp + 7 * wordSize - saved encryptedCounter start
|
|
// rbp + 8 * wordSize - saved used length
|
|
//
|
|
// Output:
|
|
// rax - input length
|
|
//
|
|
address StubGenerator::generate_counterMode_AESCrypt_Parallel() {
|
|
assert(UseAES, "need AES instructions and misaligned SSE support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register counter = c_rarg3; // counter byte array initialized from counter array address
|
|
// and updated with the incremented counter in the end
|
|
#ifndef _WIN64
|
|
const Register len_reg = c_rarg4;
|
|
const Register saved_encCounter_start = c_rarg5;
|
|
const Register used_addr = r10;
|
|
const Address used_mem(rbp, 2 * wordSize);
|
|
const Register used = r11;
|
|
#else
|
|
const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
|
|
const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
|
|
const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
|
|
const Register len_reg = r10; // pick the first volatile windows register
|
|
const Register saved_encCounter_start = r11;
|
|
const Register used_addr = r13;
|
|
const Register used = r14;
|
|
#endif
|
|
const Register pos = rax;
|
|
|
|
const int PARALLEL_FACTOR = 6;
|
|
const XMMRegister xmm_counter_shuf_mask = xmm0;
|
|
const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
|
|
const XMMRegister xmm_curr_counter = xmm2;
|
|
|
|
const XMMRegister xmm_key_tmp0 = xmm3;
|
|
const XMMRegister xmm_key_tmp1 = xmm4;
|
|
|
|
// registers holding the four results in the parallelized loop
|
|
const XMMRegister xmm_result0 = xmm5;
|
|
const XMMRegister xmm_result1 = xmm6;
|
|
const XMMRegister xmm_result2 = xmm7;
|
|
const XMMRegister xmm_result3 = xmm8;
|
|
const XMMRegister xmm_result4 = xmm9;
|
|
const XMMRegister xmm_result5 = xmm10;
|
|
|
|
const XMMRegister xmm_from0 = xmm11;
|
|
const XMMRegister xmm_from1 = xmm12;
|
|
const XMMRegister xmm_from2 = xmm13;
|
|
const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
|
|
const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
|
|
const XMMRegister xmm_from5 = xmm4;
|
|
|
|
//for key_128, key_192, key_256
|
|
const int rounds[3] = {10, 12, 14};
|
|
Label L_exit_preLoop, L_preLoop_start;
|
|
Label L_multiBlock_loopTop[3];
|
|
Label L_singleBlockLoopTop[3];
|
|
Label L__incCounter[3][6]; //for 6 blocks
|
|
Label L__incCounter_single[3]; //for single block, key128, key192, key256
|
|
Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
|
|
Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
|
|
|
|
Label L_exit;
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
#ifdef _WIN64
|
|
// allocate spill slots for r13, r14
|
|
enum {
|
|
saved_r13_offset,
|
|
saved_r14_offset
|
|
};
|
|
__ subptr(rsp, 2 * wordSize);
|
|
__ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
|
|
__ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
|
|
|
|
// on win64, fill len_reg from stack position
|
|
__ movl(len_reg, len_mem);
|
|
__ movptr(saved_encCounter_start, saved_encCounter_mem);
|
|
__ movptr(used_addr, used_mem);
|
|
__ movl(used, Address(used_addr, 0));
|
|
#else
|
|
__ push_ppx(len_reg); // Save
|
|
__ movptr(used_addr, used_mem);
|
|
__ movl(used, Address(used_addr, 0));
|
|
#endif
|
|
|
|
__ push_ppx(rbx); // Save RBX
|
|
__ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
|
|
__ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()), pos /*rscratch*/);
|
|
__ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
|
|
__ movptr(pos, 0);
|
|
|
|
// Use the partially used encrpyted counter from last invocation
|
|
__ BIND(L_preLoop_start);
|
|
__ cmpptr(used, 16);
|
|
__ jcc(Assembler::aboveEqual, L_exit_preLoop);
|
|
__ cmpptr(len_reg, 0);
|
|
__ jcc(Assembler::lessEqual, L_exit_preLoop);
|
|
__ movb(rbx, Address(saved_encCounter_start, used));
|
|
__ xorb(rbx, Address(from, pos));
|
|
__ movb(Address(to, pos), rbx);
|
|
__ addptr(pos, 1);
|
|
__ addptr(used, 1);
|
|
__ subptr(len_reg, 1);
|
|
|
|
__ jmp(L_preLoop_start);
|
|
|
|
__ BIND(L_exit_preLoop);
|
|
__ movl(Address(used_addr, 0), used);
|
|
|
|
// key length could be only {11, 13, 15} * 4 = {44, 52, 60}
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
__ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
__ cmpl(rbx, 52);
|
|
__ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
|
|
__ cmpl(rbx, 60);
|
|
__ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
|
|
|
|
#define CTR_DoSix(opc, src_reg) \
|
|
__ opc(xmm_result0, src_reg); \
|
|
__ opc(xmm_result1, src_reg); \
|
|
__ opc(xmm_result2, src_reg); \
|
|
__ opc(xmm_result3, src_reg); \
|
|
__ opc(xmm_result4, src_reg); \
|
|
__ opc(xmm_result5, src_reg);
|
|
|
|
// k == 0 : generate code for key_128
|
|
// k == 1 : generate code for key_192
|
|
// k == 2 : generate code for key_256
|
|
for (int k = 0; k < 3; ++k) {
|
|
//multi blocks starts here
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_multiBlock_loopTop[k]);
|
|
__ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
|
|
__ jcc(Assembler::less, L_singleBlockLoopTop[k]);
|
|
load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
|
|
|
|
//load, then increase counters
|
|
CTR_DoSix(movdqa, xmm_curr_counter);
|
|
inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
|
|
inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
|
|
inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
|
|
inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
|
|
inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]);
|
|
inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
|
|
CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
|
|
CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key
|
|
|
|
//load two ROUND_KEYs at a time
|
|
for (int i = 1; i < rounds[k]; ) {
|
|
load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
|
|
load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
|
|
CTR_DoSix(aesenc, xmm_key_tmp1);
|
|
i++;
|
|
if (i != rounds[k]) {
|
|
CTR_DoSix(aesenc, xmm_key_tmp0);
|
|
} else {
|
|
CTR_DoSix(aesenclast, xmm_key_tmp0);
|
|
}
|
|
i++;
|
|
}
|
|
|
|
// get next PARALLEL_FACTOR blocks into xmm_result registers
|
|
__ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
|
|
__ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
|
|
__ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
|
|
__ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
|
|
__ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
|
|
__ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
|
|
|
|
__ pxor(xmm_result0, xmm_from0);
|
|
__ pxor(xmm_result1, xmm_from1);
|
|
__ pxor(xmm_result2, xmm_from2);
|
|
__ pxor(xmm_result3, xmm_from3);
|
|
__ pxor(xmm_result4, xmm_from4);
|
|
__ pxor(xmm_result5, xmm_from5);
|
|
|
|
// store 6 results into the next 64 bytes of output
|
|
__ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
|
|
__ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
|
|
__ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
|
|
__ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
|
|
__ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
|
|
__ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
|
|
|
|
__ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
|
|
__ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
|
|
__ jmp(L_multiBlock_loopTop[k]);
|
|
|
|
// singleBlock starts here
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_singleBlockLoopTop[k]);
|
|
__ cmpptr(len_reg, 0);
|
|
__ jcc(Assembler::lessEqual, L_exit);
|
|
load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
|
|
__ movdqa(xmm_result0, xmm_curr_counter);
|
|
inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
|
|
__ pshufb(xmm_result0, xmm_counter_shuf_mask);
|
|
__ pxor(xmm_result0, xmm_key_tmp0);
|
|
for (int i = 1; i < rounds[k]; i++) {
|
|
load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
|
|
__ aesenc(xmm_result0, xmm_key_tmp0);
|
|
}
|
|
load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
|
|
__ aesenclast(xmm_result0, xmm_key_tmp0);
|
|
__ cmpptr(len_reg, AESBlockSize);
|
|
__ jcc(Assembler::less, L_processTail_insr[k]);
|
|
__ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
|
|
__ pxor(xmm_result0, xmm_from0);
|
|
__ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
|
|
__ addptr(pos, AESBlockSize);
|
|
__ subptr(len_reg, AESBlockSize);
|
|
__ jmp(L_singleBlockLoopTop[k]);
|
|
__ BIND(L_processTail_insr[k]); // Process the tail part of the input array
|
|
__ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register
|
|
__ testptr(len_reg, 8);
|
|
__ jcc(Assembler::zero, L_processTail_4_insr[k]);
|
|
__ subptr(pos,8);
|
|
__ pinsrq(xmm_from0, Address(from, pos), 0);
|
|
__ BIND(L_processTail_4_insr[k]);
|
|
__ testptr(len_reg, 4);
|
|
__ jcc(Assembler::zero, L_processTail_2_insr[k]);
|
|
__ subptr(pos,4);
|
|
__ pslldq(xmm_from0, 4);
|
|
__ pinsrd(xmm_from0, Address(from, pos), 0);
|
|
__ BIND(L_processTail_2_insr[k]);
|
|
__ testptr(len_reg, 2);
|
|
__ jcc(Assembler::zero, L_processTail_1_insr[k]);
|
|
__ subptr(pos, 2);
|
|
__ pslldq(xmm_from0, 2);
|
|
__ pinsrw(xmm_from0, Address(from, pos), 0);
|
|
__ BIND(L_processTail_1_insr[k]);
|
|
__ testptr(len_reg, 1);
|
|
__ jcc(Assembler::zero, L_processTail_exit_insr[k]);
|
|
__ subptr(pos, 1);
|
|
__ pslldq(xmm_from0, 1);
|
|
__ pinsrb(xmm_from0, Address(from, pos), 0);
|
|
__ BIND(L_processTail_exit_insr[k]);
|
|
|
|
__ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
|
|
__ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
|
|
|
|
__ testptr(len_reg, 8);
|
|
__ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array
|
|
__ pextrq(Address(to, pos), xmm_result0, 0);
|
|
__ psrldq(xmm_result0, 8);
|
|
__ addptr(pos, 8);
|
|
__ BIND(L_processTail_4_extr[k]);
|
|
__ testptr(len_reg, 4);
|
|
__ jcc(Assembler::zero, L_processTail_2_extr[k]);
|
|
__ pextrd(Address(to, pos), xmm_result0, 0);
|
|
__ psrldq(xmm_result0, 4);
|
|
__ addptr(pos, 4);
|
|
__ BIND(L_processTail_2_extr[k]);
|
|
__ testptr(len_reg, 2);
|
|
__ jcc(Assembler::zero, L_processTail_1_extr[k]);
|
|
__ pextrw(Address(to, pos), xmm_result0, 0);
|
|
__ psrldq(xmm_result0, 2);
|
|
__ addptr(pos, 2);
|
|
__ BIND(L_processTail_1_extr[k]);
|
|
__ testptr(len_reg, 1);
|
|
__ jcc(Assembler::zero, L_processTail_exit_extr[k]);
|
|
__ pextrb(Address(to, pos), xmm_result0, 0);
|
|
|
|
__ BIND(L_processTail_exit_extr[k]);
|
|
__ movl(Address(used_addr, 0), len_reg);
|
|
__ jmp(L_exit);
|
|
}
|
|
|
|
__ BIND(L_exit);
|
|
__ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
|
|
__ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
|
|
__ pop_ppx(rbx); // pop the saved RBX.
|
|
#ifdef _WIN64
|
|
__ movl(rax, len_mem);
|
|
__ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
|
|
__ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
|
|
__ addptr(rsp, 2 * wordSize);
|
|
#else
|
|
__ pop_ppx(rax); // return 'len'
|
|
#endif
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
address StubGenerator::generate_cipherBlockChaining_decryptVectorAESCrypt() {
|
|
assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register rvec = c_rarg3; // r byte array initialized from initvector array address
|
|
// and left with the results of the last encryption block
|
|
#ifndef _WIN64
|
|
const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
|
|
#else
|
|
const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
|
|
const Register len_reg = r11; // pick the volatile windows register
|
|
#endif
|
|
|
|
Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
|
|
Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
|
|
|
|
__ enter();
|
|
|
|
#ifdef _WIN64
|
|
// on win64, fill len_reg from stack position
|
|
__ movl(len_reg, len_mem);
|
|
#else
|
|
__ push_ppx(len_reg); // Save
|
|
#endif
|
|
__ push_ppx(rbx);
|
|
__ vzeroupper();
|
|
|
|
// Temporary variable declaration for swapping key bytes
|
|
const XMMRegister xmm_key_shuf_mask = xmm1;
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
|
|
// Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
|
|
const Register rounds = rbx;
|
|
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
const XMMRegister IV = xmm0;
|
|
// Load IV and broadcast value to 512-bits
|
|
__ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
|
|
|
|
// Temporary variables for storing round keys
|
|
const XMMRegister RK0 = xmm30;
|
|
const XMMRegister RK1 = xmm9;
|
|
const XMMRegister RK2 = xmm18;
|
|
const XMMRegister RK3 = xmm19;
|
|
const XMMRegister RK4 = xmm20;
|
|
const XMMRegister RK5 = xmm21;
|
|
const XMMRegister RK6 = xmm22;
|
|
const XMMRegister RK7 = xmm23;
|
|
const XMMRegister RK8 = xmm24;
|
|
const XMMRegister RK9 = xmm25;
|
|
const XMMRegister RK10 = xmm26;
|
|
|
|
// Load and shuffle key
|
|
// the java expanded key ordering is rotated one position from what we want
|
|
// so we start from 1*16 here and hit 0*16 last
|
|
ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
|
|
|
|
// Variables for storing source cipher text
|
|
const XMMRegister S0 = xmm10;
|
|
const XMMRegister S1 = xmm11;
|
|
const XMMRegister S2 = xmm12;
|
|
const XMMRegister S3 = xmm13;
|
|
const XMMRegister S4 = xmm14;
|
|
const XMMRegister S5 = xmm15;
|
|
const XMMRegister S6 = xmm16;
|
|
const XMMRegister S7 = xmm17;
|
|
|
|
// Variables for storing decrypted text
|
|
const XMMRegister B0 = xmm1;
|
|
const XMMRegister B1 = xmm2;
|
|
const XMMRegister B2 = xmm3;
|
|
const XMMRegister B3 = xmm4;
|
|
const XMMRegister B4 = xmm5;
|
|
const XMMRegister B5 = xmm6;
|
|
const XMMRegister B6 = xmm7;
|
|
const XMMRegister B7 = xmm8;
|
|
|
|
__ cmpl(rounds, 44);
|
|
__ jcc(Assembler::greater, KEY_192);
|
|
__ jmp(Loop);
|
|
|
|
__ BIND(KEY_192);
|
|
const XMMRegister RK11 = xmm27;
|
|
const XMMRegister RK12 = xmm28;
|
|
ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
|
|
ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
|
|
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::greater, KEY_256);
|
|
__ jmp(Loop);
|
|
|
|
__ BIND(KEY_256);
|
|
const XMMRegister RK13 = xmm29;
|
|
const XMMRegister RK14 = xmm31;
|
|
ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
|
|
ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
|
|
|
|
__ BIND(Loop);
|
|
__ cmpl(len_reg, 512);
|
|
__ jcc(Assembler::below, Lcbc_dec_rem);
|
|
__ BIND(Loop1);
|
|
__ subl(len_reg, 512);
|
|
__ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
|
|
__ leaq(from, Address(from, 8 * 64));
|
|
|
|
__ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
|
|
__ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
|
|
__ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
|
|
__ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
|
|
__ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
|
|
__ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
|
|
__ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
|
|
__ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
|
|
|
|
__ evalignq(IV, S0, IV, 0x06);
|
|
__ evalignq(S0, S1, S0, 0x06);
|
|
__ evalignq(S1, S2, S1, 0x06);
|
|
__ evalignq(S2, S3, S2, 0x06);
|
|
__ evalignq(S3, S4, S3, 0x06);
|
|
__ evalignq(S4, S5, S4, 0x06);
|
|
__ evalignq(S5, S6, S5, 0x06);
|
|
__ evalignq(S6, S7, S6, 0x06);
|
|
|
|
roundDec(RK2);
|
|
roundDec(RK3);
|
|
roundDec(RK4);
|
|
roundDec(RK5);
|
|
roundDec(RK6);
|
|
roundDec(RK7);
|
|
roundDec(RK8);
|
|
roundDec(RK9);
|
|
roundDec(RK10);
|
|
|
|
__ cmpl(rounds, 44);
|
|
__ jcc(Assembler::belowEqual, L_128);
|
|
roundDec(RK11);
|
|
roundDec(RK12);
|
|
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::belowEqual, L_192);
|
|
roundDec(RK13);
|
|
roundDec(RK14);
|
|
|
|
__ BIND(L_256);
|
|
roundDeclast(RK0);
|
|
__ jmp(Loop2);
|
|
|
|
__ BIND(L_128);
|
|
roundDeclast(RK0);
|
|
__ jmp(Loop2);
|
|
|
|
__ BIND(L_192);
|
|
roundDeclast(RK0);
|
|
|
|
__ BIND(Loop2);
|
|
__ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
|
|
__ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
|
|
__ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
|
|
__ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
|
|
__ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
|
|
__ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
|
|
__ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
|
|
__ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
|
|
__ evmovdquq(IV, S7, Assembler::AVX_512bit);
|
|
|
|
__ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
|
|
__ leaq(to, Address(to, 8 * 64));
|
|
__ jmp(Loop);
|
|
|
|
__ BIND(Lcbc_dec_rem);
|
|
__ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
|
|
|
|
__ BIND(Lcbc_dec_rem_loop);
|
|
__ subl(len_reg, 16);
|
|
__ jcc(Assembler::carrySet, Lcbc_dec_ret);
|
|
|
|
__ movdqu(S0, Address(from, 0));
|
|
__ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 44);
|
|
__ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
|
|
|
|
__ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
|
|
|
|
__ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
|
|
__ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
|
|
|
|
__ BIND(Lcbc_dec_rem_last);
|
|
__ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
|
|
|
|
__ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
|
|
__ evmovdquq(IV, S0, Assembler::AVX_512bit);
|
|
__ movdqu(Address(to, 0), B0);
|
|
__ leaq(from, Address(from, 16));
|
|
__ leaq(to, Address(to, 16));
|
|
__ jmp(Lcbc_dec_rem_loop);
|
|
|
|
__ BIND(Lcbc_dec_ret);
|
|
__ movdqu(Address(rvec, 0), IV);
|
|
|
|
// Zero out the round keys
|
|
__ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
|
|
__ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
|
|
__ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
|
|
__ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
|
|
__ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
|
|
__ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
|
|
__ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
|
|
__ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
|
|
__ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
|
|
__ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
|
|
__ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 44);
|
|
__ jcc(Assembler::belowEqual, Lcbc_exit);
|
|
__ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
|
|
__ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::belowEqual, Lcbc_exit);
|
|
__ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
|
|
__ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
|
|
|
|
__ BIND(Lcbc_exit);
|
|
__ vzeroupper();
|
|
__ pop_ppx(rbx);
|
|
#ifdef _WIN64
|
|
__ movl(rax, len_mem);
|
|
#else
|
|
__ pop_ppx(rax); // return length
|
|
#endif
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - sessionKe (key) in little endian int array
|
|
//
|
|
address StubGenerator::generate_aescrypt_encryptBlock() {
|
|
assert(UseAES, "need AES instructions and misaligned SSE support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
Label L_doLast;
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register keylen = rax;
|
|
|
|
const XMMRegister xmm_result = xmm0;
|
|
const XMMRegister xmm_key_shuf_mask = xmm1;
|
|
// On win64 xmm6-xmm15 must be preserved so don't use them.
|
|
const XMMRegister xmm_temp1 = xmm2;
|
|
const XMMRegister xmm_temp2 = xmm3;
|
|
const XMMRegister xmm_temp3 = xmm4;
|
|
const XMMRegister xmm_temp4 = xmm5;
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
|
|
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/);
|
|
__ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
|
|
|
|
// For encryption, the java expanded key ordering is just what we need
|
|
// we don't know if the key is aligned, hence not using load-execute form
|
|
|
|
load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
|
|
__ pxor(xmm_result, xmm_temp1);
|
|
|
|
load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
|
|
load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
|
|
load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
|
|
|
|
__ aesenc(xmm_result, xmm_temp1);
|
|
__ aesenc(xmm_result, xmm_temp2);
|
|
__ aesenc(xmm_result, xmm_temp3);
|
|
__ aesenc(xmm_result, xmm_temp4);
|
|
|
|
load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
|
|
load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
|
|
load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
|
|
|
|
__ aesenc(xmm_result, xmm_temp1);
|
|
__ aesenc(xmm_result, xmm_temp2);
|
|
__ aesenc(xmm_result, xmm_temp3);
|
|
__ aesenc(xmm_result, xmm_temp4);
|
|
|
|
load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
|
|
|
|
__ cmpl(keylen, 44);
|
|
__ jccb(Assembler::equal, L_doLast);
|
|
|
|
__ aesenc(xmm_result, xmm_temp1);
|
|
__ aesenc(xmm_result, xmm_temp2);
|
|
|
|
load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
|
|
|
|
__ cmpl(keylen, 52);
|
|
__ jccb(Assembler::equal, L_doLast);
|
|
|
|
__ aesenc(xmm_result, xmm_temp1);
|
|
__ aesenc(xmm_result, xmm_temp2);
|
|
|
|
load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
|
|
|
|
__ BIND(L_doLast);
|
|
__ aesenc(xmm_result, xmm_temp1);
|
|
__ aesenclast(xmm_result, xmm_temp2);
|
|
__ movdqu(Address(to, 0), xmm_result); // store the result
|
|
__ xorptr(rax, rax); // return 0
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - sessionKd (key) in little endian int array
|
|
//
|
|
address StubGenerator::generate_aescrypt_decryptBlock() {
|
|
assert(UseAES, "need AES instructions and misaligned SSE support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
Label L_doLast;
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register keylen = rax;
|
|
|
|
const XMMRegister xmm_result = xmm0;
|
|
const XMMRegister xmm_key_shuf_mask = xmm1;
|
|
// On win64 xmm6-xmm15 must be preserved so don't use them.
|
|
const XMMRegister xmm_temp1 = xmm2;
|
|
const XMMRegister xmm_temp2 = xmm3;
|
|
const XMMRegister xmm_temp3 = xmm4;
|
|
const XMMRegister xmm_temp4 = xmm5;
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
|
|
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/);
|
|
__ movdqu(xmm_result, Address(from, 0));
|
|
|
|
// for decryption java expanded key ordering is rotated one position from what we want
|
|
// so we start from 0x10 here and hit 0x00 last
|
|
// we don't know if the key is aligned, hence not using load-execute form
|
|
load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
|
|
load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
|
|
load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
|
|
|
|
__ pxor (xmm_result, xmm_temp1);
|
|
__ aesdec(xmm_result, xmm_temp2);
|
|
__ aesdec(xmm_result, xmm_temp3);
|
|
__ aesdec(xmm_result, xmm_temp4);
|
|
|
|
load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
|
|
load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
|
|
load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
|
|
|
|
__ aesdec(xmm_result, xmm_temp1);
|
|
__ aesdec(xmm_result, xmm_temp2);
|
|
__ aesdec(xmm_result, xmm_temp3);
|
|
__ aesdec(xmm_result, xmm_temp4);
|
|
|
|
load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
|
|
load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
|
|
|
|
__ cmpl(keylen, 44);
|
|
__ jccb(Assembler::equal, L_doLast);
|
|
|
|
__ aesdec(xmm_result, xmm_temp1);
|
|
__ aesdec(xmm_result, xmm_temp2);
|
|
|
|
load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
|
|
|
|
__ cmpl(keylen, 52);
|
|
__ jccb(Assembler::equal, L_doLast);
|
|
|
|
__ aesdec(xmm_result, xmm_temp1);
|
|
__ aesdec(xmm_result, xmm_temp2);
|
|
|
|
load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
|
|
load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
|
|
|
|
__ BIND(L_doLast);
|
|
__ aesdec(xmm_result, xmm_temp1);
|
|
__ aesdec(xmm_result, xmm_temp2);
|
|
|
|
// for decryption the aesdeclast operation is always on key+0x00
|
|
__ aesdeclast(xmm_result, xmm_temp3);
|
|
__ movdqu(Address(to, 0), xmm_result); // store the result
|
|
__ xorptr(rax, rax); // return 0
|
|
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - sessionKe (key) in little endian int array
|
|
// c_rarg3 - r vector byte array address
|
|
// c_rarg4 - input length
|
|
//
|
|
// Output:
|
|
// rax - input length
|
|
//
|
|
address StubGenerator::generate_cipherBlockChaining_encryptAESCrypt() {
|
|
assert(UseAES, "need AES instructions and misaligned SSE support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register rvec = c_rarg3; // r byte array initialized from initvector array address
|
|
// and left with the results of the last encryption block
|
|
#ifdef _WIN64
|
|
const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
|
|
const Register len_reg = r11; // pick the volatile windows register
|
|
#else
|
|
const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
|
|
#endif
|
|
const Register pos = rax;
|
|
|
|
// xmm register assignments for the loops below
|
|
const XMMRegister xmm_result = xmm0;
|
|
const XMMRegister xmm_temp = xmm1;
|
|
// keys 0-10 preloaded into xmm2-xmm12
|
|
const int XMM_REG_NUM_KEY_FIRST = 2;
|
|
const int XMM_REG_NUM_KEY_LAST = 15;
|
|
const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
|
|
const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
|
|
const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
|
|
const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
|
|
const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
#ifdef _WIN64
|
|
// on win64, fill len_reg from stack position
|
|
__ movl(len_reg, len_mem);
|
|
#else
|
|
__ push_ppx(len_reg); // Save
|
|
#endif
|
|
|
|
const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/);
|
|
// load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
|
|
for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
|
|
load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
|
|
offset += 0x10;
|
|
}
|
|
__ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
|
|
|
|
|
|
// now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
|
|
__ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
__ cmpl(rax, 44);
|
|
__ jcc(Assembler::notEqual, L_key_192_256);
|
|
|
|
// 128 bit code follows here
|
|
__ movptr(pos, 0);
|
|
__ align(OptoLoopAlignment);
|
|
|
|
__ BIND(L_loopTop_128);
|
|
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
|
|
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
|
|
__ pxor (xmm_result, xmm_key0); // do the aes rounds
|
|
for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
|
|
__ aesenc(xmm_result, as_XMMRegister(rnum));
|
|
}
|
|
__ aesenclast(xmm_result, xmm_key10);
|
|
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
|
|
// no need to store r to memory until we exit
|
|
__ addptr(pos, AESBlockSize);
|
|
__ subptr(len_reg, AESBlockSize);
|
|
__ jcc(Assembler::notEqual, L_loopTop_128);
|
|
|
|
__ BIND(L_exit);
|
|
__ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
|
|
|
|
#ifdef _WIN64
|
|
__ movl(rax, len_mem);
|
|
#else
|
|
__ pop_ppx(rax); // return length
|
|
#endif
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
__ BIND(L_key_192_256);
|
|
// here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
|
|
load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
|
|
load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
|
|
__ cmpl(rax, 52);
|
|
__ jcc(Assembler::notEqual, L_key_256);
|
|
|
|
// 192-bit code follows here (could be changed to use more xmm registers)
|
|
__ movptr(pos, 0);
|
|
__ align(OptoLoopAlignment);
|
|
|
|
__ BIND(L_loopTop_192);
|
|
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
|
|
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
|
|
__ pxor (xmm_result, xmm_key0); // do the aes rounds
|
|
for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
|
|
__ aesenc(xmm_result, as_XMMRegister(rnum));
|
|
}
|
|
__ aesenclast(xmm_result, xmm_key12);
|
|
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
|
|
// no need to store r to memory until we exit
|
|
__ addptr(pos, AESBlockSize);
|
|
__ subptr(len_reg, AESBlockSize);
|
|
__ jcc(Assembler::notEqual, L_loopTop_192);
|
|
__ jmp(L_exit);
|
|
|
|
__ BIND(L_key_256);
|
|
// 256-bit code follows here (could be changed to use more xmm registers)
|
|
load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
|
|
__ movptr(pos, 0);
|
|
__ align(OptoLoopAlignment);
|
|
|
|
__ BIND(L_loopTop_256);
|
|
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
|
|
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
|
|
__ pxor (xmm_result, xmm_key0); // do the aes rounds
|
|
for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
|
|
__ aesenc(xmm_result, as_XMMRegister(rnum));
|
|
}
|
|
load_key(xmm_temp, key, 0xe0, r10 /*rscratch*/);
|
|
__ aesenclast(xmm_result, xmm_temp);
|
|
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
|
|
// no need to store r to memory until we exit
|
|
__ addptr(pos, AESBlockSize);
|
|
__ subptr(len_reg, AESBlockSize);
|
|
__ jcc(Assembler::notEqual, L_loopTop_256);
|
|
__ jmp(L_exit);
|
|
|
|
return start;
|
|
}
|
|
|
|
// This is a version of ECB/AES Encrypt/Decrypt which does 4 blocks in a loop
|
|
// at a time to hide instruction latency.
|
|
//
|
|
// For encryption (is_encrypt=true):
|
|
// pxor key[0], aesenc key[1..rounds-1], aesenclast key[rounds]
|
|
// For decryption (is_encrypt=false):
|
|
// pxor key[1], aesdec key[2..rounds], aesdeclast key[0]
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - session key (Ke/Kd) in little endian int array
|
|
// c_rarg3 - input length (must be multiple of blocksize 16)
|
|
//
|
|
// Output:
|
|
// rax - input length
|
|
//
|
|
address StubGenerator::generate_electronicCodeBook_AESCrypt_Parallel(bool is_encrypt) {
|
|
assert(UseAES, "need AES instructions and misaligned SSE support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = is_encrypt ? StubId::stubgen_electronicCodeBook_encryptAESCrypt_id
|
|
: StubId::stubgen_electronicCodeBook_decryptAESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register len_reg = c_rarg3; // src len (must be multiple of blocksize 16)
|
|
const Register pos = rax;
|
|
const Register keylen = r11;
|
|
|
|
const XMMRegister xmm_result0 = xmm0;
|
|
const XMMRegister xmm_result1 = xmm1;
|
|
const XMMRegister xmm_result2 = xmm2;
|
|
const XMMRegister xmm_result3 = xmm3;
|
|
const XMMRegister xmm_key_shuf_mask = xmm4;
|
|
const XMMRegister xmm_key_tmp = xmm5;
|
|
// keys 0-9 pre-loaded into xmm6-xmm15
|
|
const int XMM_REG_NUM_KEY_FIRST = 6;
|
|
const int XMM_REG_NUM_KEY_LAST = 15;
|
|
const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
|
|
|
|
// for key_128, key_192, key_256
|
|
const int ROUNDS[3] = {10, 12, 14};
|
|
|
|
Label L_exit;
|
|
Label L_loop4[3], L_single[3], L_done[3];
|
|
|
|
#ifdef DoFour
|
|
#undef DoFour
|
|
#endif
|
|
#ifdef DoOne
|
|
#undef DoOne
|
|
#endif
|
|
|
|
#define DoFour(opc, reg) \
|
|
__ opc(xmm_result0, reg); \
|
|
__ opc(xmm_result1, reg); \
|
|
__ opc(xmm_result2, reg); \
|
|
__ opc(xmm_result3, reg);
|
|
|
|
#define DoOne(opc, reg) \
|
|
__ opc(xmm_result0, reg);
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
__ push(len_reg); // save original length for return value
|
|
|
|
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/);
|
|
// load up xmm regs 6 thru 15 with keys 0x00 - 0x90
|
|
for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++, offset += 0x10) {
|
|
load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
|
|
}
|
|
__ xorptr(pos, pos);
|
|
|
|
// key length could be only {11, 13, 15} * 4 = {44, 52, 60}
|
|
__ cmpl(keylen, 52);
|
|
__ jcc(Assembler::equal, L_loop4[1]);
|
|
__ cmpl(keylen, 60);
|
|
__ jcc(Assembler::equal, L_loop4[2]);
|
|
|
|
// k == 0: generate code for key_128
|
|
// k == 1: generate code for key_192
|
|
// k == 2: generate code for key_256
|
|
for (int k = 0; k < 3; ++k) {
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_loop4[k]);
|
|
__ cmpptr(len_reg, 4 * AESBlockSize);
|
|
__ jcc(Assembler::less, L_single[k]);
|
|
|
|
__ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
|
|
__ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
|
|
__ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
|
|
__ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
|
|
|
|
if (is_encrypt) {
|
|
DoFour(pxor, xmm_key_first);
|
|
for (int rnum = 1; rnum < 10; rnum++) {
|
|
DoFour(aesenc, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
|
|
}
|
|
for (int i = 10; i < ROUNDS[k]; i++) {
|
|
load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask);
|
|
DoFour(aesenc, xmm_key_tmp);
|
|
}
|
|
load_key(xmm_key_tmp, key, ROUNDS[k] * 0x10, xmm_key_shuf_mask);
|
|
DoFour(aesenclast, xmm_key_tmp);
|
|
} else {
|
|
DoFour(pxor, as_XMMRegister(1 + XMM_REG_NUM_KEY_FIRST));
|
|
for (int rnum = 2; rnum < 10; rnum++) {
|
|
DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
|
|
}
|
|
for (int i = 10; i <= ROUNDS[k]; i++) {
|
|
load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask);
|
|
DoFour(aesdec, xmm_key_tmp);
|
|
}
|
|
DoFour(aesdeclast, xmm_key_first);
|
|
}
|
|
|
|
__ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
|
|
__ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
|
|
__ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
|
|
__ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
|
|
|
|
__ addptr(pos, 4 * AESBlockSize);
|
|
__ subptr(len_reg, 4 * AESBlockSize);
|
|
__ jmp(L_loop4[k]);
|
|
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_single[k]);
|
|
__ cmpptr(len_reg, AESBlockSize);
|
|
__ jcc(Assembler::less, L_done[k]);
|
|
|
|
__ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0));
|
|
|
|
if (is_encrypt) {
|
|
DoOne(pxor, xmm_key_first);
|
|
for (int rnum = 1; rnum < 10; rnum++) {
|
|
DoOne(aesenc, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
|
|
}
|
|
for (int i = 10; i < ROUNDS[k]; i++) {
|
|
load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask);
|
|
DoOne(aesenc, xmm_key_tmp);
|
|
}
|
|
load_key(xmm_key_tmp, key, ROUNDS[k] * 0x10, xmm_key_shuf_mask);
|
|
DoOne(aesenclast, xmm_key_tmp);
|
|
} else {
|
|
DoOne(pxor, as_XMMRegister(1 + XMM_REG_NUM_KEY_FIRST));
|
|
for (int rnum = 2; rnum < 10; rnum++) {
|
|
DoOne(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
|
|
}
|
|
for (int i = 10; i <= ROUNDS[k]; i++) {
|
|
load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask);
|
|
DoOne(aesdec, xmm_key_tmp);
|
|
}
|
|
DoOne(aesdeclast, xmm_key_first);
|
|
}
|
|
|
|
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0);
|
|
__ addptr(pos, AESBlockSize);
|
|
__ subptr(len_reg, AESBlockSize);
|
|
__ jmp(L_single[k]);
|
|
|
|
__ BIND(L_done[k]);
|
|
if (k < 2) __ jmp(L_exit);
|
|
} //for key_128/192/256
|
|
|
|
__ BIND(L_exit);
|
|
// Clear all XMM registers holding sensitive key material before returning
|
|
__ pxor(xmm_key_tmp, xmm_key_tmp);
|
|
for (int rnum = XMM_REG_NUM_KEY_FIRST; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
|
|
__ pxor(as_XMMRegister(rnum), as_XMMRegister(rnum));
|
|
}
|
|
__ pop(rax);
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
|
|
#undef DoFour
|
|
#undef DoOne
|
|
}
|
|
|
|
address StubGenerator::generate_electronicCodeBook_encryptAESCrypt_Parallel() {
|
|
return generate_electronicCodeBook_AESCrypt_Parallel(true);
|
|
}
|
|
|
|
address StubGenerator::generate_electronicCodeBook_decryptAESCrypt_Parallel() {
|
|
return generate_electronicCodeBook_AESCrypt_Parallel(false);
|
|
}
|
|
|
|
// This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
|
|
// to hide instruction latency
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Inputs:
|
|
// c_rarg0 - source byte array address
|
|
// c_rarg1 - destination byte array address
|
|
// c_rarg2 - sessionKd (key) in little endian int array
|
|
// c_rarg3 - r vector byte array address
|
|
// c_rarg4 - input length
|
|
//
|
|
// Output:
|
|
// rax - input length
|
|
//
|
|
address StubGenerator::generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
|
|
assert(UseAES, "need AES instructions and misaligned SSE support");
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register rvec = c_rarg3; // r byte array initialized from initvector array address
|
|
// and left with the results of the last encryption block
|
|
#ifndef _WIN64
|
|
const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
|
|
#else
|
|
const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
|
|
const Register len_reg = r11; // pick the volatile windows register
|
|
#endif
|
|
const Register pos = rax;
|
|
|
|
const int PARALLEL_FACTOR = 4;
|
|
const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
|
|
|
|
Label L_exit;
|
|
Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
|
|
Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
|
|
Label L_singleBlock_loopTop[3]; // 128, 192, 256
|
|
Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
|
|
Label L_multiBlock_loopTop[3]; // 128, 192, 256
|
|
|
|
// keys 0-10 preloaded into xmm5-xmm15
|
|
const int XMM_REG_NUM_KEY_FIRST = 5;
|
|
const int XMM_REG_NUM_KEY_LAST = 15;
|
|
const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
|
|
const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
|
|
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
#ifdef _WIN64
|
|
// on win64, fill len_reg from stack position
|
|
__ movl(len_reg, len_mem);
|
|
#else
|
|
__ push_ppx(len_reg); // Save
|
|
#endif
|
|
__ push_ppx(rbx);
|
|
// the java expanded key ordering is rotated one position from what we want
|
|
// so we start from 0x10 here and hit 0x00 last
|
|
const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
// load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
|
|
for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
|
|
load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
|
|
offset += 0x10;
|
|
}
|
|
load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
|
|
|
|
const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
|
|
|
|
// registers holding the four results in the parallelized loop
|
|
const XMMRegister xmm_result0 = xmm0;
|
|
const XMMRegister xmm_result1 = xmm2;
|
|
const XMMRegister xmm_result2 = xmm3;
|
|
const XMMRegister xmm_result3 = xmm4;
|
|
|
|
__ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
|
|
|
|
__ xorptr(pos, pos);
|
|
|
|
// now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
|
|
__ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
__ cmpl(rbx, 52);
|
|
__ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
|
|
__ cmpl(rbx, 60);
|
|
__ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
|
|
|
|
#define DoFour(opc, src_reg) \
|
|
__ opc(xmm_result0, src_reg); \
|
|
__ opc(xmm_result1, src_reg); \
|
|
__ opc(xmm_result2, src_reg); \
|
|
__ opc(xmm_result3, src_reg);
|
|
|
|
for (int k = 0; k < 3; ++k) {
|
|
__ BIND(L_multiBlock_loopTopHead[k]);
|
|
if (k != 0) {
|
|
__ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
|
|
__ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
|
|
}
|
|
if (k == 1) {
|
|
__ subptr(rsp, 6 * wordSize);
|
|
__ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
|
|
load_key(xmm15, key, 0xb0, rbx /*rscratch*/); // 0xb0; 192-bit key goes up to 0xc0
|
|
__ movdqu(Address(rsp, 2 * wordSize), xmm15);
|
|
load_key(xmm1, key, 0xc0, rbx /*rscratch*/); // 0xc0;
|
|
__ movdqu(Address(rsp, 4 * wordSize), xmm1);
|
|
} else if (k == 2) {
|
|
__ subptr(rsp, 10 * wordSize);
|
|
__ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
|
|
load_key(xmm15, key, 0xd0, rbx /*rscratch*/); // 0xd0; 256-bit key goes up to 0xe0
|
|
__ movdqu(Address(rsp, 6 * wordSize), xmm15);
|
|
load_key(xmm1, key, 0xe0, rbx /*rscratch*/); // 0xe0;
|
|
__ movdqu(Address(rsp, 8 * wordSize), xmm1);
|
|
load_key(xmm15, key, 0xb0, rbx /*rscratch*/); // 0xb0;
|
|
__ movdqu(Address(rsp, 2 * wordSize), xmm15);
|
|
load_key(xmm1, key, 0xc0, rbx /*rscratch*/); // 0xc0;
|
|
__ movdqu(Address(rsp, 4 * wordSize), xmm1);
|
|
}
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_multiBlock_loopTop[k]);
|
|
__ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
|
|
__ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
|
|
|
|
if (k != 0) {
|
|
__ movdqu(xmm15, Address(rsp, 2 * wordSize));
|
|
__ movdqu(xmm1, Address(rsp, 4 * wordSize));
|
|
}
|
|
|
|
__ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
|
|
__ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
|
|
__ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
|
|
__ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
|
|
|
|
DoFour(pxor, xmm_key_first);
|
|
if (k == 0) {
|
|
for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
|
|
DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
|
|
}
|
|
DoFour(aesdeclast, xmm_key_last);
|
|
} else if (k == 1) {
|
|
for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
|
|
DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
|
|
}
|
|
__ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
|
|
DoFour(aesdec, xmm1); // key : 0xc0
|
|
__ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
|
|
DoFour(aesdeclast, xmm_key_last);
|
|
} else if (k == 2) {
|
|
for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
|
|
DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
|
|
}
|
|
DoFour(aesdec, xmm1); // key : 0xc0
|
|
__ movdqu(xmm15, Address(rsp, 6 * wordSize));
|
|
__ movdqu(xmm1, Address(rsp, 8 * wordSize));
|
|
DoFour(aesdec, xmm15); // key : 0xd0
|
|
__ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
|
|
DoFour(aesdec, xmm1); // key : 0xe0
|
|
__ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
|
|
DoFour(aesdeclast, xmm_key_last);
|
|
}
|
|
|
|
// for each result, xor with the r vector of previous cipher block
|
|
__ pxor(xmm_result0, xmm_prev_block_cipher);
|
|
__ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
|
|
__ pxor(xmm_result1, xmm_prev_block_cipher);
|
|
__ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
|
|
__ pxor(xmm_result2, xmm_prev_block_cipher);
|
|
__ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
|
|
__ pxor(xmm_result3, xmm_prev_block_cipher);
|
|
__ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
|
|
if (k != 0) {
|
|
__ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
|
|
}
|
|
|
|
__ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output
|
|
__ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
|
|
__ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
|
|
__ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
|
|
|
|
__ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
|
|
__ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
|
|
__ jmp(L_multiBlock_loopTop[k]);
|
|
|
|
// registers used in the non-parallelized loops
|
|
// xmm register assignments for the loops below
|
|
const XMMRegister xmm_result = xmm0;
|
|
const XMMRegister xmm_prev_block_cipher_save = xmm2;
|
|
const XMMRegister xmm_key11 = xmm3;
|
|
const XMMRegister xmm_key12 = xmm4;
|
|
const XMMRegister key_tmp = xmm4;
|
|
|
|
__ BIND(L_singleBlock_loopTopHead[k]);
|
|
if (k == 1) {
|
|
__ addptr(rsp, 6 * wordSize);
|
|
} else if (k == 2) {
|
|
__ addptr(rsp, 10 * wordSize);
|
|
}
|
|
__ cmpptr(len_reg, 0); // any blocks left??
|
|
__ jcc(Assembler::equal, L_exit);
|
|
__ BIND(L_singleBlock_loopTopHead2[k]);
|
|
if (k == 1) {
|
|
load_key(xmm_key11, key, 0xb0, rbx /*rscratch*/); // 0xb0; 192-bit key goes up to 0xc0
|
|
load_key(xmm_key12, key, 0xc0, rbx /*rscratch*/); // 0xc0; 192-bit key goes up to 0xc0
|
|
}
|
|
if (k == 2) {
|
|
load_key(xmm_key11, key, 0xb0, rbx /*rscratch*/); // 0xb0; 256-bit key goes up to 0xe0
|
|
}
|
|
__ align(OptoLoopAlignment);
|
|
__ BIND(L_singleBlock_loopTop[k]);
|
|
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
|
|
__ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
|
|
__ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
|
|
for (int rnum = 1; rnum <= 9 ; rnum++) {
|
|
__ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
|
|
}
|
|
if (k == 1) {
|
|
__ aesdec(xmm_result, xmm_key11);
|
|
__ aesdec(xmm_result, xmm_key12);
|
|
}
|
|
if (k == 2) {
|
|
__ aesdec(xmm_result, xmm_key11);
|
|
load_key(key_tmp, key, 0xc0, rbx /*rscratch*/);
|
|
__ aesdec(xmm_result, key_tmp);
|
|
load_key(key_tmp, key, 0xd0, rbx /*rscratch*/);
|
|
__ aesdec(xmm_result, key_tmp);
|
|
load_key(key_tmp, key, 0xe0, rbx /*rscratch*/);
|
|
__ aesdec(xmm_result, key_tmp);
|
|
}
|
|
|
|
__ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
|
|
__ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
|
|
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
|
|
// no need to store r to memory until we exit
|
|
__ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
|
|
__ addptr(pos, AESBlockSize);
|
|
__ subptr(len_reg, AESBlockSize);
|
|
__ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
|
|
if (k != 2) {
|
|
__ jmp(L_exit);
|
|
}
|
|
} //for 128/192/256
|
|
|
|
__ BIND(L_exit);
|
|
__ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
|
|
__ pop_ppx(rbx);
|
|
#ifdef _WIN64
|
|
__ movl(rax, len_mem);
|
|
#else
|
|
__ pop_ppx(rax); // return length
|
|
#endif
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
address StubGenerator::generate_electronicCodeBook_encryptAESCrypt() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_electronicCodeBook_encryptAESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register len = c_rarg3; // src len (must be multiple of blocksize 16)
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
aesecb_encrypt(from, to, key, len);
|
|
|
|
__ vzeroupper();
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
address StubGenerator::generate_electronicCodeBook_decryptAESCrypt() {
|
|
__ align(CodeEntryAlignment);
|
|
StubId stub_id = StubId::stubgen_electronicCodeBook_decryptAESCrypt_id;
|
|
StubCodeMark mark(this, stub_id);
|
|
address start = __ pc();
|
|
|
|
const Register from = c_rarg0; // source array address
|
|
const Register to = c_rarg1; // destination array address
|
|
const Register key = c_rarg2; // key array address
|
|
const Register len = c_rarg3; // src len (must be multiple of blocksize 16)
|
|
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
|
|
|
aesecb_decrypt(from, to, key, len);
|
|
|
|
__ vzeroupper();
|
|
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
|
__ ret(0);
|
|
|
|
return start;
|
|
}
|
|
|
|
// Utility routine for increase 128bit counter (iv in CTR mode)
|
|
void StubGenerator::inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
|
|
__ pextrq(reg, xmmdst, 0x0);
|
|
__ addq(reg, inc_delta);
|
|
__ pinsrq(xmmdst, reg, 0x0);
|
|
__ jcc(Assembler::carryClear, next_block); // jump if no carry
|
|
__ pextrq(reg, xmmdst, 0x01); // Carry
|
|
__ addq(reg, 0x01);
|
|
__ pinsrq(xmmdst, reg, 0x01); //Carry end
|
|
__ BIND(next_block); // next instruction
|
|
}
|
|
|
|
|
|
void StubGenerator::roundEnc(XMMRegister key, int rnum) {
|
|
for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
|
|
__ vaesenc(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
|
|
}
|
|
}
|
|
|
|
void StubGenerator::lastroundEnc(XMMRegister key, int rnum) {
|
|
for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
|
|
__ vaesenclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
|
|
}
|
|
}
|
|
|
|
void StubGenerator::roundDec(XMMRegister key, int rnum) {
|
|
for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
|
|
__ vaesdec(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
|
|
}
|
|
}
|
|
|
|
void StubGenerator::lastroundDec(XMMRegister key, int rnum) {
|
|
for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
|
|
__ vaesdeclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
|
|
}
|
|
}
|
|
|
|
void StubGenerator::roundDec(XMMRegister xmm_reg) {
|
|
__ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
|
|
}
|
|
|
|
void StubGenerator::roundDeclast(XMMRegister xmm_reg) {
|
|
__ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
|
|
__ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
|
|
}
|
|
|
|
// Check incoming byte offset against the int[] len. key is the pointer to the int[0].
|
|
// This check happens often, so it is important for it to be very compact.
|
|
void StubGenerator::check_key_offset(Register key, int offset, int load_size) {
|
|
#ifdef ASSERT
|
|
Address key_length(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT));
|
|
assert((offset + load_size) % 4 == 0, "Alignment is good: %d + %d", offset, load_size);
|
|
int end_offset = (offset + load_size) / 4;
|
|
Label L_good;
|
|
__ cmpl(key_length, end_offset);
|
|
__ jccb(Assembler::greaterEqual, L_good);
|
|
__ hlt();
|
|
__ bind(L_good);
|
|
#endif
|
|
}
|
|
|
|
// Utility routine for loading a 128-bit key word in little endian format
|
|
void StubGenerator::load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask) {
|
|
check_key_offset(key, offset, 16);
|
|
__ movdqu(xmmdst, Address(key, offset));
|
|
__ pshufb(xmmdst, xmm_shuf_mask);
|
|
}
|
|
|
|
void StubGenerator::load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch) {
|
|
check_key_offset(key, offset, 16);
|
|
__ movdqu(xmmdst, Address(key, offset));
|
|
__ pshufb(xmmdst, ExternalAddress(key_shuffle_mask_addr()), rscratch);
|
|
}
|
|
|
|
void StubGenerator::ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask) {
|
|
check_key_offset(key, offset, 16);
|
|
__ movdqu(xmmdst, Address(key, offset));
|
|
__ pshufb(xmmdst, xmm_shuf_mask);
|
|
__ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
|
|
}
|
|
|
|
void StubGenerator::ev_load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch) {
|
|
check_key_offset(key, offset, 16);
|
|
__ movdqu(xmmdst, Address(key, offset));
|
|
__ pshufb(xmmdst, ExternalAddress(key_shuffle_mask_addr()), rscratch);
|
|
__ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
|
|
}
|
|
|
|
// Add 128-bit integers in xmmsrc1 to xmmsrc2, then place the result in xmmdst.
|
|
// Clobber ktmp and rscratch.
|
|
// Used by aesctr_encrypt.
|
|
void StubGenerator::ev_add128(XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
|
|
int vector_len, KRegister ktmp, XMMRegister ones) {
|
|
__ vpaddq(xmmdst, xmmsrc1, xmmsrc2, vector_len);
|
|
__ evpcmpuq(ktmp, xmmdst, xmmsrc2, __ lt, vector_len); // set mask[0/1] bit if addq to dst[0/1] wraps
|
|
__ kshiftlbl(ktmp, ktmp, 1); // mask[1] <- mask[0], mask[0] <- 0, etc
|
|
|
|
__ evpaddq(xmmdst, ktmp, xmmdst, ones, /*merge*/true, vector_len); // dst[1]++ if mask[1] set
|
|
}
|
|
|
|
// AES-ECB Encrypt Operation
|
|
void StubGenerator::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) {
|
|
const Register pos = rax;
|
|
const Register rounds = r12;
|
|
|
|
Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
|
|
__ push_ppx(r13);
|
|
__ push_ppx(r12);
|
|
|
|
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
|
// context for the registers used, where all instructions below are using 128-bit mode
|
|
// On EVEX without VL and BW, these instructions will all be AVX.
|
|
if (VM_Version::supports_avx512vlbw()) {
|
|
__ movl(rax, 0xffff);
|
|
__ kmovql(k1, rax);
|
|
}
|
|
__ push_ppx(len); // Save
|
|
__ push_ppx(rbx);
|
|
|
|
__ vzeroupper();
|
|
|
|
__ xorptr(pos, pos);
|
|
|
|
// Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
|
|
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
// Load Key shuf mask
|
|
const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
|
|
// Load and shuffle key based on number of rounds
|
|
ev_load_key(xmm8, key, 0 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm23, key, 3 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm24, key, 10 * 16, xmm_key_shuf_mask);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::greaterEqual, KEY_192);
|
|
__ jmp(Loop_start);
|
|
|
|
__ bind(KEY_192);
|
|
ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::equal, KEY_256);
|
|
__ jmp(Loop_start);
|
|
|
|
__ bind(KEY_256);
|
|
ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
|
|
|
|
__ bind(Loop_start);
|
|
__ movq(rbx, len);
|
|
// Divide length by 16 to convert it to number of blocks
|
|
__ shrq(len, 4);
|
|
__ shlq(rbx, 60);
|
|
__ jcc(Assembler::equal, NO_PARTS);
|
|
__ addq(len, 1);
|
|
// Check if number of blocks is greater than or equal to 32
|
|
// If true, 512 bytes are processed at a time (code marked by label LOOP)
|
|
// If not, 16 bytes are processed (code marked by REMAINDER label)
|
|
__ bind(NO_PARTS);
|
|
__ movq(rbx, len);
|
|
__ shrq(len, 5);
|
|
__ jcc(Assembler::equal, REMAINDER);
|
|
__ movl(r13, len);
|
|
// Compute number of blocks that will be processed 512 bytes at a time
|
|
// Subtract this from the total number of blocks which will then be processed by REMAINDER loop
|
|
__ shlq(r13, 5);
|
|
__ subq(rbx, r13);
|
|
//Begin processing 512 bytes
|
|
__ bind(LOOP);
|
|
// Move 64 bytes of PT data into a zmm register, as a result 512 bytes of PT loaded in zmm0-7
|
|
__ evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
|
|
// Xor with the first round key
|
|
__ evpxorq(xmm0, xmm0, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm1, xmm1, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm2, xmm2, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm3, xmm3, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm4, xmm4, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm5, xmm5, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm6, xmm6, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm7, xmm7, xmm8, Assembler::AVX_512bit);
|
|
// 9 Aes encode round operations
|
|
roundEnc(xmm9, 7);
|
|
roundEnc(xmm10, 7);
|
|
roundEnc(xmm23, 7);
|
|
roundEnc(xmm12, 7);
|
|
roundEnc(xmm13, 7);
|
|
roundEnc(xmm14, 7);
|
|
roundEnc(xmm15, 7);
|
|
roundEnc(xmm16, 7);
|
|
roundEnc(xmm17, 7);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::aboveEqual, AES192);
|
|
// Aesenclast round operation for keysize = 128
|
|
lastroundEnc(xmm24, 7);
|
|
__ jmp(END_LOOP);
|
|
//Additional 2 rounds of Aesenc operation for keysize = 192
|
|
__ bind(AES192);
|
|
roundEnc(xmm24, 7);
|
|
roundEnc(xmm19, 7);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::aboveEqual, AES256);
|
|
// Aesenclast round for keysize = 192
|
|
lastroundEnc(xmm20, 7);
|
|
__ jmp(END_LOOP);
|
|
// 2 rounds of Aesenc operation and Aesenclast for keysize = 256
|
|
__ bind(AES256);
|
|
roundEnc(xmm20, 7);
|
|
roundEnc(xmm21, 7);
|
|
lastroundEnc(xmm22, 7);
|
|
|
|
__ bind(END_LOOP);
|
|
// Move 512 bytes of CT to destination
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
|
|
|
|
__ addq(pos, 512);
|
|
__ decq(len);
|
|
__ jcc(Assembler::notEqual, LOOP);
|
|
|
|
__ bind(REMAINDER);
|
|
__ vzeroupper();
|
|
__ cmpq(rbx, 0);
|
|
__ jcc(Assembler::equal, END);
|
|
// Process 16 bytes at a time
|
|
__ bind(LOOP2);
|
|
__ movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
|
|
__ vpxor(xmm1, xmm1, xmm8, Assembler::AVX_128bit);
|
|
// xmm2 contains shuffled key for Aesenclast operation.
|
|
__ vmovdqu(xmm2, xmm24);
|
|
|
|
__ vaesenc(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm23, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
|
|
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::below, LAST2);
|
|
__ vmovdqu(xmm2, xmm20);
|
|
__ vaesenc(xmm1, xmm1, xmm24, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::below, LAST2);
|
|
__ vmovdqu(xmm2, xmm22);
|
|
__ vaesenc(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
|
|
|
|
__ bind(LAST2);
|
|
// Aesenclast round
|
|
__ vaesenclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
|
|
// Write 16 bytes of CT to destination
|
|
__ movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
|
|
__ addq(pos, 16);
|
|
__ decq(rbx);
|
|
__ jcc(Assembler::notEqual, LOOP2);
|
|
|
|
__ bind(END);
|
|
// Zero out the round keys
|
|
__ evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 44);
|
|
__ jcc(Assembler::belowEqual, EXIT);
|
|
__ evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::belowEqual, EXIT);
|
|
__ evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
|
|
__ bind(EXIT);
|
|
__ pop_ppx(rbx);
|
|
__ pop_ppx(rax); // return length
|
|
__ pop_ppx(r12);
|
|
__ pop_ppx(r13);
|
|
}
|
|
|
|
// AES-ECB Decrypt Operation
|
|
void StubGenerator::aesecb_decrypt(Register src_addr, Register dest_addr, Register key, Register len) {
|
|
|
|
Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
|
|
const Register pos = rax;
|
|
const Register rounds = r12;
|
|
__ push_ppx(r13);
|
|
__ push_ppx(r12);
|
|
|
|
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
|
|
// context for the registers used, where all instructions below are using 128-bit mode
|
|
// On EVEX without VL and BW, these instructions will all be AVX.
|
|
if (VM_Version::supports_avx512vlbw()) {
|
|
__ movl(rax, 0xffff);
|
|
__ kmovql(k1, rax);
|
|
}
|
|
|
|
__ push_ppx(len); // Save
|
|
__ push_ppx(rbx);
|
|
|
|
__ vzeroupper();
|
|
|
|
__ xorptr(pos, pos);
|
|
// Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
|
|
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
// Load Key shuf mask
|
|
const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
|
|
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
|
|
// Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption.
|
|
// So the first round key is loaded from 1*16 here and last round key is loaded from 0*16
|
|
ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::greaterEqual, KEY_192);
|
|
__ jmp(Loop_start);
|
|
|
|
__ bind(KEY_192);
|
|
ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::equal, KEY_256);
|
|
__ jmp(Loop_start);
|
|
|
|
__ bind(KEY_256);
|
|
ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
|
|
ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
|
|
__ bind(Loop_start);
|
|
__ movq(rbx, len);
|
|
// Convert input length to number of blocks
|
|
__ shrq(len, 4);
|
|
__ shlq(rbx, 60);
|
|
__ jcc(Assembler::equal, NO_PARTS);
|
|
__ addq(len, 1);
|
|
// Check if number of blocks is greater than/ equal to 32
|
|
// If true, blocks then 512 bytes are processed at a time (code marked by label LOOP)
|
|
// If not, 16 bytes are processed (code marked by label REMAINDER)
|
|
__ bind(NO_PARTS);
|
|
__ movq(rbx, len);
|
|
__ shrq(len, 5);
|
|
__ jcc(Assembler::equal, REMAINDER);
|
|
__ movl(r13, len);
|
|
// Compute number of blocks that will be processed as 512 bytes at a time
|
|
// Subtract this from the total number of blocks, which will then be processed by REMAINDER loop.
|
|
__ shlq(r13, 5);
|
|
__ subq(rbx, r13);
|
|
|
|
__ bind(LOOP);
|
|
// Move 64 bytes of CT data into a zmm register, as a result 512 bytes of CT loaded in zmm0-7
|
|
__ evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
|
|
// Xor with the first round key
|
|
__ evpxorq(xmm0, xmm0, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm1, xmm1, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm2, xmm2, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm3, xmm3, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm4, xmm4, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm5, xmm5, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm6, xmm6, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm7, xmm7, xmm9, Assembler::AVX_512bit);
|
|
// 9 rounds of Aesdec
|
|
roundDec(xmm10, 7);
|
|
roundDec(xmm11, 7);
|
|
roundDec(xmm12, 7);
|
|
roundDec(xmm13, 7);
|
|
roundDec(xmm14, 7);
|
|
roundDec(xmm15, 7);
|
|
roundDec(xmm16, 7);
|
|
roundDec(xmm17, 7);
|
|
roundDec(xmm18, 7);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::aboveEqual, AES192);
|
|
// Aesdeclast round for keysize = 128
|
|
lastroundDec(xmm27, 7);
|
|
__ jmp(END_LOOP);
|
|
|
|
__ bind(AES192);
|
|
// 2 Additional rounds for keysize = 192
|
|
roundDec(xmm19, 7);
|
|
roundDec(xmm20, 7);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::aboveEqual, AES256);
|
|
// Aesdeclast round for keysize = 192
|
|
lastroundDec(xmm27, 7);
|
|
__ jmp(END_LOOP);
|
|
__ bind(AES256);
|
|
// 2 Additional rounds and Aesdeclast for keysize = 256
|
|
roundDec(xmm21, 7);
|
|
roundDec(xmm22, 7);
|
|
lastroundDec(xmm27, 7);
|
|
|
|
__ bind(END_LOOP);
|
|
// Write 512 bytes of PT to the destination
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
|
|
|
|
__ addq(pos, 512);
|
|
__ decq(len);
|
|
__ jcc(Assembler::notEqual, LOOP);
|
|
|
|
__ bind(REMAINDER);
|
|
__ vzeroupper();
|
|
__ cmpq(rbx, 0);
|
|
__ jcc(Assembler::equal, END);
|
|
// Process 16 bytes at a time
|
|
__ bind(LOOP2);
|
|
__ movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
|
|
__ vpxor(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
|
|
// xmm2 contains shuffled key for Aesdeclast operation.
|
|
__ vmovdqu(xmm2, xmm27);
|
|
|
|
__ vaesdec(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm11, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm18, Assembler::AVX_128bit);
|
|
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::below, LAST2);
|
|
__ vaesdec(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::below, LAST2);
|
|
__ vaesdec(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
|
|
__ vaesdec(xmm1, xmm1, xmm22, Assembler::AVX_128bit);
|
|
|
|
__ bind(LAST2);
|
|
// Aesdeclast round
|
|
__ vaesdeclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
|
|
// Write 16 bytes of PT to destination
|
|
__ movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
|
|
__ addq(pos, 16);
|
|
__ decq(rbx);
|
|
__ jcc(Assembler::notEqual, LOOP2);
|
|
|
|
__ bind(END);
|
|
// Zero out the round keys
|
|
__ evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm11, xmm11, xmm11, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 44);
|
|
__ jcc(Assembler::belowEqual, EXIT);
|
|
__ evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::belowEqual, EXIT);
|
|
__ evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
|
|
|
|
__ bind(EXIT);
|
|
__ pop_ppx(rbx);
|
|
__ pop_ppx(rax); // return length
|
|
__ pop_ppx(r12);
|
|
__ pop_ppx(r13);
|
|
}
|
|
|
|
|
|
// AES Counter Mode using VAES instructions
|
|
void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
|
|
Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
|
|
|
|
const Register rounds = rax;
|
|
const Register pos = r12;
|
|
const Register tail = r15;
|
|
|
|
Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP,
|
|
AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16,
|
|
REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER,
|
|
AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP,
|
|
AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES,
|
|
EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR;
|
|
|
|
__ cmpl(len_reg, 0);
|
|
__ jcc(Assembler::belowEqual, EXIT);
|
|
|
|
__ movl(pos, 0);
|
|
// if the number of used encrypted counter bytes < 16,
|
|
// XOR PT with saved encrypted counter to obtain CT
|
|
__ bind(PRELOOP_START);
|
|
__ cmpl(used, 16);
|
|
__ jcc(Assembler::aboveEqual, EXIT_PRELOOP);
|
|
__ movb(rbx, Address(saved_encCounter_start, used));
|
|
__ xorb(rbx, Address(src_addr, pos));
|
|
__ movb(Address(dest_addr, pos), rbx);
|
|
__ addptr(pos, 1);
|
|
__ addptr(used, 1);
|
|
__ decrement(len_reg);
|
|
__ jcc(Assembler::notEqual, PRELOOP_START);
|
|
|
|
__ bind(EXIT_PRELOOP);
|
|
__ movl(Address(used_addr, 0), used);
|
|
|
|
__ cmpl(len_reg, 0);
|
|
__ jcc(Assembler::equal, EXIT);
|
|
|
|
// Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256).
|
|
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
|
|
// Move initial counter value in xmm0
|
|
__ movdqu(xmm0, Address(counter, 0));
|
|
// broadcast counter value to zmm8
|
|
__ evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit);
|
|
|
|
// load lbswap mask
|
|
__ evmovdquq(xmm16, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
|
|
|
|
//shuffle counter using lbswap_mask
|
|
__ vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit);
|
|
|
|
// pre-increment and propagate counter values to zmm9-zmm15 registers.
|
|
// Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4
|
|
// The counter is incremented after each block i.e. 16 bytes is processed;
|
|
// each zmm register has 4 counter values as its MSB
|
|
// the counters are incremented in parallel
|
|
|
|
const XMMRegister ones = xmm17;
|
|
// Vector value to propagate carries
|
|
__ evmovdquq(ones, ExternalAddress(counter_mask_ones_addr()), Assembler::AVX_512bit, r15);
|
|
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc0_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
|
|
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
|
|
ev_add128(xmm9, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
ev_add128(xmm10, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
ev_add128(xmm11, xmm10, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
ev_add128(xmm12, xmm11, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
ev_add128(xmm13, xmm12, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
ev_add128(xmm14, xmm13, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
ev_add128(xmm15, xmm14, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
|
|
// load linc32 mask in zmm register.linc32 increments counter by 32
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc32_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
|
|
|
|
// xmm31 contains the key shuffle mask.
|
|
__ movdqu(xmm31, ExternalAddress(key_shuffle_mask_addr()), r15 /*rscratch*/);
|
|
// Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value.
|
|
// For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register
|
|
// that holds shuffled key value.
|
|
ev_load_key(xmm20, key, 0, xmm31);
|
|
ev_load_key(xmm21, key, 1 * 16, xmm31);
|
|
ev_load_key(xmm22, key, 2 * 16, xmm31);
|
|
ev_load_key(xmm23, key, 3 * 16, xmm31);
|
|
ev_load_key(xmm24, key, 4 * 16, xmm31);
|
|
ev_load_key(xmm25, key, 5 * 16, xmm31);
|
|
ev_load_key(xmm26, key, 6 * 16, xmm31);
|
|
ev_load_key(xmm27, key, 7 * 16, xmm31);
|
|
ev_load_key(xmm28, key, 8 * 16, xmm31);
|
|
ev_load_key(xmm29, key, 9 * 16, xmm31);
|
|
ev_load_key(xmm30, key, 10 * 16, xmm31);
|
|
|
|
// Process 32 blocks or 512 bytes of data
|
|
__ bind(LOOP);
|
|
__ cmpl(len_reg, 512);
|
|
__ jcc(Assembler::less, REMAINDER);
|
|
__ subq(len_reg, 512);
|
|
//Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7
|
|
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit);
|
|
// Perform AES encode operations and put results in zmm0-zmm7.
|
|
// This is followed by incrementing counter values in zmm8-zmm15.
|
|
// Since we will be processing 32 blocks at a time, the counter is incremented by 32.
|
|
roundEnc(xmm21, 7);
|
|
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
roundEnc(xmm22, 7);
|
|
ev_add128(xmm9, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
roundEnc(xmm23, 7);
|
|
ev_add128(xmm10, xmm10, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
roundEnc(xmm24, 7);
|
|
ev_add128(xmm11, xmm11, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
roundEnc(xmm25, 7);
|
|
ev_add128(xmm12, xmm12, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
roundEnc(xmm26, 7);
|
|
ev_add128(xmm13, xmm13, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
roundEnc(xmm27, 7);
|
|
ev_add128(xmm14, xmm14, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
roundEnc(xmm28, 7);
|
|
ev_add128(xmm15, xmm15, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
roundEnc(xmm29, 7);
|
|
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::aboveEqual, AES192);
|
|
lastroundEnc(xmm30, 7);
|
|
__ jmp(END_LOOP);
|
|
|
|
__ bind(AES192);
|
|
roundEnc(xmm30, 7);
|
|
ev_load_key(xmm18, key, 11 * 16, xmm31);
|
|
roundEnc(xmm18, 7);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::aboveEqual, AES256);
|
|
ev_load_key(xmm18, key, 12 * 16, xmm31);
|
|
lastroundEnc(xmm18, 7);
|
|
__ jmp(END_LOOP);
|
|
|
|
__ bind(AES256);
|
|
ev_load_key(xmm18, key, 12 * 16, xmm31);
|
|
roundEnc(xmm18, 7);
|
|
ev_load_key(xmm18, key, 13 * 16, xmm31);
|
|
roundEnc(xmm18, 7);
|
|
ev_load_key(xmm18, key, 14 * 16, xmm31);
|
|
lastroundEnc(xmm18, 7);
|
|
|
|
// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7
|
|
// xor encrypted block cipher and input plaintext and store resultant ciphertext
|
|
__ bind(END_LOOP);
|
|
__ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
|
|
__ addq(pos, 512);
|
|
__ jmp(LOOP);
|
|
|
|
// Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes
|
|
__ bind(REMAINDER);
|
|
__ cmpl(len_reg, 0);
|
|
__ jcc(Assembler::equal, END);
|
|
__ cmpl(len_reg, 256);
|
|
__ jcc(Assembler::aboveEqual, REMAINDER_16);
|
|
__ cmpl(len_reg, 128);
|
|
__ jcc(Assembler::aboveEqual, REMAINDER_8);
|
|
__ cmpl(len_reg, 64);
|
|
__ jcc(Assembler::aboveEqual, REMAINDER_4);
|
|
// At this point, we will process 16 bytes of data at a time.
|
|
// So load xmm19 with counter increment value as 1
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
|
|
__ jmp(REMAINDER_LOOP);
|
|
|
|
// Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data
|
|
__ bind(REMAINDER_16);
|
|
__ subq(len_reg, 256);
|
|
// As we process 16 blocks at a time, load mask for incrementing the counter value by 16
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc16_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
|
|
// shuffle counter and XOR counter with roundkey1
|
|
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
|
|
// Increment counter values by 16
|
|
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
ev_add128(xmm9, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
// AES encode rounds
|
|
roundEnc(xmm21, 3);
|
|
roundEnc(xmm22, 3);
|
|
roundEnc(xmm23, 3);
|
|
roundEnc(xmm24, 3);
|
|
roundEnc(xmm25, 3);
|
|
roundEnc(xmm26, 3);
|
|
roundEnc(xmm27, 3);
|
|
roundEnc(xmm28, 3);
|
|
roundEnc(xmm29, 3);
|
|
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::aboveEqual, AES192_REMAINDER16);
|
|
lastroundEnc(xmm30, 3);
|
|
__ jmp(REMAINDER16_END_LOOP);
|
|
|
|
__ bind(AES192_REMAINDER16);
|
|
roundEnc(xmm30, 3);
|
|
ev_load_key(xmm18, key, 11 * 16, xmm31);
|
|
roundEnc(xmm18, 3);
|
|
ev_load_key(xmm5, key, 12 * 16, xmm31);
|
|
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::aboveEqual, AES256_REMAINDER16);
|
|
lastroundEnc(xmm5, 3);
|
|
__ jmp(REMAINDER16_END_LOOP);
|
|
__ bind(AES256_REMAINDER16);
|
|
roundEnc(xmm5, 3);
|
|
ev_load_key(xmm6, key, 13 * 16, xmm31);
|
|
roundEnc(xmm6, 3);
|
|
ev_load_key(xmm7, key, 14 * 16, xmm31);
|
|
lastroundEnc(xmm7, 3);
|
|
|
|
// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3
|
|
// xor 256 bytes of PT with the encrypted counters to produce CT.
|
|
__ bind(REMAINDER16_END_LOOP);
|
|
__ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
|
|
__ addq(pos, 256);
|
|
|
|
__ cmpl(len_reg, 128);
|
|
__ jcc(Assembler::aboveEqual, REMAINDER_8);
|
|
|
|
__ cmpl(len_reg, 64);
|
|
__ jcc(Assembler::aboveEqual, REMAINDER_4);
|
|
//load mask for incrementing the counter value by 1
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
|
|
__ jmp(REMAINDER_LOOP);
|
|
|
|
// Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data
|
|
__ bind(REMAINDER_8);
|
|
__ subq(len_reg, 128);
|
|
// As we process 8 blocks at a time, load mask for incrementing the counter value by 8
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc8_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
|
|
// shuffle counters and xor with roundkey1
|
|
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
|
|
__ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
|
|
// increment counter by 8
|
|
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
// AES encode
|
|
roundEnc(xmm21, 1);
|
|
roundEnc(xmm22, 1);
|
|
roundEnc(xmm23, 1);
|
|
roundEnc(xmm24, 1);
|
|
roundEnc(xmm25, 1);
|
|
roundEnc(xmm26, 1);
|
|
roundEnc(xmm27, 1);
|
|
roundEnc(xmm28, 1);
|
|
roundEnc(xmm29, 1);
|
|
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::aboveEqual, AES192_REMAINDER8);
|
|
lastroundEnc(xmm30, 1);
|
|
__ jmp(REMAINDER8_END_LOOP);
|
|
|
|
__ bind(AES192_REMAINDER8);
|
|
roundEnc(xmm30, 1);
|
|
ev_load_key(xmm18, key, 11 * 16, xmm31);
|
|
roundEnc(xmm18, 1);
|
|
ev_load_key(xmm5, key, 12 * 16, xmm31);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::aboveEqual, AES256_REMAINDER8);
|
|
lastroundEnc(xmm5, 1);
|
|
__ jmp(REMAINDER8_END_LOOP);
|
|
|
|
__ bind(AES256_REMAINDER8);
|
|
roundEnc(xmm5, 1);
|
|
ev_load_key(xmm6, key, 13 * 16, xmm31);
|
|
roundEnc(xmm6, 1);
|
|
ev_load_key(xmm7, key, 14 * 16, xmm31);
|
|
lastroundEnc(xmm7, 1);
|
|
|
|
__ bind(REMAINDER8_END_LOOP);
|
|
// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1
|
|
// XOR PT with the encrypted counter and store as CT
|
|
__ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
|
|
__ addq(pos, 128);
|
|
|
|
__ cmpl(len_reg, 64);
|
|
__ jcc(Assembler::aboveEqual, REMAINDER_4);
|
|
// load mask for incrementing the counter value by 1
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
|
|
__ jmp(REMAINDER_LOOP);
|
|
|
|
// Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code
|
|
__ bind(REMAINDER_4);
|
|
__ subq(len_reg, 64);
|
|
// As we process 4 blocks at a time, load mask for incrementing the counter value by 4
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
|
|
// XOR counter with first roundkey
|
|
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
|
|
|
|
// Increment counter
|
|
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
|
|
__ vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
|
|
__ vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
|
|
__ vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
|
|
__ vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit);
|
|
__ vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit);
|
|
__ vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit);
|
|
__ vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit);
|
|
__ vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit);
|
|
__ vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::aboveEqual, AES192_REMAINDER4);
|
|
__ vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
|
|
__ jmp(END_REMAINDER4);
|
|
|
|
__ bind(AES192_REMAINDER4);
|
|
__ vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
|
|
ev_load_key(xmm18, key, 11 * 16, xmm31);
|
|
__ vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit);
|
|
ev_load_key(xmm5, key, 12 * 16, xmm31);
|
|
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::aboveEqual, AES256_REMAINDER4);
|
|
__ vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
|
|
__ jmp(END_REMAINDER4);
|
|
|
|
__ bind(AES256_REMAINDER4);
|
|
__ vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
|
|
ev_load_key(xmm6, key, 13 * 16, xmm31);
|
|
__ vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit);
|
|
ev_load_key(xmm7, key, 14 * 16, xmm31);
|
|
__ vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit);
|
|
// After AES encode rounds, the encrypted block cipher lies in zmm0.
|
|
// XOR encrypted block cipher with PT and store 64 bytes of ciphertext
|
|
__ bind(END_REMAINDER4);
|
|
__ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
|
|
__ addq(pos, 64);
|
|
// load mask for incrementing the counter value by 1
|
|
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
|
|
|
|
// For a single block, the AES rounds start here.
|
|
__ bind(REMAINDER_LOOP);
|
|
__ cmpl(len_reg, 0);
|
|
__ jcc(Assembler::belowEqual, END);
|
|
// XOR counter with first roundkey
|
|
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit);
|
|
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
|
|
// Increment counter by 1
|
|
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_128bit, /*ktmp*/k1, ones);
|
|
__ vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit);
|
|
__ vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit);
|
|
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::aboveEqual, AES192_REMAINDER);
|
|
__ vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
|
|
__ jmp(END_REMAINDER_LOOP);
|
|
|
|
__ bind(AES192_REMAINDER);
|
|
__ vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
|
|
ev_load_key(xmm18, key, 11 * 16, xmm31);
|
|
__ vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit);
|
|
ev_load_key(xmm5, key, 12 * 16, xmm31);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::aboveEqual, AES256_REMAINDER);
|
|
__ vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
|
|
__ jmp(END_REMAINDER_LOOP);
|
|
|
|
__ bind(AES256_REMAINDER);
|
|
__ vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
|
|
ev_load_key(xmm6, key, 13 * 16, xmm31);
|
|
__ vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit);
|
|
ev_load_key(xmm7, key, 14 * 16, xmm31);
|
|
__ vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit);
|
|
|
|
__ bind(END_REMAINDER_LOOP);
|
|
// If the length register is less than the blockSize i.e. 16
|
|
// then we store only those bytes of the CT to the destination
|
|
// corresponding to the length register value
|
|
// extracting the exact number of bytes is handled by EXTRACT_TAILBYTES
|
|
__ cmpl(len_reg, 16);
|
|
__ jcc(Assembler::less, EXTRACT_TAILBYTES);
|
|
__ subl(len_reg, 16);
|
|
// After AES encode rounds, the encrypted block cipher lies in xmm0.
|
|
// If the length register is equal to 16 bytes, store CT in dest after XOR operation.
|
|
__ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
|
|
__ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit);
|
|
__ addl(pos, 16);
|
|
|
|
__ jmp(REMAINDER_LOOP);
|
|
|
|
__ bind(EXTRACT_TAILBYTES);
|
|
// Save encrypted counter value in xmm0 for next invocation, before XOR operation
|
|
__ movdqu(Address(saved_encCounter_start, 0), xmm0);
|
|
// XOR encryted block cipher in xmm0 with PT to produce CT
|
|
// extract up to 15 bytes of CT from xmm0 as specified by length register
|
|
__ testptr(len_reg, 8);
|
|
__ jcc(Assembler::zero, EXTRACT_TAIL_4BYTES);
|
|
__ pextrq(tail, xmm0, 0);
|
|
__ xorq(tail, Address(src_addr, pos, Address::times_1, 0));
|
|
__ movq(Address(dest_addr, pos), tail);
|
|
__ psrldq(xmm0, 8);
|
|
__ addl(pos, 8);
|
|
__ bind(EXTRACT_TAIL_4BYTES);
|
|
__ testptr(len_reg, 4);
|
|
__ jcc(Assembler::zero, EXTRACT_TAIL_2BYTES);
|
|
__ pextrd(tail, xmm0, 0);
|
|
__ xorl(tail, Address(src_addr, pos, Address::times_1, 0));
|
|
__ movl(Address(dest_addr, pos), tail);
|
|
__ psrldq(xmm0, 4);
|
|
__ addq(pos, 4);
|
|
__ bind(EXTRACT_TAIL_2BYTES);
|
|
__ testptr(len_reg, 2);
|
|
__ jcc(Assembler::zero, EXTRACT_TAIL_1BYTE);
|
|
__ pextrw(tail, xmm0, 0);
|
|
__ xorw(tail, Address(src_addr, pos, Address::times_1, 0));
|
|
__ movw(Address(dest_addr, pos), tail);
|
|
__ psrldq(xmm0, 2);
|
|
__ addl(pos, 2);
|
|
__ bind(EXTRACT_TAIL_1BYTE);
|
|
__ testptr(len_reg, 1);
|
|
__ jcc(Assembler::zero, END);
|
|
__ pextrb(tail, xmm0, 0);
|
|
__ xorb(tail, Address(src_addr, pos, Address::times_1, 0));
|
|
__ movb(Address(dest_addr, pos), tail);
|
|
__ addl(pos, 1);
|
|
|
|
__ bind(END);
|
|
// If there are no tail bytes, store counter value and exit
|
|
__ cmpl(len_reg, 0);
|
|
__ jcc(Assembler::equal, STORE_CTR);
|
|
__ movl(Address(used_addr, 0), len_reg);
|
|
|
|
__ bind(STORE_CTR);
|
|
//shuffle updated counter and store it
|
|
__ vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit);
|
|
__ movdqu(Address(counter, 0), xmm8);
|
|
// Zero out counter and key registers
|
|
__ evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 44);
|
|
__ jcc(Assembler::belowEqual, EXIT);
|
|
__ evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::belowEqual, EXIT);
|
|
__ evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit);
|
|
__ evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit);
|
|
__ bind(EXIT);
|
|
}
|
|
|
|
void StubGenerator::gfmul_avx512(XMMRegister GH, XMMRegister HK) {
|
|
const XMMRegister TMP1 = xmm0;
|
|
const XMMRegister TMP2 = xmm1;
|
|
const XMMRegister TMP3 = xmm2;
|
|
|
|
__ evpclmulqdq(TMP1, GH, HK, 0x11, Assembler::AVX_512bit);
|
|
__ evpclmulqdq(TMP2, GH, HK, 0x00, Assembler::AVX_512bit);
|
|
__ evpclmulqdq(TMP3, GH, HK, 0x01, Assembler::AVX_512bit);
|
|
__ evpclmulqdq(GH, GH, HK, 0x10, Assembler::AVX_512bit);
|
|
__ evpxorq(GH, GH, TMP3, Assembler::AVX_512bit);
|
|
__ vpsrldq(TMP3, GH, 8, Assembler::AVX_512bit);
|
|
__ vpslldq(GH, GH, 8, Assembler::AVX_512bit);
|
|
__ evpxorq(TMP1, TMP1, TMP3, Assembler::AVX_512bit);
|
|
__ evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
|
|
|
|
__ evmovdquq(TMP3, ExternalAddress(ghash_polynomial_reduction_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
|
|
__ evpclmulqdq(TMP2, TMP3, GH, 0x01, Assembler::AVX_512bit);
|
|
__ vpslldq(TMP2, TMP2, 8, Assembler::AVX_512bit);
|
|
__ evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
|
|
__ evpclmulqdq(TMP2, TMP3, GH, 0x00, Assembler::AVX_512bit);
|
|
__ vpsrldq(TMP2, TMP2, 4, Assembler::AVX_512bit);
|
|
__ evpclmulqdq(GH, TMP3, GH, 0x10, Assembler::AVX_512bit);
|
|
__ vpslldq(GH, GH, 4, Assembler::AVX_512bit);
|
|
__ vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit);
|
|
}
|
|
|
|
// Holds 64 Htbl entries, 32 HKey and 32 HkKey (derived from HKey)
|
|
void StubGenerator::generateHtbl_32_blocks_avx512(Register htbl, Register avx512_htbl) {
|
|
const XMMRegister HK = xmm6;
|
|
const XMMRegister ZT1 = xmm0, ZT2 = xmm1, ZT3 = xmm2, ZT4 = xmm3;
|
|
const XMMRegister ZT5 = xmm4, ZT6 = xmm5, ZT7 = xmm7, ZT8 = xmm8;
|
|
const XMMRegister ZT10 = xmm10, ZT11 = xmm11, ZT12 = xmm12;
|
|
|
|
__ movdqu(HK, Address(htbl, 0));
|
|
__ movdqu(ZT10, ExternalAddress(ghash_long_swap_mask_addr()), r15);
|
|
__ vpshufb(HK, HK, ZT10, Assembler::AVX_128bit);
|
|
__ movdqu(ZT11, ExternalAddress(ghash_polynomial_addr()), r15);
|
|
__ movdqu(ZT12, ExternalAddress(ghash_polynomial_two_one_addr()), r15);
|
|
// Compute H ^ 2 from the input subkeyH
|
|
__ movdqu(ZT3, HK);
|
|
__ vpsllq(HK, HK, 1, Assembler::AVX_128bit);
|
|
__ vpsrlq(ZT3, ZT3, 63, Assembler::AVX_128bit);
|
|
__ movdqu(ZT2, ZT3);
|
|
__ vpslldq(ZT3, ZT3, 8, Assembler::AVX_128bit);
|
|
__ vpsrldq(ZT2, ZT2, 8, Assembler::AVX_128bit);
|
|
__ vpor(HK, HK, ZT3, Assembler::AVX_128bit);
|
|
__ vpshufd(ZT3, ZT2, 0x24, Assembler::AVX_128bit);
|
|
__ vpcmpeqd(ZT3, ZT3, ZT12, Assembler::AVX_128bit);
|
|
__ vpand(ZT3, ZT3, ZT11, Assembler::AVX_128bit);
|
|
__ vpxor(HK, HK, ZT3, Assembler::AVX_128bit);
|
|
__ movdqu(Address(avx512_htbl, 16 * 31), HK); // H ^ 2
|
|
|
|
__ movdqu(ZT5, HK);
|
|
__ evinserti64x2(ZT7, ZT7, HK, 3, Assembler::AVX_512bit);
|
|
|
|
//calculate HashKey ^ 2 << 1 mod poly
|
|
gfmul_avx512(ZT5, HK);
|
|
__ movdqu(Address(avx512_htbl, 16 * 30), ZT5);
|
|
__ evinserti64x2(ZT7, ZT7, ZT5, 2, Assembler::AVX_512bit);
|
|
|
|
//calculate HashKey ^ 3 << 1 mod poly
|
|
gfmul_avx512(ZT5, HK);
|
|
__ movdqu(Address(avx512_htbl, 16 * 29), ZT5);
|
|
__ evinserti64x2(ZT7, ZT7, ZT5, 1, Assembler::AVX_512bit);
|
|
|
|
//calculate HashKey ^ 4 << 1 mod poly
|
|
gfmul_avx512(ZT5, HK);
|
|
__ movdqu(Address(avx512_htbl, 16 * 28), ZT5);
|
|
__ evinserti64x2(ZT7, ZT7, ZT5, 0, Assembler::AVX_512bit);
|
|
// ZT5 amd ZT7 to be cleared(hash key)
|
|
//calculate HashKeyK = HashKey x POLY
|
|
__ evmovdquq(xmm11, ExternalAddress(ghash_polynomial_addr()), Assembler::AVX_512bit, r15);
|
|
__ evpclmulqdq(ZT1, ZT7, xmm11, 0x10, Assembler::AVX_512bit);
|
|
__ vpshufd(ZT2, ZT7, 78, Assembler::AVX_512bit);
|
|
__ evpxorq(ZT1, ZT1, ZT2, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_htbl, 16 * 60), ZT1, Assembler::AVX_512bit);
|
|
//**ZT1 amd ZT2 to be cleared(hash key)
|
|
|
|
//switch to 4x128 - bit computations now
|
|
__ evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit); //;; broadcast HashKey ^ 4 across all ZT5
|
|
__ evmovdquq(ZT8, ZT7, Assembler::AVX_512bit);//; save HashKey ^ 4 to HashKey ^ 1 in ZT8
|
|
//**ZT8 to be cleared(hash key)
|
|
|
|
//calculate HashKey ^ 5 << 1 mod poly, HashKey ^ 6 << 1 mod poly, ... HashKey ^ 8 << 1 mod poly
|
|
gfmul_avx512(ZT7, ZT5);
|
|
__ evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit);//; HashKey ^ 8 to HashKey ^ 5 in ZT7 now
|
|
|
|
//calculate HashKeyX = HashKey x POLY
|
|
__ evpclmulqdq(ZT1, ZT7, xmm11, 0x10, Assembler::AVX_512bit);
|
|
__ vpshufd(ZT2, ZT7, 78, Assembler::AVX_512bit);
|
|
__ evpxorq(ZT1, ZT1, ZT2, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_htbl, 16 * 56), ZT1, Assembler::AVX_512bit);
|
|
|
|
__ evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit);//;; broadcast HashKey ^ 8 across all ZT5
|
|
|
|
for (int i = 20, j = 52; i > 0;) {
|
|
gfmul_avx512(ZT8, ZT5);
|
|
__ evmovdquq(Address(avx512_htbl, 16 * i), ZT8, Assembler::AVX_512bit);
|
|
//calculate HashKeyK = HashKey x POLY
|
|
__ evpclmulqdq(ZT1, ZT8, xmm11, 0x10, Assembler::AVX_512bit);
|
|
__ vpshufd(ZT2, ZT8, 78, Assembler::AVX_512bit);
|
|
__ evpxorq(ZT1, ZT1, ZT2, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_htbl, 16 * j), ZT1, Assembler::AVX_512bit);
|
|
|
|
i -= 4;
|
|
j -= 4;
|
|
//compute HashKey ^ (8 + n), HashKey ^ (7 + n), ... HashKey ^ (5 + n)
|
|
gfmul_avx512(ZT7, ZT5);
|
|
__ evmovdquq(Address(avx512_htbl, 16 * i), ZT7, Assembler::AVX_512bit);
|
|
|
|
//calculate HashKeyK = HashKey x POLY
|
|
__ evpclmulqdq(ZT1, ZT7, xmm11, 0x10, Assembler::AVX_512bit);
|
|
__ vpshufd(ZT2, ZT7, 78, Assembler::AVX_512bit);
|
|
__ evpxorq(ZT1, ZT1, ZT2, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_htbl, 16 * j), ZT1, Assembler::AVX_512bit);
|
|
|
|
i -= 4;
|
|
j -= 4;
|
|
}
|
|
}
|
|
|
|
#define vhpxori4x128(reg, tmp) \
|
|
__ vextracti64x4(tmp, reg, 1); \
|
|
__ evpxorq(reg, reg, tmp, Assembler::AVX_256bit); \
|
|
__ vextracti32x4(tmp, reg, 1); \
|
|
__ evpxorq(reg, reg, tmp, Assembler::AVX_128bit); \
|
|
|
|
#define roundEncode(key, dst1, dst2, dst3, dst4) \
|
|
__ vaesenc(dst1, dst1, key, Assembler::AVX_512bit); \
|
|
__ vaesenc(dst2, dst2, key, Assembler::AVX_512bit); \
|
|
__ vaesenc(dst3, dst3, key, Assembler::AVX_512bit); \
|
|
__ vaesenc(dst4, dst4, key, Assembler::AVX_512bit); \
|
|
|
|
#define lastroundEncode(key, dst1, dst2, dst3, dst4) \
|
|
__ vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); \
|
|
__ vaesenclast(dst2, dst2, key, Assembler::AVX_512bit); \
|
|
__ vaesenclast(dst3, dst3, key, Assembler::AVX_512bit); \
|
|
__ vaesenclast(dst4, dst4, key, Assembler::AVX_512bit); \
|
|
|
|
#define storeData(dst, position, src1, src2, src3, src4) \
|
|
__ evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1, Assembler::AVX_512bit); \
|
|
__ evmovdquq(Address(dst, position, Address::times_1, 1 * 64), src2, Assembler::AVX_512bit); \
|
|
__ evmovdquq(Address(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit); \
|
|
__ evmovdquq(Address(dst, position, Address::times_1, 3 * 64), src4, Assembler::AVX_512bit); \
|
|
|
|
#define loadData(src, position, dst1, dst2, dst3, dst4) \
|
|
__ evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64), Assembler::AVX_512bit); \
|
|
__ evmovdquq(dst2, Address(src, position, Address::times_1, 1 * 64), Assembler::AVX_512bit); \
|
|
__ evmovdquq(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler::AVX_512bit); \
|
|
__ evmovdquq(dst4, Address(src, position, Address::times_1, 3 * 64), Assembler::AVX_512bit); \
|
|
|
|
#define carrylessMultiply(dst00, dst01, dst10, dst11, ghdata, hkey2, hkey1) \
|
|
__ evpclmulqdq(dst00, ghdata, hkey2, 0x00, Assembler::AVX_512bit); \
|
|
__ evpclmulqdq(dst01, ghdata, hkey2, 0x10, Assembler::AVX_512bit); \
|
|
__ evpclmulqdq(dst10, ghdata, hkey1, 0x01, Assembler::AVX_512bit); \
|
|
__ evpclmulqdq(dst11, ghdata, hkey1, 0x11, Assembler::AVX_512bit); \
|
|
|
|
#define shuffle(dst0, dst1, dst2, dst3, src0, src1, src2, src3, shufmask) \
|
|
__ vpshufb(dst0, src0, shufmask, Assembler::AVX_512bit); \
|
|
__ vpshufb(dst1, src1, shufmask, Assembler::AVX_512bit); \
|
|
__ vpshufb(dst2, src2, shufmask, Assembler::AVX_512bit); \
|
|
__ vpshufb(dst3, src3, shufmask, Assembler::AVX_512bit); \
|
|
|
|
#define xorBeforeStore(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
|
|
__ evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); \
|
|
__ evpxorq(dst1, dst1, src1, Assembler::AVX_512bit); \
|
|
__ evpxorq(dst2, dst2, src2, Assembler::AVX_512bit); \
|
|
__ evpxorq(dst3, dst3, src3, Assembler::AVX_512bit); \
|
|
|
|
#define xorGHASH(dst0, dst1, dst2, dst3, src02, src03, src12, src13, src22, src23, src32, src33) \
|
|
__ vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); \
|
|
__ vpternlogq(dst1, 0x96, src12, src13, Assembler::AVX_512bit); \
|
|
__ vpternlogq(dst2, 0x96, src22, src23, Assembler::AVX_512bit); \
|
|
__ vpternlogq(dst3, 0x96, src32, src33, Assembler::AVX_512bit); \
|
|
|
|
//schoolbook multiply of 16 blocks(8 x 16 bytes)
|
|
//it is assumed that data read is already shuffledand
|
|
void StubGenerator::ghash16_avx512(bool start_ghash, bool do_reduction, bool uload_shuffle, bool hk_broadcast, bool do_hxor,
|
|
Register in, Register pos, Register subkeyHtbl, XMMRegister HASH, XMMRegister SHUFM, int in_offset,
|
|
int in_disp, int displacement, int hashkey_offset) {
|
|
const XMMRegister ZTMP0 = xmm0;
|
|
const XMMRegister ZTMP1 = xmm3;
|
|
const XMMRegister ZTMP2 = xmm4;
|
|
const XMMRegister ZTMP3 = xmm5;
|
|
const XMMRegister ZTMP4 = xmm6;
|
|
const XMMRegister ZTMP5 = xmm7;
|
|
const XMMRegister ZTMP6 = xmm10;
|
|
const XMMRegister ZTMP7 = xmm11;
|
|
const XMMRegister ZTMP8 = xmm12;
|
|
const XMMRegister ZTMP9 = xmm13;
|
|
const XMMRegister ZTMPA = xmm26;
|
|
const XMMRegister ZTMPB = xmm23;
|
|
const XMMRegister GH = xmm24;
|
|
const XMMRegister GL = xmm25;
|
|
const int hkey_gap = 16 * 32;
|
|
|
|
if (uload_shuffle) {
|
|
__ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp), Assembler::AVX_512bit);
|
|
__ vpshufb(ZTMP9, ZTMP9, SHUFM, Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp), Assembler::AVX_512bit);
|
|
}
|
|
|
|
if (start_ghash) {
|
|
__ evpxorq(ZTMP9, ZTMP9, HASH, Assembler::AVX_512bit);
|
|
}
|
|
if (hk_broadcast) {
|
|
__ evbroadcastf64x2(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 0 * 64), Assembler::AVX_512bit);
|
|
__ evbroadcastf64x2(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 0 * 64), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 0 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 0 * 64), Assembler::AVX_512bit);
|
|
}
|
|
|
|
carrylessMultiply(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP9, ZTMPA, ZTMP8);
|
|
|
|
//ghash blocks 4 - 7
|
|
if (uload_shuffle) {
|
|
__ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 64), Assembler::AVX_512bit);
|
|
__ vpshufb(ZTMP9, ZTMP9, SHUFM, Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 64), Assembler::AVX_512bit);
|
|
}
|
|
|
|
if (hk_broadcast) {
|
|
__ evbroadcastf64x2(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 1 * 64), Assembler::AVX_512bit);;
|
|
__ evbroadcastf64x2(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 1 * 64), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 1 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 1 * 64), Assembler::AVX_512bit);
|
|
}
|
|
|
|
carrylessMultiply(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP9, ZTMPA, ZTMP8);
|
|
|
|
//update sums
|
|
if (start_ghash) {
|
|
__ evpxorq(GL, ZTMP0, ZTMP2, Assembler::AVX_512bit);//T2 = THL + TLL
|
|
__ evpxorq(GH, ZTMP1, ZTMP3, Assembler::AVX_512bit);//T1 = THH + TLH
|
|
} else { //mid, end, end_reduce
|
|
__ vpternlogq(GL, 0x96, ZTMP0, ZTMP2, Assembler::AVX_512bit);//T2 = THL + TLL
|
|
__ vpternlogq(GH, 0x96, ZTMP1, ZTMP3, Assembler::AVX_512bit);//T1 = THH + TLH
|
|
}
|
|
//ghash blocks 8 - 11
|
|
if (uload_shuffle) {
|
|
__ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 128), Assembler::AVX_512bit);
|
|
__ vpshufb(ZTMP9, ZTMP9, SHUFM, Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 128), Assembler::AVX_512bit);
|
|
}
|
|
if (hk_broadcast) {
|
|
__ evbroadcastf64x2(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 2 * 64), Assembler::AVX_512bit);
|
|
__ evbroadcastf64x2(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 2 * 64), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 2 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 2 * 64), Assembler::AVX_512bit);
|
|
}
|
|
|
|
carrylessMultiply(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP9, ZTMPA, ZTMP8);
|
|
|
|
//update sums
|
|
__ vpternlogq(GL, 0x96, ZTMP6, ZTMP4, Assembler::AVX_512bit);//T2 = THL + TLL
|
|
__ vpternlogq(GH, 0x96, ZTMP7, ZTMP5, Assembler::AVX_512bit);//T1 = THH + TLH
|
|
//ghash blocks 12 - 15
|
|
if (uload_shuffle) {
|
|
__ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 192), Assembler::AVX_512bit);
|
|
__ vpshufb(ZTMP9, ZTMP9, SHUFM, Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 192), Assembler::AVX_512bit);
|
|
}
|
|
|
|
if (hk_broadcast) {
|
|
__ evbroadcastf64x2(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 3 * 64), Assembler::AVX_512bit);
|
|
__ evbroadcastf64x2(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 3 * 64), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 3 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 3 * 64), Assembler::AVX_512bit);
|
|
}
|
|
carrylessMultiply(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP9, ZTMPA, ZTMP8);
|
|
|
|
//update sums
|
|
xorGHASH(GL, GH, GL, GH, ZTMP0, ZTMP2, ZTMP1, ZTMP3, ZTMP6, ZTMP4, ZTMP7, ZTMP5);
|
|
|
|
if (do_reduction) {
|
|
//new reduction
|
|
__ evmovdquq(ZTMPB, ExternalAddress(ghash_polynomial_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
|
|
__ evpclmulqdq(HASH, GL, ZTMPB, 0x10, Assembler::AVX_512bit);
|
|
__ vpshufd(ZTMP0, GL, 78, Assembler::AVX_512bit);
|
|
__ vpternlogq(HASH, 0x96, GH, ZTMP0, Assembler::AVX_512bit);
|
|
if (do_hxor) {
|
|
vhpxori4x128(HASH, ZTMP0);
|
|
}
|
|
}
|
|
}
|
|
|
|
//Stitched GHASH of 16 blocks(with reduction) with encryption of 0 blocks
|
|
void StubGenerator::gcm_enc_dec_last_avx512(Register len, Register in, Register pos, XMMRegister HASH, XMMRegister SHUFM, Register subkeyHtbl,
|
|
int ghashin_offset, int hashkey_offset, bool start_ghash, bool do_reduction) {
|
|
//there is 0 blocks to cipher so there are only 16 blocks for ghash and reduction
|
|
ghash16_avx512(start_ghash, do_reduction, false, false, true, in, pos, subkeyHtbl, HASH, SHUFM, ghashin_offset, 0, 0, hashkey_offset);
|
|
}
|
|
|
|
//Main GCM macro stitching cipher with GHASH
|
|
//encrypts 16 blocks at a time
|
|
//ghash the 16 previously encrypted ciphertext blocks
|
|
void StubGenerator::ghash16_encrypt_parallel16_avx512(Register in, Register out, Register ct, Register pos, Register avx512_subkeyHtbl,
|
|
Register CTR_CHECK, Register NROUNDS, Register key, XMMRegister CTR_BE, XMMRegister GHASH_IN,
|
|
XMMRegister ADDBE_4x4, XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHFMSK,
|
|
bool hk_broadcast, bool is_hash_start, bool do_hash_reduction, bool do_hash_hxor,
|
|
bool no_ghash_in, int ghashin_offset, int aesout_offset, int hashkey_offset) {
|
|
const XMMRegister B00_03 = xmm0;
|
|
const XMMRegister B04_07 = xmm3;
|
|
const XMMRegister B08_11 = xmm4;
|
|
const XMMRegister B12_15 = xmm5;
|
|
const XMMRegister THH1 = xmm6;
|
|
const XMMRegister THL1 = xmm7;
|
|
const XMMRegister TLH1 = xmm10;
|
|
const XMMRegister TLL1 = xmm11, THH2 = xmm12, THL2 = xmm13, TLH2 = xmm15;
|
|
const XMMRegister TLL2 = xmm16, THH3 = xmm17, THL3 = xmm19, TLH3 = xmm20;
|
|
const XMMRegister TLL3 = xmm21, DATA1 = xmm17, DATA2 = xmm19, DATA3 = xmm20, DATA4 = xmm21;
|
|
const XMMRegister AESKEY1 = xmm30, AESKEY2 = xmm31;
|
|
const XMMRegister GHKEY1 = xmm1, GHKEY2 = xmm18, GHDAT1 = xmm8, GHDAT2 = xmm22;
|
|
const XMMRegister ZT = xmm23, TO_REDUCE_L = xmm25, TO_REDUCE_H = xmm24;
|
|
const int hkey_gap = 16 * 32;
|
|
|
|
Label blocks_overflow, blocks_ok, skip_shuffle, cont, aes_256, aes_192, last_aes_rnd;
|
|
|
|
__ cmpb(CTR_CHECK, (256 - 16));
|
|
__ jcc(Assembler::aboveEqual, blocks_overflow);
|
|
__ vpaddd(B00_03, CTR_BE, ADDBE_1234, Assembler::AVX_512bit);
|
|
__ vpaddd(B04_07, B00_03, ADDBE_4x4, Assembler::AVX_512bit);
|
|
__ vpaddd(B08_11, B04_07, ADDBE_4x4, Assembler::AVX_512bit);
|
|
__ vpaddd(B12_15, B08_11, ADDBE_4x4, Assembler::AVX_512bit);
|
|
__ jmp(blocks_ok);
|
|
__ bind(blocks_overflow);
|
|
__ vpshufb(CTR_BE, CTR_BE, SHFMSK, Assembler::AVX_512bit);
|
|
__ evmovdquq(B12_15, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
|
|
__ vpaddd(B00_03, CTR_BE, ADD_1234, Assembler::AVX_512bit);
|
|
__ vpaddd(B04_07, B00_03, B12_15, Assembler::AVX_512bit);
|
|
__ vpaddd(B08_11, B04_07, B12_15, Assembler::AVX_512bit);
|
|
__ vpaddd(B12_15, B08_11, B12_15, Assembler::AVX_512bit);
|
|
shuffle(B00_03, B04_07, B08_11, B12_15, B00_03, B04_07, B08_11, B12_15, SHFMSK);
|
|
|
|
__ bind(blocks_ok);
|
|
|
|
//pre - load constants
|
|
ev_load_key(AESKEY1, key, 0, rbx);
|
|
if (!no_ghash_in) {
|
|
__ evpxorq(GHDAT1, GHASH_IN, Address(avx512_subkeyHtbl, 16 * ghashin_offset), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(GHDAT1, Address(avx512_subkeyHtbl, 16 * ghashin_offset), Assembler::AVX_512bit);
|
|
}
|
|
|
|
if (hk_broadcast) {
|
|
__ evbroadcastf64x2(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 0 * 64), Assembler::AVX_512bit);
|
|
__ evbroadcastf64x2(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 0 * 64), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 0 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 0 * 64), Assembler::AVX_512bit);
|
|
}
|
|
|
|
//save counter for the next round
|
|
//increment counter overflow check register
|
|
__ evshufi64x2(CTR_BE, B12_15, B12_15, 255, Assembler::AVX_512bit);
|
|
__ addb(CTR_CHECK, 16);
|
|
|
|
//pre - load constants
|
|
ev_load_key(AESKEY2, key, 1 * 16, rbx);
|
|
__ evmovdquq(GHDAT2, Address(avx512_subkeyHtbl, 16 * (ghashin_offset +4)), Assembler::AVX_512bit);
|
|
|
|
//stitch AES rounds with GHASH
|
|
//AES round 0
|
|
__ evpxorq(B00_03, B00_03, AESKEY1, Assembler::AVX_512bit);
|
|
__ evpxorq(B04_07, B04_07, AESKEY1, Assembler::AVX_512bit);
|
|
__ evpxorq(B08_11, B08_11, AESKEY1, Assembler::AVX_512bit);
|
|
__ evpxorq(B12_15, B12_15, AESKEY1, Assembler::AVX_512bit);
|
|
ev_load_key(AESKEY1, key, 2 * 16, rbx);
|
|
|
|
//GHASH 4 blocks(15 to 12)
|
|
carrylessMultiply(TLL1, TLH1, THL1, THH1, GHDAT1, GHKEY2, GHKEY1);
|
|
|
|
if (hk_broadcast) {
|
|
__ evbroadcastf64x2(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 1 * 64), Assembler::AVX_512bit);
|
|
__ evbroadcastf64x2(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 1 * 64), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 1 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 1 * 64), Assembler::AVX_512bit);
|
|
}
|
|
|
|
__ evmovdquq(GHDAT1, Address(avx512_subkeyHtbl, 16 * (ghashin_offset + 8)), Assembler::AVX_512bit);
|
|
|
|
//AES round 1
|
|
roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
|
|
|
|
ev_load_key(AESKEY2, key, 3 * 16, rbx);
|
|
|
|
//GHASH 4 blocks(11 to 8)
|
|
carrylessMultiply(TLL2, TLH2, THL2, THH2, GHDAT2, GHKEY2, GHKEY1);
|
|
|
|
if (hk_broadcast) {
|
|
__ evbroadcastf64x2(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 2 * 64), Assembler::AVX_512bit);
|
|
__ evbroadcastf64x2(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 2 * 64), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 2 * 64 ), Assembler::AVX_512bit);
|
|
__ evmovdquq(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 2 * 64), Assembler::AVX_512bit);
|
|
}
|
|
__ evmovdquq(GHDAT2, Address(avx512_subkeyHtbl, 16 * (ghashin_offset + 12)), Assembler::AVX_512bit);
|
|
|
|
//AES round 2
|
|
roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY1, key, 4 * 16, rbx);
|
|
|
|
//GHASH 4 blocks(7 to 4)
|
|
carrylessMultiply(TLL3, TLH3, THL3, THH3, GHDAT1, GHKEY2, GHKEY1);
|
|
|
|
if (hk_broadcast) {
|
|
__ evbroadcastf64x2(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 3 * 64), Assembler::AVX_512bit);
|
|
__ evbroadcastf64x2(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 3 * 64), Assembler::AVX_512bit);
|
|
} else {
|
|
__ evmovdquq(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 3 * 64), Assembler::AVX_512bit);
|
|
__ evmovdquq(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 3 * 64), Assembler::AVX_512bit);
|
|
}
|
|
|
|
//AES rounds 3
|
|
roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY2, key, 5 * 16, rbx);
|
|
|
|
//Gather(XOR) GHASH for 12 blocks
|
|
xorGHASH(TLL1, TLH1, THL1, THH1, TLL2, TLL3, TLH2, TLH3, THL2, THL3, THH2, THH3);
|
|
|
|
//AES rounds 4
|
|
roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY1, key, 6 * 16, rbx);
|
|
|
|
//load plain / cipher text(recycle GH3xx registers)
|
|
loadData(in, pos, DATA1, DATA2, DATA3, DATA4);
|
|
|
|
//AES rounds 5
|
|
roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY2, key, 7 * 16, rbx);
|
|
|
|
//GHASH 4 blocks(3 to 0)
|
|
carrylessMultiply(TLL2, TLH2, THL2, THH2, GHDAT2, GHKEY2, GHKEY1);
|
|
|
|
//AES round 6
|
|
roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY1, key, 8 * 16, rbx);
|
|
|
|
//gather GHASH in TO_REDUCE_H / L
|
|
if (is_hash_start) {
|
|
__ evpxorq(TO_REDUCE_L, TLL2, THL2, Assembler::AVX_512bit);
|
|
__ evpxorq(TO_REDUCE_H, THH2, TLH2, Assembler::AVX_512bit);
|
|
__ vpternlogq(TO_REDUCE_L, 0x96, TLL1, THL1, Assembler::AVX_512bit);
|
|
__ vpternlogq(TO_REDUCE_H, 0x96, THH1, TLH1, Assembler::AVX_512bit);
|
|
} else {
|
|
//not the first round so sums need to be updated
|
|
xorGHASH(TO_REDUCE_L, TO_REDUCE_H, TO_REDUCE_L, TO_REDUCE_H, TLL2, THL2, THH2, TLH2, TLL1, THL1, THH1, TLH1);
|
|
}
|
|
|
|
//AES round 7
|
|
roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY2, key, 9 * 16, rbx);
|
|
|
|
//new reduction
|
|
if (do_hash_reduction) {
|
|
__ evmovdquq(ZT, ExternalAddress(ghash_polynomial_reduction_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
|
|
__ evpclmulqdq(THH1, TO_REDUCE_L, ZT, 0x10, Assembler::AVX_512bit);
|
|
__ vpshufd(TO_REDUCE_L, TO_REDUCE_L, 78, Assembler::AVX_512bit);
|
|
__ vpternlogq(THH1, 0x96, TO_REDUCE_H, TO_REDUCE_L, Assembler::AVX_512bit);
|
|
}
|
|
|
|
//AES round 8
|
|
roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY1, key, 10 * 16, rbx);
|
|
|
|
//horizontalxor of 4 reduced hashes
|
|
if (do_hash_hxor) {
|
|
vhpxori4x128(THH1, TLL1);
|
|
}
|
|
|
|
//AES round 9
|
|
roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
|
|
//AES rounds up to 11 (AES192) or 13 (AES256)
|
|
//AES128 is done
|
|
__ cmpl(NROUNDS, 52);
|
|
__ jcc(Assembler::less, last_aes_rnd);
|
|
__ bind(aes_192);
|
|
ev_load_key(AESKEY2, key, 11 * 16, rbx);
|
|
roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY1, key, 12 * 16, rbx);
|
|
roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
|
|
__ cmpl(NROUNDS, 60);
|
|
__ jcc(Assembler::less, last_aes_rnd);
|
|
__ bind(aes_256);
|
|
ev_load_key(AESKEY2, key, 13 * 16, rbx);
|
|
roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(AESKEY1, key, 14 * 16, rbx);
|
|
roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
|
|
|
|
__ bind(last_aes_rnd);
|
|
//the last AES round
|
|
lastroundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
|
|
//AESKEY1and AESKEY2 contain AES round keys
|
|
|
|
//XOR against plain / cipher text
|
|
xorBeforeStore(B00_03, B04_07, B08_11, B12_15, DATA1, DATA2, DATA3, DATA4);
|
|
|
|
//store cipher / plain text
|
|
storeData(out, pos, B00_03, B04_07, B08_11, B12_15);
|
|
//**B00_03, B04_07, B08_011, B12_B15 may contain sensitive data
|
|
|
|
//shuffle cipher text blocks for GHASH computation
|
|
__ cmpptr(ct, out);
|
|
__ jcc(Assembler::notEqual, skip_shuffle);
|
|
shuffle(B00_03, B04_07, B08_11, B12_15, B00_03, B04_07, B08_11, B12_15, SHFMSK);
|
|
__ jmp(cont);
|
|
__ bind(skip_shuffle);
|
|
shuffle(B00_03, B04_07, B08_11, B12_15, DATA1, DATA2, DATA3, DATA4, SHFMSK);
|
|
|
|
//**B00_03, B04_07, B08_011, B12_B15 overwritten with shuffled cipher text
|
|
__ bind(cont);
|
|
//store shuffled cipher text for ghashing
|
|
__ evmovdquq(Address(avx512_subkeyHtbl, 16 * aesout_offset), B00_03, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_subkeyHtbl, 16 * (aesout_offset + 4)), B04_07, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_subkeyHtbl, 16 * (aesout_offset + 8)), B08_11, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_subkeyHtbl, 16 * (aesout_offset + 12)), B12_15, Assembler::AVX_512bit);
|
|
}
|
|
|
|
|
|
//Encrypt / decrypt the initial 16 blocks
|
|
void StubGenerator::initial_blocks_16_avx512(Register in, Register out, Register ct, Register pos, Register key, Register avx512_subkeyHtbl,
|
|
Register CTR_CHECK, Register rounds, XMMRegister CTR, XMMRegister GHASH, XMMRegister ADDBE_4x4,
|
|
XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHUF_MASK, int stack_offset) {
|
|
const XMMRegister B00_03 = xmm7;
|
|
const XMMRegister B04_07 = xmm10;
|
|
const XMMRegister B08_11 = xmm11;
|
|
const XMMRegister B12_15 = xmm12;
|
|
const XMMRegister T0 = xmm0;
|
|
const XMMRegister T1 = xmm3;
|
|
const XMMRegister T2 = xmm4;
|
|
const XMMRegister T3 = xmm5;
|
|
const XMMRegister T4 = xmm6;
|
|
const XMMRegister T5 = xmm30;
|
|
|
|
Label next_16_overflow, next_16_ok, cont, skip_shuffle, aes_256, aes_192, last_aes_rnd;
|
|
//prepare counter blocks
|
|
__ cmpb(CTR_CHECK, (256 - 16));
|
|
__ jcc(Assembler::aboveEqual, next_16_overflow);
|
|
__ vpaddd(B00_03, CTR, ADDBE_1234, Assembler::AVX_512bit);
|
|
__ vpaddd(B04_07, B00_03, ADDBE_4x4, Assembler::AVX_512bit);
|
|
__ vpaddd(B08_11, B04_07, ADDBE_4x4, Assembler::AVX_512bit);
|
|
__ vpaddd(B12_15, B08_11, ADDBE_4x4, Assembler::AVX_512bit);
|
|
__ jmp(next_16_ok);
|
|
__ bind(next_16_overflow);
|
|
__ vpshufb(CTR, CTR, SHUF_MASK, Assembler::AVX_512bit);
|
|
__ evmovdquq(B12_15, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, rbx);
|
|
__ vpaddd(B00_03, CTR, ADD_1234, Assembler::AVX_512bit);
|
|
__ vpaddd(B04_07, B00_03, B12_15, Assembler::AVX_512bit);
|
|
__ vpaddd(B08_11, B04_07, B12_15, Assembler::AVX_512bit);
|
|
__ vpaddd(B12_15, B08_11, B12_15, Assembler::AVX_512bit);
|
|
shuffle(B00_03, B04_07, B08_11, B12_15, B00_03, B04_07, B08_11, B12_15, SHUF_MASK);
|
|
__ bind(next_16_ok);
|
|
__ evshufi64x2(CTR, B12_15, B12_15, 255, Assembler::AVX_512bit);
|
|
__ addb(CTR_CHECK, 16);
|
|
|
|
//load 16 blocks of data
|
|
loadData(in, pos, T0, T1, T2, T3);
|
|
|
|
//move to AES encryption rounds
|
|
__ movdqu(T5, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
ev_load_key(T4, key, 0, T5);
|
|
__ evpxorq(B00_03, B00_03, T4, Assembler::AVX_512bit);
|
|
__ evpxorq(B04_07, B04_07, T4, Assembler::AVX_512bit);
|
|
__ evpxorq(B08_11, B08_11, T4, Assembler::AVX_512bit);
|
|
__ evpxorq(B12_15, B12_15, T4, Assembler::AVX_512bit);
|
|
|
|
for (int i = 1; i < 10; i++) {
|
|
ev_load_key(T4, key, i * 16, T5);
|
|
roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
|
|
}
|
|
|
|
ev_load_key(T4, key, 10 * 16, T5);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::less, last_aes_rnd);
|
|
__ bind(aes_192);
|
|
roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(T4, key, 16 * 11, T5);
|
|
roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(T4, key, 16 * 12, T5);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::less, last_aes_rnd);
|
|
__ bind(aes_256);
|
|
roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(T4, key, 16 * 13, T5);
|
|
roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
|
|
ev_load_key(T4, key, 16 * 14, T5);
|
|
|
|
__ bind(last_aes_rnd);
|
|
lastroundEncode(T4, B00_03, B04_07, B08_11, B12_15);
|
|
|
|
//xor against text
|
|
xorBeforeStore(B00_03, B04_07, B08_11, B12_15, T0, T1, T2, T3);
|
|
|
|
//store
|
|
storeData(out, pos, B00_03, B04_07, B08_11, B12_15);
|
|
|
|
__ cmpptr(ct, out);
|
|
__ jcc(Assembler::equal, skip_shuffle);
|
|
//decryption - cipher text needs to go to GHASH phase
|
|
shuffle(B00_03, B04_07, B08_11, B12_15, T0, T1, T2, T3, SHUF_MASK);
|
|
__ jmp(cont);
|
|
__ bind(skip_shuffle);
|
|
shuffle(B00_03, B04_07, B08_11, B12_15, B00_03, B04_07, B08_11, B12_15, SHUF_MASK);
|
|
|
|
//B00_03, B04_07, B08_11, B12_15 overwritten with shuffled cipher text
|
|
__ bind(cont);
|
|
__ evmovdquq(Address(avx512_subkeyHtbl, 16 * stack_offset), B00_03, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_subkeyHtbl, 16 * (stack_offset + 4)), B04_07, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_subkeyHtbl, 16 * (stack_offset + 8)), B08_11, Assembler::AVX_512bit);
|
|
__ evmovdquq(Address(avx512_subkeyHtbl, 16 * (stack_offset + 12)), B12_15, Assembler::AVX_512bit);
|
|
}
|
|
|
|
void StubGenerator::aesgcm_avx512(Register in, Register len, Register ct, Register out, Register key, Register state,
|
|
Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) {
|
|
Label ENC_DEC_DONE, MESG_BELOW_32_BLKS, NO_BIG_BLKS, ENCRYPT_BIG_BLKS_NO_HXOR,
|
|
ENCRYPT_BIG_NBLKS, ENCRYPT_16_BLKS, ENCRYPT_N_GHASH_32_N_BLKS, GHASH_DONE;
|
|
const XMMRegister CTR_BLOCKx = xmm2;
|
|
const XMMRegister AAD_HASHx = xmm14;
|
|
const XMMRegister ZTMP0 = xmm0;
|
|
const XMMRegister ZTMP1 = xmm3; //**sensitive
|
|
const XMMRegister ZTMP2 = xmm4; //**sensitive(small data)
|
|
const XMMRegister ZTMP3 = xmm5; //**sensitive(small data)
|
|
const XMMRegister ZTMP4 = xmm6;
|
|
const XMMRegister ZTMP5 = xmm7;
|
|
const XMMRegister ZTMP6 = xmm10;
|
|
const XMMRegister ZTMP7 = xmm11;
|
|
const XMMRegister ZTMP8 = xmm12;
|
|
const XMMRegister ZTMP9 = xmm13;
|
|
const XMMRegister ZTMP10 = xmm15;
|
|
const XMMRegister ZTMP11 = xmm16;
|
|
const XMMRegister ZTMP12 = xmm17;
|
|
const XMMRegister ZTMP13 = xmm19;
|
|
const XMMRegister ZTMP14 = xmm20;
|
|
const XMMRegister ZTMP15 = xmm21;
|
|
const XMMRegister ZTMP16 = xmm30;
|
|
const XMMRegister ZTMP17 = xmm31;
|
|
const XMMRegister ZTMP18 = xmm1;
|
|
const XMMRegister ZTMP19 = xmm18;
|
|
const XMMRegister ZTMP20 = xmm8;
|
|
const XMMRegister ZTMP21 = xmm22;
|
|
const XMMRegister ZTMP22 = xmm23;
|
|
const XMMRegister ZTMP23 = xmm26;
|
|
const XMMRegister GH = xmm24;
|
|
const XMMRegister GL = xmm25;
|
|
const XMMRegister SHUF_MASK = xmm29;
|
|
const XMMRegister ADDBE_4x4 = xmm27;
|
|
const XMMRegister ADDBE_1234 = xmm28;
|
|
const XMMRegister ADD_1234 = xmm9;
|
|
const KRegister MASKREG = k1;
|
|
const Register pos = rax;
|
|
const Register rounds = r15;
|
|
const Register CTR_CHECK = r14;
|
|
|
|
const int stack_offset = 64;
|
|
const int ghashin_offset = 64;
|
|
const int aesout_offset = 64;
|
|
const int hashkey_offset = 0;
|
|
const int hashkey_gap = 16 * 32;
|
|
const int HashKey_32 = 0;
|
|
const int HashKey_16 = 16 * 16;
|
|
|
|
__ movl(pos, 0);
|
|
__ cmpl(len, 256);
|
|
__ jcc(Assembler::lessEqual, ENC_DEC_DONE);
|
|
|
|
/* Structure of the Htbl is as follows:
|
|
* Where 0 - 31 we have 32 Hashkey's and 32-63 we have 32 HashKeyK (derived from HashKey)
|
|
* Rest 8 entries are for storing CTR values post AES rounds
|
|
* ----------------------------------------------------------------------------------------
|
|
Hashkey32 -> 16 * 0
|
|
Hashkey31 -> 16 * 1
|
|
Hashkey30 -> 16 * 2
|
|
........
|
|
Hashkey1 -> 16 * 31
|
|
---------------------
|
|
HaskeyK32 -> 16 * 32
|
|
HashkeyK31 -> 16 * 33
|
|
.........
|
|
HashkeyK1 -> 16 * 63
|
|
---------------------
|
|
1st set of AES Entries
|
|
B00_03 -> 16 * 64
|
|
B04_07 -> 16 * 68
|
|
B08_11 -> 16 * 72
|
|
B12_15 -> 16 * 80
|
|
---------------------
|
|
2nd set of AES Entries
|
|
B00_03 -> 16 * 84
|
|
B04_07 -> 16 * 88
|
|
B08_11 -> 16 * 92
|
|
B12_15 -> 16 * 96
|
|
---------------------*/
|
|
generateHtbl_32_blocks_avx512(subkeyHtbl, avx512_subkeyHtbl);
|
|
|
|
//Move initial counter value and STATE value into variables
|
|
__ movdqu(CTR_BLOCKx, Address(counter, 0));
|
|
__ movdqu(AAD_HASHx, Address(state, 0));
|
|
|
|
//Load lswap mask for ghash
|
|
__ movdqu(xmm24, ExternalAddress(ghash_long_swap_mask_addr()), rbx /*rscratch*/);
|
|
//Shuffle input state using lswap mask
|
|
__ vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
|
|
|
|
// Compute #rounds for AES based on the length of the key array
|
|
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
__ evmovdquq(ADDBE_4x4, ExternalAddress(counter_mask_addbe_4444_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
|
|
__ evmovdquq(ADDBE_1234, ExternalAddress(counter_mask_addbe_1234_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
|
|
__ evmovdquq(SHUF_MASK, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
|
|
__ evmovdquq(ADD_1234, ExternalAddress(counter_mask_add_1234_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
|
|
|
|
//Shuffle counter, subtract 1 from the pre-incremented counter value and broadcast counter value to 512 bit register
|
|
__ vpshufb(CTR_BLOCKx, CTR_BLOCKx, SHUF_MASK, Assembler::AVX_128bit);
|
|
__ vpsubd(CTR_BLOCKx, CTR_BLOCKx, ADD_1234, Assembler::AVX_128bit);
|
|
__ evshufi64x2(CTR_BLOCKx, CTR_BLOCKx, CTR_BLOCKx, 0, Assembler::AVX_512bit);
|
|
|
|
__ movdl(CTR_CHECK, CTR_BLOCKx);
|
|
__ andl(CTR_CHECK, 255);
|
|
|
|
// Reshuffle counter
|
|
__ vpshufb(CTR_BLOCKx, CTR_BLOCKx, SHUF_MASK, Assembler::AVX_512bit);
|
|
|
|
initial_blocks_16_avx512(in, out, ct, pos, key, avx512_subkeyHtbl, CTR_CHECK, rounds, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK, stack_offset);
|
|
__ addl(pos, 16 * 16);
|
|
__ cmpl(len, 32 * 16);
|
|
__ jcc(Assembler::below, MESG_BELOW_32_BLKS);
|
|
|
|
initial_blocks_16_avx512(in, out, ct, pos, key, avx512_subkeyHtbl, CTR_CHECK, rounds, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK, stack_offset + 16);
|
|
__ addl(pos, 16 * 16);
|
|
__ subl(len, 32 * 16);
|
|
|
|
__ cmpl(len, 32 * 16);
|
|
__ jcc(Assembler::below, NO_BIG_BLKS);
|
|
|
|
__ bind(ENCRYPT_BIG_BLKS_NO_HXOR);
|
|
__ cmpl(len, 2 * 32 * 16);
|
|
__ jcc(Assembler::below, ENCRYPT_BIG_NBLKS);
|
|
ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
|
|
true, true, false, false, false, ghashin_offset, aesout_offset, HashKey_32);
|
|
__ addl(pos, 16 * 16);
|
|
|
|
ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
|
|
true, false, true, false, true, ghashin_offset + 16, aesout_offset + 16, HashKey_16);
|
|
__ evmovdquq(AAD_HASHx, ZTMP4, Assembler::AVX_512bit);
|
|
__ addl(pos, 16 * 16);
|
|
__ subl(len, 32 * 16);
|
|
__ jmp(ENCRYPT_BIG_BLKS_NO_HXOR);
|
|
|
|
__ bind(ENCRYPT_BIG_NBLKS);
|
|
ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
|
|
false, true, false, false, false, ghashin_offset, aesout_offset, HashKey_32);
|
|
__ addl(pos, 16 * 16);
|
|
ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
|
|
false, false, true, true, true, ghashin_offset + 16, aesout_offset + 16, HashKey_16);
|
|
|
|
__ movdqu(AAD_HASHx, ZTMP4);
|
|
__ addl(pos, 16 * 16);
|
|
__ subl(len, 32 * 16);
|
|
|
|
__ bind(NO_BIG_BLKS);
|
|
__ cmpl(len, 16 * 16);
|
|
__ jcc(Assembler::aboveEqual, ENCRYPT_16_BLKS);
|
|
|
|
__ bind(ENCRYPT_N_GHASH_32_N_BLKS);
|
|
ghash16_avx512(true, false, false, false, true, in, pos, avx512_subkeyHtbl, AAD_HASHx, SHUF_MASK, stack_offset, 0, 0, HashKey_32);
|
|
gcm_enc_dec_last_avx512(len, in, pos, AAD_HASHx, SHUF_MASK, avx512_subkeyHtbl, ghashin_offset + 16, HashKey_16, false, true);
|
|
__ jmp(GHASH_DONE);
|
|
|
|
__ bind(ENCRYPT_16_BLKS);
|
|
ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
|
|
false, true, false, false, false, ghashin_offset, aesout_offset, HashKey_32);
|
|
|
|
ghash16_avx512(false, true, false, false, true, in, pos, avx512_subkeyHtbl, AAD_HASHx, SHUF_MASK, stack_offset, 16 * 16, 0, HashKey_16);
|
|
__ addl(pos, 16 * 16);
|
|
|
|
__ bind(MESG_BELOW_32_BLKS);
|
|
__ subl(len, 16 * 16);
|
|
gcm_enc_dec_last_avx512(len, in, pos, AAD_HASHx, SHUF_MASK, avx512_subkeyHtbl, ghashin_offset, HashKey_16, true, true);
|
|
|
|
__ bind(GHASH_DONE);
|
|
//Pre-increment counter for next operation, make sure that counter value is incremented on the LSB
|
|
__ vpshufb(CTR_BLOCKx, CTR_BLOCKx, SHUF_MASK, Assembler::AVX_128bit);
|
|
__ vpaddd(CTR_BLOCKx, CTR_BLOCKx, ADD_1234, Assembler::AVX_128bit);
|
|
__ vpshufb(CTR_BLOCKx, CTR_BLOCKx, SHUF_MASK, Assembler::AVX_128bit);
|
|
__ movdqu(Address(counter, 0), CTR_BLOCKx);
|
|
//Load ghash lswap mask
|
|
__ movdqu(xmm24, ExternalAddress(ghash_long_swap_mask_addr()), rbx /*rscratch*/);
|
|
//Shuffle ghash using lbswap_mask and store it
|
|
__ vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
|
|
__ movdqu(Address(state, 0), AAD_HASHx);
|
|
|
|
//Zero out sensitive data
|
|
__ evpxorq(ZTMP21, ZTMP21, ZTMP21, Assembler::AVX_512bit);
|
|
__ evpxorq(ZTMP0, ZTMP0, ZTMP0, Assembler::AVX_512bit);
|
|
__ evpxorq(ZTMP1, ZTMP1, ZTMP1, Assembler::AVX_512bit);
|
|
__ evpxorq(ZTMP2, ZTMP2, ZTMP2, Assembler::AVX_512bit);
|
|
__ evpxorq(ZTMP3, ZTMP3, ZTMP3, Assembler::AVX_512bit);
|
|
|
|
__ bind(ENC_DEC_DONE);
|
|
}
|
|
|
|
//Implements data * hashkey mod (128, 127, 126, 121, 0)
|
|
//Inputs:
|
|
//GH and HK - 128 bits each
|
|
//Output:
|
|
//GH = GH * Hashkey mod poly
|
|
//Temp registers: xmm1, xmm2, xmm3, r15
|
|
void StubGenerator::gfmul_avx2(XMMRegister GH, XMMRegister HK) {
|
|
const XMMRegister T1 = xmm1;
|
|
const XMMRegister T2 = xmm2;
|
|
const XMMRegister T3 = xmm3;
|
|
|
|
__ vpclmulqdq(T1, GH, HK, 0x11); // %%T1 = a1*b1
|
|
__ vpclmulqdq(T2, GH, HK, 0x00); // %%T2 = a0*b0
|
|
__ vpclmulqdq(T3, GH, HK, 0x01); // %%T3 = a1*b0
|
|
__ vpclmulqdq(GH, GH, HK, 0x10); // %%GH = a0*b1
|
|
__ vpxor(GH, GH, T3, Assembler::AVX_128bit);
|
|
|
|
__ vpsrldq(T3, GH, 8, Assembler::AVX_128bit); // shift-R %%GH 2 DWs
|
|
__ vpslldq(GH, GH, 8, Assembler::AVX_128bit); // shift-L %%GH 2 DWs
|
|
|
|
__ vpxor(T1, T1, T3, Assembler::AVX_128bit);
|
|
__ vpxor(GH, GH, T2, Assembler::AVX_128bit);
|
|
|
|
//first phase of the reduction
|
|
__ movdqu(T3, ExternalAddress(ghash_polynomial_reduction_addr()), r15 /*rscratch*/);
|
|
__ vpclmulqdq(T2, T3, GH, 0x01);
|
|
__ vpslldq(T2, T2, 8, Assembler::AVX_128bit); // shift-L %%T2 2 DWs
|
|
|
|
__ vpxor(GH, GH, T2, Assembler::AVX_128bit); // first phase of the reduction complete
|
|
//second phase of the reduction
|
|
__ vpclmulqdq(T2, T3, GH, 0x00);
|
|
__ vpsrldq(T2, T2, 4, Assembler::AVX_128bit); // shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
|
|
|
|
__ vpclmulqdq(GH, T3, GH, 0x10);
|
|
__ vpslldq(GH, GH, 4, Assembler::AVX_128bit); // shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
|
|
|
|
__ vpxor(GH, GH, T2, Assembler::AVX_128bit); // second phase of the reduction complete
|
|
__ vpxor(GH, GH, T1, Assembler::AVX_128bit); // the result is in %%GH
|
|
}
|
|
|
|
//Generate 8 constants from the given subkeyH.
|
|
//Input:
|
|
//htbl - table containing the initial subkeyH
|
|
//Output:
|
|
//htbl - containing 8 H constants
|
|
//Temp registers: xmm0, xmm1, xmm2, xmm3, xmm6, xmm11, xmm12, r15, rbx
|
|
void StubGenerator::generateHtbl_8_block_avx2(Register htbl) {
|
|
const XMMRegister HK = xmm6;
|
|
|
|
__ movdqu(HK, Address(htbl, 0));
|
|
__ movdqu(xmm1, ExternalAddress(ghash_long_swap_mask_addr()), rbx /*rscratch*/);
|
|
__ vpshufb(HK, HK, xmm1, Assembler::AVX_128bit);
|
|
|
|
__ movdqu(xmm11, ExternalAddress(ghash_polynomial_addr()), rbx /*rscratch*/);
|
|
__ movdqu(xmm12, ExternalAddress(ghash_polynomial_two_one_addr()), rbx /*rscratch*/);
|
|
// Compute H ^ 2 from the input subkeyH
|
|
__ vpsrlq(xmm1, xmm6, 63, Assembler::AVX_128bit);
|
|
__ vpsllq(xmm6, xmm6, 1, Assembler::AVX_128bit);
|
|
__ vpslldq(xmm2, xmm1, 8, Assembler::AVX_128bit);
|
|
__ vpsrldq(xmm1, xmm1, 8, Assembler::AVX_128bit);
|
|
|
|
__ vpor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
|
|
|
|
__ vpshufd(xmm2, xmm1, 0x24, Assembler::AVX_128bit);
|
|
__ vpcmpeqd(xmm2, xmm2, xmm12, Assembler::AVX_128bit);
|
|
__ vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit);
|
|
__ vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
|
|
__ movdqu(Address(htbl, 1 * 16), xmm6); // H * 2
|
|
__ movdqu(xmm0, xmm6);
|
|
for (int i = 2; i < 9; i++) {
|
|
gfmul_avx2(xmm6, xmm0);
|
|
__ movdqu(Address(htbl, i * 16), xmm6);
|
|
}
|
|
}
|
|
|
|
#define aesenc_step_avx2(t_key)\
|
|
__ aesenc(xmm1, t_key);\
|
|
__ aesenc(xmm2, t_key);\
|
|
__ aesenc(xmm3, t_key);\
|
|
__ aesenc(xmm4, t_key);\
|
|
__ aesenc(xmm5, t_key);\
|
|
__ aesenc(xmm6, t_key);\
|
|
__ aesenc(xmm7, t_key);\
|
|
__ aesenc(xmm8, t_key);\
|
|
|
|
#define ghash_step_avx2(ghdata, hkey) \
|
|
__ vpclmulqdq(xmm11, ghdata, hkey, 0x11);\
|
|
__ vpxor(xmm12, xmm12, xmm11, Assembler::AVX_128bit);\
|
|
__ vpclmulqdq(xmm11, ghdata, hkey, 0x00);\
|
|
__ vpxor(xmm15, xmm15, xmm11, Assembler::AVX_128bit);\
|
|
__ vpclmulqdq(xmm11, ghdata, hkey, 0x01);\
|
|
__ vpxor(xmm14, xmm14, xmm11, Assembler::AVX_128bit);\
|
|
__ vpclmulqdq(xmm11, ghdata, hkey, 0x10);\
|
|
__ vpxor(xmm14, xmm14, xmm11, Assembler::AVX_128bit);\
|
|
|
|
//Encrypts and hashes 8 blocks in an interleaved fashion.
|
|
//Inputs:
|
|
//key - key for aes operations
|
|
//subkeyHtbl - table containing H constants
|
|
//ctr_blockx - counter for aes operations
|
|
//in - input buffer
|
|
//out - output buffer
|
|
//ct - ciphertext buffer
|
|
//pos - holds the length processed in this method
|
|
//in_order - boolean that indicates if incrementing counter without shuffling is needed
|
|
//rounds - number of aes rounds calculated based on key length
|
|
//xmm1-xmm8 - holds encrypted counter values
|
|
//Outputs:
|
|
//xmm1-xmm8 - updated encrypted counter values
|
|
//ctr_blockx - updated counter value
|
|
//out - updated output buffer
|
|
//Temp registers: xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, rbx
|
|
void StubGenerator::ghash8_encrypt8_parallel_avx2(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, Register in,
|
|
Register out, Register ct, Register pos, bool in_order, Register rounds,
|
|
XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
|
|
XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, XMMRegister xmm8) {
|
|
const XMMRegister t1 = xmm0;
|
|
const XMMRegister t2 = xmm10;
|
|
const XMMRegister t3 = xmm11;
|
|
const XMMRegister t4 = xmm12;
|
|
const XMMRegister t5 = xmm13;
|
|
const XMMRegister t6 = xmm14;
|
|
const XMMRegister t7 = xmm15;
|
|
Label skip_reload, last_aes_rnd, aes_192, aes_256;
|
|
|
|
__ movdqu(t2, xmm1);
|
|
for (int i = 0; i <= 6; i++) {
|
|
__ movdqu(Address(rsp, 16 * i), as_XMMRegister(i + 2));
|
|
}
|
|
|
|
if (in_order) {
|
|
__ vpaddd(xmm1, ctr_blockx, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, rbx /*rscratch*/); //Increment counter by 1
|
|
__ movdqu(t5, ExternalAddress(counter_mask_linc2_addr()), rbx /*rscratch*/);
|
|
__ vpaddd(xmm2, ctr_blockx, t5, Assembler::AVX_128bit);
|
|
for (int rnum = 1; rnum <= 6; rnum++) {
|
|
__ vpaddd(as_XMMRegister(rnum + 2), as_XMMRegister(rnum), t5, Assembler::AVX_128bit);
|
|
}
|
|
__ movdqu(ctr_blockx, xmm8);
|
|
|
|
__ movdqu(t5, ExternalAddress(counter_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
for (int rnum = 1; rnum <= 8; rnum++) {
|
|
__ vpshufb(as_XMMRegister(rnum), as_XMMRegister(rnum), t5, Assembler::AVX_128bit); //perform a 16Byte swap
|
|
}
|
|
} else {
|
|
__ vpaddd(xmm1, ctr_blockx, ExternalAddress(counter_mask_linc1f_addr()), Assembler::AVX_128bit, rbx /*rscratch*/); //Increment counter by 1
|
|
__ vmovdqu(t5, ExternalAddress(counter_mask_linc2f_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
__ vpaddd(xmm2, ctr_blockx, t5, Assembler::AVX_128bit);
|
|
for (int rnum = 1; rnum <= 6; rnum++) {
|
|
__ vpaddd(as_XMMRegister(rnum + 2), as_XMMRegister(rnum), t5, Assembler::AVX_128bit);
|
|
}
|
|
__ movdqu(ctr_blockx, xmm8);
|
|
}
|
|
|
|
load_key(t1, key, 16 * 0, rbx /*rscratch*/);
|
|
for (int rnum = 1; rnum <= 8; rnum++) {
|
|
__ vpxor(as_XMMRegister(rnum), as_XMMRegister(rnum), t1, Assembler::AVX_128bit);
|
|
}
|
|
|
|
load_key(t1, key, 16 * 1, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t1);
|
|
|
|
load_key(t1, key, 16 * 2, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t1);
|
|
|
|
__ movdqu(t5, (Address(subkeyHtbl, 8 * 16)));
|
|
__ vpclmulqdq(t4, t2, t5, 0x11); //t4 = a1*b1
|
|
__ vpclmulqdq(t7, t2, t5, 0x00); //t7 = a0*b0
|
|
__ vpclmulqdq(t6, t2, t5, 0x01); //t6 = a1*b0
|
|
__ vpclmulqdq(t5, t2, t5, 0x10); //t5 = a0*b1
|
|
__ vpxor(t6, t6, t5, Assembler::AVX_128bit);
|
|
|
|
for (int i = 3, j = 0; i <= 8; i++, j++) {
|
|
load_key(t1, key, 16 * i, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t1);
|
|
__ movdqu(t1, Address(rsp, 16 * j));
|
|
__ movdqu(t5, (Address(subkeyHtbl, (7 - j) * 16)));
|
|
ghash_step_avx2(t1, t5);
|
|
}
|
|
|
|
load_key(t1, key, 16 * 9, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t1);
|
|
|
|
__ movdqu(t1, Address(rsp, 16 * 6));
|
|
__ movdqu(t5, (Address(subkeyHtbl, 1 * 16)));
|
|
|
|
__ vpclmulqdq(t3, t1, t5, 0x00);
|
|
__ vpxor(t7, t7, t3, Assembler::AVX_128bit);
|
|
|
|
__ vpclmulqdq(t3, t1, t5, 0x01);
|
|
__ vpxor(t6, t6, t3, Assembler::AVX_128bit);
|
|
|
|
__ vpclmulqdq(t3, t1, t5, 0x10);
|
|
__ vpxor(t6, t6, t3, Assembler::AVX_128bit);
|
|
|
|
__ vpclmulqdq(t3, t1, t5, 0x11);
|
|
__ vpxor(t1, t4, t3, Assembler::AVX_128bit);
|
|
|
|
__ vpslldq(t3, t6, 8, Assembler::AVX_128bit); //shift-L t3 2 DWs
|
|
__ vpsrldq(t6, t6, 8, Assembler::AVX_128bit); //shift-R t2 2 DWs
|
|
__ vpxor(t7, t7, t3, Assembler::AVX_128bit);
|
|
__ vpxor(t1, t1, t6, Assembler::AVX_128bit); // accumulate the results in t1:t7
|
|
|
|
load_key(t5, key, 16 * 10, rbx /*rscratch*/);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::less, last_aes_rnd);
|
|
|
|
__ bind(aes_192);
|
|
aesenc_step_avx2(t5);
|
|
load_key(t5, key, 16 * 11, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t5);
|
|
load_key(t5, key, 16 * 12, rbx /*rscratch*/);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::less, last_aes_rnd);
|
|
|
|
__ bind(aes_256);
|
|
aesenc_step_avx2(t5);
|
|
load_key(t5, key, 16 * 13, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t5);
|
|
load_key(t5, key, 16 * 14, rbx /*rscratch*/);
|
|
__ bind(last_aes_rnd);
|
|
for (int rnum = 1; rnum <= 8; rnum++) {
|
|
__ aesenclast(as_XMMRegister(rnum), t5);
|
|
}
|
|
|
|
for (int i = 0; i <= 7; i++) {
|
|
__ movdqu(t2, Address(in, pos, Address::times_1, 16 * i));
|
|
__ vpxor(as_XMMRegister(i + 1), as_XMMRegister(i + 1), t2, Assembler::AVX_128bit);
|
|
}
|
|
|
|
//first phase of the reduction
|
|
__ vmovdqu(t3, ExternalAddress(ghash_polynomial_reduction_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
|
|
__ vpclmulqdq(t2, t3, t7, 0x01);
|
|
__ vpslldq(t2, t2, 8, Assembler::AVX_128bit); //shift-L xmm2 2 DWs
|
|
|
|
__ vpxor(t7, t7, t2, Assembler::AVX_128bit); //first phase of the reduction complete
|
|
|
|
//Write to the Ciphertext buffer
|
|
for (int i = 0; i <= 7; i++) {
|
|
__ movdqu(Address(out, pos, Address::times_1, 16 * i), as_XMMRegister(i + 1));
|
|
}
|
|
|
|
__ cmpptr(ct, out);
|
|
__ jcc(Assembler::equal, skip_reload);
|
|
for (int i = 0; i <= 7; i++) {
|
|
__ movdqu(as_XMMRegister(i + 1), Address(in, pos, Address::times_1, 16 * i));
|
|
}
|
|
|
|
__ bind(skip_reload);
|
|
//second phase of the reduction
|
|
__ vpclmulqdq(t2, t3, t7, 0x00);
|
|
__ vpsrldq(t2, t2, 4, Assembler::AVX_128bit); //shift-R t2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
|
|
|
|
__ vpclmulqdq(t4, t3, t7, 0x10);
|
|
__ vpslldq(t4, t4, 4, Assembler::AVX_128bit); //shift-L t4 1 DW (Shift-L 1-DW to obtain result with no shifts)
|
|
__ vpxor(t4, t4, t2, Assembler::AVX_128bit); //second phase of the reduction complete
|
|
__ vpxor(t1, t1, t4, Assembler::AVX_128bit); //the result is in t1
|
|
|
|
//perform a 16Byte swap
|
|
__ movdqu(t7, ExternalAddress(counter_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
for (int rnum = 1; rnum <= 8; rnum++) {
|
|
__ vpshufb(as_XMMRegister(rnum), as_XMMRegister(rnum), t7, Assembler::AVX_128bit);
|
|
}
|
|
__ vpxor(xmm1, xmm1, t1, Assembler::AVX_128bit);
|
|
}
|
|
|
|
//GHASH the last 8 ciphertext blocks.
|
|
//Input:
|
|
//subkeyHtbl - table containing H constants
|
|
//Output:
|
|
//xmm14 - calculated aad hash
|
|
//Temp registers: xmm0, xmm10, xmm11, xmm12, xmm13, xmm15, rbx
|
|
void StubGenerator::ghash_last_8_avx2(Register subkeyHtbl) {
|
|
const XMMRegister t1 = xmm0;
|
|
const XMMRegister t2 = xmm10;
|
|
const XMMRegister t3 = xmm11;
|
|
const XMMRegister t4 = xmm12;
|
|
const XMMRegister t5 = xmm13;
|
|
const XMMRegister t6 = xmm14;
|
|
const XMMRegister t7 = xmm15;
|
|
|
|
//Karatsuba Method
|
|
__ movdqu(t5, Address(subkeyHtbl, 8 * 16));
|
|
|
|
__ vpshufd(t2, xmm1, 78, Assembler::AVX_128bit);
|
|
__ vpshufd(t3, t5, 78, Assembler::AVX_128bit);
|
|
__ vpxor(t2, t2, xmm1, Assembler::AVX_128bit);
|
|
__ vpxor(t3, t3, t5, Assembler::AVX_128bit);
|
|
|
|
__ vpclmulqdq(t6, xmm1, t5, 0x11);
|
|
__ vpclmulqdq(t7, xmm1, t5, 0x00);
|
|
|
|
__ vpclmulqdq(xmm1, t2, t3, 0x00);
|
|
|
|
for (int i = 7, rnum = 2; rnum <= 8; i--, rnum++) {
|
|
__ movdqu(t5, Address(subkeyHtbl, i * 16));
|
|
__ vpshufd(t2, as_XMMRegister(rnum), 78, Assembler::AVX_128bit);
|
|
__ vpshufd(t3, t5, 78, Assembler::AVX_128bit);
|
|
__ vpxor(t2, t2, as_XMMRegister(rnum), Assembler::AVX_128bit);
|
|
__ vpxor(t3, t3, t5, Assembler::AVX_128bit);
|
|
__ vpclmulqdq(t4, as_XMMRegister(rnum), t5, 0x11);
|
|
__ vpxor(t6, t6, t4, Assembler::AVX_128bit);
|
|
__ vpclmulqdq(t4, as_XMMRegister(rnum), t5, 0x00);
|
|
__ vpxor(t7, t7, t4, Assembler::AVX_128bit);
|
|
__ vpclmulqdq(t2, t2, t3, 0x00);
|
|
__ vpxor(xmm1, xmm1, t2, Assembler::AVX_128bit);
|
|
}
|
|
|
|
__ vpxor(xmm1, xmm1, t6, Assembler::AVX_128bit);
|
|
__ vpxor(t2, xmm1, t7, Assembler::AVX_128bit);
|
|
|
|
__ vpslldq(t4, t2, 8, Assembler::AVX_128bit);
|
|
__ vpsrldq(t2, t2, 8, Assembler::AVX_128bit);
|
|
|
|
__ vpxor(t7, t7, t4, Assembler::AVX_128bit);
|
|
__ vpxor(t6, t6, t2, Assembler::AVX_128bit); //<t6:t7> holds the result of the accumulated carry-less multiplications
|
|
|
|
//first phase of the reduction
|
|
__ movdqu(t3, ExternalAddress(ghash_polynomial_reduction_addr()), rbx /*rscratch*/);
|
|
|
|
__ vpclmulqdq(t2, t3, t7, 0x01);
|
|
__ vpslldq(t2, t2, 8, Assembler::AVX_128bit); // shift-L t2 2 DWs
|
|
|
|
__ vpxor(t7, t7, t2, Assembler::AVX_128bit);//first phase of the reduction complete
|
|
|
|
//second phase of the reduction
|
|
__ vpclmulqdq(t2, t3, t7, 0x00);
|
|
__ vpsrldq(t2, t2, 4, Assembler::AVX_128bit); //shift-R t2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
|
|
|
|
__ vpclmulqdq(t4, t3, t7, 0x10);
|
|
__ vpslldq(t4, t4, 4, Assembler::AVX_128bit); //shift-L t4 1 DW (Shift-L 1-DW to obtain result with no shifts)
|
|
__ vpxor(t4, t4, t2, Assembler::AVX_128bit); //second phase of the reduction complete
|
|
__ vpxor(t6, t6, t4, Assembler::AVX_128bit); //the result is in t6
|
|
}
|
|
|
|
//Encrypt initial number of 8 blocks
|
|
//Inputs:
|
|
//ctr - counter for aes operations
|
|
//rounds - number of aes rounds calculated based on key length
|
|
//key - key for aes operations
|
|
//len - input length to be processed
|
|
//in - input buffer
|
|
//out - output buffer
|
|
//ct - ciphertext buffer
|
|
//aad_hashx - input aad hash
|
|
//pos - holds the length processed in this method
|
|
//Outputs:
|
|
//xmm1-xmm8 - holds updated encrypted counter values
|
|
//ctr - updated counter value
|
|
//pos - updated position
|
|
//len - updated length
|
|
//out - updated output buffer
|
|
//Temp registers: xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
|
|
void StubGenerator::initial_blocks_avx2(XMMRegister ctr, Register rounds, Register key, Register len, Register in,
|
|
Register out, Register ct, XMMRegister aad_hashx, Register pos) {
|
|
const XMMRegister t1 = xmm12;
|
|
const XMMRegister t2 = xmm13;
|
|
const XMMRegister t3 = xmm14;
|
|
const XMMRegister t4 = xmm15;
|
|
const XMMRegister t5 = xmm11;
|
|
const XMMRegister t6 = xmm10;
|
|
const XMMRegister t_key = xmm0;
|
|
|
|
Label skip_reload, last_aes_rnd, aes_192, aes_256;
|
|
//Move AAD_HASH to temp reg t3
|
|
__ movdqu(t3, aad_hashx);
|
|
//Prepare 8 counter blocks and perform rounds of AES cipher on
|
|
//them, load plain/cipher text and store cipher/plain text.
|
|
__ movdqu(xmm1, ctr);
|
|
__ movdqu(t5, ExternalAddress(counter_mask_linc1_addr()), rbx /*rscratch*/);
|
|
__ movdqu(t6, ExternalAddress(counter_mask_linc2_addr()), rbx /*rscratch*/ );
|
|
__ vpaddd(xmm2, xmm1, t5, Assembler::AVX_128bit);
|
|
for (int rnum = 1; rnum <= 6; rnum++) {
|
|
__ vpaddd(as_XMMRegister(rnum + 2), as_XMMRegister(rnum), t6, Assembler::AVX_128bit);
|
|
}
|
|
__ movdqu(ctr, xmm8);
|
|
|
|
__ movdqu(t5, ExternalAddress(counter_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
for (int rnum = 1; rnum <= 8; rnum++) {
|
|
__ vpshufb(as_XMMRegister(rnum), as_XMMRegister(rnum), t5, Assembler::AVX_128bit); //perform a 16Byte swap
|
|
}
|
|
|
|
load_key(t_key, key, 16 * 0, rbx /*rscratch*/);
|
|
for (int rnum = 1; rnum <= 8; rnum++) {
|
|
__ vpxor(as_XMMRegister(rnum), as_XMMRegister(rnum), t_key, Assembler::AVX_128bit);
|
|
}
|
|
|
|
for (int i = 1; i <= 9; i++) {
|
|
load_key(t_key, key, 16 * i, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t_key);
|
|
}
|
|
|
|
load_key(t_key, key, 16 * 10, rbx /*rscratch*/);
|
|
__ cmpl(rounds, 52);
|
|
__ jcc(Assembler::less, last_aes_rnd);
|
|
|
|
__ bind(aes_192);
|
|
aesenc_step_avx2(t_key);
|
|
load_key(t_key, key, 16 * 11, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t_key);
|
|
load_key(t_key, key, 16 * 12, rbx /*rscratch*/);
|
|
__ cmpl(rounds, 60);
|
|
__ jcc(Assembler::less, last_aes_rnd);
|
|
|
|
__ bind(aes_256);
|
|
aesenc_step_avx2(t_key);
|
|
load_key(t_key, key, 16 * 13, rbx /*rscratch*/);
|
|
aesenc_step_avx2(t_key);
|
|
load_key(t_key, key, 16 * 14, rbx /*rscratch*/);
|
|
|
|
__ bind(last_aes_rnd);
|
|
for (int rnum = 1; rnum <= 8; rnum++) {
|
|
__ aesenclast(as_XMMRegister(rnum), t_key);
|
|
}
|
|
|
|
//XOR and store data
|
|
for (int i = 0; i <= 7; i++) {
|
|
__ movdqu(t1, Address(in, pos, Address::times_1, 16 * i));
|
|
__ vpxor(as_XMMRegister(i + 1), as_XMMRegister(i + 1), t1, Assembler::AVX_128bit);
|
|
__ movdqu(Address(out, pos, Address::times_1, 16 * i), as_XMMRegister(i + 1));
|
|
}
|
|
|
|
__ cmpptr(ct, out);
|
|
__ jcc(Assembler::equal, skip_reload);
|
|
for (int i = 0; i <= 7; i++) {
|
|
__ movdqu(as_XMMRegister(i + 1), Address(in, pos, Address::times_1, 16 * i));
|
|
}
|
|
|
|
__ bind(skip_reload);
|
|
//Update len with the number of blocks processed
|
|
__ subl(len, 128);
|
|
__ addl(pos, 128);
|
|
|
|
__ movdqu(t4, ExternalAddress(counter_shuffle_mask_addr()), rbx /*rscratch*/);
|
|
for (int rnum = 1; rnum <= 8; rnum++) {
|
|
__ vpshufb(as_XMMRegister(rnum), as_XMMRegister(rnum), t4, Assembler::AVX_128bit);
|
|
}
|
|
// Combine GHASHed value with the corresponding ciphertext
|
|
__ vpxor(xmm1, xmm1, t3, Assembler::AVX_128bit);
|
|
}
|
|
|
|
//AES-GCM interleaved implementation
|
|
//Inputs:
|
|
//in - input buffer
|
|
//len- message length to be processed
|
|
//ct - cipher text buffer
|
|
//out - output buffer
|
|
//key - key for aes operations
|
|
//state - address of aad hash for ghash computation
|
|
//subkeyHtbl- table consisting of H constants
|
|
//counter - address of counter for aes operations
|
|
//Output:
|
|
//(counter) - updated in memory counter value
|
|
//(state) - updated in memory aad hash
|
|
//rax - length processed
|
|
//(out) - output buffer updated
|
|
//len - updated length
|
|
//Temp registers: xmm0-xmm15, r10, r15, rbx
|
|
void StubGenerator::aesgcm_avx2(Register in, Register len, Register ct, Register out, Register key,
|
|
Register state, Register subkeyHtbl, Register counter) {
|
|
const Register pos = rax;
|
|
const Register rounds = r10;
|
|
const XMMRegister ctr_blockx = xmm9;
|
|
const XMMRegister aad_hashx = xmm8;
|
|
Label encrypt_done, encrypt_by_8_new, encrypt_by_8, exit;
|
|
|
|
//This routine should be called only for message sizes of 128 bytes or more.
|
|
//Macro flow:
|
|
//process 8 16 byte blocks in initial_num_blocks.
|
|
//process 8 16 byte blocks at a time until all are done 'encrypt_by_8_new followed by ghash_last_8'
|
|
__ xorl(pos, pos);
|
|
__ cmpl(len, 128);
|
|
__ jcc(Assembler::less, exit);
|
|
|
|
//Generate 8 constants for htbl
|
|
generateHtbl_8_block_avx2(subkeyHtbl);
|
|
|
|
//Compute #rounds for AES based on the length of the key array
|
|
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
|
|
|
|
//Load and shuffle state and counter values
|
|
__ movdqu(ctr_blockx, Address(counter, 0));
|
|
__ movdqu(aad_hashx, Address(state, 0));
|
|
__ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
__ vpshufb(aad_hashx, aad_hashx, ExternalAddress(ghash_long_swap_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
|
|
initial_blocks_avx2(ctr_blockx, rounds, key, len, in, out, ct, aad_hashx, pos);
|
|
|
|
//We need at least 128 bytes to proceed further.
|
|
__ cmpl(len, 128);
|
|
__ jcc(Assembler::less, encrypt_done);
|
|
|
|
//in_order vs. out_order is an optimization to increment the counter without shuffling
|
|
//it back into little endian. r15d keeps track of when we need to increment in order so
|
|
//that the carry is handled correctly.
|
|
__ movdl(r15, ctr_blockx);
|
|
__ andl(r15, 255);
|
|
__ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
|
|
__ bind(encrypt_by_8_new);
|
|
__ cmpl(r15, 255 - 8);
|
|
__ jcc(Assembler::greater, encrypt_by_8);
|
|
|
|
__ addb(r15, 8);
|
|
ghash8_encrypt8_parallel_avx2(key, subkeyHtbl, ctr_blockx, in, out, ct, pos, false, rounds,
|
|
xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8);
|
|
__ addl(pos, 128);
|
|
__ subl(len, 128);
|
|
__ cmpl(len, 128);
|
|
__ jcc(Assembler::greaterEqual, encrypt_by_8_new);
|
|
|
|
__ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
__ jmp(encrypt_done);
|
|
|
|
__ bind(encrypt_by_8);
|
|
__ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
|
|
__ addb(r15, 8);
|
|
ghash8_encrypt8_parallel_avx2(key, subkeyHtbl, ctr_blockx, in, out, ct, pos, true, rounds,
|
|
xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8);
|
|
|
|
__ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
__ addl(pos, 128);
|
|
__ subl(len, 128);
|
|
__ cmpl(len, 128);
|
|
__ jcc(Assembler::greaterEqual, encrypt_by_8_new);
|
|
__ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
|
|
__ bind(encrypt_done);
|
|
ghash_last_8_avx2(subkeyHtbl);
|
|
|
|
__ vpaddd(ctr_blockx, ctr_blockx, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
__ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
__ movdqu(Address(counter, 0), ctr_blockx); //current_counter = xmm9
|
|
__ vpshufb(xmm14, xmm14, ExternalAddress(ghash_long_swap_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
|
|
__ movdqu(Address(state, 0), xmm14); //aad hash = xmm14
|
|
//Xor out round keys
|
|
__ vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
|
|
__ vpxor(xmm13, xmm13, xmm13, Assembler::AVX_128bit);
|
|
|
|
__ bind(exit);
|
|
}
|
|
|
|
#undef __
|