From 3e9fc5d49e52d79bcd2bb75068ff7efb31f768fd Mon Sep 17 00:00:00 2001 From: wuxinyang Date: Tue, 10 Mar 2026 02:15:25 +0000 Subject: [PATCH] 8376164: Optimize AES/ECB implementation using full-message intrinsic stub and parallel RoundKey addition Reviewed-by: sviswanathan, semery --- src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 15 +- .../cpu/x86/stubGenerator_x86_64_aes.cpp | 200 +++++++++++++++++- 2 files changed, 212 insertions(+), 3 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 36315535d16..64b56442c90 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -330,6 +330,19 @@ class StubGenerator: public StubCodeGenerator { void aesecb_decrypt(Register source_addr, Register dest_addr, Register key, Register len); + // Shared implementation for ECB/AES Encrypt and Decrypt, which does 4 blocks + // in a loop at a time to hide instruction latency. Set is_encrypt=true for + // encryption, false for decryption. + address generate_electronicCodeBook_AESCrypt_Parallel(bool is_encrypt); + + // A version of ECB/AES Encrypt which does 4 blocks in a loop at a time + // to hide instruction latency + address generate_electronicCodeBook_encryptAESCrypt_Parallel(); + + // A version of ECB/AES Decrypt which does 4 blocks in a loop at a time + // to hide instruction latency + address generate_electronicCodeBook_decryptAESCrypt_Parallel(); + // Vector AES Galois Counter Mode implementation address generate_galoisCounterMode_AESCrypt(); void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key, diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp index 24de32a6fe7..1fa80c9d967 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp @@ -1,5 +1,5 @@ /* -* Copyright (c) 2019, 2025, Intel Corporation. All rights reserved. +* Copyright (c) 2019, 2026, Intel Corporation. All rights reserved. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -218,6 +218,8 @@ void StubGenerator::generate_aes_stubs() { StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); } else { StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); + StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt_Parallel(); + StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt_Parallel(); if (VM_Version::supports_avx2()) { StubRoutines::_galoisCounterMode_AESCrypt = generate_avx2_galoisCounterMode_AESCrypt(); } @@ -1399,6 +1401,200 @@ address StubGenerator::generate_cipherBlockChaining_encryptAESCrypt() { return start; } +// This is a version of ECB/AES Encrypt/Decrypt which does 4 blocks in a loop +// at a time to hide instruction latency. +// +// For encryption (is_encrypt=true): +// pxor key[0], aesenc key[1..rounds-1], aesenclast key[rounds] +// For decryption (is_encrypt=false): +// pxor key[1], aesdec key[2..rounds], aesdeclast key[0] +// +// Arguments: +// +// Inputs: +// c_rarg0 - source byte array address +// c_rarg1 - destination byte array address +// c_rarg2 - session key (Ke/Kd) in little endian int array +// c_rarg3 - input length (must be multiple of blocksize 16) +// +// Output: +// rax - input length +// +address StubGenerator::generate_electronicCodeBook_AESCrypt_Parallel(bool is_encrypt) { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubId stub_id = is_encrypt ? StubId::stubgen_electronicCodeBook_encryptAESCrypt_id + : StubId::stubgen_electronicCodeBook_decryptAESCrypt_id; + StubCodeMark mark(this, stub_id); + address start = __ pc(); + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register len_reg = c_rarg3; // src len (must be multiple of blocksize 16) + const Register pos = rax; + const Register keylen = r11; + + const XMMRegister xmm_result0 = xmm0; + const XMMRegister xmm_result1 = xmm1; + const XMMRegister xmm_result2 = xmm2; + const XMMRegister xmm_result3 = xmm3; + const XMMRegister xmm_key_shuf_mask = xmm4; + const XMMRegister xmm_key_tmp = xmm5; + // keys 0-9 pre-loaded into xmm6-xmm15 + const int XMM_REG_NUM_KEY_FIRST = 6; + const int XMM_REG_NUM_KEY_LAST = 15; + const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); + + // for key_128, key_192, key_256 + const int ROUNDS[3] = {10, 12, 14}; + + Label L_exit; + Label L_loop4[3], L_single[3], L_done[3]; + +#ifdef DoFour +#undef DoFour +#endif +#ifdef DoOne +#undef DoOne +#endif + +#define DoFour(opc, reg) \ +__ opc(xmm_result0, reg); \ +__ opc(xmm_result1, reg); \ +__ opc(xmm_result2, reg); \ +__ opc(xmm_result3, reg); + +#define DoOne(opc, reg) \ +__ opc(xmm_result0, reg); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + __ push(len_reg); // save original length for return value + + __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/); + // load up xmm regs 6 thru 15 with keys 0x00 - 0x90 + for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++, offset += 0x10) { + load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); + } + __ xorptr(pos, pos); + + // key length could be only {11, 13, 15} * 4 = {44, 52, 60} + __ cmpl(keylen, 52); + __ jcc(Assembler::equal, L_loop4[1]); + __ cmpl(keylen, 60); + __ jcc(Assembler::equal, L_loop4[2]); + + // k == 0: generate code for key_128 + // k == 1: generate code for key_192 + // k == 2: generate code for key_256 + for (int k = 0; k < 3; ++k) { + __ align(OptoLoopAlignment); + __ BIND(L_loop4[k]); + __ cmpptr(len_reg, 4 * AESBlockSize); + __ jcc(Assembler::less, L_single[k]); + + __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); + __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); + __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); + + if (is_encrypt) { + DoFour(pxor, xmm_key_first); + for (int rnum = 1; rnum < 10; rnum++) { + DoFour(aesenc, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); + } + for (int i = 10; i < ROUNDS[k]; i++) { + load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask); + DoFour(aesenc, xmm_key_tmp); + } + load_key(xmm_key_tmp, key, ROUNDS[k] * 0x10, xmm_key_shuf_mask); + DoFour(aesenclast, xmm_key_tmp); + } else { + DoFour(pxor, as_XMMRegister(1 + XMM_REG_NUM_KEY_FIRST)); + for (int rnum = 2; rnum < 10; rnum++) { + DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); + } + for (int i = 10; i <= ROUNDS[k]; i++) { + load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask); + DoFour(aesdec, xmm_key_tmp); + } + DoFour(aesdeclast, xmm_key_first); + } + + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); + __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); + __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); + + __ addptr(pos, 4 * AESBlockSize); + __ subptr(len_reg, 4 * AESBlockSize); + __ jmp(L_loop4[k]); + + __ align(OptoLoopAlignment); + __ BIND(L_single[k]); + __ cmpptr(len_reg, AESBlockSize); + __ jcc(Assembler::less, L_done[k]); + + __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); + + if (is_encrypt) { + DoOne(pxor, xmm_key_first); + for (int rnum = 1; rnum < 10; rnum++) { + DoOne(aesenc, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); + } + for (int i = 10; i < ROUNDS[k]; i++) { + load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask); + DoOne(aesenc, xmm_key_tmp); + } + load_key(xmm_key_tmp, key, ROUNDS[k] * 0x10, xmm_key_shuf_mask); + DoOne(aesenclast, xmm_key_tmp); + } else { + DoOne(pxor, as_XMMRegister(1 + XMM_REG_NUM_KEY_FIRST)); + for (int rnum = 2; rnum < 10; rnum++) { + DoOne(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); + } + for (int i = 10; i <= ROUNDS[k]; i++) { + load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask); + DoOne(aesdec, xmm_key_tmp); + } + DoOne(aesdeclast, xmm_key_first); + } + + __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_single[k]); + + __ BIND(L_done[k]); + if (k < 2) __ jmp(L_exit); + } //for key_128/192/256 + + __ BIND(L_exit); + // Clear all XMM registers holding sensitive key material before returning + __ pxor(xmm_key_tmp, xmm_key_tmp); + for (int rnum = XMM_REG_NUM_KEY_FIRST; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { + __ pxor(as_XMMRegister(rnum), as_XMMRegister(rnum)); + } + __ pop(rax); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; + +#undef DoFour +#undef DoOne +} + +address StubGenerator::generate_electronicCodeBook_encryptAESCrypt_Parallel() { + return generate_electronicCodeBook_AESCrypt_Parallel(true); +} + +address StubGenerator::generate_electronicCodeBook_decryptAESCrypt_Parallel() { + return generate_electronicCodeBook_AESCrypt_Parallel(false); +} + // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time // to hide instruction latency // @@ -1493,7 +1689,7 @@ address StubGenerator::generate_cipherBlockChaining_decryptAESCrypt_Parallel() { __ opc(xmm_result0, src_reg); \ __ opc(xmm_result1, src_reg); \ __ opc(xmm_result2, src_reg); \ -__ opc(xmm_result3, src_reg); \ +__ opc(xmm_result3, src_reg); for (int k = 0; k < 3; ++k) { __ BIND(L_multiBlock_loopTopHead[k]);