mirror of
https://github.com/openjdk/jdk.git
synced 2026-01-28 12:09:14 +00:00
8350126: Regression ~3% on Crypto-ChaCha20Poly1305.encrypt for MacOSX aarch64
Reviewed-by: aph
This commit is contained in:
parent
d783a94098
commit
594b26516e
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
@ -1611,11 +1611,15 @@ public:
|
||||
void aes_round(FloatRegister input, FloatRegister subkey);
|
||||
|
||||
// ChaCha20 functions support block
|
||||
void cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
|
||||
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
|
||||
FloatRegister tbl);
|
||||
void cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
|
||||
FloatRegister dVec, bool colToDiag);
|
||||
void cc20_qr_add4(FloatRegister (&addFirst)[4],
|
||||
FloatRegister (&addSecond)[4]);
|
||||
void cc20_qr_xor4(FloatRegister (&firstElem)[4],
|
||||
FloatRegister (&secondElem)[4], FloatRegister (&result)[4]);
|
||||
void cc20_qr_lrot4(FloatRegister (&sourceReg)[4],
|
||||
FloatRegister (&destReg)[4], int bits, FloatRegister table);
|
||||
void cc20_set_qr_registers(FloatRegister (&vectorSet)[4],
|
||||
const FloatRegister (&stateVectors)[16], int idx1, int idx2,
|
||||
int idx3, int idx4);
|
||||
|
||||
// Place an ISB after code may have been modified due to a safepoint.
|
||||
void safepoint_isb();
|
||||
|
||||
@ -28,60 +28,119 @@
|
||||
#include "runtime/stubRoutines.hpp"
|
||||
|
||||
/**
|
||||
* Perform the quarter round calculations on values contained within
|
||||
* four SIMD registers.
|
||||
* Perform the vectorized add for a group of 4 quarter round operations.
|
||||
* In the ChaCha20 quarter round, there are two add ops: a += b and c += d.
|
||||
* Each parameter is a set of 4 registers representing the 4 registers
|
||||
* for the each addend in the add operation for each of the quarter rounds.
|
||||
* (e.g. for "a" it would consist of v0/v1/v2/v3). The result of the add
|
||||
* is placed into the vectors in the "addFirst" array.
|
||||
*
|
||||
* @param aVec the SIMD register containing only the "a" values
|
||||
* @param bVec the SIMD register containing only the "b" values
|
||||
* @param cVec the SIMD register containing only the "c" values
|
||||
* @param dVec the SIMD register containing only the "d" values
|
||||
* @param scratch scratch SIMD register used for 12 and 7 bit left rotations
|
||||
* @param table the SIMD register used as a table for 8 bit left rotations
|
||||
* @param addFirst array of SIMD registers representing the first addend.
|
||||
* @param addSecond array of SIMD registers representing the second addend.
|
||||
*/
|
||||
void MacroAssembler::cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
|
||||
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
|
||||
FloatRegister table) {
|
||||
void MacroAssembler::cc20_qr_add4(FloatRegister (&addFirst)[4],
|
||||
FloatRegister (&addSecond)[4]) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
addv(addFirst[i], T4S, addFirst[i], addSecond[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// a += b, d ^= a, d <<<= 16
|
||||
addv(aVec, T4S, aVec, bVec);
|
||||
eor(dVec, T16B, dVec, aVec);
|
||||
rev32(dVec, T8H, dVec);
|
||||
|
||||
// c += d, b ^= c, b <<<= 12
|
||||
addv(cVec, T4S, cVec, dVec);
|
||||
eor(scratch, T16B, bVec, cVec);
|
||||
ushr(bVec, T4S, scratch, 20);
|
||||
sli(bVec, T4S, scratch, 12);
|
||||
|
||||
// a += b, d ^= a, d <<<= 8
|
||||
addv(aVec, T4S, aVec, bVec);
|
||||
eor(dVec, T16B, dVec, aVec);
|
||||
tbl(dVec, T16B, dVec, 1, table);
|
||||
|
||||
// c += d, b ^= c, b <<<= 7
|
||||
addv(cVec, T4S, cVec, dVec);
|
||||
eor(scratch, T16B, bVec, cVec);
|
||||
ushr(bVec, T4S, scratch, 25);
|
||||
sli(bVec, T4S, scratch, 7);
|
||||
/**
|
||||
* Perform the vectorized XOR for a group of 4 quarter round operations.
|
||||
* In the ChaCha20 quarter round, there are two XOR ops: d ^= a and b ^= c
|
||||
* Each parameter is a set of 4 registers representing the 4 registers
|
||||
* for the each element in the xor operation for each of the quarter rounds.
|
||||
* (e.g. for "a" it would consist of v0/v1/v2/v3)
|
||||
* Note: because the b ^= c ops precede a non-byte-aligned left-rotation,
|
||||
* there is a third parameter which can take a set of scratch registers
|
||||
* for the result, which facilitates doing the subsequent operations for
|
||||
* the left rotation.
|
||||
*
|
||||
* @param firstElem array of SIMD registers representing the first element.
|
||||
* @param secondElem array of SIMD registers representing the second element.
|
||||
* @param result array of SIMD registers representing the destination.
|
||||
* May be the same as firstElem or secondElem, or a separate array.
|
||||
*/
|
||||
void MacroAssembler::cc20_qr_xor4(FloatRegister (&firstElem)[4],
|
||||
FloatRegister (&secondElem)[4], FloatRegister (&result)[4]) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
eor(result[i], T16B, firstElem[i], secondElem[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shift the b, c, and d vectors between columnar and diagonal representations.
|
||||
* Note that the "a" vector does not shift.
|
||||
* Perform the vectorized left-rotation on 32-bit lanes for a group of
|
||||
* 4 quarter round operations.
|
||||
* Each parameter is a set of 4 registers representing the 4 registers
|
||||
* for the each element in the source and destination for each of the quarter
|
||||
* rounds (e.g. for "d" it would consist of v12/v13/v14/v15 on columns and
|
||||
* v15/v12/v13/v14 on diagonal alignments).
|
||||
*
|
||||
* @param bVec the SIMD register containing only the "b" values
|
||||
* @param cVec the SIMD register containing only the "c" values
|
||||
* @param dVec the SIMD register containing only the "d" values
|
||||
* @param colToDiag true if moving columnar to diagonal, false if
|
||||
* moving diagonal back to columnar.
|
||||
* @param sourceReg array of SIMD registers representing the source
|
||||
* @param destReg array of SIMD registers representing the destination
|
||||
* @param bits the distance of the rotation in bits, must be 16/12/8/7 per
|
||||
* the ChaCha20 specification.
|
||||
*/
|
||||
void MacroAssembler::cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
|
||||
FloatRegister dVec, bool colToDiag) {
|
||||
int bShift = colToDiag ? 4 : 12;
|
||||
int cShift = 8;
|
||||
int dShift = colToDiag ? 12 : 4;
|
||||
void MacroAssembler::cc20_qr_lrot4(FloatRegister (&sourceReg)[4],
|
||||
FloatRegister (&destReg)[4], int bits, FloatRegister table) {
|
||||
switch (bits) {
|
||||
case 16: // reg <<<= 16, in-place swap of half-words
|
||||
for (int i = 0; i < 4; i++) {
|
||||
rev32(destReg[i], T8H, sourceReg[i]);
|
||||
}
|
||||
break;
|
||||
|
||||
ext(bVec, T16B, bVec, bVec, bShift);
|
||||
ext(cVec, T16B, cVec, cVec, cShift);
|
||||
ext(dVec, T16B, dVec, dVec, dShift);
|
||||
case 7: // reg <<<= (12 || 7)
|
||||
case 12: // r-shift src -> dest, l-shift src & ins to dest
|
||||
for (int i = 0; i < 4; i++) {
|
||||
ushr(destReg[i], T4S, sourceReg[i], 32 - bits);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
sli(destReg[i], T4S, sourceReg[i], bits);
|
||||
}
|
||||
break;
|
||||
|
||||
case 8: // reg <<<= 8, simulate left rotation with table reorg
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tbl(destReg[i], T16B, sourceReg[i], 1, table);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
// The caller shouldn't be sending bit rotation values outside
|
||||
// of the 16/12/8/7 as defined in the specification.
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the FloatRegisters for a 4-vector register set. These will be used
|
||||
* during various quarter round transformations (adds, xors and left-rotations).
|
||||
* This method itself does not result in the output of any assembly
|
||||
* instructions. It just organizes the vectors so they can be in columnar or
|
||||
* diagonal alignments.
|
||||
*
|
||||
* @param vectorSet a 4-vector array to be altered into a new alignment
|
||||
* @param stateVectors the 16-vector array that represents the current
|
||||
* working state. The indices of this array match up with the
|
||||
* organization of the ChaCha20 state per RFC 7539 (e.g. stateVectors[12]
|
||||
* would contain the vector that holds the 32-bit counter, etc.)
|
||||
* @param idx1 the index of the stateVectors array to be assigned to the
|
||||
* first vectorSet element.
|
||||
* @param idx2 the index of the stateVectors array to be assigned to the
|
||||
* second vectorSet element.
|
||||
* @param idx3 the index of the stateVectors array to be assigned to the
|
||||
* third vectorSet element.
|
||||
* @param idx4 the index of the stateVectors array to be assigned to the
|
||||
* fourth vectorSet element.
|
||||
*/
|
||||
void MacroAssembler::cc20_set_qr_registers(FloatRegister (&vectorSet)[4],
|
||||
const FloatRegister (&stateVectors)[16], int idx1, int idx2,
|
||||
int idx3, int idx4) {
|
||||
vectorSet[0] = stateVectors[idx1];
|
||||
vectorSet[1] = stateVectors[idx2];
|
||||
vectorSet[2] = stateVectors[idx3];
|
||||
vectorSet[3] = stateVectors[idx4];
|
||||
}
|
||||
|
||||
@ -4405,89 +4405,44 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Inputs:
|
||||
* c_rarg0 - int crc
|
||||
* c_rarg1 - byte* buf
|
||||
* c_rarg2 - int length
|
||||
*
|
||||
* Output:
|
||||
* rax - int crc result
|
||||
*/
|
||||
address generate_updateBytesCRC32() {
|
||||
assert(UseCRC32Intrinsics, "what are we doing here?");
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
|
||||
StubCodeMark mark(this, stub_id);
|
||||
|
||||
address start = __ pc();
|
||||
|
||||
const Register crc = c_rarg0; // crc
|
||||
const Register buf = c_rarg1; // source java byte array address
|
||||
const Register len = c_rarg2; // length
|
||||
const Register table0 = c_rarg3; // crc_table address
|
||||
const Register table1 = c_rarg4;
|
||||
const Register table2 = c_rarg5;
|
||||
const Register table3 = c_rarg6;
|
||||
const Register tmp3 = c_rarg7;
|
||||
|
||||
BLOCK_COMMENT("Entry:");
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
__ kernel_crc32(crc, buf, len,
|
||||
table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ ret(lr);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
// ChaCha20 block function. This version parallelizes 4 quarter
|
||||
// round operations at a time. It uses 16 SIMD registers to
|
||||
// produce 4 blocks of key stream.
|
||||
// ChaCha20 block function. This version parallelizes the 32-bit
|
||||
// state elements on each of 16 vectors, producing 4 blocks of
|
||||
// keystream at a time.
|
||||
//
|
||||
// state (int[16]) = c_rarg0
|
||||
// keystream (byte[256]) = c_rarg1
|
||||
// return - number of bytes of keystream (always 256)
|
||||
// return - number of bytes of produced keystream (always 256)
|
||||
//
|
||||
// In this approach, we load the 512-bit start state sequentially into
|
||||
// 4 128-bit vectors. We then make 4 4-vector copies of that starting
|
||||
// state, with each successive set of 4 vectors having a +1 added into
|
||||
// the first 32-bit lane of the 4th vector in that group (the counter).
|
||||
// By doing this, we can perform the block function on 4 512-bit blocks
|
||||
// within one run of this intrinsic.
|
||||
// The alignment of the data across the 4-vector group is such that at
|
||||
// the start it is already aligned for the first round of each two-round
|
||||
// loop iteration. In other words, the corresponding lanes of each vector
|
||||
// will contain the values needed for that quarter round operation (e.g.
|
||||
// elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
|
||||
// In between each full round, a lane shift must occur. Within a loop
|
||||
// iteration, between the first and second rounds, the 2nd, 3rd, and 4th
|
||||
// vectors are rotated left 32, 64 and 96 bits, respectively. The result
|
||||
// is effectively a diagonal orientation in columnar form. After the
|
||||
// second full round, those registers are left-rotated again, this time
|
||||
// 96, 64, and 32 bits - returning the vectors to their columnar organization.
|
||||
// After all 10 iterations, the original state is added to each 4-vector
|
||||
// working state along with the add mask, and the 4 vector groups are
|
||||
// sequentially written to the memory dedicated for the output key stream.
|
||||
//
|
||||
// For a more detailed explanation, see Goll and Gueron, "Vectorization of
|
||||
// ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
|
||||
// New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
|
||||
address generate_chacha20Block_qrpar() {
|
||||
Label L_Q_twoRounds, L_Q_cc20_const;
|
||||
// This implementation takes each 32-bit integer from the state
|
||||
// array and broadcasts it across all 4 32-bit lanes of a vector register
|
||||
// (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
|
||||
// of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
|
||||
// the quarter round schedule is implemented as outlined in RFC 7539 section
|
||||
// 2.3. However, instead of sequentially processing the 3 quarter round
|
||||
// operations represented by one QUARTERROUND function, we instead stack all
|
||||
// the adds, xors and left-rotations from the first 4 quarter rounds together
|
||||
// and then do the same for the second set of 4 quarter rounds. This removes
|
||||
// some latency that would otherwise be incurred by waiting for an add to
|
||||
// complete before performing an xor (which depends on the result of the
|
||||
// add), etc. An adjustment happens between the first and second groups of 4
|
||||
// quarter rounds, but this is done only in the inputs to the macro functions
|
||||
// that generate the assembly instructions - these adjustments themselves are
|
||||
// not part of the resulting assembly.
|
||||
// The 4 registers v0-v3 are used during the quarter round operations as
|
||||
// scratch registers. Once the 20 rounds are complete, these 4 scratch
|
||||
// registers become the vectors involved in adding the start state back onto
|
||||
// the post-QR working state. After the adds are complete, each of the 16
|
||||
// vectors write their first lane back to the keystream buffer, followed
|
||||
// by the second lane from all vectors and so on.
|
||||
address generate_chacha20Block_blockpar() {
|
||||
Label L_twoRounds, L_cc20_const;
|
||||
// The constant data is broken into two 128-bit segments to be loaded
|
||||
// onto SIMD registers. The first 128 bits are a counter add overlay
|
||||
// that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
|
||||
// onto FloatRegisters. The first 128 bits are a counter add overlay
|
||||
// that adds +0/+1/+2/+3 to the vector holding replicated state[12].
|
||||
// The second 128-bits is a table constant used for 8-bit left rotations.
|
||||
// on 32-bit lanes within a SIMD register.
|
||||
__ BIND(L_Q_cc20_const);
|
||||
__ emit_int64(0x0000000000000001UL);
|
||||
__ emit_int64(0x0000000000000000UL);
|
||||
__ BIND(L_cc20_const);
|
||||
__ emit_int64(0x0000000100000000UL);
|
||||
__ emit_int64(0x0000000300000002UL);
|
||||
__ emit_int64(0x0605040702010003UL);
|
||||
__ emit_int64(0x0E0D0C0F0A09080BUL);
|
||||
|
||||
@ -4497,144 +4452,142 @@ class StubGenerator: public StubCodeGenerator {
|
||||
address start = __ pc();
|
||||
__ enter();
|
||||
|
||||
int i, j;
|
||||
const Register state = c_rarg0;
|
||||
const Register keystream = c_rarg1;
|
||||
const Register loopCtr = r10;
|
||||
const Register tmpAddr = r11;
|
||||
const FloatRegister ctrAddOverlay = v28;
|
||||
const FloatRegister lrot8Tbl = v29;
|
||||
|
||||
const FloatRegister aState = v0;
|
||||
const FloatRegister bState = v1;
|
||||
const FloatRegister cState = v2;
|
||||
const FloatRegister dState = v3;
|
||||
const FloatRegister a1Vec = v4;
|
||||
const FloatRegister b1Vec = v5;
|
||||
const FloatRegister c1Vec = v6;
|
||||
const FloatRegister d1Vec = v7;
|
||||
// Skip the callee-saved registers v8 - v15
|
||||
const FloatRegister a2Vec = v16;
|
||||
const FloatRegister b2Vec = v17;
|
||||
const FloatRegister c2Vec = v18;
|
||||
const FloatRegister d2Vec = v19;
|
||||
const FloatRegister a3Vec = v20;
|
||||
const FloatRegister b3Vec = v21;
|
||||
const FloatRegister c3Vec = v22;
|
||||
const FloatRegister d3Vec = v23;
|
||||
const FloatRegister a4Vec = v24;
|
||||
const FloatRegister b4Vec = v25;
|
||||
const FloatRegister c4Vec = v26;
|
||||
const FloatRegister d4Vec = v27;
|
||||
const FloatRegister scratch = v28;
|
||||
const FloatRegister addMask = v29;
|
||||
const FloatRegister lrot8Tbl = v30;
|
||||
// Organize SIMD registers in an array that facilitates
|
||||
// putting repetitive opcodes into loop structures. It is
|
||||
// important that each grouping of 4 registers is monotonically
|
||||
// increasing to support the requirements of multi-register
|
||||
// instructions (e.g. ld4r, st4, etc.)
|
||||
const FloatRegister workSt[16] = {
|
||||
v4, v5, v6, v7, v16, v17, v18, v19,
|
||||
v20, v21, v22, v23, v24, v25, v26, v27
|
||||
};
|
||||
|
||||
// Load the initial state in the first 4 quadword registers,
|
||||
// then copy the initial state into the next 4 quadword registers
|
||||
// that will be used for the working state.
|
||||
__ ld1(aState, bState, cState, dState, __ T16B, Address(state));
|
||||
// Pull in constant data. The first 16 bytes are the add overlay
|
||||
// which is applied to the vector holding the counter (state[12]).
|
||||
// The second 16 bytes is the index register for the 8-bit left
|
||||
// rotation tbl instruction.
|
||||
__ adr(tmpAddr, L_cc20_const);
|
||||
__ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
|
||||
|
||||
// Load the index register for 2 constant 128-bit data fields.
|
||||
// The first represents the +1/+0/+0/+0 add mask. The second is
|
||||
// the 8-bit left rotation.
|
||||
__ adr(tmpAddr, L_Q_cc20_const);
|
||||
__ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
|
||||
// Load from memory and interlace across 16 SIMD registers,
|
||||
// With each word from memory being broadcast to all lanes of
|
||||
// each successive SIMD register.
|
||||
// Addr(0) -> All lanes in workSt[i]
|
||||
// Addr(4) -> All lanes workSt[i + 1], etc.
|
||||
__ mov(tmpAddr, state);
|
||||
for (i = 0; i < 16; i += 4) {
|
||||
__ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
|
||||
__ post(tmpAddr, 16));
|
||||
}
|
||||
__ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
|
||||
|
||||
__ mov(a1Vec, __ T16B, aState);
|
||||
__ mov(b1Vec, __ T16B, bState);
|
||||
__ mov(c1Vec, __ T16B, cState);
|
||||
__ mov(d1Vec, __ T16B, dState);
|
||||
// Before entering the loop, create 5 4-register arrays. These
|
||||
// will hold the 4 registers that represent the a/b/c/d fields
|
||||
// in the quarter round operation. For instance the "b" field
|
||||
// for the first 4 quarter round operations is the set of v16/v17/v18/v19,
|
||||
// but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
|
||||
// since it is part of a diagonal organization. The aSet and scratch
|
||||
// register sets are defined at declaration time because they do not change
|
||||
// organization at any point during the 20-round processing.
|
||||
FloatRegister aSet[4] = { v4, v5, v6, v7 };
|
||||
FloatRegister bSet[4];
|
||||
FloatRegister cSet[4];
|
||||
FloatRegister dSet[4];
|
||||
FloatRegister scratch[4] = { v0, v1, v2, v3 };
|
||||
|
||||
__ mov(a2Vec, __ T16B, aState);
|
||||
__ mov(b2Vec, __ T16B, bState);
|
||||
__ mov(c2Vec, __ T16B, cState);
|
||||
__ addv(d2Vec, __ T4S, d1Vec, addMask);
|
||||
|
||||
__ mov(a3Vec, __ T16B, aState);
|
||||
__ mov(b3Vec, __ T16B, bState);
|
||||
__ mov(c3Vec, __ T16B, cState);
|
||||
__ addv(d3Vec, __ T4S, d2Vec, addMask);
|
||||
|
||||
__ mov(a4Vec, __ T16B, aState);
|
||||
__ mov(b4Vec, __ T16B, bState);
|
||||
__ mov(c4Vec, __ T16B, cState);
|
||||
__ addv(d4Vec, __ T4S, d3Vec, addMask);
|
||||
|
||||
// Set up the 10 iteration loop
|
||||
// Set up the 10 iteration loop and perform all 8 quarter round ops
|
||||
__ mov(loopCtr, 10);
|
||||
__ BIND(L_Q_twoRounds);
|
||||
__ BIND(L_twoRounds);
|
||||
|
||||
// The first set of operations on the vectors covers the first 4 quarter
|
||||
// round operations:
|
||||
// Qround(state, 0, 4, 8,12)
|
||||
// Qround(state, 1, 5, 9,13)
|
||||
// Qround(state, 2, 6,10,14)
|
||||
// Qround(state, 3, 7,11,15)
|
||||
__ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
|
||||
// Set to columnar organization and do the following 4 quarter-rounds:
|
||||
// QUARTERROUND(0, 4, 8, 12)
|
||||
// QUARTERROUND(1, 5, 9, 13)
|
||||
// QUARTERROUND(2, 6, 10, 14)
|
||||
// QUARTERROUND(3, 7, 11, 15)
|
||||
__ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
|
||||
__ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
|
||||
__ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
|
||||
|
||||
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
|
||||
// diagonals. The a1Vec does not need to change orientation.
|
||||
__ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
|
||||
__ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
|
||||
__ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
|
||||
__ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
|
||||
__ cc20_qr_add4(aSet, bSet); // a += b
|
||||
__ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
|
||||
__ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
|
||||
|
||||
// The second set of operations on the vectors covers the second 4 quarter
|
||||
// round operations, now acting on the diagonals:
|
||||
// Qround(state, 0, 5,10,15)
|
||||
// Qround(state, 1, 6,11,12)
|
||||
// Qround(state, 2, 7, 8,13)
|
||||
// Qround(state, 3, 4, 9,14)
|
||||
__ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
|
||||
__ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
|
||||
__ cc20_qr_add4(cSet, dSet); // c += d
|
||||
__ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
|
||||
__ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
|
||||
|
||||
// Before we start the next iteration, we need to perform shuffles
|
||||
// on the b/c/d vectors to move them back to columnar organizations
|
||||
// from their current diagonal orientation.
|
||||
__ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
|
||||
__ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
|
||||
__ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
|
||||
__ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
|
||||
__ cc20_qr_add4(aSet, bSet); // a += b
|
||||
__ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
|
||||
__ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
|
||||
|
||||
__ cc20_qr_add4(cSet, dSet); // c += d
|
||||
__ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
|
||||
__ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
|
||||
|
||||
// Set to diagonal organization and do the next 4 quarter-rounds:
|
||||
// QUARTERROUND(0, 5, 10, 15)
|
||||
// QUARTERROUND(1, 6, 11, 12)
|
||||
// QUARTERROUND(2, 7, 8, 13)
|
||||
// QUARTERROUND(3, 4, 9, 14)
|
||||
__ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
|
||||
__ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
|
||||
__ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
|
||||
|
||||
__ cc20_qr_add4(aSet, bSet); // a += b
|
||||
__ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
|
||||
__ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
|
||||
|
||||
__ cc20_qr_add4(cSet, dSet); // c += d
|
||||
__ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
|
||||
__ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
|
||||
|
||||
__ cc20_qr_add4(aSet, bSet); // a += b
|
||||
__ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
|
||||
__ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
|
||||
|
||||
__ cc20_qr_add4(cSet, dSet); // c += d
|
||||
__ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
|
||||
__ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
|
||||
|
||||
// Decrement and iterate
|
||||
__ sub(loopCtr, loopCtr, 1);
|
||||
__ cbnz(loopCtr, L_Q_twoRounds);
|
||||
__ cbnz(loopCtr, L_twoRounds);
|
||||
|
||||
// Once the counter reaches zero, we fall out of the loop
|
||||
// and need to add the initial state back into the working state
|
||||
// represented by the a/b/c/d1Vec registers. This is destructive
|
||||
// on the dState register but we no longer will need it.
|
||||
__ addv(a1Vec, __ T4S, a1Vec, aState);
|
||||
__ addv(b1Vec, __ T4S, b1Vec, bState);
|
||||
__ addv(c1Vec, __ T4S, c1Vec, cState);
|
||||
__ addv(d1Vec, __ T4S, d1Vec, dState);
|
||||
__ mov(tmpAddr, state);
|
||||
|
||||
__ addv(a2Vec, __ T4S, a2Vec, aState);
|
||||
__ addv(b2Vec, __ T4S, b2Vec, bState);
|
||||
__ addv(c2Vec, __ T4S, c2Vec, cState);
|
||||
__ addv(dState, __ T4S, dState, addMask);
|
||||
__ addv(d2Vec, __ T4S, d2Vec, dState);
|
||||
// Add the starting state back to the post-loop keystream
|
||||
// state. We read/interlace the state array from memory into
|
||||
// 4 registers similar to what we did in the beginning. Then
|
||||
// add the counter overlay onto workSt[12] at the end.
|
||||
for (i = 0; i < 16; i += 4) {
|
||||
__ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
|
||||
__ addv(workSt[i], __ T4S, workSt[i], v0);
|
||||
__ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
|
||||
__ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
|
||||
__ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
|
||||
}
|
||||
__ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
|
||||
|
||||
__ addv(a3Vec, __ T4S, a3Vec, aState);
|
||||
__ addv(b3Vec, __ T4S, b3Vec, bState);
|
||||
__ addv(c3Vec, __ T4S, c3Vec, cState);
|
||||
__ addv(dState, __ T4S, dState, addMask);
|
||||
__ addv(d3Vec, __ T4S, d3Vec, dState);
|
||||
|
||||
__ addv(a4Vec, __ T4S, a4Vec, aState);
|
||||
__ addv(b4Vec, __ T4S, b4Vec, bState);
|
||||
__ addv(c4Vec, __ T4S, c4Vec, cState);
|
||||
__ addv(dState, __ T4S, dState, addMask);
|
||||
__ addv(d4Vec, __ T4S, d4Vec, dState);
|
||||
|
||||
// Write the final state back to the result buffer
|
||||
__ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
|
||||
__ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
|
||||
__ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
|
||||
__ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
|
||||
// Write working state into the keystream buffer. This is accomplished
|
||||
// by taking the lane "i" from each of the four vectors and writing
|
||||
// it to consecutive 4-byte offsets, then post-incrementing by 16 and
|
||||
// repeating with the next 4 vectors until all 16 vectors have been used.
|
||||
// Then move to the next lane and repeat the process until all lanes have
|
||||
// been written.
|
||||
for (i = 0; i < 4; i++) {
|
||||
for (j = 0; j < 16; j += 4) {
|
||||
__ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
|
||||
__ post(keystream, 16));
|
||||
}
|
||||
}
|
||||
|
||||
__ mov(r0, 256); // Return length of output keystream
|
||||
__ leave();
|
||||
@ -7008,6 +6961,47 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Inputs:
|
||||
* c_rarg0 - int crc
|
||||
* c_rarg1 - byte* buf
|
||||
* c_rarg2 - int length
|
||||
*
|
||||
* Output:
|
||||
* rax - int crc result
|
||||
*/
|
||||
address generate_updateBytesCRC32() {
|
||||
assert(UseCRC32Intrinsics, "what are we doing here?");
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
|
||||
StubCodeMark mark(this, stub_id);
|
||||
|
||||
address start = __ pc();
|
||||
|
||||
const Register crc = c_rarg0; // crc
|
||||
const Register buf = c_rarg1; // source java byte array address
|
||||
const Register len = c_rarg2; // length
|
||||
const Register table0 = c_rarg3; // crc_table address
|
||||
const Register table1 = c_rarg4;
|
||||
const Register table2 = c_rarg5;
|
||||
const Register table3 = c_rarg6;
|
||||
const Register tmp3 = c_rarg7;
|
||||
|
||||
BLOCK_COMMENT("Entry:");
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
__ kernel_crc32(crc, buf, len,
|
||||
table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ ret(lr);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
@ -11403,7 +11397,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
#endif // COMPILER2
|
||||
|
||||
if (UseChaCha20Intrinsics) {
|
||||
StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
|
||||
StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
|
||||
}
|
||||
|
||||
if (UseKyberIntrinsics) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user