8288012: AArch64: unnecessary macro expansion in stubGenerator_aarch64

Reviewed-by: aph, njian
This commit is contained in:
Hao Sun 2022-09-02 02:43:07 +00:00 committed by Ningsheng Jian
parent 99c3ab0177
commit e0168a0eb0

View File

@ -3224,6 +3224,74 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
// Utility routines for md5.
// Clobbers r10 and r11.
void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4,
int k, int s, int t) {
Register rscratch3 = r10;
Register rscratch4 = r11;
__ eorw(rscratch3, r3, r4);
__ movw(rscratch2, t);
__ andw(rscratch3, rscratch3, r2);
__ addw(rscratch4, r1, rscratch2);
__ ldrw(rscratch1, Address(buf, k*4));
__ eorw(rscratch3, rscratch3, r4);
__ addw(rscratch3, rscratch3, rscratch1);
__ addw(rscratch3, rscratch3, rscratch4);
__ rorw(rscratch2, rscratch3, 32 - s);
__ addw(r1, rscratch2, r2);
}
void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4,
int k, int s, int t) {
Register rscratch3 = r10;
Register rscratch4 = r11;
__ eorw(rscratch2, r2, r3);
__ ldrw(rscratch1, Address(buf, k*4));
__ andw(rscratch3, rscratch2, r4);
__ movw(rscratch2, t);
__ eorw(rscratch3, rscratch3, r3);
__ addw(rscratch4, r1, rscratch2);
__ addw(rscratch3, rscratch3, rscratch1);
__ addw(rscratch3, rscratch3, rscratch4);
__ rorw(rscratch2, rscratch3, 32 - s);
__ addw(r1, rscratch2, r2);
}
void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4,
int k, int s, int t) {
Register rscratch3 = r10;
Register rscratch4 = r11;
__ eorw(rscratch3, r3, r4);
__ movw(rscratch2, t);
__ addw(rscratch4, r1, rscratch2);
__ ldrw(rscratch1, Address(buf, k*4));
__ eorw(rscratch3, rscratch3, r2);
__ addw(rscratch3, rscratch3, rscratch1);
__ addw(rscratch3, rscratch3, rscratch4);
__ rorw(rscratch2, rscratch3, 32 - s);
__ addw(r1, rscratch2, r2);
}
void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4,
int k, int s, int t) {
Register rscratch3 = r10;
Register rscratch4 = r11;
__ movw(rscratch3, t);
__ ornw(rscratch2, r2, r4);
__ addw(rscratch4, r1, rscratch3);
__ ldrw(rscratch1, Address(buf, k*4));
__ eorw(rscratch3, rscratch2, r3);
__ addw(rscratch3, rscratch3, rscratch1);
__ addw(rscratch3, rscratch3, rscratch4);
__ rorw(rscratch2, rscratch3, 32 - s);
__ addw(r1, rscratch2, r2);
}
// Arguments:
//
// Inputs:
@ -3248,9 +3316,7 @@ class StubGenerator: public StubCodeGenerator {
Register rscratch3 = r10;
Register rscratch4 = r11;
Label keys;
Label md5_loop;
__ BIND(md5_loop);
// Save hash values for addition after rounds
@ -3259,128 +3325,77 @@ class StubGenerator: public StubCodeGenerator {
__ ldrw(c, Address(state, 8));
__ ldrw(d, Address(state, 12));
#define FF(r1, r2, r3, r4, k, s, t) \
__ eorw(rscratch3, r3, r4); \
__ movw(rscratch2, t); \
__ andw(rscratch3, rscratch3, r2); \
__ addw(rscratch4, r1, rscratch2); \
__ ldrw(rscratch1, Address(buf, k*4)); \
__ eorw(rscratch3, rscratch3, r4); \
__ addw(rscratch3, rscratch3, rscratch1); \
__ addw(rscratch3, rscratch3, rscratch4); \
__ rorw(rscratch2, rscratch3, 32 - s); \
__ addw(r1, rscratch2, r2);
#define GG(r1, r2, r3, r4, k, s, t) \
__ eorw(rscratch2, r2, r3); \
__ ldrw(rscratch1, Address(buf, k*4)); \
__ andw(rscratch3, rscratch2, r4); \
__ movw(rscratch2, t); \
__ eorw(rscratch3, rscratch3, r3); \
__ addw(rscratch4, r1, rscratch2); \
__ addw(rscratch3, rscratch3, rscratch1); \
__ addw(rscratch3, rscratch3, rscratch4); \
__ rorw(rscratch2, rscratch3, 32 - s); \
__ addw(r1, rscratch2, r2);
#define HH(r1, r2, r3, r4, k, s, t) \
__ eorw(rscratch3, r3, r4); \
__ movw(rscratch2, t); \
__ addw(rscratch4, r1, rscratch2); \
__ ldrw(rscratch1, Address(buf, k*4)); \
__ eorw(rscratch3, rscratch3, r2); \
__ addw(rscratch3, rscratch3, rscratch1); \
__ addw(rscratch3, rscratch3, rscratch4); \
__ rorw(rscratch2, rscratch3, 32 - s); \
__ addw(r1, rscratch2, r2);
#define II(r1, r2, r3, r4, k, s, t) \
__ movw(rscratch3, t); \
__ ornw(rscratch2, r2, r4); \
__ addw(rscratch4, r1, rscratch3); \
__ ldrw(rscratch1, Address(buf, k*4)); \
__ eorw(rscratch3, rscratch2, r3); \
__ addw(rscratch3, rscratch3, rscratch1); \
__ addw(rscratch3, rscratch3, rscratch4); \
__ rorw(rscratch2, rscratch3, 32 - s); \
__ addw(r1, rscratch2, r2);
// Round 1
FF(a, b, c, d, 0, 7, 0xd76aa478)
FF(d, a, b, c, 1, 12, 0xe8c7b756)
FF(c, d, a, b, 2, 17, 0x242070db)
FF(b, c, d, a, 3, 22, 0xc1bdceee)
FF(a, b, c, d, 4, 7, 0xf57c0faf)
FF(d, a, b, c, 5, 12, 0x4787c62a)
FF(c, d, a, b, 6, 17, 0xa8304613)
FF(b, c, d, a, 7, 22, 0xfd469501)
FF(a, b, c, d, 8, 7, 0x698098d8)
FF(d, a, b, c, 9, 12, 0x8b44f7af)
FF(c, d, a, b, 10, 17, 0xffff5bb1)
FF(b, c, d, a, 11, 22, 0x895cd7be)
FF(a, b, c, d, 12, 7, 0x6b901122)
FF(d, a, b, c, 13, 12, 0xfd987193)
FF(c, d, a, b, 14, 17, 0xa679438e)
FF(b, c, d, a, 15, 22, 0x49b40821)
md5_FF(buf, a, b, c, d, 0, 7, 0xd76aa478);
md5_FF(buf, d, a, b, c, 1, 12, 0xe8c7b756);
md5_FF(buf, c, d, a, b, 2, 17, 0x242070db);
md5_FF(buf, b, c, d, a, 3, 22, 0xc1bdceee);
md5_FF(buf, a, b, c, d, 4, 7, 0xf57c0faf);
md5_FF(buf, d, a, b, c, 5, 12, 0x4787c62a);
md5_FF(buf, c, d, a, b, 6, 17, 0xa8304613);
md5_FF(buf, b, c, d, a, 7, 22, 0xfd469501);
md5_FF(buf, a, b, c, d, 8, 7, 0x698098d8);
md5_FF(buf, d, a, b, c, 9, 12, 0x8b44f7af);
md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1);
md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be);
md5_FF(buf, a, b, c, d, 12, 7, 0x6b901122);
md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193);
md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e);
md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821);
// Round 2
GG(a, b, c, d, 1, 5, 0xf61e2562)
GG(d, a, b, c, 6, 9, 0xc040b340)
GG(c, d, a, b, 11, 14, 0x265e5a51)
GG(b, c, d, a, 0, 20, 0xe9b6c7aa)
GG(a, b, c, d, 5, 5, 0xd62f105d)
GG(d, a, b, c, 10, 9, 0x02441453)
GG(c, d, a, b, 15, 14, 0xd8a1e681)
GG(b, c, d, a, 4, 20, 0xe7d3fbc8)
GG(a, b, c, d, 9, 5, 0x21e1cde6)
GG(d, a, b, c, 14, 9, 0xc33707d6)
GG(c, d, a, b, 3, 14, 0xf4d50d87)
GG(b, c, d, a, 8, 20, 0x455a14ed)
GG(a, b, c, d, 13, 5, 0xa9e3e905)
GG(d, a, b, c, 2, 9, 0xfcefa3f8)
GG(c, d, a, b, 7, 14, 0x676f02d9)
GG(b, c, d, a, 12, 20, 0x8d2a4c8a)
md5_GG(buf, a, b, c, d, 1, 5, 0xf61e2562);
md5_GG(buf, d, a, b, c, 6, 9, 0xc040b340);
md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51);
md5_GG(buf, b, c, d, a, 0, 20, 0xe9b6c7aa);
md5_GG(buf, a, b, c, d, 5, 5, 0xd62f105d);
md5_GG(buf, d, a, b, c, 10, 9, 0x02441453);
md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681);
md5_GG(buf, b, c, d, a, 4, 20, 0xe7d3fbc8);
md5_GG(buf, a, b, c, d, 9, 5, 0x21e1cde6);
md5_GG(buf, d, a, b, c, 14, 9, 0xc33707d6);
md5_GG(buf, c, d, a, b, 3, 14, 0xf4d50d87);
md5_GG(buf, b, c, d, a, 8, 20, 0x455a14ed);
md5_GG(buf, a, b, c, d, 13, 5, 0xa9e3e905);
md5_GG(buf, d, a, b, c, 2, 9, 0xfcefa3f8);
md5_GG(buf, c, d, a, b, 7, 14, 0x676f02d9);
md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a);
// Round 3
HH(a, b, c, d, 5, 4, 0xfffa3942)
HH(d, a, b, c, 8, 11, 0x8771f681)
HH(c, d, a, b, 11, 16, 0x6d9d6122)
HH(b, c, d, a, 14, 23, 0xfde5380c)
HH(a, b, c, d, 1, 4, 0xa4beea44)
HH(d, a, b, c, 4, 11, 0x4bdecfa9)
HH(c, d, a, b, 7, 16, 0xf6bb4b60)
HH(b, c, d, a, 10, 23, 0xbebfbc70)
HH(a, b, c, d, 13, 4, 0x289b7ec6)
HH(d, a, b, c, 0, 11, 0xeaa127fa)
HH(c, d, a, b, 3, 16, 0xd4ef3085)
HH(b, c, d, a, 6, 23, 0x04881d05)
HH(a, b, c, d, 9, 4, 0xd9d4d039)
HH(d, a, b, c, 12, 11, 0xe6db99e5)
HH(c, d, a, b, 15, 16, 0x1fa27cf8)
HH(b, c, d, a, 2, 23, 0xc4ac5665)
md5_HH(buf, a, b, c, d, 5, 4, 0xfffa3942);
md5_HH(buf, d, a, b, c, 8, 11, 0x8771f681);
md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122);
md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c);
md5_HH(buf, a, b, c, d, 1, 4, 0xa4beea44);
md5_HH(buf, d, a, b, c, 4, 11, 0x4bdecfa9);
md5_HH(buf, c, d, a, b, 7, 16, 0xf6bb4b60);
md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70);
md5_HH(buf, a, b, c, d, 13, 4, 0x289b7ec6);
md5_HH(buf, d, a, b, c, 0, 11, 0xeaa127fa);
md5_HH(buf, c, d, a, b, 3, 16, 0xd4ef3085);
md5_HH(buf, b, c, d, a, 6, 23, 0x04881d05);
md5_HH(buf, a, b, c, d, 9, 4, 0xd9d4d039);
md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5);
md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8);
md5_HH(buf, b, c, d, a, 2, 23, 0xc4ac5665);
// Round 4
II(a, b, c, d, 0, 6, 0xf4292244)
II(d, a, b, c, 7, 10, 0x432aff97)
II(c, d, a, b, 14, 15, 0xab9423a7)
II(b, c, d, a, 5, 21, 0xfc93a039)
II(a, b, c, d, 12, 6, 0x655b59c3)
II(d, a, b, c, 3, 10, 0x8f0ccc92)
II(c, d, a, b, 10, 15, 0xffeff47d)
II(b, c, d, a, 1, 21, 0x85845dd1)
II(a, b, c, d, 8, 6, 0x6fa87e4f)
II(d, a, b, c, 15, 10, 0xfe2ce6e0)
II(c, d, a, b, 6, 15, 0xa3014314)
II(b, c, d, a, 13, 21, 0x4e0811a1)
II(a, b, c, d, 4, 6, 0xf7537e82)
II(d, a, b, c, 11, 10, 0xbd3af235)
II(c, d, a, b, 2, 15, 0x2ad7d2bb)
II(b, c, d, a, 9, 21, 0xeb86d391)
#undef FF
#undef GG
#undef HH
#undef II
md5_II(buf, a, b, c, d, 0, 6, 0xf4292244);
md5_II(buf, d, a, b, c, 7, 10, 0x432aff97);
md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7);
md5_II(buf, b, c, d, a, 5, 21, 0xfc93a039);
md5_II(buf, a, b, c, d, 12, 6, 0x655b59c3);
md5_II(buf, d, a, b, c, 3, 10, 0x8f0ccc92);
md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d);
md5_II(buf, b, c, d, a, 1, 21, 0x85845dd1);
md5_II(buf, a, b, c, d, 8, 6, 0x6fa87e4f);
md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0);
md5_II(buf, c, d, a, b, 6, 15, 0xa3014314);
md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1);
md5_II(buf, a, b, c, d, 4, 6, 0xf7537e82);
md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235);
md5_II(buf, c, d, a, b, 2, 15, 0x2ad7d2bb);
md5_II(buf, b, c, d, a, 9, 21, 0xeb86d391);
// write hash values back in the correct order
__ ldrw(rscratch1, Address(state, 0));
@ -3618,6 +3633,34 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
// Double rounds for sha512.
void sha512_dround(int dr,
FloatRegister vi0, FloatRegister vi1,
FloatRegister vi2, FloatRegister vi3,
FloatRegister vi4, FloatRegister vrc0,
FloatRegister vrc1, FloatRegister vin0,
FloatRegister vin1, FloatRegister vin2,
FloatRegister vin3, FloatRegister vin4) {
if (dr < 36) {
__ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
}
__ addv(v5, __ T2D, vrc0, vin0);
__ ext(v6, __ T16B, vi2, vi3, 8);
__ ext(v5, __ T16B, v5, v5, 8);
__ ext(v7, __ T16B, vi1, vi2, 8);
__ addv(vi3, __ T2D, vi3, v5);
if (dr < 32) {
__ ext(v5, __ T16B, vin3, vin4, 8);
__ sha512su0(vin0, __ T2D, vin1);
}
__ sha512h(vi3, __ T2D, v6, v7);
if (dr < 32) {
__ sha512su1(vin0, __ T2D, vin2, v5);
}
__ addv(vi4, __ T2D, vi1, vi3);
__ sha512h2(vi3, __ T2D, vi1, vi0);
}
// Arguments:
//
// Inputs:
@ -3657,25 +3700,6 @@ class StubGenerator: public StubCodeGenerator {
0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
};
// Double rounds for sha512.
#define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
if (dr < 36) \
__ ld1(v##rc1, __ T2D, __ post(rscratch2, 16)); \
__ addv(v5, __ T2D, v##rc0, v##in0); \
__ ext(v6, __ T16B, v##i2, v##i3, 8); \
__ ext(v5, __ T16B, v5, v5, 8); \
__ ext(v7, __ T16B, v##i1, v##i2, 8); \
__ addv(v##i3, __ T2D, v##i3, v5); \
if (dr < 32) { \
__ ext(v5, __ T16B, v##in3, v##in4, 8); \
__ sha512su0(v##in0, __ T2D, v##in1); \
} \
__ sha512h(v##i3, __ T2D, v6, v7); \
if (dr < 32) \
__ sha512su1(v##in0, __ T2D, v##in2, v5); \
__ addv(v##i4, __ T2D, v##i1, v##i3); \
__ sha512h2(v##i3, __ T2D, v##i1, v##i0); \
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
@ -3719,46 +3743,46 @@ class StubGenerator: public StubCodeGenerator {
__ mov(v2, __ T16B, v10);
__ mov(v3, __ T16B, v11);
sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12, 0, 0, 0, 0);
sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13, 0, 0, 0, 0);
sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14, 0, 0, 0, 0);
sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15, 0, 0, 0, 0);
sha512_dround(36, 3, 0, 4, 2, 1, 24, 0, 16, 0, 0, 0, 0);
sha512_dround(37, 2, 3, 1, 4, 0, 25, 0, 17, 0, 0, 0, 0);
sha512_dround(38, 4, 2, 0, 1, 3, 26, 0, 18, 0, 0, 0, 0);
sha512_dround(39, 1, 4, 3, 0, 2, 27, 0, 19, 0, 0, 0, 0);
sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
__ addv(v8, __ T2D, v8, v0);
__ addv(v9, __ T2D, v9, v1);