jdk/src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp

/*
* Copyright (c) 2019, 2026, Intel Corporation. All rights reserved.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/

#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "macroAssembler_x86.hpp"
#include "stubGenerator_x86_64.hpp"

#define __ _masm->

#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif // PRODUCT

#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")

// Constants

const int AESBlockSize = 16;

// Shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers.
ATTRIBUTE_ALIGNED(16) static const uint64_t KEY_SHUFFLE_MASK[] = {
    0x0405060700010203UL, 0x0C0D0E0F08090A0BUL
};
static address key_shuffle_mask_addr() {
  return (address)KEY_SHUFFLE_MASK;
}

// Shuffle mask for big-endian 128-bit integers.
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_SHUFFLE_MASK[] = {
    0x08090A0B0C0D0E0FUL, 0x0001020304050607UL,
    0x08090A0B0C0D0E0FUL, 0x0001020304050607UL,
    0x08090A0B0C0D0E0FUL, 0x0001020304050607UL,
    0x08090A0B0C0D0E0FUL, 0x0001020304050607UL,
};
static address counter_shuffle_mask_addr() {
  return (address)COUNTER_SHUFFLE_MASK;
}

// This mask is used for incrementing counter value
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC0[] = {
    0x0000000000000000UL, 0x0000000000000000UL,
    0x0000000000000001UL, 0x0000000000000000UL,
    0x0000000000000002UL, 0x0000000000000000UL,
    0x0000000000000003UL, 0x0000000000000000UL,
};
static address counter_mask_linc0_addr() {
  return (address)COUNTER_MASK_LINC0;
}

ATTRIBUTE_ALIGNED(16) static const uint64_t COUNTER_MASK_LINC1[] = {
    0x0000000000000001UL, 0x0000000000000000UL,
};
static address counter_mask_linc1_addr() {
  return (address)COUNTER_MASK_LINC1;
}

ATTRIBUTE_ALIGNED(16) uint64_t COUNTER_MASK_LINC1F[] = {
    0x0000000000000000UL, 0x0100000000000000UL,
};

static address counter_mask_linc1f_addr() {
  return (address)COUNTER_MASK_LINC1F;
}

ATTRIBUTE_ALIGNED(16) uint64_t COUNTER_MASK_LINC2[] = {
    0x0000000000000002UL, 0x0000000000000000UL,
};

static address counter_mask_linc2_addr() {
  return (address)COUNTER_MASK_LINC2;
}

ATTRIBUTE_ALIGNED(16) uint64_t COUNTER_MASK_LINC2F[] = {
    0x0000000000000000UL, 0x0200000000000000UL,
};

static address counter_mask_linc2f_addr() {
  return (address)COUNTER_MASK_LINC2F;
}

ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC4[] = {
    0x0000000000000004UL, 0x0000000000000000UL,
    0x0000000000000004UL, 0x0000000000000000UL,
    0x0000000000000004UL, 0x0000000000000000UL,
    0x0000000000000004UL, 0x0000000000000000UL,
};
static address counter_mask_linc4_addr() {
  return (address)COUNTER_MASK_LINC4;
}

ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC8[] = {
    0x0000000000000008UL, 0x0000000000000000UL,
    0x0000000000000008UL, 0x0000000000000000UL,
    0x0000000000000008UL, 0x0000000000000000UL,
    0x0000000000000008UL, 0x0000000000000000UL,
};
static address counter_mask_linc8_addr() {
  return (address)COUNTER_MASK_LINC8;
}

ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC16[] = {
    0x0000000000000010UL, 0x0000000000000000UL,
    0x0000000000000010UL, 0x0000000000000000UL,
    0x0000000000000010UL, 0x0000000000000000UL,
    0x0000000000000010UL, 0x0000000000000000UL,
};
static address counter_mask_linc16_addr() {
  return (address)COUNTER_MASK_LINC16;
}

ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_LINC32[] = {
    0x0000000000000020UL, 0x0000000000000000UL,
    0x0000000000000020UL, 0x0000000000000000UL,
    0x0000000000000020UL, 0x0000000000000000UL,
    0x0000000000000020UL, 0x0000000000000000UL,
};
static address counter_mask_linc32_addr() {
  return (address)COUNTER_MASK_LINC32;
}

ATTRIBUTE_ALIGNED(64) uint64_t COUNTER_MASK_ONES[] = {
    0x0000000000000000UL, 0x0000000000000001UL,
    0x0000000000000000UL, 0x0000000000000001UL,
    0x0000000000000000UL, 0x0000000000000001UL,
    0x0000000000000000UL, 0x0000000000000001UL,
};
static address counter_mask_ones_addr() {
  return (address)COUNTER_MASK_ONES;
}

ATTRIBUTE_ALIGNED(64) static const uint64_t GHASH_POLYNOMIAL_REDUCTION[] = {
    0x00000001C2000000UL, 0xC200000000000000UL,
    0x00000001C2000000UL, 0xC200000000000000UL,
    0x00000001C2000000UL, 0xC200000000000000UL,
    0x00000001C2000000UL, 0xC200000000000000UL,
};
static address ghash_polynomial_reduction_addr() {
  return (address)GHASH_POLYNOMIAL_REDUCTION;
}

ATTRIBUTE_ALIGNED(16) static const uint64_t GHASH_POLYNOMIAL_TWO_ONE[] = {
    0x0000000000000001UL, 0x0000000100000000UL,
};
static address ghash_polynomial_two_one_addr() {
  return (address)GHASH_POLYNOMIAL_TWO_ONE;
}

// This mask is used for incrementing counter value
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_ADDBE_4444[] = {
    0x0000000000000000ULL, 0x0400000000000000ULL,
    0x0000000000000000ULL, 0x0400000000000000ULL,
    0x0000000000000000ULL, 0x0400000000000000ULL,
    0x0000000000000000ULL, 0x0400000000000000ULL,
};
static address counter_mask_addbe_4444_addr() {
    return (address)COUNTER_MASK_ADDBE_4444;
}

// This mask is used for incrementing counter value
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_ADDBE_1234[] = {
    0x0000000000000000ULL, 0x0100000000000000ULL,
    0x0000000000000000ULL, 0x0200000000000000ULL,
    0x0000000000000000ULL, 0x0300000000000000ULL,
    0x0000000000000000ULL, 0x0400000000000000ULL,
};
static address counter_mask_addbe_1234_addr() {
    return (address)COUNTER_MASK_ADDBE_1234;
}

// This mask is used for incrementing counter value
ATTRIBUTE_ALIGNED(64) static const uint64_t COUNTER_MASK_ADD_1234[] = {
    0x0000000000000001ULL, 0x0000000000000000ULL,
    0x0000000000000002ULL, 0x0000000000000000ULL,
    0x0000000000000003ULL, 0x0000000000000000ULL,
    0x0000000000000004ULL, 0x0000000000000000ULL,
};
static address counter_mask_add_1234_addr() {
    return (address)COUNTER_MASK_ADD_1234;
}

// AES intrinsic stubs

void StubGenerator::generate_aes_stubs() {
  if (UseAESIntrinsics) {
    StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
    StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
    StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
    if (VM_Version::supports_avx512_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
      StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
      StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
      StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
    } else {
      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
      StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt_Parallel();
      StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt_Parallel();
      if (VM_Version::supports_avx2()) {
          StubRoutines::_galoisCounterMode_AESCrypt = generate_avx2_galoisCounterMode_AESCrypt();
      }
    }
  }

  if (UseAESCTRIntrinsics) {
    if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
      StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
    } else {
      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
    }
  }
}

// Vector AES Galois Counter Mode implementation.
//
// Inputs:           Windows    |   Linux
//   in         = rcx (c_rarg0) | rsi (c_rarg0)
//   len        = rdx (c_rarg1) | rdi (c_rarg1)
//   ct         = r8  (c_rarg2) | rdx (c_rarg2)
//   out        = r9  (c_rarg3) | rcx (c_rarg3)
//   key        = rsi           | r8  (c_rarg4)
//   state      = rdi           | r9  (c_rarg5)
//   subkeyHtbl = r10           | r10
//   counter    = r11           | r11
//
// Output:
//   rax - number of processed bytes
address StubGenerator::generate_galoisCounterMode_AESCrypt() {
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register in = c_rarg0;
  const Register len = c_rarg1;
  const Register ct = c_rarg2;
  const Register out = c_rarg3;
  // and updated with the incremented counter in the end
#ifndef _WIN64
  const Register key = c_rarg4;
  const Register state = c_rarg5;
  const Address subkeyH_mem(rbp, 2 * wordSize);
  const Register subkeyHtbl = r10;
  const Register avx512_subkeyHtbl = r12;
  const Address counter_mem(rbp, 3 * wordSize);
  const Register counter = r11;
#else
  const Address key_mem(rbp, 6 * wordSize);
  const Register key = rsi;
  const Address state_mem(rbp, 7 * wordSize);
  const Register state = rdi;
  const Address subkeyH_mem(rbp, 8 * wordSize);
  const Register subkeyHtbl = r10;
  const Register avx512_subkeyHtbl = r12;
  const Address counter_mem(rbp, 9 * wordSize);
  const Register counter = r11;
#endif
  __ enter();
 // Save state before entering routine
  __ push_ppx(r12);//holds pointer to avx512_subkeyHtbl
  __ push_ppx(r14);//holds CTR_CHECK value to check for overflow
  __ push_ppx(r15);//holds number of rounds
  __ push_ppx(rbx);//scratch register
#ifdef _WIN64
  // on win64, fill len_reg from stack position
  __ push_ppx(rsi);
  __ push_ppx(rdi);
  __ movptr(key, key_mem);
  __ movptr(state, state_mem);
#endif
  __ movptr(subkeyHtbl, subkeyH_mem);
  __ movptr(counter, counter_mem);
// Align stack
  __ andq(rsp, -64);
  __ subptr(rsp, 200 * longSize); // Create space on the stack for 64 htbl entries and 8 zmm AES entries
  __ movptr(avx512_subkeyHtbl, rsp);

  aesgcm_avx512(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);

  __ vzeroupper();

  // Restore state before leaving routine
#ifdef _WIN64
  __ lea(rsp, Address(rbp, -6 * wordSize));
  __ pop_ppx(rdi);
  __ pop_ppx(rsi);
#else
  __ lea(rsp, Address(rbp, -4 * wordSize));
#endif
  __ pop_ppx(rbx);
  __ pop_ppx(r15);
  __ pop_ppx(r14);
  __ pop_ppx(r12);

  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}

// AVX2 Vector AES Galois Counter Mode implementation.
//
// Inputs:           Windows    |   Linux
//   in         = rcx (c_rarg0) | rsi (c_rarg0)
//   len        = rdx (c_rarg1) | rdi (c_rarg1)
//   ct         = r8  (c_rarg2) | rdx (c_rarg2)
//   out        = r9  (c_rarg3) | rcx (c_rarg3)
//   key        = rdi           | r8  (c_rarg4)
//   state      = r13           | r9  (c_rarg5)
//   subkeyHtbl = r11           | r11
//   counter    = rsi           | r12
//
// Output:
//   rax - number of processed bytes
address StubGenerator::generate_avx2_galoisCounterMode_AESCrypt() {
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register in = c_rarg0;
  const Register len = c_rarg1;
  const Register ct = c_rarg2;
  const Register out = c_rarg3;
  // and updated with the incremented counter in the end
 #ifndef _WIN64
  const Register key = c_rarg4;
  const Register state = c_rarg5;
  const Address subkeyH_mem(rbp, 2 * wordSize);
  const Register subkeyHtbl = r11;
  const Address counter_mem(rbp, 3 * wordSize);
  const Register counter = r12;
 #else
  const Address key_mem(rbp, 6 * wordSize);
  const Register key = rdi;
  const Address state_mem(rbp, 7 * wordSize);
  const Register state = r13;
  const Address subkeyH_mem(rbp, 8 * wordSize);
  const Register subkeyHtbl = r11;
  const Address counter_mem(rbp, 9 * wordSize);
  const Register counter = rsi;
 #endif
  __ enter();
  // Save state before entering routine
  __ push_ppx(r12);
  __ push_ppx(r13);
  __ push_ppx(r14);
  __ push_ppx(r15);
  __ push_ppx(rbx);
#ifdef _WIN64
  // on win64, fill len_reg from stack position
  __ push_ppx(rsi);
  __ push_ppx(rdi);
  __ movptr(key, key_mem);
  __ movptr(state, state_mem);
#endif
  __ movptr(subkeyHtbl, subkeyH_mem);
  __ movptr(counter, counter_mem);

  // Save rsp
  __ movq(r14, rsp);
  // Align stack
  __ andq(rsp, -64);
  __ subptr(rsp, 16 * longSize); // Create space on the stack for saving AES entries

  aesgcm_avx2(in, len, ct, out, key, state, subkeyHtbl, counter);
  __ vzeroupper();
  __ movq(rsp, r14);
  // Restore state before leaving routine
 #ifdef _WIN64
  __ pop_ppx(rdi);
  __ pop_ppx(rsi);
 #endif
  __ pop_ppx(rbx);
  __ pop_ppx(r15);
  __ pop_ppx(r14);
  __ pop_ppx(r13);
  __ pop_ppx(r12);

  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}

// Vector AES Counter implementation
address StubGenerator::generate_counterMode_VectorAESCrypt()  {
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register from = c_rarg0; // source array address
  const Register to = c_rarg1; // destination array address
  const Register key = c_rarg2; // key array address r8
  const Register counter = c_rarg3; // counter byte array initialized from counter array address
  // and updated with the incremented counter in the end
#ifndef _WIN64
  const Register len_reg = c_rarg4;
  const Register saved_encCounter_start = c_rarg5;
  const Register used_addr = r10;
  const Address  used_mem(rbp, 2 * wordSize);
  const Register used = r11;
#else
  const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
  const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
  const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
  const Register len_reg = r10; // pick the first volatile windows register
  const Register saved_encCounter_start = r11;
  const Register used_addr = r13;
  const Register used = r14;
#endif
  __ enter();
 // Save state before entering routine
  __ push_ppx(r12);
  __ push_ppx(r13);
  __ push_ppx(r14);
  __ push_ppx(r15);
#ifdef _WIN64
  // on win64, fill len_reg from stack position
  __ movl(len_reg, len_mem);
  __ movptr(saved_encCounter_start, saved_encCounter_mem);
  __ movptr(used_addr, used_mem);
  __ movl(used, Address(used_addr, 0));
#else
  __ push_ppx(len_reg); // Save
  __ movptr(used_addr, used_mem);
  __ movl(used, Address(used_addr, 0));
#endif
  __ push_ppx(rbx);

  aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);

  __ vzeroupper();
  // Restore state before leaving routine
  __ pop_ppx(rbx);
#ifdef _WIN64
  __ movl(rax, len_mem); // return length
#else
  __ pop_ppx(rax); // return length
#endif
  __ pop_ppx(r15);
  __ pop_ppx(r14);
  __ pop_ppx(r13);
  __ pop_ppx(r12);

  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}

// This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
// to hide instruction latency
//
// Arguments:
//
// Inputs:
//   c_rarg0   - source byte array address
//   c_rarg1   - destination byte array address
//   c_rarg2   - sessionKe (key) in little endian int array
//   c_rarg3   - counter vector byte array address
//   Linux
//     c_rarg4   -          input length
//     c_rarg5   -          saved encryptedCounter start
//     rbp + 6 * wordSize - saved used length
//   Windows
//     rbp + 6 * wordSize - input length
//     rbp + 7 * wordSize - saved encryptedCounter start
//     rbp + 8 * wordSize - saved used length
//
// Output:
//   rax       - input length
//
address StubGenerator::generate_counterMode_AESCrypt_Parallel() {
  assert(UseAES, "need AES instructions and misaligned SSE support");
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register from = c_rarg0; // source array address
  const Register to = c_rarg1; // destination array address
  const Register key = c_rarg2; // key array address
  const Register counter = c_rarg3; // counter byte array initialized from counter array address
                                    // and updated with the incremented counter in the end
#ifndef _WIN64
  const Register len_reg = c_rarg4;
  const Register saved_encCounter_start = c_rarg5;
  const Register used_addr = r10;
  const Address  used_mem(rbp, 2 * wordSize);
  const Register used = r11;
#else
  const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
  const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
  const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
  const Register len_reg = r10; // pick the first volatile windows register
  const Register saved_encCounter_start = r11;
  const Register used_addr = r13;
  const Register used = r14;
#endif
  const Register pos = rax;

  const int PARALLEL_FACTOR = 6;
  const XMMRegister xmm_counter_shuf_mask = xmm0;
  const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
  const XMMRegister xmm_curr_counter = xmm2;

  const XMMRegister xmm_key_tmp0 = xmm3;
  const XMMRegister xmm_key_tmp1 = xmm4;

  // registers holding the four results in the parallelized loop
  const XMMRegister xmm_result0 = xmm5;
  const XMMRegister xmm_result1 = xmm6;
  const XMMRegister xmm_result2 = xmm7;
  const XMMRegister xmm_result3 = xmm8;
  const XMMRegister xmm_result4 = xmm9;
  const XMMRegister xmm_result5 = xmm10;

  const XMMRegister xmm_from0 = xmm11;
  const XMMRegister xmm_from1 = xmm12;
  const XMMRegister xmm_from2 = xmm13;
  const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
  const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
  const XMMRegister xmm_from5 = xmm4;

  //for key_128, key_192, key_256
  const int rounds[3] = {10, 12, 14};
  Label L_exit_preLoop, L_preLoop_start;
  Label L_multiBlock_loopTop[3];
  Label L_singleBlockLoopTop[3];
  Label L__incCounter[3][6]; //for 6 blocks
  Label L__incCounter_single[3]; //for single block, key128, key192, key256
  Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
  Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];

  Label L_exit;

  __ enter(); // required for proper stackwalking of RuntimeStub frame

#ifdef _WIN64
  // allocate spill slots for r13, r14
  enum {
      saved_r13_offset,
      saved_r14_offset
  };
  __ subptr(rsp, 2 * wordSize);
  __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
  __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);

  // on win64, fill len_reg from stack position
  __ movl(len_reg, len_mem);
  __ movptr(saved_encCounter_start, saved_encCounter_mem);
  __ movptr(used_addr, used_mem);
  __ movl(used, Address(used_addr, 0));
#else
  __ push_ppx(len_reg); // Save
  __ movptr(used_addr, used_mem);
  __ movl(used, Address(used_addr, 0));
#endif

  __ push_ppx(rbx); // Save RBX
  __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
  __ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()), pos /*rscratch*/);
  __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
  __ movptr(pos, 0);

  // Use the partially used encrpyted counter from last invocation
  __ BIND(L_preLoop_start);
  __ cmpptr(used, 16);
  __ jcc(Assembler::aboveEqual, L_exit_preLoop);
    __ cmpptr(len_reg, 0);
    __ jcc(Assembler::lessEqual, L_exit_preLoop);
    __ movb(rbx, Address(saved_encCounter_start, used));
    __ xorb(rbx, Address(from, pos));
    __ movb(Address(to, pos), rbx);
    __ addptr(pos, 1);
    __ addptr(used, 1);
    __ subptr(len_reg, 1);

  __ jmp(L_preLoop_start);

  __ BIND(L_exit_preLoop);
  __ movl(Address(used_addr, 0), used);

  // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
  __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
  __ cmpl(rbx, 52);
  __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
  __ cmpl(rbx, 60);
  __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);

#define CTR_DoSix(opc, src_reg)                \
  __ opc(xmm_result0, src_reg);              \
  __ opc(xmm_result1, src_reg);              \
  __ opc(xmm_result2, src_reg);              \
  __ opc(xmm_result3, src_reg);              \
  __ opc(xmm_result4, src_reg);              \
  __ opc(xmm_result5, src_reg);

  // k == 0 :  generate code for key_128
  // k == 1 :  generate code for key_192
  // k == 2 :  generate code for key_256
  for (int k = 0; k < 3; ++k) {
    //multi blocks starts here
    __ align(OptoLoopAlignment);
    __ BIND(L_multiBlock_loopTop[k]);
    __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
    __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
    load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);

    //load, then increase counters
    CTR_DoSix(movdqa, xmm_curr_counter);
    inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
    inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
    inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
    inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
    inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
    inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
    CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
    CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key

    //load two ROUND_KEYs at a time
    for (int i = 1; i < rounds[k]; ) {
      load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
      load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
      CTR_DoSix(aesenc, xmm_key_tmp1);
      i++;
      if (i != rounds[k]) {
        CTR_DoSix(aesenc, xmm_key_tmp0);
      } else {
        CTR_DoSix(aesenclast, xmm_key_tmp0);
      }
      i++;
    }

    // get next PARALLEL_FACTOR blocks into xmm_result registers
    __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
    __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
    __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
    __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
    __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
    __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));

    __ pxor(xmm_result0, xmm_from0);
    __ pxor(xmm_result1, xmm_from1);
    __ pxor(xmm_result2, xmm_from2);
    __ pxor(xmm_result3, xmm_from3);
    __ pxor(xmm_result4, xmm_from4);
    __ pxor(xmm_result5, xmm_from5);

    // store 6 results into the next 64 bytes of output
    __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
    __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
    __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
    __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
    __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
    __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);

    __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
    __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
    __ jmp(L_multiBlock_loopTop[k]);

    // singleBlock starts here
    __ align(OptoLoopAlignment);
    __ BIND(L_singleBlockLoopTop[k]);
    __ cmpptr(len_reg, 0);
    __ jcc(Assembler::lessEqual, L_exit);
    load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
    __ movdqa(xmm_result0, xmm_curr_counter);
    inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
    __ pshufb(xmm_result0, xmm_counter_shuf_mask);
    __ pxor(xmm_result0, xmm_key_tmp0);
    for (int i = 1; i < rounds[k]; i++) {
      load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
      __ aesenc(xmm_result0, xmm_key_tmp0);
    }
    load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
    __ aesenclast(xmm_result0, xmm_key_tmp0);
    __ cmpptr(len_reg, AESBlockSize);
    __ jcc(Assembler::less, L_processTail_insr[k]);
      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
      __ pxor(xmm_result0, xmm_from0);
      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
      __ addptr(pos, AESBlockSize);
      __ subptr(len_reg, AESBlockSize);
      __ jmp(L_singleBlockLoopTop[k]);
    __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
      __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
      __ testptr(len_reg, 8);
      __ jcc(Assembler::zero, L_processTail_4_insr[k]);
        __ subptr(pos,8);
        __ pinsrq(xmm_from0, Address(from, pos), 0);
      __ BIND(L_processTail_4_insr[k]);
      __ testptr(len_reg, 4);
      __ jcc(Assembler::zero, L_processTail_2_insr[k]);
        __ subptr(pos,4);
        __ pslldq(xmm_from0, 4);
        __ pinsrd(xmm_from0, Address(from, pos), 0);
      __ BIND(L_processTail_2_insr[k]);
      __ testptr(len_reg, 2);
      __ jcc(Assembler::zero, L_processTail_1_insr[k]);
        __ subptr(pos, 2);
        __ pslldq(xmm_from0, 2);
        __ pinsrw(xmm_from0, Address(from, pos), 0);
      __ BIND(L_processTail_1_insr[k]);
      __ testptr(len_reg, 1);
      __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
        __ subptr(pos, 1);
        __ pslldq(xmm_from0, 1);
        __ pinsrb(xmm_from0, Address(from, pos), 0);
      __ BIND(L_processTail_exit_insr[k]);

      __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
      __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.

      __ testptr(len_reg, 8);
      __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
        __ pextrq(Address(to, pos), xmm_result0, 0);
        __ psrldq(xmm_result0, 8);
        __ addptr(pos, 8);
      __ BIND(L_processTail_4_extr[k]);
      __ testptr(len_reg, 4);
      __ jcc(Assembler::zero, L_processTail_2_extr[k]);
        __ pextrd(Address(to, pos), xmm_result0, 0);
        __ psrldq(xmm_result0, 4);
        __ addptr(pos, 4);
      __ BIND(L_processTail_2_extr[k]);
      __ testptr(len_reg, 2);
      __ jcc(Assembler::zero, L_processTail_1_extr[k]);
        __ pextrw(Address(to, pos), xmm_result0, 0);
        __ psrldq(xmm_result0, 2);
        __ addptr(pos, 2);
      __ BIND(L_processTail_1_extr[k]);
      __ testptr(len_reg, 1);
      __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
        __ pextrb(Address(to, pos), xmm_result0, 0);

      __ BIND(L_processTail_exit_extr[k]);
      __ movl(Address(used_addr, 0), len_reg);
      __ jmp(L_exit);
  }

  __ BIND(L_exit);
  __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
  __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
  __ pop_ppx(rbx); // pop the saved RBX.
#ifdef _WIN64
  __ movl(rax, len_mem);
  __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
  __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
  __ addptr(rsp, 2 * wordSize);
#else
  __ pop_ppx(rax); // return 'len'
#endif
  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}

address StubGenerator::generate_cipherBlockChaining_decryptVectorAESCrypt() {
  assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register from = c_rarg0;  // source array address
  const Register to = c_rarg1;  // destination array address
  const Register key = c_rarg2;  // key array address
  const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
  // and left with the results of the last encryption block
#ifndef _WIN64
  const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
#else
  const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
  const Register len_reg = r11;      // pick the volatile windows register
#endif

  Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
        Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;

  __ enter();

#ifdef _WIN64
// on win64, fill len_reg from stack position
  __ movl(len_reg, len_mem);
#else
  __ push_ppx(len_reg); // Save
#endif
  __ push_ppx(rbx);
  __ vzeroupper();

  // Temporary variable declaration for swapping key bytes
  const XMMRegister xmm_key_shuf_mask = xmm1;
  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);

  // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
  const Register rounds = rbx;
  __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  const XMMRegister IV = xmm0;
  // Load IV and broadcast value to 512-bits
  __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);

  // Temporary variables for storing round keys
  const XMMRegister RK0 = xmm30;
  const XMMRegister RK1 = xmm9;
  const XMMRegister RK2 = xmm18;
  const XMMRegister RK3 = xmm19;
  const XMMRegister RK4 = xmm20;
  const XMMRegister RK5 = xmm21;
  const XMMRegister RK6 = xmm22;
  const XMMRegister RK7 = xmm23;
  const XMMRegister RK8 = xmm24;
  const XMMRegister RK9 = xmm25;
  const XMMRegister RK10 = xmm26;

  // Load and shuffle key
  // the java expanded key ordering is rotated one position from what we want
  // so we start from 1*16 here and hit 0*16 last
  ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
  ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
  ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
  ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
  ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
  ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
  ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
  ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
  ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
  ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
  ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);

  // Variables for storing source cipher text
  const XMMRegister S0 = xmm10;
  const XMMRegister S1 = xmm11;
  const XMMRegister S2 = xmm12;
  const XMMRegister S3 = xmm13;
  const XMMRegister S4 = xmm14;
  const XMMRegister S5 = xmm15;
  const XMMRegister S6 = xmm16;
  const XMMRegister S7 = xmm17;

  // Variables for storing decrypted text
  const XMMRegister B0 = xmm1;
  const XMMRegister B1 = xmm2;
  const XMMRegister B2 = xmm3;
  const XMMRegister B3 = xmm4;
  const XMMRegister B4 = xmm5;
  const XMMRegister B5 = xmm6;
  const XMMRegister B6 = xmm7;
  const XMMRegister B7 = xmm8;

  __ cmpl(rounds, 44);
  __ jcc(Assembler::greater, KEY_192);
  __ jmp(Loop);

  __ BIND(KEY_192);
  const XMMRegister RK11 = xmm27;
  const XMMRegister RK12 = xmm28;
  ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
  ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);

  __ cmpl(rounds, 52);
  __ jcc(Assembler::greater, KEY_256);
  __ jmp(Loop);

  __ BIND(KEY_256);
  const XMMRegister RK13 = xmm29;
  const XMMRegister RK14 = xmm31;
  ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
  ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);

  __ BIND(Loop);
  __ cmpl(len_reg, 512);
  __ jcc(Assembler::below, Lcbc_dec_rem);
  __ BIND(Loop1);
  __ subl(len_reg, 512);
  __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
  __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
  __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
  __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
  __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
  __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
  __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
  __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
  __ leaq(from, Address(from, 8 * 64));

  __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
  __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
  __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
  __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
  __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
  __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
  __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
  __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);

  __ evalignq(IV, S0, IV, 0x06);
  __ evalignq(S0, S1, S0, 0x06);
  __ evalignq(S1, S2, S1, 0x06);
  __ evalignq(S2, S3, S2, 0x06);
  __ evalignq(S3, S4, S3, 0x06);
  __ evalignq(S4, S5, S4, 0x06);
  __ evalignq(S5, S6, S5, 0x06);
  __ evalignq(S6, S7, S6, 0x06);

  roundDec(RK2);
  roundDec(RK3);
  roundDec(RK4);
  roundDec(RK5);
  roundDec(RK6);
  roundDec(RK7);
  roundDec(RK8);
  roundDec(RK9);
  roundDec(RK10);

  __ cmpl(rounds, 44);
  __ jcc(Assembler::belowEqual, L_128);
  roundDec(RK11);
  roundDec(RK12);

  __ cmpl(rounds, 52);
  __ jcc(Assembler::belowEqual, L_192);
  roundDec(RK13);
  roundDec(RK14);

  __ BIND(L_256);
  roundDeclast(RK0);
  __ jmp(Loop2);

  __ BIND(L_128);
  roundDeclast(RK0);
  __ jmp(Loop2);

  __ BIND(L_192);
  roundDeclast(RK0);

  __ BIND(Loop2);
  __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
  __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
  __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
  __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
  __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
  __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
  __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
  __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
  __ evmovdquq(IV, S7, Assembler::AVX_512bit);

  __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
  __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
  __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
  __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
  __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
  __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
  __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
  __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
  __ leaq(to, Address(to, 8 * 64));
  __ jmp(Loop);

  __ BIND(Lcbc_dec_rem);
  __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);

  __ BIND(Lcbc_dec_rem_loop);
  __ subl(len_reg, 16);
  __ jcc(Assembler::carrySet, Lcbc_dec_ret);

  __ movdqu(S0, Address(from, 0));
  __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
  __ cmpl(rounds, 44);
  __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);

  __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);

  __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
  __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);

  __ BIND(Lcbc_dec_rem_last);
  __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);

  __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
  __ evmovdquq(IV, S0, Assembler::AVX_512bit);
  __ movdqu(Address(to, 0), B0);
  __ leaq(from, Address(from, 16));
  __ leaq(to, Address(to, 16));
  __ jmp(Lcbc_dec_rem_loop);

  __ BIND(Lcbc_dec_ret);
  __ movdqu(Address(rvec, 0), IV);

  // Zero out the round keys
  __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
  __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
  __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
  __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
  __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
  __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
  __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
  __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
  __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
  __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
  __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
  __ cmpl(rounds, 44);
  __ jcc(Assembler::belowEqual, Lcbc_exit);
  __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
  __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::belowEqual, Lcbc_exit);
  __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
  __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);

  __ BIND(Lcbc_exit);
  __ vzeroupper();
  __ pop_ppx(rbx);
#ifdef _WIN64
  __ movl(rax, len_mem);
#else
  __ pop_ppx(rax); // return length
#endif
  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}

// Arguments:
//
// Inputs:
//   c_rarg0   - source byte array address
//   c_rarg1   - destination byte array address
//   c_rarg2   - sessionKe (key) in little endian int array
//
address StubGenerator::generate_aescrypt_encryptBlock() {
  assert(UseAES, "need AES instructions and misaligned SSE support");
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
  StubCodeMark mark(this, stub_id);
  Label L_doLast;
  address start = __ pc();

  const Register from        = c_rarg0;  // source array address
  const Register to          = c_rarg1;  // destination array address
  const Register key         = c_rarg2;  // key array address
  const Register keylen      = rax;

  const XMMRegister xmm_result = xmm0;
  const XMMRegister xmm_key_shuf_mask = xmm1;
  // On win64 xmm6-xmm15 must be preserved so don't use them.
  const XMMRegister xmm_temp1  = xmm2;
  const XMMRegister xmm_temp2  = xmm3;
  const XMMRegister xmm_temp3  = xmm4;
  const XMMRegister xmm_temp4  = xmm5;

  __ enter(); // required for proper stackwalking of RuntimeStub frame

  // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
  __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/);
  __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input

  // For encryption, the java expanded key ordering is just what we need
  // we don't know if the key is aligned, hence not using load-execute form

  load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
  __ pxor(xmm_result, xmm_temp1);

  load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
  load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
  load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);

  __ aesenc(xmm_result, xmm_temp1);
  __ aesenc(xmm_result, xmm_temp2);
  __ aesenc(xmm_result, xmm_temp3);
  __ aesenc(xmm_result, xmm_temp4);

  load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
  load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
  load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);

  __ aesenc(xmm_result, xmm_temp1);
  __ aesenc(xmm_result, xmm_temp2);
  __ aesenc(xmm_result, xmm_temp3);
  __ aesenc(xmm_result, xmm_temp4);

  load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);

  __ cmpl(keylen, 44);
  __ jccb(Assembler::equal, L_doLast);

  __ aesenc(xmm_result, xmm_temp1);
  __ aesenc(xmm_result, xmm_temp2);

  load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);

  __ cmpl(keylen, 52);
  __ jccb(Assembler::equal, L_doLast);

  __ aesenc(xmm_result, xmm_temp1);
  __ aesenc(xmm_result, xmm_temp2);

  load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

  __ BIND(L_doLast);
  __ aesenc(xmm_result, xmm_temp1);
  __ aesenclast(xmm_result, xmm_temp2);
  __ movdqu(Address(to, 0), xmm_result);        // store the result
  __ xorptr(rax, rax); // return 0

  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}

// Arguments:
//
// Inputs:
//   c_rarg0   - source byte array address
//   c_rarg1   - destination byte array address
//   c_rarg2   - sessionKd (key) in little endian int array
//
address StubGenerator::generate_aescrypt_decryptBlock() {
  assert(UseAES, "need AES instructions and misaligned SSE support");
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
  StubCodeMark mark(this, stub_id);
  Label L_doLast;
  address start = __ pc();

  const Register from        = c_rarg0;  // source array address
  const Register to          = c_rarg1;  // destination array address
  const Register key         = c_rarg2;  // key array address
  const Register keylen      = rax;

  const XMMRegister xmm_result = xmm0;
  const XMMRegister xmm_key_shuf_mask = xmm1;
  // On win64 xmm6-xmm15 must be preserved so don't use them.
  const XMMRegister xmm_temp1  = xmm2;
  const XMMRegister xmm_temp2  = xmm3;
  const XMMRegister xmm_temp3  = xmm4;
  const XMMRegister xmm_temp4  = xmm5;

  __ enter(); // required for proper stackwalking of RuntimeStub frame

  // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
  __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/);
  __ movdqu(xmm_result, Address(from, 0));

  // for decryption java expanded key ordering is rotated one position from what we want
  // so we start from 0x10 here and hit 0x00 last
  // we don't know if the key is aligned, hence not using load-execute form
  load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
  load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
  load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);

  __ pxor  (xmm_result, xmm_temp1);
  __ aesdec(xmm_result, xmm_temp2);
  __ aesdec(xmm_result, xmm_temp3);
  __ aesdec(xmm_result, xmm_temp4);

  load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
  load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
  load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);

  __ aesdec(xmm_result, xmm_temp1);
  __ aesdec(xmm_result, xmm_temp2);
  __ aesdec(xmm_result, xmm_temp3);
  __ aesdec(xmm_result, xmm_temp4);

  load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
  load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);

  __ cmpl(keylen, 44);
  __ jccb(Assembler::equal, L_doLast);

  __ aesdec(xmm_result, xmm_temp1);
  __ aesdec(xmm_result, xmm_temp2);

  load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);

  __ cmpl(keylen, 52);
  __ jccb(Assembler::equal, L_doLast);

  __ aesdec(xmm_result, xmm_temp1);
  __ aesdec(xmm_result, xmm_temp2);

  load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
  load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

  __ BIND(L_doLast);
  __ aesdec(xmm_result, xmm_temp1);
  __ aesdec(xmm_result, xmm_temp2);

  // for decryption the aesdeclast operation is always on key+0x00
  __ aesdeclast(xmm_result, xmm_temp3);
  __ movdqu(Address(to, 0), xmm_result);  // store the result
  __ xorptr(rax, rax); // return 0

  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}


// Arguments:
//
// Inputs:
//   c_rarg0   - source byte array address
//   c_rarg1   - destination byte array address
//   c_rarg2   - sessionKe (key) in little endian int array
//   c_rarg3   - r vector byte array address
//   c_rarg4   - input length
//
// Output:
//   rax       - input length
//
address StubGenerator::generate_cipherBlockChaining_encryptAESCrypt() {
  assert(UseAES, "need AES instructions and misaligned SSE support");
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
  const Register from        = c_rarg0;  // source array address
  const Register to          = c_rarg1;  // destination array address
  const Register key         = c_rarg2;  // key array address
  const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
                                         // and left with the results of the last encryption block
#ifdef _WIN64
  const Address  len_mem(rbp, 6 * wordSize); // length is on stack on Win64
  const Register len_reg     = r11;      // pick the volatile windows register
#else
  const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
#endif
  const Register pos         = rax;

  // xmm register assignments for the loops below
  const XMMRegister xmm_result = xmm0;
  const XMMRegister xmm_temp   = xmm1;
  // keys 0-10 preloaded into xmm2-xmm12
  const int XMM_REG_NUM_KEY_FIRST = 2;
  const int XMM_REG_NUM_KEY_LAST  = 15;
  const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
  const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
  const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
  const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
  const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);

  __ enter(); // required for proper stackwalking of RuntimeStub frame

#ifdef _WIN64
  // on win64, fill len_reg from stack position
  __ movl(len_reg, len_mem);
#else
  __ push_ppx(len_reg); // Save
#endif

  const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/);
  // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
  for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
    load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
    offset += 0x10;
  }
  __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec


  // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
  __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
  __ cmpl(rax, 44);
  __ jcc(Assembler::notEqual, L_key_192_256);

  // 128 bit code follows here
  __ movptr(pos, 0);
  __ align(OptoLoopAlignment);

  __ BIND(L_loopTop_128);
  __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
  __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
  __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
  for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
    __ aesenc(xmm_result, as_XMMRegister(rnum));
  }
  __ aesenclast(xmm_result, xmm_key10);
  __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
  // no need to store r to memory until we exit
  __ addptr(pos, AESBlockSize);
  __ subptr(len_reg, AESBlockSize);
  __ jcc(Assembler::notEqual, L_loopTop_128);

  __ BIND(L_exit);
  __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object

#ifdef _WIN64
  __ movl(rax, len_mem);
#else
  __ pop_ppx(rax); // return length
#endif
  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  __ BIND(L_key_192_256);
  // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
  load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
  load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
  __ cmpl(rax, 52);
  __ jcc(Assembler::notEqual, L_key_256);

  // 192-bit code follows here (could be changed to use more xmm registers)
  __ movptr(pos, 0);
  __ align(OptoLoopAlignment);

  __ BIND(L_loopTop_192);
  __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
  __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
  __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
  for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
    __ aesenc(xmm_result, as_XMMRegister(rnum));
  }
  __ aesenclast(xmm_result, xmm_key12);
  __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
  // no need to store r to memory until we exit
  __ addptr(pos, AESBlockSize);
  __ subptr(len_reg, AESBlockSize);
  __ jcc(Assembler::notEqual, L_loopTop_192);
  __ jmp(L_exit);

  __ BIND(L_key_256);
  // 256-bit code follows here (could be changed to use more xmm registers)
  load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
  __ movptr(pos, 0);
  __ align(OptoLoopAlignment);

  __ BIND(L_loopTop_256);
  __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
  __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
  __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
  for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
    __ aesenc(xmm_result, as_XMMRegister(rnum));
  }
  load_key(xmm_temp, key, 0xe0, r10 /*rscratch*/);
  __ aesenclast(xmm_result, xmm_temp);
  __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
  // no need to store r to memory until we exit
  __ addptr(pos, AESBlockSize);
  __ subptr(len_reg, AESBlockSize);
  __ jcc(Assembler::notEqual, L_loopTop_256);
  __ jmp(L_exit);

  return start;
}

// This is a version of ECB/AES Encrypt/Decrypt which does 4 blocks in a loop
// at a time to hide instruction latency.
//
// For encryption (is_encrypt=true):
//   pxor key[0], aesenc key[1..rounds-1], aesenclast key[rounds]
// For decryption (is_encrypt=false):
//   pxor key[1], aesdec key[2..rounds], aesdeclast key[0]
//
// Arguments:
//
// Inputs:
//   c_rarg0   - source byte array address
//   c_rarg1   - destination byte array address
//   c_rarg2   - session key (Ke/Kd) in little endian int array
//   c_rarg3   - input length (must be multiple of blocksize 16)
//
// Output:
//   rax       - input length
//
address StubGenerator::generate_electronicCodeBook_AESCrypt_Parallel(bool is_encrypt) {
  assert(UseAES, "need AES instructions and misaligned SSE support");
  __ align(CodeEntryAlignment);
  StubId stub_id = is_encrypt ? StubId::stubgen_electronicCodeBook_encryptAESCrypt_id
                              : StubId::stubgen_electronicCodeBook_decryptAESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register from    = c_rarg0;  // source array address
  const Register to      = c_rarg1;  // destination array address
  const Register key     = c_rarg2;  // key array address
  const Register len_reg = c_rarg3;  // src len (must be multiple of blocksize 16)
  const Register pos     = rax;
  const Register keylen  = r11;

  const XMMRegister xmm_result0       = xmm0;
  const XMMRegister xmm_result1       = xmm1;
  const XMMRegister xmm_result2       = xmm2;
  const XMMRegister xmm_result3       = xmm3;
  const XMMRegister xmm_key_shuf_mask = xmm4;
  const XMMRegister xmm_key_tmp       = xmm5;
  // keys 0-9 pre-loaded into xmm6-xmm15
  const int XMM_REG_NUM_KEY_FIRST = 6;
  const int XMM_REG_NUM_KEY_LAST  = 15;
  const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);

  // for key_128, key_192, key_256
  const int ROUNDS[3] = {10, 12, 14};

  Label L_exit;
  Label L_loop4[3], L_single[3], L_done[3];

#ifdef DoFour
#undef DoFour
#endif
#ifdef DoOne
#undef DoOne
#endif

#define DoFour(opc, reg)           \
__ opc(xmm_result0, reg);         \
__ opc(xmm_result1, reg);         \
__ opc(xmm_result2, reg);         \
__ opc(xmm_result3, reg);

#define DoOne(opc, reg)            \
__ opc(xmm_result0, reg);

  __ enter(); // required for proper stackwalking of RuntimeStub frame
  __ push(len_reg); // save original length for return value

  __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), r10 /*rscratch*/);
  // load up xmm regs 6 thru 15 with keys 0x00 - 0x90
  for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++, offset += 0x10) {
    load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
  }
  __ xorptr(pos, pos);

  // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
  __ cmpl(keylen, 52);
  __ jcc(Assembler::equal, L_loop4[1]);
  __ cmpl(keylen, 60);
  __ jcc(Assembler::equal, L_loop4[2]);

  // k == 0: generate code for key_128
  // k == 1: generate code for key_192
  // k == 2: generate code for key_256
  for (int k = 0; k < 3; ++k) {
    __ align(OptoLoopAlignment);
    __ BIND(L_loop4[k]);
    __ cmpptr(len_reg, 4 * AESBlockSize);
    __ jcc(Assembler::less, L_single[k]);

    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
    __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
    __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
    __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));

    if (is_encrypt) {
      DoFour(pxor, xmm_key_first);
      for (int rnum = 1; rnum < 10; rnum++) {
        DoFour(aesenc, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
      }
      for (int i = 10; i < ROUNDS[k]; i++) {
        load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask);
        DoFour(aesenc, xmm_key_tmp);
      }
      load_key(xmm_key_tmp, key, ROUNDS[k] * 0x10, xmm_key_shuf_mask);
      DoFour(aesenclast, xmm_key_tmp);
    } else {
      DoFour(pxor, as_XMMRegister(1 + XMM_REG_NUM_KEY_FIRST));
      for (int rnum = 2; rnum < 10; rnum++) {
        DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
      }
      for (int i = 10; i <= ROUNDS[k]; i++) {
        load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask);
        DoFour(aesdec, xmm_key_tmp);
      }
      DoFour(aesdeclast, xmm_key_first);
    }

    __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
    __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
    __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
    __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);

    __ addptr(pos, 4 * AESBlockSize);
    __ subptr(len_reg, 4 * AESBlockSize);
    __ jmp(L_loop4[k]);

    __ align(OptoLoopAlignment);
    __ BIND(L_single[k]);
    __ cmpptr(len_reg, AESBlockSize);
    __ jcc(Assembler::less, L_done[k]);

    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0));

    if (is_encrypt) {
      DoOne(pxor, xmm_key_first);
      for (int rnum = 1; rnum < 10; rnum++) {
        DoOne(aesenc, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
      }
      for (int i = 10; i < ROUNDS[k]; i++) {
        load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask);
        DoOne(aesenc, xmm_key_tmp);
      }
      load_key(xmm_key_tmp, key, ROUNDS[k] * 0x10, xmm_key_shuf_mask);
      DoOne(aesenclast, xmm_key_tmp);
    } else {
      DoOne(pxor, as_XMMRegister(1 + XMM_REG_NUM_KEY_FIRST));
      for (int rnum = 2; rnum < 10; rnum++) {
        DoOne(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
      }
      for (int i = 10; i <= ROUNDS[k]; i++) {
        load_key(xmm_key_tmp, key, i * 0x10, xmm_key_shuf_mask);
        DoOne(aesdec, xmm_key_tmp);
      }
      DoOne(aesdeclast, xmm_key_first);
    }

    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0);
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jmp(L_single[k]);

    __ BIND(L_done[k]);
    if (k < 2) __ jmp(L_exit);
  } //for key_128/192/256

  __ BIND(L_exit);
  // Clear all XMM registers holding sensitive key material before returning
  __ pxor(xmm_key_tmp, xmm_key_tmp);
  for (int rnum = XMM_REG_NUM_KEY_FIRST; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
    __ pxor(as_XMMRegister(rnum), as_XMMRegister(rnum));
  }
  __ pop(rax);
  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;

#undef DoFour
#undef DoOne
}

address StubGenerator::generate_electronicCodeBook_encryptAESCrypt_Parallel() {
  return generate_electronicCodeBook_AESCrypt_Parallel(true);
}

address StubGenerator::generate_electronicCodeBook_decryptAESCrypt_Parallel() {
  return generate_electronicCodeBook_AESCrypt_Parallel(false);
}

// This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
// to hide instruction latency
//
// Arguments:
//
// Inputs:
//   c_rarg0   - source byte array address
//   c_rarg1   - destination byte array address
//   c_rarg2   - sessionKd (key) in little endian int array
//   c_rarg3   - r vector byte array address
//   c_rarg4   - input length
//
// Output:
//   rax       - input length
//
address StubGenerator::generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
  assert(UseAES, "need AES instructions and misaligned SSE support");
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register from        = c_rarg0;  // source array address
  const Register to          = c_rarg1;  // destination array address
  const Register key         = c_rarg2;  // key array address
  const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
                                         // and left with the results of the last encryption block
#ifndef _WIN64
  const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
#else
  const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
  const Register len_reg     = r11;      // pick the volatile windows register
#endif
  const Register pos         = rax;

  const int PARALLEL_FACTOR = 4;
  const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256

  Label L_exit;
  Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
  Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
  Label L_singleBlock_loopTop[3]; // 128, 192, 256
  Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
  Label L_multiBlock_loopTop[3]; // 128, 192, 256

  // keys 0-10 preloaded into xmm5-xmm15
  const int XMM_REG_NUM_KEY_FIRST = 5;
  const int XMM_REG_NUM_KEY_LAST  = 15;
  const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
  const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);

  __ enter(); // required for proper stackwalking of RuntimeStub frame

#ifdef _WIN64
  // on win64, fill len_reg from stack position
  __ movl(len_reg, len_mem);
#else
  __ push_ppx(len_reg); // Save
#endif
  __ push_ppx(rbx);
  // the java expanded key ordering is rotated one position from what we want
  // so we start from 0x10 here and hit 0x00 last
  const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
  // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
  for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
    load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
    offset += 0x10;
  }
  load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);

  const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block

  // registers holding the four results in the parallelized loop
  const XMMRegister xmm_result0 = xmm0;
  const XMMRegister xmm_result1 = xmm2;
  const XMMRegister xmm_result2 = xmm3;
  const XMMRegister xmm_result3 = xmm4;

  __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec

  __ xorptr(pos, pos);

  // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
  __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
  __ cmpl(rbx, 52);
  __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
  __ cmpl(rbx, 60);
  __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);

#define DoFour(opc, src_reg)           \
__ opc(xmm_result0, src_reg);         \
__ opc(xmm_result1, src_reg);         \
__ opc(xmm_result2, src_reg);         \
__ opc(xmm_result3, src_reg);

  for (int k = 0; k < 3; ++k) {
    __ BIND(L_multiBlock_loopTopHead[k]);
    if (k != 0) {
      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
      __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
    }
    if (k == 1) {
      __ subptr(rsp, 6 * wordSize);
      __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
      load_key(xmm15, key, 0xb0, rbx /*rscratch*/); // 0xb0; 192-bit key goes up to 0xc0
      __ movdqu(Address(rsp, 2 * wordSize), xmm15);
      load_key(xmm1, key, 0xc0, rbx /*rscratch*/);  // 0xc0;
      __ movdqu(Address(rsp, 4 * wordSize), xmm1);
    } else if (k == 2) {
      __ subptr(rsp, 10 * wordSize);
      __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
      load_key(xmm15, key, 0xd0, rbx /*rscratch*/); // 0xd0; 256-bit key goes up to 0xe0
      __ movdqu(Address(rsp, 6 * wordSize), xmm15);
      load_key(xmm1, key, 0xe0, rbx /*rscratch*/);  // 0xe0;
      __ movdqu(Address(rsp, 8 * wordSize), xmm1);
      load_key(xmm15, key, 0xb0, rbx /*rscratch*/); // 0xb0;
      __ movdqu(Address(rsp, 2 * wordSize), xmm15);
      load_key(xmm1, key, 0xc0, rbx /*rscratch*/);  // 0xc0;
      __ movdqu(Address(rsp, 4 * wordSize), xmm1);
    }
    __ align(OptoLoopAlignment);
    __ BIND(L_multiBlock_loopTop[k]);
    __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
    __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);

    if  (k != 0) {
      __ movdqu(xmm15, Address(rsp, 2 * wordSize));
      __ movdqu(xmm1, Address(rsp, 4 * wordSize));
    }

    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
    __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
    __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
    __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));

    DoFour(pxor, xmm_key_first);
    if (k == 0) {
      for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
        DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
      }
      DoFour(aesdeclast, xmm_key_last);
    } else if (k == 1) {
      for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
        DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
      }
      __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
      DoFour(aesdec, xmm1);  // key : 0xc0
      __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
      DoFour(aesdeclast, xmm_key_last);
    } else if (k == 2) {
      for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
        DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
      }
      DoFour(aesdec, xmm1);  // key : 0xc0
      __ movdqu(xmm15, Address(rsp, 6 * wordSize));
      __ movdqu(xmm1, Address(rsp, 8 * wordSize));
      DoFour(aesdec, xmm15);  // key : 0xd0
      __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
      DoFour(aesdec, xmm1);  // key : 0xe0
      __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
      DoFour(aesdeclast, xmm_key_last);
    }

    // for each result, xor with the r vector of previous cipher block
    __ pxor(xmm_result0, xmm_prev_block_cipher);
    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
    __ pxor(xmm_result1, xmm_prev_block_cipher);
    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
    __ pxor(xmm_result2, xmm_prev_block_cipher);
    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
    __ pxor(xmm_result3, xmm_prev_block_cipher);
    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
    if (k != 0) {
      __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
    }

    __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
    __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
    __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
    __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);

    __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
    __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
    __ jmp(L_multiBlock_loopTop[k]);

    // registers used in the non-parallelized loops
    // xmm register assignments for the loops below
    const XMMRegister xmm_result = xmm0;
    const XMMRegister xmm_prev_block_cipher_save = xmm2;
    const XMMRegister xmm_key11 = xmm3;
    const XMMRegister xmm_key12 = xmm4;
    const XMMRegister key_tmp = xmm4;

    __ BIND(L_singleBlock_loopTopHead[k]);
    if (k == 1) {
      __ addptr(rsp, 6 * wordSize);
    } else if (k == 2) {
      __ addptr(rsp, 10 * wordSize);
    }
    __ cmpptr(len_reg, 0); // any blocks left??
    __ jcc(Assembler::equal, L_exit);
    __ BIND(L_singleBlock_loopTopHead2[k]);
    if (k == 1) {
      load_key(xmm_key11, key, 0xb0, rbx /*rscratch*/); // 0xb0; 192-bit key goes up to 0xc0
      load_key(xmm_key12, key, 0xc0, rbx /*rscratch*/); // 0xc0; 192-bit key goes up to 0xc0
    }
    if (k == 2) {
      load_key(xmm_key11, key, 0xb0, rbx /*rscratch*/); // 0xb0; 256-bit key goes up to 0xe0
    }
    __ align(OptoLoopAlignment);
    __ BIND(L_singleBlock_loopTop[k]);
    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
    __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
    __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
    for (int rnum = 1; rnum <= 9 ; rnum++) {
        __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
    }
    if (k == 1) {
      __ aesdec(xmm_result, xmm_key11);
      __ aesdec(xmm_result, xmm_key12);
    }
    if (k == 2) {
      __ aesdec(xmm_result, xmm_key11);
      load_key(key_tmp, key, 0xc0, rbx /*rscratch*/);
      __ aesdec(xmm_result, key_tmp);
      load_key(key_tmp, key, 0xd0, rbx /*rscratch*/);
      __ aesdec(xmm_result, key_tmp);
      load_key(key_tmp, key, 0xe0, rbx /*rscratch*/);
      __ aesdec(xmm_result, key_tmp);
    }

    __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
    __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
    // no need to store r to memory until we exit
    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
    if (k != 2) {
      __ jmp(L_exit);
    }
  } //for 128/192/256

  __ BIND(L_exit);
  __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
  __ pop_ppx(rbx);
#ifdef _WIN64
  __ movl(rax, len_mem);
#else
  __ pop_ppx(rax); // return length
#endif
  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}

address StubGenerator::generate_electronicCodeBook_encryptAESCrypt() {
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_electronicCodeBook_encryptAESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register from = c_rarg0;  // source array address
  const Register to = c_rarg1;  // destination array address
  const Register key = c_rarg2;  // key array address
  const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
  __ enter(); // required for proper stackwalking of RuntimeStub frame

  aesecb_encrypt(from, to, key, len);

  __ vzeroupper();
  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
 }

address StubGenerator::generate_electronicCodeBook_decryptAESCrypt() {
  __ align(CodeEntryAlignment);
  StubId stub_id = StubId::stubgen_electronicCodeBook_decryptAESCrypt_id;
  StubCodeMark mark(this, stub_id);
  address start = __ pc();

  const Register from = c_rarg0;  // source array address
  const Register to = c_rarg1;  // destination array address
  const Register key = c_rarg2;  // key array address
  const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
  __ enter(); // required for proper stackwalking of RuntimeStub frame

  aesecb_decrypt(from, to, key, len);

  __ vzeroupper();
  __ leave(); // required for proper stackwalking of RuntimeStub frame
  __ ret(0);

  return start;
}

// Utility routine for increase 128bit counter (iv in CTR mode)
void StubGenerator::inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
  __ pextrq(reg, xmmdst, 0x0);
  __ addq(reg, inc_delta);
  __ pinsrq(xmmdst, reg, 0x0);
  __ jcc(Assembler::carryClear, next_block); // jump if no carry
  __ pextrq(reg, xmmdst, 0x01); // Carry
  __ addq(reg, 0x01);
  __ pinsrq(xmmdst, reg, 0x01); //Carry end
  __ BIND(next_block);          // next instruction
}


void StubGenerator::roundEnc(XMMRegister key, int rnum) {
  for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
    __ vaesenc(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
  }
}

void StubGenerator::lastroundEnc(XMMRegister key, int rnum) {
  for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
    __ vaesenclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
  }
}

void StubGenerator::roundDec(XMMRegister key, int rnum) {
  for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
    __ vaesdec(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
  }
}

void StubGenerator::lastroundDec(XMMRegister key, int rnum) {
  for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
    __ vaesdeclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
  }
}

void StubGenerator::roundDec(XMMRegister xmm_reg) {
  __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
  __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
  __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
  __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
  __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
  __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
  __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
  __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
}

void StubGenerator::roundDeclast(XMMRegister xmm_reg) {
  __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
  __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
  __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
  __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
  __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
  __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
  __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
  __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
}

// Check incoming byte offset against the int[] len. key is the pointer to the int[0].
// This check happens often, so it is important for it to be very compact.
void StubGenerator::check_key_offset(Register key, int offset, int load_size) {
#ifdef ASSERT
  Address key_length(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT));
  assert((offset + load_size) % 4 == 0, "Alignment is good: %d + %d", offset, load_size);
  int end_offset = (offset + load_size) / 4;
  Label L_good;
  __ cmpl(key_length, end_offset);
  __ jccb(Assembler::greaterEqual, L_good);
  __ hlt();
  __ bind(L_good);
#endif
}

// Utility routine for loading a 128-bit key word in little endian format
void StubGenerator::load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask) {
  check_key_offset(key, offset, 16);
  __ movdqu(xmmdst, Address(key, offset));
  __ pshufb(xmmdst, xmm_shuf_mask);
}

void StubGenerator::load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch) {
  check_key_offset(key, offset, 16);
  __ movdqu(xmmdst, Address(key, offset));
  __ pshufb(xmmdst, ExternalAddress(key_shuffle_mask_addr()), rscratch);
}

void StubGenerator::ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask) {
  check_key_offset(key, offset, 16);
  __ movdqu(xmmdst, Address(key, offset));
  __ pshufb(xmmdst, xmm_shuf_mask);
  __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
}

void StubGenerator::ev_load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch) {
  check_key_offset(key, offset, 16);
  __ movdqu(xmmdst, Address(key, offset));
  __ pshufb(xmmdst, ExternalAddress(key_shuffle_mask_addr()), rscratch);
  __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
}

// Add 128-bit integers in xmmsrc1 to xmmsrc2, then place the result in xmmdst.
// Clobber ktmp and rscratch.
// Used by aesctr_encrypt.
void StubGenerator::ev_add128(XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
                              int vector_len, KRegister ktmp, XMMRegister ones) {
  __ vpaddq(xmmdst, xmmsrc1, xmmsrc2, vector_len);
  __ evpcmpuq(ktmp, xmmdst, xmmsrc2, __ lt, vector_len); // set mask[0/1] bit if addq to dst[0/1] wraps
  __ kshiftlbl(ktmp, ktmp, 1);                        // mask[1] <- mask[0], mask[0] <- 0, etc

  __ evpaddq(xmmdst, ktmp, xmmdst, ones, /*merge*/true, vector_len); // dst[1]++ if mask[1] set
}

// AES-ECB Encrypt Operation
void StubGenerator::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) {
  const Register pos = rax;
  const Register rounds = r12;

  Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
  __ push_ppx(r13);
  __ push_ppx(r12);

  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
  // context for the registers used, where all instructions below are using 128-bit mode
  // On EVEX without VL and BW, these instructions will all be AVX.
  if (VM_Version::supports_avx512vlbw()) {
    __ movl(rax, 0xffff);
    __ kmovql(k1, rax);
  }
  __ push_ppx(len); // Save
  __ push_ppx(rbx);

  __ vzeroupper();

  __ xorptr(pos, pos);

  // Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
  __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  // Load Key shuf mask
  const XMMRegister xmm_key_shuf_mask = xmm31;  // used temporarily to swap key bytes up front
  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);

  // Load and shuffle key based on number of rounds
  ev_load_key(xmm8, key, 0 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm23, key, 3 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm24, key, 10 * 16, xmm_key_shuf_mask);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::greaterEqual, KEY_192);
  __ jmp(Loop_start);

  __ bind(KEY_192);
  ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::equal, KEY_256);
  __ jmp(Loop_start);

  __ bind(KEY_256);
  ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);

  __ bind(Loop_start);
  __ movq(rbx, len);
  // Divide length by 16 to convert it to number of blocks
  __ shrq(len, 4);
  __ shlq(rbx, 60);
  __ jcc(Assembler::equal, NO_PARTS);
  __ addq(len, 1);
  // Check if number of blocks is greater than or equal to 32
  // If true, 512 bytes are processed at a time (code marked by label LOOP)
  // If not, 16 bytes are processed (code marked by REMAINDER label)
  __ bind(NO_PARTS);
  __ movq(rbx, len);
  __ shrq(len, 5);
  __ jcc(Assembler::equal, REMAINDER);
  __ movl(r13, len);
  // Compute number of blocks that will be processed 512 bytes at a time
  // Subtract this from the total number of blocks which will then be processed by REMAINDER loop
  __ shlq(r13, 5);
  __ subq(rbx, r13);
  //Begin processing 512 bytes
  __ bind(LOOP);
  // Move 64 bytes of PT data into a zmm register, as a result 512 bytes of PT loaded in zmm0-7
  __ evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
  // Xor with the first round key
  __ evpxorq(xmm0, xmm0, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm1, xmm1, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm2, xmm2, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm3, xmm3, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm4, xmm4, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm5, xmm5, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm6, xmm6, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm7, xmm7, xmm8, Assembler::AVX_512bit);
  // 9 Aes encode round operations
  roundEnc(xmm9,  7);
  roundEnc(xmm10, 7);
  roundEnc(xmm23, 7);
  roundEnc(xmm12, 7);
  roundEnc(xmm13, 7);
  roundEnc(xmm14, 7);
  roundEnc(xmm15, 7);
  roundEnc(xmm16, 7);
  roundEnc(xmm17, 7);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::aboveEqual, AES192);
  // Aesenclast round operation for keysize = 128
  lastroundEnc(xmm24, 7);
  __ jmp(END_LOOP);
  //Additional 2 rounds of Aesenc operation for keysize = 192
  __ bind(AES192);
  roundEnc(xmm24, 7);
  roundEnc(xmm19, 7);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::aboveEqual, AES256);
  // Aesenclast round for keysize = 192
  lastroundEnc(xmm20, 7);
  __ jmp(END_LOOP);
  // 2 rounds of Aesenc operation and Aesenclast for keysize = 256
  __ bind(AES256);
  roundEnc(xmm20, 7);
  roundEnc(xmm21, 7);
  lastroundEnc(xmm22, 7);

  __ bind(END_LOOP);
  // Move 512 bytes of CT to destination
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);

  __ addq(pos, 512);
  __ decq(len);
  __ jcc(Assembler::notEqual, LOOP);

  __ bind(REMAINDER);
  __ vzeroupper();
  __ cmpq(rbx, 0);
  __ jcc(Assembler::equal, END);
  // Process 16 bytes at a time
  __ bind(LOOP2);
  __ movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
  __ vpxor(xmm1, xmm1, xmm8, Assembler::AVX_128bit);
  // xmm2 contains shuffled key for Aesenclast operation.
  __ vmovdqu(xmm2, xmm24);

  __ vaesenc(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm23, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm17, Assembler::AVX_128bit);

  __ cmpl(rounds, 52);
  __ jcc(Assembler::below, LAST2);
  __ vmovdqu(xmm2, xmm20);
  __ vaesenc(xmm1, xmm1, xmm24, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::below, LAST2);
  __ vmovdqu(xmm2, xmm22);
  __ vaesenc(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
  __ vaesenc(xmm1, xmm1, xmm21, Assembler::AVX_128bit);

  __ bind(LAST2);
  // Aesenclast round
  __ vaesenclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
  // Write 16 bytes of CT to destination
  __ movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
  __ addq(pos, 16);
  __ decq(rbx);
  __ jcc(Assembler::notEqual, LOOP2);

  __ bind(END);
  // Zero out the round keys
  __ evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
  __ evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
  __ evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
  __ evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
  __ evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
  __ evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
  __ evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
  __ evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
  __ cmpl(rounds, 44);
  __ jcc(Assembler::belowEqual, EXIT);
  __ evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
  __ evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::belowEqual, EXIT);
  __ evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
  __ evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
  __ bind(EXIT);
  __ pop_ppx(rbx);
  __ pop_ppx(rax); // return length
  __ pop_ppx(r12);
  __ pop_ppx(r13);
}

// AES-ECB Decrypt Operation
void StubGenerator::aesecb_decrypt(Register src_addr, Register dest_addr, Register key, Register len)  {

  Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
  const Register pos = rax;
  const Register rounds = r12;
  __ push_ppx(r13);
  __ push_ppx(r12);

  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
  // context for the registers used, where all instructions below are using 128-bit mode
  // On EVEX without VL and BW, these instructions will all be AVX.
  if (VM_Version::supports_avx512vlbw()) {
    __ movl(rax, 0xffff);
    __ kmovql(k1, rax);
  }

  __ push_ppx(len); // Save
  __ push_ppx(rbx);

  __ vzeroupper();

  __ xorptr(pos, pos);
  // Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
  __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  // Load Key shuf mask
  const XMMRegister xmm_key_shuf_mask = xmm31;  // used temporarily to swap key bytes up front
  __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);

  // Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption.
  // So the first round key is loaded from 1*16 here and last round key is loaded from 0*16
  ev_load_key(xmm9,  key, 1 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::greaterEqual, KEY_192);
  __ jmp(Loop_start);

  __ bind(KEY_192);
  ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::equal, KEY_256);
  __ jmp(Loop_start);

  __ bind(KEY_256);
  ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
  ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
  __ bind(Loop_start);
  __ movq(rbx, len);
  // Convert input length to number of blocks
  __ shrq(len, 4);
  __ shlq(rbx, 60);
  __ jcc(Assembler::equal, NO_PARTS);
  __ addq(len, 1);
  // Check if number of blocks is greater than/ equal to 32
  // If true, blocks then 512 bytes are processed at a time (code marked by label LOOP)
  // If not, 16 bytes are processed (code marked by label REMAINDER)
  __ bind(NO_PARTS);
  __ movq(rbx, len);
  __ shrq(len, 5);
  __ jcc(Assembler::equal, REMAINDER);
  __ movl(r13, len);
  // Compute number of blocks that will be processed as 512 bytes at a time
  // Subtract this from the total number of blocks, which will then be processed by REMAINDER loop.
  __ shlq(r13, 5);
  __ subq(rbx, r13);

  __ bind(LOOP);
  // Move 64 bytes of CT data into a zmm register, as a result 512 bytes of CT loaded in zmm0-7
  __ evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
  __ evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
  // Xor with the first round key
  __ evpxorq(xmm0, xmm0, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm1, xmm1, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm2, xmm2, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm3, xmm3, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm4, xmm4, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm5, xmm5, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm6, xmm6, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm7, xmm7, xmm9, Assembler::AVX_512bit);
  // 9 rounds of Aesdec
  roundDec(xmm10, 7);
  roundDec(xmm11, 7);
  roundDec(xmm12, 7);
  roundDec(xmm13, 7);
  roundDec(xmm14, 7);
  roundDec(xmm15, 7);
  roundDec(xmm16, 7);
  roundDec(xmm17, 7);
  roundDec(xmm18, 7);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::aboveEqual, AES192);
  // Aesdeclast round for keysize = 128
  lastroundDec(xmm27, 7);
  __ jmp(END_LOOP);

  __ bind(AES192);
  // 2 Additional rounds for keysize = 192
  roundDec(xmm19, 7);
  roundDec(xmm20, 7);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::aboveEqual, AES256);
  // Aesdeclast round for keysize = 192
  lastroundDec(xmm27, 7);
  __ jmp(END_LOOP);
  __ bind(AES256);
  // 2 Additional rounds and Aesdeclast for keysize = 256
  roundDec(xmm21, 7);
  roundDec(xmm22, 7);
  lastroundDec(xmm27, 7);

  __ bind(END_LOOP);
  // Write 512 bytes of PT to the destination
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);

  __ addq(pos, 512);
  __ decq(len);
  __ jcc(Assembler::notEqual, LOOP);

  __ bind(REMAINDER);
  __ vzeroupper();
  __ cmpq(rbx, 0);
  __ jcc(Assembler::equal, END);
  // Process 16 bytes at a time
  __ bind(LOOP2);
  __ movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
  __ vpxor(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
  // xmm2 contains shuffled key for Aesdeclast operation.
  __ vmovdqu(xmm2, xmm27);

  __ vaesdec(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm11, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm18, Assembler::AVX_128bit);

  __ cmpl(rounds, 52);
  __ jcc(Assembler::below, LAST2);
  __ vaesdec(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::below, LAST2);
  __ vaesdec(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
  __ vaesdec(xmm1, xmm1, xmm22, Assembler::AVX_128bit);

  __ bind(LAST2);
  // Aesdeclast round
  __ vaesdeclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
  // Write 16 bytes of PT to destination
  __ movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
  __ addq(pos, 16);
  __ decq(rbx);
  __ jcc(Assembler::notEqual, LOOP2);

  __ bind(END);
  // Zero out the round keys
  __ evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
  __ evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
  __ evpxorq(xmm11, xmm11, xmm11, Assembler::AVX_512bit);
  __ evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
  __ evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
  __ evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
  __ evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
  __ evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
  __ evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
  __ evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
  __ cmpl(rounds, 44);
  __ jcc(Assembler::belowEqual, EXIT);
  __ evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
  __ evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::belowEqual, EXIT);
  __ evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
  __ evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);

  __ bind(EXIT);
  __ pop_ppx(rbx);
  __ pop_ppx(rax); // return length
  __ pop_ppx(r12);
  __ pop_ppx(r13);
}


// AES Counter Mode using VAES instructions
void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
  Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {

  const Register rounds = rax;
  const Register pos = r12;
  const Register tail = r15;

  Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP,
  AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16,
  REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER,
  AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP,
  AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES,
  EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR;

  __ cmpl(len_reg, 0);
  __ jcc(Assembler::belowEqual, EXIT);

  __ movl(pos, 0);
  // if the number of used encrypted counter bytes < 16,
  // XOR PT with saved encrypted counter to obtain CT
  __ bind(PRELOOP_START);
  __ cmpl(used, 16);
  __ jcc(Assembler::aboveEqual, EXIT_PRELOOP);
  __ movb(rbx, Address(saved_encCounter_start, used));
  __ xorb(rbx, Address(src_addr, pos));
  __ movb(Address(dest_addr, pos), rbx);
  __ addptr(pos, 1);
  __ addptr(used, 1);
  __ decrement(len_reg);
  __ jcc(Assembler::notEqual, PRELOOP_START);

  __ bind(EXIT_PRELOOP);
  __ movl(Address(used_addr, 0), used);

  __ cmpl(len_reg, 0);
  __ jcc(Assembler::equal, EXIT);

  // Calculate number of rounds i.e. 10, 12, 14,  based on key length(128, 192, 256).
  __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  __ vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
  // Move initial counter value in xmm0
  __ movdqu(xmm0, Address(counter, 0));
  // broadcast counter value to zmm8
  __ evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit);

  // load lbswap mask
  __ evmovdquq(xmm16, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);

  //shuffle counter using lbswap_mask
  __ vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit);

  // pre-increment and propagate counter values to zmm9-zmm15 registers.
  // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4
  // The counter is incremented after each block i.e. 16 bytes is processed;
  // each zmm register has 4 counter values as its MSB
  // the counters are incremented in parallel

  const XMMRegister ones = xmm17;
  // Vector value to propagate carries
  __ evmovdquq(ones, ExternalAddress(counter_mask_ones_addr()), Assembler::AVX_512bit, r15);

  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc0_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
  ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
  ev_add128(xmm9,  xmm8,  xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  ev_add128(xmm10, xmm9,  xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  ev_add128(xmm11, xmm10, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  ev_add128(xmm12, xmm11, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  ev_add128(xmm13, xmm12, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  ev_add128(xmm14, xmm13, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  ev_add128(xmm15, xmm14, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);

  // load linc32 mask in zmm register.linc32 increments counter by 32
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc32_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);

  // xmm31 contains the key shuffle mask.
  __ movdqu(xmm31, ExternalAddress(key_shuffle_mask_addr()), r15 /*rscratch*/);
  // Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value.
  // For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register
  // that holds shuffled key value.
  ev_load_key(xmm20, key, 0, xmm31);
  ev_load_key(xmm21, key, 1 * 16, xmm31);
  ev_load_key(xmm22, key, 2 * 16, xmm31);
  ev_load_key(xmm23, key, 3 * 16, xmm31);
  ev_load_key(xmm24, key, 4 * 16, xmm31);
  ev_load_key(xmm25, key, 5 * 16, xmm31);
  ev_load_key(xmm26, key, 6 * 16, xmm31);
  ev_load_key(xmm27, key, 7 * 16, xmm31);
  ev_load_key(xmm28, key, 8 * 16, xmm31);
  ev_load_key(xmm29, key, 9 * 16, xmm31);
  ev_load_key(xmm30, key, 10 * 16, xmm31);

  // Process 32 blocks or 512 bytes of data
  __ bind(LOOP);
  __ cmpl(len_reg, 512);
  __ jcc(Assembler::less, REMAINDER);
  __ subq(len_reg, 512);
  //Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7
  __ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit);
  // Perform AES encode operations and put results in zmm0-zmm7.
  // This is followed by incrementing counter values in zmm8-zmm15.
  // Since we will be processing 32 blocks at a time, the counter is incremented by 32.
  roundEnc(xmm21, 7);
  ev_add128(xmm8,   xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  roundEnc(xmm22, 7);
  ev_add128(xmm9,   xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  roundEnc(xmm23, 7);
  ev_add128(xmm10, xmm10, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  roundEnc(xmm24, 7);
  ev_add128(xmm11, xmm11, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  roundEnc(xmm25, 7);
  ev_add128(xmm12, xmm12, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  roundEnc(xmm26, 7);
  ev_add128(xmm13, xmm13, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  roundEnc(xmm27, 7);
  ev_add128(xmm14, xmm14, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  roundEnc(xmm28, 7);
  ev_add128(xmm15, xmm15, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  roundEnc(xmm29, 7);

  __ cmpl(rounds, 52);
  __ jcc(Assembler::aboveEqual, AES192);
  lastroundEnc(xmm30, 7);
  __ jmp(END_LOOP);

  __ bind(AES192);
  roundEnc(xmm30, 7);
  ev_load_key(xmm18, key, 11 * 16, xmm31);
  roundEnc(xmm18, 7);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::aboveEqual, AES256);
  ev_load_key(xmm18, key, 12 * 16, xmm31);
  lastroundEnc(xmm18, 7);
  __ jmp(END_LOOP);

  __ bind(AES256);
  ev_load_key(xmm18, key, 12 * 16, xmm31);
  roundEnc(xmm18, 7);
  ev_load_key(xmm18, key, 13 * 16, xmm31);
  roundEnc(xmm18, 7);
  ev_load_key(xmm18, key, 14 * 16, xmm31);
  lastroundEnc(xmm18, 7);

  // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7
  // xor encrypted block cipher and input plaintext and store resultant ciphertext
  __ bind(END_LOOP);
  __ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
  __ evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit);
  __ evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
  __ evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
  __ evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
  __ evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
  __ evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
  __ evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
  __ addq(pos, 512);
  __ jmp(LOOP);

  // Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes
  __ bind(REMAINDER);
  __ cmpl(len_reg, 0);
  __ jcc(Assembler::equal, END);
  __ cmpl(len_reg, 256);
  __ jcc(Assembler::aboveEqual, REMAINDER_16);
  __ cmpl(len_reg, 128);
  __ jcc(Assembler::aboveEqual, REMAINDER_8);
  __ cmpl(len_reg, 64);
  __ jcc(Assembler::aboveEqual, REMAINDER_4);
  // At this point, we will process 16 bytes of data at a time.
  // So load xmm19 with counter increment value as 1
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
  __ jmp(REMAINDER_LOOP);

  // Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data
  __ bind(REMAINDER_16);
  __ subq(len_reg, 256);
  // As we process 16 blocks at a time, load mask for incrementing the counter value by 16
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc16_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
  // shuffle counter and XOR counter with roundkey1
  __ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
  // Increment counter values by 16
  ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  ev_add128(xmm9, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  // AES encode rounds
  roundEnc(xmm21, 3);
  roundEnc(xmm22, 3);
  roundEnc(xmm23, 3);
  roundEnc(xmm24, 3);
  roundEnc(xmm25, 3);
  roundEnc(xmm26, 3);
  roundEnc(xmm27, 3);
  roundEnc(xmm28, 3);
  roundEnc(xmm29, 3);

  __ cmpl(rounds, 52);
  __ jcc(Assembler::aboveEqual, AES192_REMAINDER16);
  lastroundEnc(xmm30, 3);
  __ jmp(REMAINDER16_END_LOOP);

  __ bind(AES192_REMAINDER16);
  roundEnc(xmm30, 3);
  ev_load_key(xmm18, key, 11 * 16, xmm31);
  roundEnc(xmm18, 3);
  ev_load_key(xmm5, key, 12 * 16, xmm31);

  __ cmpl(rounds, 60);
  __ jcc(Assembler::aboveEqual, AES256_REMAINDER16);
  lastroundEnc(xmm5, 3);
  __ jmp(REMAINDER16_END_LOOP);
  __ bind(AES256_REMAINDER16);
  roundEnc(xmm5, 3);
  ev_load_key(xmm6, key, 13 * 16, xmm31);
  roundEnc(xmm6, 3);
  ev_load_key(xmm7, key, 14 * 16, xmm31);
  lastroundEnc(xmm7, 3);

  // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3
  // xor 256 bytes of PT with the encrypted counters to produce CT.
  __ bind(REMAINDER16_END_LOOP);
  __ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
  __ evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
  __ evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
  __ evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
  __ addq(pos, 256);

  __ cmpl(len_reg, 128);
  __ jcc(Assembler::aboveEqual, REMAINDER_8);

  __ cmpl(len_reg, 64);
  __ jcc(Assembler::aboveEqual, REMAINDER_4);
  //load mask for incrementing the counter value by 1
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
  __ jmp(REMAINDER_LOOP);

  // Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data
  __ bind(REMAINDER_8);
  __ subq(len_reg, 128);
  // As we process 8 blocks at a time, load mask for incrementing the counter value by 8
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc8_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
  // shuffle counters and xor with roundkey1
  __ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
  __ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
  // increment counter by 8
  ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  // AES encode
  roundEnc(xmm21, 1);
  roundEnc(xmm22, 1);
  roundEnc(xmm23, 1);
  roundEnc(xmm24, 1);
  roundEnc(xmm25, 1);
  roundEnc(xmm26, 1);
  roundEnc(xmm27, 1);
  roundEnc(xmm28, 1);
  roundEnc(xmm29, 1);

  __ cmpl(rounds, 52);
  __ jcc(Assembler::aboveEqual, AES192_REMAINDER8);
  lastroundEnc(xmm30, 1);
  __ jmp(REMAINDER8_END_LOOP);

  __ bind(AES192_REMAINDER8);
  roundEnc(xmm30, 1);
  ev_load_key(xmm18, key, 11 * 16, xmm31);
  roundEnc(xmm18, 1);
  ev_load_key(xmm5, key, 12 * 16, xmm31);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::aboveEqual, AES256_REMAINDER8);
  lastroundEnc(xmm5, 1);
  __ jmp(REMAINDER8_END_LOOP);

  __ bind(AES256_REMAINDER8);
  roundEnc(xmm5, 1);
  ev_load_key(xmm6, key, 13 * 16, xmm31);
  roundEnc(xmm6, 1);
  ev_load_key(xmm7, key, 14 * 16, xmm31);
  lastroundEnc(xmm7, 1);

  __ bind(REMAINDER8_END_LOOP);
  // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1
  // XOR PT with the encrypted counter and store as CT
  __ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
  __ evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
  __ addq(pos, 128);

  __ cmpl(len_reg, 64);
  __ jcc(Assembler::aboveEqual, REMAINDER_4);
  // load mask for incrementing the counter value by 1
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);
  __ jmp(REMAINDER_LOOP);

  // Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code
  __ bind(REMAINDER_4);
  __ subq(len_reg, 64);
  // As we process 4 blocks at a time, load mask for incrementing the counter value by 4
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
  // XOR counter with first roundkey
  __ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
  __ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);

  // Increment counter
  ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, ones);
  __ vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
  __ vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
  __ vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
  __ vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit);
  __ vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit);
  __ vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit);
  __ vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit);
  __ vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit);
  __ vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::aboveEqual, AES192_REMAINDER4);
  __ vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
  __ jmp(END_REMAINDER4);

  __ bind(AES192_REMAINDER4);
  __ vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
  ev_load_key(xmm18, key, 11 * 16, xmm31);
  __ vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit);
  ev_load_key(xmm5, key, 12 * 16, xmm31);

  __ cmpl(rounds, 60);
  __ jcc(Assembler::aboveEqual, AES256_REMAINDER4);
  __ vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
  __ jmp(END_REMAINDER4);

  __ bind(AES256_REMAINDER4);
  __ vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
  ev_load_key(xmm6, key, 13 * 16, xmm31);
  __ vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit);
  ev_load_key(xmm7, key, 14 * 16, xmm31);
  __ vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit);
  // After AES encode rounds, the encrypted block cipher lies in zmm0.
  // XOR encrypted block cipher with PT and store 64 bytes of ciphertext
  __ bind(END_REMAINDER4);
  __ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
  __ addq(pos, 64);
  // load mask for incrementing the counter value by 1
  __ evmovdquq(xmm19, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, r15 /*rscratch*/);

  // For a single block, the AES rounds start here.
  __ bind(REMAINDER_LOOP);
  __ cmpl(len_reg, 0);
  __ jcc(Assembler::belowEqual, END);
  // XOR counter with first roundkey
  __ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit);
  __ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
  __ vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
  // Increment counter by 1
  ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_128bit, /*ktmp*/k1, ones);
  __ vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
  __ vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
  __ vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);
  __ vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit);
  __ vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit);
  __ vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit);
  __ vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit);
  __ vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit);

  __ cmpl(rounds, 52);
  __ jcc(Assembler::aboveEqual, AES192_REMAINDER);
  __ vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
  __ jmp(END_REMAINDER_LOOP);

  __ bind(AES192_REMAINDER);
  __ vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
  ev_load_key(xmm18, key, 11 * 16, xmm31);
  __ vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit);
  ev_load_key(xmm5, key, 12 * 16, xmm31);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::aboveEqual, AES256_REMAINDER);
  __ vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
  __ jmp(END_REMAINDER_LOOP);

  __ bind(AES256_REMAINDER);
  __ vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
  ev_load_key(xmm6, key, 13 * 16, xmm31);
  __ vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit);
  ev_load_key(xmm7, key, 14 * 16, xmm31);
  __ vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit);

  __ bind(END_REMAINDER_LOOP);
  // If the length register is less than the blockSize i.e. 16
  // then we store only those bytes of the CT to the destination
  // corresponding to the length register value
  // extracting the exact number of bytes is handled by EXTRACT_TAILBYTES
  __ cmpl(len_reg, 16);
  __ jcc(Assembler::less, EXTRACT_TAILBYTES);
  __ subl(len_reg, 16);
  // After AES encode rounds, the encrypted block cipher lies in xmm0.
  // If the length register is equal to 16 bytes, store CT in dest after XOR operation.
  __ evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
  __ evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit);
  __ addl(pos, 16);

  __ jmp(REMAINDER_LOOP);

  __ bind(EXTRACT_TAILBYTES);
  // Save encrypted counter value in xmm0 for next invocation, before XOR operation
  __ movdqu(Address(saved_encCounter_start, 0), xmm0);
  // XOR encryted block cipher in xmm0 with PT to produce CT
  // extract up to 15 bytes of CT from xmm0 as specified by length register
  __ testptr(len_reg, 8);
  __ jcc(Assembler::zero, EXTRACT_TAIL_4BYTES);
  __ pextrq(tail, xmm0, 0);
  __ xorq(tail, Address(src_addr, pos, Address::times_1, 0));
  __ movq(Address(dest_addr, pos), tail);
  __ psrldq(xmm0, 8);
  __ addl(pos, 8);
  __ bind(EXTRACT_TAIL_4BYTES);
  __ testptr(len_reg, 4);
  __ jcc(Assembler::zero, EXTRACT_TAIL_2BYTES);
  __ pextrd(tail, xmm0, 0);
  __ xorl(tail, Address(src_addr, pos, Address::times_1, 0));
  __ movl(Address(dest_addr, pos), tail);
  __ psrldq(xmm0, 4);
  __ addq(pos, 4);
  __ bind(EXTRACT_TAIL_2BYTES);
  __ testptr(len_reg, 2);
  __ jcc(Assembler::zero, EXTRACT_TAIL_1BYTE);
  __ pextrw(tail, xmm0, 0);
  __ xorw(tail, Address(src_addr, pos, Address::times_1, 0));
  __ movw(Address(dest_addr, pos), tail);
  __ psrldq(xmm0, 2);
  __ addl(pos, 2);
  __ bind(EXTRACT_TAIL_1BYTE);
  __ testptr(len_reg, 1);
  __ jcc(Assembler::zero, END);
  __ pextrb(tail, xmm0, 0);
  __ xorb(tail, Address(src_addr, pos, Address::times_1, 0));
  __ movb(Address(dest_addr, pos), tail);
  __ addl(pos, 1);

  __ bind(END);
  // If there are no tail bytes, store counter value and exit
  __ cmpl(len_reg, 0);
  __ jcc(Assembler::equal, STORE_CTR);
  __ movl(Address(used_addr, 0), len_reg);

  __ bind(STORE_CTR);
  //shuffle updated counter and store it
  __ vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit);
  __ movdqu(Address(counter, 0), xmm8);
  // Zero out counter and key registers
  __ evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
  __ evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
  __ evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
  __ evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
  __ evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
  __ evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
  __ evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit);
  __ evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit);
  __ evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
  __ evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit);
  __ evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit);
  __ evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit);
  __ cmpl(rounds, 44);
  __ jcc(Assembler::belowEqual, EXIT);
  __ evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
  __ evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::belowEqual, EXIT);
  __ evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit);
  __ evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit);
  __ bind(EXIT);
}

void StubGenerator::gfmul_avx512(XMMRegister GH, XMMRegister HK) {
  const XMMRegister TMP1 = xmm0;
  const XMMRegister TMP2 = xmm1;
  const XMMRegister TMP3 = xmm2;

  __ evpclmulqdq(TMP1, GH, HK, 0x11, Assembler::AVX_512bit);
  __ evpclmulqdq(TMP2, GH, HK, 0x00, Assembler::AVX_512bit);
  __ evpclmulqdq(TMP3, GH, HK, 0x01, Assembler::AVX_512bit);
  __ evpclmulqdq(GH, GH, HK, 0x10, Assembler::AVX_512bit);
  __ evpxorq(GH, GH, TMP3, Assembler::AVX_512bit);
  __ vpsrldq(TMP3, GH, 8, Assembler::AVX_512bit);
  __ vpslldq(GH, GH, 8, Assembler::AVX_512bit);
  __ evpxorq(TMP1, TMP1, TMP3, Assembler::AVX_512bit);
  __ evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);

  __ evmovdquq(TMP3, ExternalAddress(ghash_polynomial_reduction_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
  __ evpclmulqdq(TMP2, TMP3, GH, 0x01, Assembler::AVX_512bit);
  __ vpslldq(TMP2, TMP2, 8, Assembler::AVX_512bit);
  __ evpxorq(GH, GH, TMP2, Assembler::AVX_512bit);
  __ evpclmulqdq(TMP2, TMP3, GH, 0x00, Assembler::AVX_512bit);
  __ vpsrldq(TMP2, TMP2, 4, Assembler::AVX_512bit);
  __ evpclmulqdq(GH, TMP3, GH, 0x10, Assembler::AVX_512bit);
  __ vpslldq(GH, GH, 4, Assembler::AVX_512bit);
  __ vpternlogq(GH, 0x96, TMP1, TMP2, Assembler::AVX_512bit);
}

// Holds 64 Htbl entries, 32 HKey and 32 HkKey (derived from HKey)
void StubGenerator::generateHtbl_32_blocks_avx512(Register htbl, Register avx512_htbl) {
  const XMMRegister HK = xmm6;
  const XMMRegister ZT1 = xmm0, ZT2 = xmm1, ZT3 = xmm2, ZT4 = xmm3;
  const XMMRegister ZT5 = xmm4, ZT6 = xmm5, ZT7 = xmm7, ZT8 = xmm8;
  const XMMRegister ZT10 = xmm10, ZT11 = xmm11, ZT12 = xmm12;

  __ movdqu(HK, Address(htbl, 0));
  __ movdqu(ZT10, ExternalAddress(ghash_long_swap_mask_addr()), r15);
  __ vpshufb(HK, HK, ZT10, Assembler::AVX_128bit);
  __ movdqu(ZT11, ExternalAddress(ghash_polynomial_addr()), r15);
  __ movdqu(ZT12, ExternalAddress(ghash_polynomial_two_one_addr()), r15);
  // Compute H ^ 2 from the input subkeyH
  __ movdqu(ZT3, HK);
  __ vpsllq(HK, HK, 1, Assembler::AVX_128bit);
  __ vpsrlq(ZT3, ZT3, 63, Assembler::AVX_128bit);
  __ movdqu(ZT2, ZT3);
  __ vpslldq(ZT3, ZT3, 8, Assembler::AVX_128bit);
  __ vpsrldq(ZT2, ZT2, 8, Assembler::AVX_128bit);
  __ vpor(HK, HK, ZT3, Assembler::AVX_128bit);
  __ vpshufd(ZT3, ZT2, 0x24, Assembler::AVX_128bit);
  __ vpcmpeqd(ZT3, ZT3, ZT12, Assembler::AVX_128bit);
  __ vpand(ZT3, ZT3, ZT11, Assembler::AVX_128bit);
  __ vpxor(HK, HK, ZT3, Assembler::AVX_128bit);
  __ movdqu(Address(avx512_htbl, 16 * 31), HK); // H ^ 2

  __ movdqu(ZT5, HK);
  __ evinserti64x2(ZT7, ZT7, HK, 3, Assembler::AVX_512bit);

  //calculate HashKey ^ 2 << 1 mod poly
  gfmul_avx512(ZT5, HK);
  __ movdqu(Address(avx512_htbl, 16 * 30), ZT5);
  __ evinserti64x2(ZT7, ZT7, ZT5, 2, Assembler::AVX_512bit);

  //calculate HashKey ^ 3 << 1 mod poly
  gfmul_avx512(ZT5, HK);
  __ movdqu(Address(avx512_htbl, 16 * 29), ZT5);
  __ evinserti64x2(ZT7, ZT7, ZT5, 1, Assembler::AVX_512bit);

  //calculate HashKey ^ 4 << 1 mod poly
  gfmul_avx512(ZT5, HK);
  __ movdqu(Address(avx512_htbl, 16 * 28), ZT5);
  __ evinserti64x2(ZT7, ZT7, ZT5, 0, Assembler::AVX_512bit);
  // ZT5 amd ZT7 to be cleared(hash key)
  //calculate HashKeyK = HashKey x POLY
  __ evmovdquq(xmm11, ExternalAddress(ghash_polynomial_addr()), Assembler::AVX_512bit, r15);
  __ evpclmulqdq(ZT1, ZT7, xmm11, 0x10, Assembler::AVX_512bit);
  __ vpshufd(ZT2, ZT7, 78, Assembler::AVX_512bit);
  __ evpxorq(ZT1, ZT1, ZT2, Assembler::AVX_512bit);
  __ evmovdquq(Address(avx512_htbl, 16 * 60), ZT1, Assembler::AVX_512bit);
  //**ZT1 amd ZT2 to be cleared(hash key)

  //switch to 4x128 - bit computations now
  __ evshufi64x2(ZT5, ZT5, ZT5, 0x00, Assembler::AVX_512bit); //;; broadcast HashKey ^ 4 across all ZT5
  __ evmovdquq(ZT8, ZT7, Assembler::AVX_512bit);//; save HashKey ^ 4 to HashKey ^ 1 in ZT8
  //**ZT8 to be cleared(hash key)

  //calculate HashKey ^ 5 << 1 mod poly, HashKey ^ 6 << 1 mod poly, ... HashKey ^ 8 << 1 mod poly
  gfmul_avx512(ZT7, ZT5);
  __ evmovdquq(Address(avx512_htbl, 16 * 24), ZT7, Assembler::AVX_512bit);//; HashKey ^ 8 to HashKey ^ 5 in ZT7 now

  //calculate HashKeyX = HashKey x POLY
  __ evpclmulqdq(ZT1, ZT7, xmm11, 0x10, Assembler::AVX_512bit);
  __ vpshufd(ZT2, ZT7, 78, Assembler::AVX_512bit);
  __ evpxorq(ZT1, ZT1, ZT2, Assembler::AVX_512bit);
  __ evmovdquq(Address(avx512_htbl, 16 * 56), ZT1, Assembler::AVX_512bit);

  __ evshufi64x2(ZT5, ZT7, ZT7, 0x00, Assembler::AVX_512bit);//;; broadcast HashKey ^ 8 across all ZT5

  for (int i = 20, j = 52; i > 0;) {
    gfmul_avx512(ZT8, ZT5);
    __ evmovdquq(Address(avx512_htbl, 16 * i), ZT8, Assembler::AVX_512bit);
    //calculate HashKeyK = HashKey x POLY
    __ evpclmulqdq(ZT1, ZT8, xmm11, 0x10, Assembler::AVX_512bit);
    __ vpshufd(ZT2, ZT8, 78, Assembler::AVX_512bit);
    __ evpxorq(ZT1, ZT1, ZT2, Assembler::AVX_512bit);
    __ evmovdquq(Address(avx512_htbl, 16 * j), ZT1, Assembler::AVX_512bit);

    i -= 4;
    j -= 4;
    //compute HashKey ^ (8 + n), HashKey ^ (7 + n), ... HashKey ^ (5 + n)
    gfmul_avx512(ZT7, ZT5);
    __ evmovdquq(Address(avx512_htbl, 16 * i), ZT7, Assembler::AVX_512bit);

    //calculate HashKeyK = HashKey x POLY
    __ evpclmulqdq(ZT1, ZT7, xmm11, 0x10, Assembler::AVX_512bit);
    __ vpshufd(ZT2, ZT7, 78, Assembler::AVX_512bit);
    __ evpxorq(ZT1, ZT1, ZT2, Assembler::AVX_512bit);
    __ evmovdquq(Address(avx512_htbl, 16 * j), ZT1, Assembler::AVX_512bit);

    i -= 4;
    j -= 4;
  }
 }

#define vhpxori4x128(reg, tmp)                    \
__ vextracti64x4(tmp, reg, 1);                    \
__ evpxorq(reg, reg, tmp, Assembler::AVX_256bit); \
__ vextracti32x4(tmp, reg, 1);                    \
__ evpxorq(reg, reg, tmp, Assembler::AVX_128bit); \

#define roundEncode(key, dst1, dst2, dst3, dst4)    \
__ vaesenc(dst1, dst1, key, Assembler::AVX_512bit); \
__ vaesenc(dst2, dst2, key, Assembler::AVX_512bit); \
__ vaesenc(dst3, dst3, key, Assembler::AVX_512bit); \
__ vaesenc(dst4, dst4, key, Assembler::AVX_512bit); \

#define lastroundEncode(key, dst1, dst2, dst3, dst4) \
__ vaesenclast(dst1, dst1, key, Assembler::AVX_512bit); \
__ vaesenclast(dst2, dst2, key, Assembler::AVX_512bit); \
__ vaesenclast(dst3, dst3, key, Assembler::AVX_512bit); \
__ vaesenclast(dst4, dst4, key, Assembler::AVX_512bit); \

#define storeData(dst, position, src1, src2, src3, src4) \
__ evmovdquq(Address(dst, position, Address::times_1, 0 * 64), src1, Assembler::AVX_512bit); \
__ evmovdquq(Address(dst, position, Address::times_1, 1 * 64), src2, Assembler::AVX_512bit); \
__ evmovdquq(Address(dst, position, Address::times_1, 2 * 64), src3, Assembler::AVX_512bit); \
__ evmovdquq(Address(dst, position, Address::times_1, 3 * 64), src4, Assembler::AVX_512bit); \

#define loadData(src, position, dst1, dst2, dst3, dst4) \
__ evmovdquq(dst1, Address(src, position, Address::times_1, 0 * 64), Assembler::AVX_512bit); \
__ evmovdquq(dst2, Address(src, position, Address::times_1, 1 * 64), Assembler::AVX_512bit); \
__ evmovdquq(dst3, Address(src, position, Address::times_1, 2 * 64), Assembler::AVX_512bit); \
__ evmovdquq(dst4, Address(src, position, Address::times_1, 3 * 64), Assembler::AVX_512bit); \

#define carrylessMultiply(dst00, dst01, dst10, dst11, ghdata, hkey2, hkey1) \
__ evpclmulqdq(dst00, ghdata, hkey2, 0x00, Assembler::AVX_512bit); \
__ evpclmulqdq(dst01, ghdata, hkey2, 0x10, Assembler::AVX_512bit); \
__ evpclmulqdq(dst10, ghdata, hkey1, 0x01, Assembler::AVX_512bit); \
__ evpclmulqdq(dst11, ghdata, hkey1, 0x11, Assembler::AVX_512bit); \

#define shuffle(dst0, dst1, dst2, dst3, src0, src1, src2, src3, shufmask) \
__ vpshufb(dst0, src0, shufmask, Assembler::AVX_512bit); \
__ vpshufb(dst1, src1, shufmask, Assembler::AVX_512bit); \
__ vpshufb(dst2, src2, shufmask, Assembler::AVX_512bit); \
__ vpshufb(dst3, src3, shufmask, Assembler::AVX_512bit); \

#define xorBeforeStore(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
__ evpxorq(dst0, dst0, src0, Assembler::AVX_512bit); \
__ evpxorq(dst1, dst1, src1, Assembler::AVX_512bit); \
__ evpxorq(dst2, dst2, src2, Assembler::AVX_512bit); \
__ evpxorq(dst3, dst3, src3, Assembler::AVX_512bit); \

#define xorGHASH(dst0, dst1, dst2, dst3, src02, src03, src12, src13, src22, src23, src32, src33) \
__ vpternlogq(dst0, 0x96, src02, src03, Assembler::AVX_512bit); \
__ vpternlogq(dst1, 0x96, src12, src13, Assembler::AVX_512bit); \
__ vpternlogq(dst2, 0x96, src22, src23, Assembler::AVX_512bit); \
__ vpternlogq(dst3, 0x96, src32, src33, Assembler::AVX_512bit); \

//schoolbook multiply of 16 blocks(8 x 16 bytes)
//it is assumed that data read is already shuffledand
void StubGenerator::ghash16_avx512(bool start_ghash, bool do_reduction, bool uload_shuffle, bool hk_broadcast, bool do_hxor,
                                   Register in, Register pos, Register subkeyHtbl, XMMRegister HASH, XMMRegister SHUFM, int in_offset,
                                   int in_disp, int displacement, int hashkey_offset) {
  const XMMRegister ZTMP0 = xmm0;
  const XMMRegister ZTMP1 = xmm3;
  const XMMRegister ZTMP2 = xmm4;
  const XMMRegister ZTMP3 = xmm5;
  const XMMRegister ZTMP4 = xmm6;
  const XMMRegister ZTMP5 = xmm7;
  const XMMRegister ZTMP6 = xmm10;
  const XMMRegister ZTMP7 = xmm11;
  const XMMRegister ZTMP8 = xmm12;
  const XMMRegister ZTMP9 = xmm13;
  const XMMRegister ZTMPA = xmm26;
  const XMMRegister ZTMPB = xmm23;
  const XMMRegister GH = xmm24;
  const XMMRegister GL = xmm25;
  const int hkey_gap = 16 * 32;

  if (uload_shuffle) {
    __ evmovdquq(ZTMP9, Address(subkeyHtbl,  in_offset * 16 + in_disp), Assembler::AVX_512bit);
    __ vpshufb(ZTMP9, ZTMP9, SHUFM, Assembler::AVX_512bit);
  } else {
    __ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp), Assembler::AVX_512bit);
  }

  if (start_ghash) {
    __ evpxorq(ZTMP9, ZTMP9, HASH, Assembler::AVX_512bit);
  }
  if (hk_broadcast) {
    __ evbroadcastf64x2(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 0 * 64), Assembler::AVX_512bit);
    __ evbroadcastf64x2(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 0 * 64), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 0 * 64), Assembler::AVX_512bit);
    __ evmovdquq(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 0 * 64), Assembler::AVX_512bit);
  }

  carrylessMultiply(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP9, ZTMPA, ZTMP8);

  //ghash blocks 4 - 7
  if (uload_shuffle) {
    __ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 64), Assembler::AVX_512bit);
    __ vpshufb(ZTMP9, ZTMP9, SHUFM, Assembler::AVX_512bit);
  } else {
    __ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 64), Assembler::AVX_512bit);
  }

  if (hk_broadcast) {
    __ evbroadcastf64x2(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 1 * 64), Assembler::AVX_512bit);;
    __ evbroadcastf64x2(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 1 * 64), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 1 * 64), Assembler::AVX_512bit);
    __ evmovdquq(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 1 * 64), Assembler::AVX_512bit);
  }

  carrylessMultiply(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP9, ZTMPA, ZTMP8);

  //update sums
  if (start_ghash) {
    __ evpxorq(GL, ZTMP0, ZTMP2, Assembler::AVX_512bit);//T2 = THL + TLL
    __ evpxorq(GH, ZTMP1, ZTMP3, Assembler::AVX_512bit);//T1 = THH + TLH
  } else { //mid, end, end_reduce
    __ vpternlogq(GL, 0x96, ZTMP0, ZTMP2, Assembler::AVX_512bit);//T2 = THL + TLL
    __ vpternlogq(GH, 0x96, ZTMP1, ZTMP3, Assembler::AVX_512bit);//T1 = THH + TLH
  }
  //ghash blocks 8 - 11
  if (uload_shuffle) {
    __ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 128), Assembler::AVX_512bit);
    __ vpshufb(ZTMP9, ZTMP9, SHUFM, Assembler::AVX_512bit);
  } else {
    __ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 128), Assembler::AVX_512bit);
  }
  if (hk_broadcast) {
    __ evbroadcastf64x2(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 2 * 64), Assembler::AVX_512bit);
    __ evbroadcastf64x2(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 2 * 64), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 2 * 64), Assembler::AVX_512bit);
    __ evmovdquq(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 2 * 64), Assembler::AVX_512bit);
  }

  carrylessMultiply(ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP9, ZTMPA, ZTMP8);

  //update sums
  __ vpternlogq(GL, 0x96, ZTMP6, ZTMP4, Assembler::AVX_512bit);//T2 = THL + TLL
  __ vpternlogq(GH, 0x96, ZTMP7, ZTMP5, Assembler::AVX_512bit);//T1 = THH + TLH
  //ghash blocks 12 - 15
  if (uload_shuffle) {
    __ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 192), Assembler::AVX_512bit);
    __ vpshufb(ZTMP9, ZTMP9, SHUFM, Assembler::AVX_512bit);
  } else {
    __ evmovdquq(ZTMP9, Address(subkeyHtbl, in_offset * 16 + in_disp + 192), Assembler::AVX_512bit);
  }

  if (hk_broadcast) {
    __ evbroadcastf64x2(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 3 * 64), Assembler::AVX_512bit);
    __ evbroadcastf64x2(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 3 * 64), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(ZTMP8, Address(subkeyHtbl, hashkey_offset + displacement + 3 * 64), Assembler::AVX_512bit);
    __ evmovdquq(ZTMPA, Address(subkeyHtbl, hashkey_offset + displacement + hkey_gap + 3 * 64), Assembler::AVX_512bit);
  }
  carrylessMultiply(ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP9, ZTMPA, ZTMP8);

  //update sums
  xorGHASH(GL, GH, GL, GH, ZTMP0, ZTMP2, ZTMP1, ZTMP3, ZTMP6, ZTMP4, ZTMP7, ZTMP5);

  if (do_reduction) {
  //new reduction
    __ evmovdquq(ZTMPB, ExternalAddress(ghash_polynomial_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
    __ evpclmulqdq(HASH, GL, ZTMPB, 0x10, Assembler::AVX_512bit);
    __ vpshufd(ZTMP0, GL, 78, Assembler::AVX_512bit);
    __ vpternlogq(HASH, 0x96, GH, ZTMP0, Assembler::AVX_512bit);
    if (do_hxor) {
      vhpxori4x128(HASH, ZTMP0);
    }
  }
}

//Stitched GHASH of 16 blocks(with reduction) with encryption of 0 blocks
void StubGenerator::gcm_enc_dec_last_avx512(Register len, Register in, Register pos, XMMRegister HASH, XMMRegister SHUFM, Register subkeyHtbl,
                                            int ghashin_offset, int hashkey_offset, bool start_ghash, bool do_reduction) {
  //there is 0 blocks to cipher so there are only 16 blocks for ghash and reduction
  ghash16_avx512(start_ghash, do_reduction, false, false, true, in, pos, subkeyHtbl, HASH, SHUFM, ghashin_offset, 0, 0, hashkey_offset);
}

//Main GCM macro stitching cipher with GHASH
//encrypts 16 blocks at a time
//ghash the 16 previously encrypted ciphertext blocks
void StubGenerator::ghash16_encrypt_parallel16_avx512(Register in, Register out, Register ct, Register pos, Register avx512_subkeyHtbl,
                                                      Register CTR_CHECK, Register NROUNDS, Register key, XMMRegister CTR_BE, XMMRegister GHASH_IN,
                                                      XMMRegister ADDBE_4x4, XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHFMSK,
                                                      bool hk_broadcast, bool is_hash_start, bool do_hash_reduction, bool do_hash_hxor,
                                                      bool no_ghash_in, int ghashin_offset, int aesout_offset, int hashkey_offset) {
  const XMMRegister B00_03 = xmm0;
  const XMMRegister B04_07 = xmm3;
  const XMMRegister B08_11 = xmm4;
  const XMMRegister B12_15 = xmm5;
  const XMMRegister THH1 = xmm6;
  const XMMRegister THL1 = xmm7;
  const XMMRegister TLH1 = xmm10;
  const XMMRegister TLL1 = xmm11, THH2 = xmm12, THL2 = xmm13, TLH2 = xmm15;
  const XMMRegister TLL2 = xmm16, THH3 = xmm17, THL3 = xmm19, TLH3 = xmm20;
  const XMMRegister TLL3 = xmm21, DATA1 = xmm17, DATA2 = xmm19, DATA3 = xmm20, DATA4 = xmm21;
  const XMMRegister AESKEY1 = xmm30, AESKEY2 = xmm31;
  const XMMRegister GHKEY1 = xmm1, GHKEY2 = xmm18, GHDAT1 = xmm8, GHDAT2 = xmm22;
  const XMMRegister ZT = xmm23, TO_REDUCE_L = xmm25, TO_REDUCE_H = xmm24;
  const int hkey_gap = 16 * 32;

  Label blocks_overflow, blocks_ok, skip_shuffle, cont, aes_256, aes_192, last_aes_rnd;

  __ cmpb(CTR_CHECK, (256 - 16));
  __ jcc(Assembler::aboveEqual, blocks_overflow);
  __ vpaddd(B00_03, CTR_BE, ADDBE_1234, Assembler::AVX_512bit);
  __ vpaddd(B04_07, B00_03, ADDBE_4x4, Assembler::AVX_512bit);
  __ vpaddd(B08_11, B04_07, ADDBE_4x4, Assembler::AVX_512bit);
  __ vpaddd(B12_15, B08_11, ADDBE_4x4, Assembler::AVX_512bit);
  __ jmp(blocks_ok);
  __ bind(blocks_overflow);
  __ vpshufb(CTR_BE, CTR_BE, SHFMSK, Assembler::AVX_512bit);
  __ evmovdquq(B12_15, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
  __ vpaddd(B00_03, CTR_BE, ADD_1234, Assembler::AVX_512bit);
  __ vpaddd(B04_07, B00_03, B12_15, Assembler::AVX_512bit);
  __ vpaddd(B08_11, B04_07, B12_15, Assembler::AVX_512bit);
  __ vpaddd(B12_15, B08_11, B12_15, Assembler::AVX_512bit);
  shuffle(B00_03, B04_07, B08_11, B12_15, B00_03, B04_07, B08_11, B12_15, SHFMSK);

  __ bind(blocks_ok);

  //pre - load constants
  ev_load_key(AESKEY1, key, 0, rbx);
  if (!no_ghash_in) {
    __ evpxorq(GHDAT1, GHASH_IN, Address(avx512_subkeyHtbl, 16 * ghashin_offset), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(GHDAT1, Address(avx512_subkeyHtbl, 16 * ghashin_offset), Assembler::AVX_512bit);
  }

  if (hk_broadcast) {
    __ evbroadcastf64x2(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 0 * 64), Assembler::AVX_512bit);
    __ evbroadcastf64x2(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 0 * 64), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 0 * 64), Assembler::AVX_512bit);
    __ evmovdquq(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 0 * 64), Assembler::AVX_512bit);
  }

  //save counter for the next round
  //increment counter overflow check register
  __ evshufi64x2(CTR_BE, B12_15, B12_15, 255, Assembler::AVX_512bit);
  __ addb(CTR_CHECK, 16);

  //pre - load constants
  ev_load_key(AESKEY2, key, 1 * 16, rbx);
  __ evmovdquq(GHDAT2, Address(avx512_subkeyHtbl, 16 * (ghashin_offset +4)), Assembler::AVX_512bit);

  //stitch AES rounds with GHASH
  //AES round 0
  __ evpxorq(B00_03, B00_03, AESKEY1, Assembler::AVX_512bit);
  __ evpxorq(B04_07, B04_07, AESKEY1, Assembler::AVX_512bit);
  __ evpxorq(B08_11, B08_11, AESKEY1, Assembler::AVX_512bit);
  __ evpxorq(B12_15, B12_15, AESKEY1, Assembler::AVX_512bit);
  ev_load_key(AESKEY1, key, 2 * 16, rbx);

  //GHASH 4 blocks(15 to 12)
  carrylessMultiply(TLL1, TLH1, THL1, THH1, GHDAT1, GHKEY2, GHKEY1);

  if (hk_broadcast) {
    __ evbroadcastf64x2(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 1 * 64), Assembler::AVX_512bit);
    __ evbroadcastf64x2(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 1 * 64), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 1 * 64), Assembler::AVX_512bit);
    __ evmovdquq(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 1 * 64), Assembler::AVX_512bit);
  }

  __ evmovdquq(GHDAT1, Address(avx512_subkeyHtbl, 16 * (ghashin_offset + 8)), Assembler::AVX_512bit);

  //AES round 1
  roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);

  ev_load_key(AESKEY2, key, 3 * 16, rbx);

  //GHASH 4 blocks(11 to 8)
  carrylessMultiply(TLL2, TLH2, THL2, THH2, GHDAT2, GHKEY2, GHKEY1);

  if (hk_broadcast) {
    __ evbroadcastf64x2(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 2 * 64), Assembler::AVX_512bit);
    __ evbroadcastf64x2(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 2 * 64), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 2 * 64 ), Assembler::AVX_512bit);
    __ evmovdquq(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 2 * 64), Assembler::AVX_512bit);
  }
  __ evmovdquq(GHDAT2, Address(avx512_subkeyHtbl, 16 * (ghashin_offset + 12)), Assembler::AVX_512bit);

  //AES round 2
  roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY1, key, 4 * 16, rbx);

  //GHASH 4 blocks(7 to 4)
  carrylessMultiply(TLL3, TLH3, THL3, THH3, GHDAT1, GHKEY2, GHKEY1);

  if (hk_broadcast) {
    __ evbroadcastf64x2(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 3 * 64), Assembler::AVX_512bit);
    __ evbroadcastf64x2(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 3 * 64), Assembler::AVX_512bit);
  } else {
    __ evmovdquq(GHKEY1, Address(avx512_subkeyHtbl, hashkey_offset + 3 * 64), Assembler::AVX_512bit);
    __ evmovdquq(GHKEY2, Address(avx512_subkeyHtbl, hashkey_offset + hkey_gap + 3 * 64), Assembler::AVX_512bit);
  }

  //AES rounds 3
  roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY2, key, 5 * 16, rbx);

  //Gather(XOR) GHASH for 12 blocks
  xorGHASH(TLL1, TLH1, THL1, THH1, TLL2, TLL3, TLH2, TLH3, THL2, THL3, THH2, THH3);

  //AES rounds 4
  roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY1, key, 6 * 16, rbx);

  //load plain / cipher text(recycle GH3xx registers)
  loadData(in, pos, DATA1, DATA2, DATA3, DATA4);

  //AES rounds 5
  roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY2, key, 7 * 16, rbx);

  //GHASH 4 blocks(3 to 0)
  carrylessMultiply(TLL2, TLH2, THL2, THH2, GHDAT2, GHKEY2, GHKEY1);

  //AES round 6
  roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY1, key, 8 * 16, rbx);

  //gather GHASH in TO_REDUCE_H / L
  if (is_hash_start) {
    __ evpxorq(TO_REDUCE_L, TLL2, THL2, Assembler::AVX_512bit);
    __ evpxorq(TO_REDUCE_H, THH2, TLH2, Assembler::AVX_512bit);
    __ vpternlogq(TO_REDUCE_L, 0x96, TLL1, THL1, Assembler::AVX_512bit);
    __ vpternlogq(TO_REDUCE_H, 0x96, THH1, TLH1, Assembler::AVX_512bit);
  } else {
    //not the first round so sums need to be updated
    xorGHASH(TO_REDUCE_L, TO_REDUCE_H, TO_REDUCE_L, TO_REDUCE_H, TLL2, THL2, THH2, TLH2, TLL1, THL1, THH1, TLH1);
  }

  //AES round 7
  roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY2, key, 9 * 16, rbx);

  //new reduction
  if (do_hash_reduction) {
    __ evmovdquq(ZT, ExternalAddress(ghash_polynomial_reduction_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
    __ evpclmulqdq(THH1, TO_REDUCE_L, ZT, 0x10, Assembler::AVX_512bit);
    __ vpshufd(TO_REDUCE_L, TO_REDUCE_L, 78, Assembler::AVX_512bit);
    __ vpternlogq(THH1, 0x96, TO_REDUCE_H, TO_REDUCE_L, Assembler::AVX_512bit);
  }

  //AES round 8
  roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY1, key, 10 * 16, rbx);

  //horizontalxor of 4 reduced hashes
  if (do_hash_hxor) {
    vhpxori4x128(THH1, TLL1);
  }

  //AES round 9
  roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
  //AES rounds up to 11 (AES192) or 13 (AES256)
  //AES128 is done
  __ cmpl(NROUNDS, 52);
  __ jcc(Assembler::less, last_aes_rnd);
  __ bind(aes_192);
  ev_load_key(AESKEY2, key, 11 * 16, rbx);
  roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY1, key, 12 * 16, rbx);
  roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);
  __ cmpl(NROUNDS, 60);
  __ jcc(Assembler::less, last_aes_rnd);
  __ bind(aes_256);
  ev_load_key(AESKEY2, key, 13 * 16, rbx);
  roundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(AESKEY1, key, 14 * 16, rbx);
  roundEncode(AESKEY2, B00_03, B04_07, B08_11, B12_15);

  __ bind(last_aes_rnd);
  //the last AES round
  lastroundEncode(AESKEY1, B00_03, B04_07, B08_11, B12_15);
  //AESKEY1and AESKEY2 contain AES round keys

  //XOR against plain / cipher text
  xorBeforeStore(B00_03, B04_07, B08_11, B12_15, DATA1, DATA2, DATA3, DATA4);

  //store cipher / plain text
  storeData(out, pos, B00_03, B04_07, B08_11, B12_15);
  //**B00_03, B04_07, B08_011, B12_B15 may contain sensitive data

  //shuffle cipher text blocks for GHASH computation
  __ cmpptr(ct, out);
  __ jcc(Assembler::notEqual, skip_shuffle);
  shuffle(B00_03, B04_07, B08_11, B12_15, B00_03, B04_07, B08_11, B12_15, SHFMSK);
  __ jmp(cont);
  __ bind(skip_shuffle);
  shuffle(B00_03, B04_07, B08_11, B12_15, DATA1, DATA2, DATA3, DATA4, SHFMSK);

  //**B00_03, B04_07, B08_011, B12_B15 overwritten with shuffled cipher text
  __ bind(cont);
  //store shuffled cipher text for ghashing
  __ evmovdquq(Address(avx512_subkeyHtbl, 16 * aesout_offset), B00_03, Assembler::AVX_512bit);
  __ evmovdquq(Address(avx512_subkeyHtbl, 16 * (aesout_offset + 4)), B04_07, Assembler::AVX_512bit);
  __ evmovdquq(Address(avx512_subkeyHtbl, 16 * (aesout_offset + 8)), B08_11, Assembler::AVX_512bit);
  __ evmovdquq(Address(avx512_subkeyHtbl, 16 * (aesout_offset + 12)), B12_15, Assembler::AVX_512bit);
}


//Encrypt / decrypt the initial 16 blocks
void StubGenerator::initial_blocks_16_avx512(Register in, Register out, Register ct, Register pos, Register key, Register avx512_subkeyHtbl,
                                             Register CTR_CHECK, Register rounds, XMMRegister CTR, XMMRegister GHASH,  XMMRegister ADDBE_4x4,
                                             XMMRegister ADDBE_1234, XMMRegister ADD_1234, XMMRegister SHUF_MASK, int stack_offset) {
  const XMMRegister B00_03 = xmm7;
  const XMMRegister B04_07 = xmm10;
  const XMMRegister B08_11 = xmm11;
  const XMMRegister B12_15 = xmm12;
  const XMMRegister T0 = xmm0;
  const XMMRegister T1 = xmm3;
  const XMMRegister T2 = xmm4;
  const XMMRegister T3 = xmm5;
  const XMMRegister T4 = xmm6;
  const XMMRegister T5 = xmm30;

  Label next_16_overflow, next_16_ok, cont, skip_shuffle, aes_256, aes_192, last_aes_rnd;
  //prepare counter blocks
  __ cmpb(CTR_CHECK, (256 - 16));
  __ jcc(Assembler::aboveEqual, next_16_overflow);
  __ vpaddd(B00_03, CTR, ADDBE_1234, Assembler::AVX_512bit);
  __ vpaddd(B04_07, B00_03, ADDBE_4x4, Assembler::AVX_512bit);
  __ vpaddd(B08_11, B04_07, ADDBE_4x4, Assembler::AVX_512bit);
  __ vpaddd(B12_15, B08_11, ADDBE_4x4, Assembler::AVX_512bit);
  __ jmp(next_16_ok);
  __ bind(next_16_overflow);
  __ vpshufb(CTR, CTR, SHUF_MASK, Assembler::AVX_512bit);
  __ evmovdquq(B12_15, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, rbx);
  __ vpaddd(B00_03, CTR, ADD_1234, Assembler::AVX_512bit);
  __ vpaddd(B04_07, B00_03, B12_15, Assembler::AVX_512bit);
  __ vpaddd(B08_11, B04_07, B12_15, Assembler::AVX_512bit);
  __ vpaddd(B12_15, B08_11, B12_15, Assembler::AVX_512bit);
  shuffle(B00_03, B04_07, B08_11, B12_15, B00_03, B04_07, B08_11, B12_15, SHUF_MASK);
  __ bind(next_16_ok);
  __ evshufi64x2(CTR, B12_15, B12_15, 255, Assembler::AVX_512bit);
  __ addb(CTR_CHECK, 16);

  //load 16 blocks of data
  loadData(in, pos, T0, T1, T2, T3);

  //move to AES encryption rounds
  __ movdqu(T5, ExternalAddress(key_shuffle_mask_addr()), rbx /*rscratch*/);
  ev_load_key(T4, key, 0, T5);
  __ evpxorq(B00_03, B00_03, T4, Assembler::AVX_512bit);
  __ evpxorq(B04_07, B04_07, T4, Assembler::AVX_512bit);
  __ evpxorq(B08_11, B08_11, T4, Assembler::AVX_512bit);
  __ evpxorq(B12_15, B12_15, T4, Assembler::AVX_512bit);

  for (int i = 1; i < 10; i++) {
    ev_load_key(T4, key, i * 16, T5);
    roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
  }

  ev_load_key(T4, key, 10 * 16, T5);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::less, last_aes_rnd);
  __ bind(aes_192);
  roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(T4, key, 16 * 11, T5);
  roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(T4, key, 16 * 12, T5);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::less, last_aes_rnd);
  __ bind(aes_256);
  roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(T4, key, 16 * 13, T5);
  roundEncode(T4, B00_03, B04_07, B08_11, B12_15);
  ev_load_key(T4, key, 16 * 14, T5);

  __ bind(last_aes_rnd);
  lastroundEncode(T4, B00_03, B04_07, B08_11, B12_15);

  //xor against text
  xorBeforeStore(B00_03, B04_07, B08_11, B12_15, T0, T1, T2, T3);

  //store
  storeData(out, pos, B00_03, B04_07, B08_11, B12_15);

  __ cmpptr(ct, out);
  __ jcc(Assembler::equal, skip_shuffle);
  //decryption - cipher text needs to go to GHASH phase
  shuffle(B00_03, B04_07, B08_11, B12_15, T0, T1, T2, T3, SHUF_MASK);
  __ jmp(cont);
  __ bind(skip_shuffle);
  shuffle(B00_03, B04_07, B08_11, B12_15, B00_03, B04_07, B08_11, B12_15, SHUF_MASK);

  //B00_03, B04_07, B08_11, B12_15 overwritten with shuffled cipher text
  __ bind(cont);
  __ evmovdquq(Address(avx512_subkeyHtbl, 16 * stack_offset), B00_03, Assembler::AVX_512bit);
  __ evmovdquq(Address(avx512_subkeyHtbl, 16 * (stack_offset + 4)), B04_07, Assembler::AVX_512bit);
  __ evmovdquq(Address(avx512_subkeyHtbl, 16 * (stack_offset + 8)), B08_11, Assembler::AVX_512bit);
  __ evmovdquq(Address(avx512_subkeyHtbl, 16 * (stack_offset + 12)), B12_15, Assembler::AVX_512bit);
}

void StubGenerator::aesgcm_avx512(Register in, Register len, Register ct, Register out, Register key, Register state,
                                  Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter) {
  Label ENC_DEC_DONE, MESG_BELOW_32_BLKS, NO_BIG_BLKS, ENCRYPT_BIG_BLKS_NO_HXOR,
        ENCRYPT_BIG_NBLKS, ENCRYPT_16_BLKS, ENCRYPT_N_GHASH_32_N_BLKS, GHASH_DONE;
  const XMMRegister CTR_BLOCKx = xmm2;
  const XMMRegister AAD_HASHx = xmm14;
  const XMMRegister ZTMP0 = xmm0;
  const XMMRegister ZTMP1 = xmm3; //**sensitive
  const XMMRegister ZTMP2 = xmm4; //**sensitive(small data)
  const XMMRegister ZTMP3 = xmm5; //**sensitive(small data)
  const XMMRegister ZTMP4 = xmm6;
  const XMMRegister ZTMP5 = xmm7;
  const XMMRegister ZTMP6 = xmm10;
  const XMMRegister ZTMP7 = xmm11;
  const XMMRegister ZTMP8 = xmm12;
  const XMMRegister ZTMP9 = xmm13;
  const XMMRegister ZTMP10 = xmm15;
  const XMMRegister ZTMP11 = xmm16;
  const XMMRegister ZTMP12 = xmm17;
  const XMMRegister ZTMP13 = xmm19;
  const XMMRegister ZTMP14 = xmm20;
  const XMMRegister ZTMP15 = xmm21;
  const XMMRegister ZTMP16 = xmm30;
  const XMMRegister ZTMP17 = xmm31;
  const XMMRegister ZTMP18 = xmm1;
  const XMMRegister ZTMP19 = xmm18;
  const XMMRegister ZTMP20 = xmm8;
  const XMMRegister ZTMP21 = xmm22;
  const XMMRegister ZTMP22 = xmm23;
  const XMMRegister ZTMP23 = xmm26;
  const XMMRegister GH = xmm24;
  const XMMRegister GL = xmm25;
  const XMMRegister SHUF_MASK = xmm29;
  const XMMRegister ADDBE_4x4 = xmm27;
  const XMMRegister ADDBE_1234 = xmm28;
  const XMMRegister ADD_1234 = xmm9;
  const KRegister MASKREG = k1;
  const Register pos = rax;
  const Register rounds = r15;
  const Register CTR_CHECK = r14;

  const int stack_offset = 64;
  const int ghashin_offset = 64;
  const int aesout_offset = 64;
  const int hashkey_offset = 0;
  const int hashkey_gap = 16 * 32;
  const int HashKey_32 = 0;
  const int HashKey_16 = 16 * 16;

  __ movl(pos, 0);
  __ cmpl(len, 256);
  __ jcc(Assembler::lessEqual, ENC_DEC_DONE);

  /* Structure of the Htbl is as follows:
  *   Where 0 - 31 we have 32 Hashkey's and 32-63 we have 32 HashKeyK (derived from HashKey)
  *   Rest 8 entries are for storing CTR values post AES rounds
  * ----------------------------------------------------------------------------------------
      Hashkey32 -> 16 * 0
      Hashkey31 -> 16 * 1
      Hashkey30 -> 16 * 2
      ........
      Hashkey1 -> 16 * 31
      ---------------------
      HaskeyK32 -> 16 * 32
      HashkeyK31 -> 16 * 33
      .........
      HashkeyK1 -> 16 * 63
      ---------------------
      1st set of AES Entries
      B00_03 -> 16 * 64
      B04_07 -> 16 * 68
      B08_11 -> 16 * 72
      B12_15 -> 16 * 80
      ---------------------
      2nd set of AES Entries
      B00_03 -> 16 * 84
      B04_07 -> 16 * 88
      B08_11 -> 16 * 92
      B12_15 -> 16 * 96
      ---------------------*/
  generateHtbl_32_blocks_avx512(subkeyHtbl, avx512_subkeyHtbl);

  //Move initial counter value and STATE value into variables
  __ movdqu(CTR_BLOCKx, Address(counter, 0));
  __ movdqu(AAD_HASHx, Address(state, 0));

  //Load lswap mask for ghash
  __ movdqu(xmm24, ExternalAddress(ghash_long_swap_mask_addr()), rbx /*rscratch*/);
  //Shuffle input state using lswap mask
  __ vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);

  // Compute #rounds for AES based on the length of the key array
  __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  __ evmovdquq(ADDBE_4x4, ExternalAddress(counter_mask_addbe_4444_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
  __ evmovdquq(ADDBE_1234, ExternalAddress(counter_mask_addbe_1234_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
  __ evmovdquq(SHUF_MASK, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);
  __ evmovdquq(ADD_1234, ExternalAddress(counter_mask_add_1234_addr()), Assembler::AVX_512bit, rbx /*rscratch*/);

  //Shuffle counter, subtract 1 from the pre-incremented counter value and broadcast counter value to 512 bit register
  __ vpshufb(CTR_BLOCKx, CTR_BLOCKx, SHUF_MASK, Assembler::AVX_128bit);
  __ vpsubd(CTR_BLOCKx, CTR_BLOCKx, ADD_1234, Assembler::AVX_128bit);
  __ evshufi64x2(CTR_BLOCKx, CTR_BLOCKx, CTR_BLOCKx, 0, Assembler::AVX_512bit);

  __ movdl(CTR_CHECK, CTR_BLOCKx);
  __ andl(CTR_CHECK, 255);

  // Reshuffle counter
  __ vpshufb(CTR_BLOCKx, CTR_BLOCKx, SHUF_MASK, Assembler::AVX_512bit);

  initial_blocks_16_avx512(in, out, ct, pos, key, avx512_subkeyHtbl, CTR_CHECK, rounds, CTR_BLOCKx, AAD_HASHx,  ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK, stack_offset);
  __ addl(pos, 16 * 16);
  __ cmpl(len, 32 * 16);
  __ jcc(Assembler::below, MESG_BELOW_32_BLKS);

  initial_blocks_16_avx512(in, out, ct, pos, key, avx512_subkeyHtbl, CTR_CHECK, rounds, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK, stack_offset + 16);
  __ addl(pos, 16 * 16);
  __ subl(len, 32 * 16);

  __ cmpl(len, 32 * 16);
  __ jcc(Assembler::below, NO_BIG_BLKS);

  __ bind(ENCRYPT_BIG_BLKS_NO_HXOR);
  __ cmpl(len, 2 * 32 * 16);
  __ jcc(Assembler::below, ENCRYPT_BIG_NBLKS);
  ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
                                    true, true, false, false, false, ghashin_offset, aesout_offset, HashKey_32);
  __ addl(pos, 16 * 16);

  ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
                                    true, false, true, false, true, ghashin_offset + 16, aesout_offset + 16, HashKey_16);
  __ evmovdquq(AAD_HASHx, ZTMP4, Assembler::AVX_512bit);
  __ addl(pos, 16 * 16);
  __ subl(len, 32 * 16);
  __ jmp(ENCRYPT_BIG_BLKS_NO_HXOR);

  __ bind(ENCRYPT_BIG_NBLKS);
  ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
                                    false, true, false, false, false, ghashin_offset, aesout_offset, HashKey_32);
  __ addl(pos, 16 * 16);
  ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
                                    false, false, true, true, true, ghashin_offset + 16, aesout_offset + 16, HashKey_16);

  __ movdqu(AAD_HASHx, ZTMP4);
  __ addl(pos, 16 * 16);
  __ subl(len, 32 * 16);

  __ bind(NO_BIG_BLKS);
  __ cmpl(len, 16 * 16);
  __ jcc(Assembler::aboveEqual, ENCRYPT_16_BLKS);

  __ bind(ENCRYPT_N_GHASH_32_N_BLKS);
  ghash16_avx512(true, false, false, false, true, in, pos, avx512_subkeyHtbl, AAD_HASHx, SHUF_MASK, stack_offset, 0, 0, HashKey_32);
  gcm_enc_dec_last_avx512(len, in, pos, AAD_HASHx, SHUF_MASK, avx512_subkeyHtbl, ghashin_offset + 16, HashKey_16, false, true);
  __ jmp(GHASH_DONE);

  __ bind(ENCRYPT_16_BLKS);
  ghash16_encrypt_parallel16_avx512(in, out, ct, pos, avx512_subkeyHtbl, CTR_CHECK, rounds, key, CTR_BLOCKx, AAD_HASHx, ADDBE_4x4, ADDBE_1234, ADD_1234, SHUF_MASK,
                                    false, true, false, false, false, ghashin_offset, aesout_offset, HashKey_32);

  ghash16_avx512(false, true, false, false, true, in, pos, avx512_subkeyHtbl, AAD_HASHx, SHUF_MASK, stack_offset, 16 * 16, 0, HashKey_16);
  __ addl(pos, 16 * 16);

  __ bind(MESG_BELOW_32_BLKS);
  __ subl(len, 16 * 16);
  gcm_enc_dec_last_avx512(len, in, pos, AAD_HASHx, SHUF_MASK, avx512_subkeyHtbl, ghashin_offset, HashKey_16, true, true);

  __ bind(GHASH_DONE);
  //Pre-increment counter for next operation, make sure that counter value is incremented on the LSB
  __ vpshufb(CTR_BLOCKx, CTR_BLOCKx, SHUF_MASK, Assembler::AVX_128bit);
  __ vpaddd(CTR_BLOCKx, CTR_BLOCKx, ADD_1234, Assembler::AVX_128bit);
  __ vpshufb(CTR_BLOCKx, CTR_BLOCKx, SHUF_MASK, Assembler::AVX_128bit);
  __ movdqu(Address(counter, 0), CTR_BLOCKx);
  //Load ghash lswap mask
  __ movdqu(xmm24, ExternalAddress(ghash_long_swap_mask_addr()), rbx /*rscratch*/);
  //Shuffle ghash using lbswap_mask and store it
  __ vpshufb(AAD_HASHx, AAD_HASHx, xmm24, Assembler::AVX_128bit);
  __ movdqu(Address(state, 0), AAD_HASHx);

  //Zero out sensitive data
  __ evpxorq(ZTMP21, ZTMP21, ZTMP21, Assembler::AVX_512bit);
  __ evpxorq(ZTMP0, ZTMP0, ZTMP0, Assembler::AVX_512bit);
  __ evpxorq(ZTMP1, ZTMP1, ZTMP1, Assembler::AVX_512bit);
  __ evpxorq(ZTMP2, ZTMP2, ZTMP2, Assembler::AVX_512bit);
  __ evpxorq(ZTMP3, ZTMP3, ZTMP3, Assembler::AVX_512bit);

  __ bind(ENC_DEC_DONE);
}

//Implements data * hashkey mod (128, 127, 126, 121, 0)
//Inputs:
//GH and HK - 128 bits each
//Output:
//GH = GH * Hashkey mod poly
//Temp registers: xmm1, xmm2, xmm3, r15
void StubGenerator::gfmul_avx2(XMMRegister GH, XMMRegister HK) {
  const XMMRegister T1 = xmm1;
  const XMMRegister T2 = xmm2;
  const XMMRegister T3 = xmm3;

  __ vpclmulqdq(T1, GH, HK, 0x11); // %%T1 = a1*b1
  __ vpclmulqdq(T2, GH, HK, 0x00); // %%T2 = a0*b0
  __ vpclmulqdq(T3, GH, HK, 0x01); // %%T3 = a1*b0
  __ vpclmulqdq(GH, GH, HK, 0x10); // %%GH = a0*b1
  __ vpxor(GH, GH, T3, Assembler::AVX_128bit);

  __ vpsrldq(T3, GH, 8, Assembler::AVX_128bit); // shift-R %%GH 2 DWs
  __ vpslldq(GH, GH, 8, Assembler::AVX_128bit); // shift-L %%GH 2 DWs

  __ vpxor(T1, T1, T3, Assembler::AVX_128bit);
  __ vpxor(GH, GH, T2, Assembler::AVX_128bit);

  //first phase of the reduction
  __ movdqu(T3, ExternalAddress(ghash_polynomial_reduction_addr()), r15 /*rscratch*/);
  __ vpclmulqdq(T2, T3, GH, 0x01);
  __ vpslldq(T2, T2, 8, Assembler::AVX_128bit); // shift-L %%T2 2 DWs

  __ vpxor(GH, GH, T2, Assembler::AVX_128bit); // first phase of the reduction complete
  //second phase of the reduction
  __ vpclmulqdq(T2, T3, GH, 0x00);
  __ vpsrldq(T2, T2, 4, Assembler::AVX_128bit); // shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)

  __ vpclmulqdq(GH, T3, GH, 0x10);
  __ vpslldq(GH, GH, 4, Assembler::AVX_128bit); // shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)

  __ vpxor(GH, GH, T2, Assembler::AVX_128bit); // second phase of the reduction complete
  __ vpxor(GH, GH, T1, Assembler::AVX_128bit); // the result is in %%GH
}

//Generate 8 constants from the given subkeyH.
//Input:
//htbl - table containing the initial subkeyH
//Output:
//htbl - containing 8 H constants
//Temp registers: xmm0, xmm1, xmm2, xmm3, xmm6, xmm11, xmm12, r15, rbx
void StubGenerator::generateHtbl_8_block_avx2(Register htbl) {
  const XMMRegister HK = xmm6;

  __ movdqu(HK, Address(htbl, 0));
  __ movdqu(xmm1, ExternalAddress(ghash_long_swap_mask_addr()), rbx /*rscratch*/);
  __ vpshufb(HK, HK, xmm1, Assembler::AVX_128bit);

  __ movdqu(xmm11, ExternalAddress(ghash_polynomial_addr()), rbx /*rscratch*/);
  __ movdqu(xmm12, ExternalAddress(ghash_polynomial_two_one_addr()), rbx /*rscratch*/);
  // Compute H ^ 2 from the input subkeyH
  __ vpsrlq(xmm1, xmm6, 63, Assembler::AVX_128bit);
  __ vpsllq(xmm6, xmm6, 1, Assembler::AVX_128bit);
  __ vpslldq(xmm2, xmm1, 8, Assembler::AVX_128bit);
  __ vpsrldq(xmm1, xmm1, 8, Assembler::AVX_128bit);

  __ vpor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);

  __ vpshufd(xmm2, xmm1, 0x24, Assembler::AVX_128bit);
  __ vpcmpeqd(xmm2, xmm2, xmm12, Assembler::AVX_128bit);
  __ vpand(xmm2, xmm2, xmm11, Assembler::AVX_128bit);
  __ vpxor(xmm6, xmm6, xmm2, Assembler::AVX_128bit);
  __ movdqu(Address(htbl, 1 * 16), xmm6); // H * 2
  __ movdqu(xmm0, xmm6);
  for (int i = 2; i < 9; i++) {
    gfmul_avx2(xmm6, xmm0);
    __ movdqu(Address(htbl, i * 16), xmm6);
  }
}

#define aesenc_step_avx2(t_key)\
__ aesenc(xmm1, t_key);\
__ aesenc(xmm2, t_key);\
__ aesenc(xmm3, t_key);\
__ aesenc(xmm4, t_key);\
__ aesenc(xmm5, t_key);\
__ aesenc(xmm6, t_key);\
__ aesenc(xmm7, t_key);\
__ aesenc(xmm8, t_key);\

#define ghash_step_avx2(ghdata, hkey) \
__ vpclmulqdq(xmm11, ghdata, hkey, 0x11);\
__ vpxor(xmm12, xmm12, xmm11, Assembler::AVX_128bit);\
__ vpclmulqdq(xmm11, ghdata, hkey, 0x00);\
__ vpxor(xmm15, xmm15, xmm11, Assembler::AVX_128bit);\
__ vpclmulqdq(xmm11, ghdata, hkey, 0x01);\
__ vpxor(xmm14, xmm14, xmm11, Assembler::AVX_128bit);\
__ vpclmulqdq(xmm11, ghdata, hkey, 0x10);\
__ vpxor(xmm14, xmm14, xmm11, Assembler::AVX_128bit);\

//Encrypts and hashes 8 blocks in an interleaved fashion.
//Inputs:
//key - key for aes operations
//subkeyHtbl - table containing H constants
//ctr_blockx - counter for aes operations
//in - input buffer
//out - output buffer
//ct - ciphertext buffer
//pos - holds the length processed in this method
//in_order - boolean that indicates if incrementing counter without shuffling is needed
//rounds - number of aes rounds calculated based on key length
//xmm1-xmm8 - holds encrypted counter values
//Outputs:
//xmm1-xmm8 - updated encrypted counter values
//ctr_blockx - updated counter value
//out - updated output buffer
//Temp registers: xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, rbx
void StubGenerator::ghash8_encrypt8_parallel_avx2(Register key, Register subkeyHtbl, XMMRegister ctr_blockx, Register in,
                                                  Register out, Register ct, Register pos, bool in_order, Register rounds,
                                                  XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
                                                  XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, XMMRegister xmm8) {
  const XMMRegister t1 = xmm0;
  const XMMRegister t2 = xmm10;
  const XMMRegister t3 = xmm11;
  const XMMRegister t4 = xmm12;
  const XMMRegister t5 = xmm13;
  const XMMRegister t6 = xmm14;
  const XMMRegister t7 = xmm15;
  Label skip_reload, last_aes_rnd, aes_192, aes_256;

  __ movdqu(t2, xmm1);
  for (int i = 0; i <= 6; i++) {
    __ movdqu(Address(rsp, 16 * i), as_XMMRegister(i + 2));
  }

  if (in_order) {
    __ vpaddd(xmm1, ctr_blockx, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, rbx /*rscratch*/); //Increment counter by 1
    __ movdqu(t5, ExternalAddress(counter_mask_linc2_addr()), rbx /*rscratch*/);
    __ vpaddd(xmm2, ctr_blockx, t5, Assembler::AVX_128bit);
    for (int rnum = 1; rnum <= 6; rnum++) {
      __ vpaddd(as_XMMRegister(rnum + 2), as_XMMRegister(rnum), t5, Assembler::AVX_128bit);
    }
    __ movdqu(ctr_blockx, xmm8);

    __ movdqu(t5, ExternalAddress(counter_shuffle_mask_addr()), rbx /*rscratch*/);
    for (int rnum = 1; rnum <= 8; rnum++) {
      __ vpshufb(as_XMMRegister(rnum), as_XMMRegister(rnum), t5, Assembler::AVX_128bit); //perform a 16Byte swap
    }
  } else {
    __ vpaddd(xmm1, ctr_blockx, ExternalAddress(counter_mask_linc1f_addr()), Assembler::AVX_128bit, rbx /*rscratch*/); //Increment counter by 1
    __ vmovdqu(t5, ExternalAddress(counter_mask_linc2f_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
    __ vpaddd(xmm2, ctr_blockx, t5, Assembler::AVX_128bit);
    for (int rnum = 1; rnum <= 6; rnum++) {
      __ vpaddd(as_XMMRegister(rnum + 2), as_XMMRegister(rnum), t5, Assembler::AVX_128bit);
    }
    __ movdqu(ctr_blockx, xmm8);
  }

  load_key(t1, key, 16 * 0, rbx /*rscratch*/);
  for (int rnum = 1; rnum <= 8; rnum++) {
    __ vpxor(as_XMMRegister(rnum), as_XMMRegister(rnum), t1, Assembler::AVX_128bit);
  }

  load_key(t1, key, 16 * 1, rbx /*rscratch*/);
  aesenc_step_avx2(t1);

  load_key(t1, key, 16 * 2, rbx /*rscratch*/);
  aesenc_step_avx2(t1);

  __ movdqu(t5, (Address(subkeyHtbl, 8 * 16)));
  __ vpclmulqdq(t4, t2, t5, 0x11); //t4 = a1*b1
  __ vpclmulqdq(t7, t2, t5, 0x00); //t7 = a0*b0
  __ vpclmulqdq(t6, t2, t5, 0x01); //t6 = a1*b0
  __ vpclmulqdq(t5, t2, t5, 0x10); //t5 = a0*b1
  __ vpxor(t6, t6, t5, Assembler::AVX_128bit);

  for (int i = 3, j = 0; i <= 8; i++, j++) {
    load_key(t1, key, 16 * i, rbx /*rscratch*/);
    aesenc_step_avx2(t1);
    __ movdqu(t1, Address(rsp, 16 * j));
    __ movdqu(t5, (Address(subkeyHtbl, (7 - j) * 16)));
    ghash_step_avx2(t1, t5);
  }

  load_key(t1, key, 16 * 9, rbx /*rscratch*/);
  aesenc_step_avx2(t1);

  __ movdqu(t1, Address(rsp, 16 * 6));
  __ movdqu(t5, (Address(subkeyHtbl, 1 * 16)));

  __ vpclmulqdq(t3, t1, t5, 0x00);
  __ vpxor(t7, t7, t3, Assembler::AVX_128bit);

  __ vpclmulqdq(t3, t1, t5, 0x01);
  __ vpxor(t6, t6, t3, Assembler::AVX_128bit);

  __ vpclmulqdq(t3, t1, t5, 0x10);
  __ vpxor(t6, t6, t3, Assembler::AVX_128bit);

  __ vpclmulqdq(t3, t1, t5, 0x11);
  __ vpxor(t1, t4, t3, Assembler::AVX_128bit);

  __ vpslldq(t3, t6, 8, Assembler::AVX_128bit); //shift-L t3 2 DWs
  __ vpsrldq(t6, t6, 8, Assembler::AVX_128bit); //shift-R t2 2 DWs
  __ vpxor(t7, t7, t3, Assembler::AVX_128bit);
  __ vpxor(t1, t1, t6, Assembler::AVX_128bit); // accumulate the results in t1:t7

  load_key(t5, key, 16 * 10, rbx /*rscratch*/);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::less, last_aes_rnd);

  __ bind(aes_192);
  aesenc_step_avx2(t5);
  load_key(t5, key, 16 * 11, rbx /*rscratch*/);
  aesenc_step_avx2(t5);
  load_key(t5, key, 16 * 12, rbx /*rscratch*/);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::less, last_aes_rnd);

  __ bind(aes_256);
  aesenc_step_avx2(t5);
  load_key(t5, key, 16 * 13, rbx /*rscratch*/);
  aesenc_step_avx2(t5);
  load_key(t5, key, 16 * 14, rbx /*rscratch*/);
  __ bind(last_aes_rnd);
  for (int rnum = 1; rnum <= 8; rnum++) {
    __ aesenclast(as_XMMRegister(rnum), t5);
  }

  for (int i = 0; i <= 7; i++) {
    __ movdqu(t2, Address(in, pos, Address::times_1, 16 * i));
    __ vpxor(as_XMMRegister(i + 1), as_XMMRegister(i + 1), t2, Assembler::AVX_128bit);
  }

  //first phase of the reduction
  __ vmovdqu(t3, ExternalAddress(ghash_polynomial_reduction_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);

  __ vpclmulqdq(t2, t3, t7, 0x01);
  __ vpslldq(t2, t2, 8, Assembler::AVX_128bit); //shift-L xmm2 2 DWs

  __ vpxor(t7, t7, t2, Assembler::AVX_128bit); //first phase of the reduction complete

  //Write to the Ciphertext buffer
  for (int i = 0; i <= 7; i++) {
    __ movdqu(Address(out, pos, Address::times_1, 16 * i), as_XMMRegister(i + 1));
  }

  __ cmpptr(ct, out);
  __ jcc(Assembler::equal, skip_reload);
  for (int i = 0; i <= 7; i++) {
    __ movdqu(as_XMMRegister(i + 1), Address(in, pos, Address::times_1, 16 * i));
  }

  __ bind(skip_reload);
  //second phase of the reduction
  __ vpclmulqdq(t2, t3, t7, 0x00);
  __ vpsrldq(t2, t2, 4, Assembler::AVX_128bit); //shift-R t2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)

  __ vpclmulqdq(t4, t3, t7, 0x10);
  __ vpslldq(t4, t4, 4, Assembler::AVX_128bit); //shift-L t4 1 DW (Shift-L 1-DW to obtain result with no shifts)
  __ vpxor(t4, t4, t2, Assembler::AVX_128bit); //second phase of the reduction complete
  __ vpxor(t1, t1, t4, Assembler::AVX_128bit); //the result is in t1

  //perform a 16Byte swap
  __ movdqu(t7, ExternalAddress(counter_shuffle_mask_addr()), rbx /*rscratch*/);
  for (int rnum = 1; rnum <= 8; rnum++) {
    __ vpshufb(as_XMMRegister(rnum), as_XMMRegister(rnum), t7, Assembler::AVX_128bit);
  }
  __ vpxor(xmm1, xmm1, t1, Assembler::AVX_128bit);
}

//GHASH the last 8 ciphertext blocks.
//Input:
//subkeyHtbl - table containing H constants
//Output:
//xmm14 - calculated aad hash
//Temp registers: xmm0, xmm10, xmm11, xmm12, xmm13, xmm15, rbx
void StubGenerator::ghash_last_8_avx2(Register subkeyHtbl) {
  const XMMRegister t1 = xmm0;
  const XMMRegister t2 = xmm10;
  const XMMRegister t3 = xmm11;
  const XMMRegister t4 = xmm12;
  const XMMRegister t5 = xmm13;
  const XMMRegister t6 = xmm14;
  const XMMRegister t7 = xmm15;

  //Karatsuba Method
  __ movdqu(t5, Address(subkeyHtbl, 8 * 16));

  __ vpshufd(t2, xmm1, 78, Assembler::AVX_128bit);
  __ vpshufd(t3, t5, 78, Assembler::AVX_128bit);
  __ vpxor(t2, t2, xmm1, Assembler::AVX_128bit);
  __ vpxor(t3, t3, t5, Assembler::AVX_128bit);

  __ vpclmulqdq(t6, xmm1, t5, 0x11);
  __ vpclmulqdq(t7, xmm1, t5, 0x00);

  __ vpclmulqdq(xmm1, t2, t3, 0x00);

  for (int i = 7, rnum = 2; rnum <= 8; i--, rnum++) {
    __ movdqu(t5, Address(subkeyHtbl, i * 16));
    __ vpshufd(t2, as_XMMRegister(rnum), 78, Assembler::AVX_128bit);
    __ vpshufd(t3, t5, 78, Assembler::AVX_128bit);
    __ vpxor(t2, t2, as_XMMRegister(rnum), Assembler::AVX_128bit);
    __ vpxor(t3, t3, t5, Assembler::AVX_128bit);
    __ vpclmulqdq(t4, as_XMMRegister(rnum), t5, 0x11);
    __ vpxor(t6, t6, t4, Assembler::AVX_128bit);
    __ vpclmulqdq(t4, as_XMMRegister(rnum), t5, 0x00);
    __ vpxor(t7, t7, t4, Assembler::AVX_128bit);
    __ vpclmulqdq(t2, t2, t3, 0x00);
    __ vpxor(xmm1, xmm1, t2, Assembler::AVX_128bit);
  }

  __ vpxor(xmm1, xmm1, t6, Assembler::AVX_128bit);
  __ vpxor(t2, xmm1, t7, Assembler::AVX_128bit);

  __ vpslldq(t4, t2, 8, Assembler::AVX_128bit);
  __ vpsrldq(t2, t2, 8, Assembler::AVX_128bit);

  __ vpxor(t7, t7, t4, Assembler::AVX_128bit);
  __ vpxor(t6, t6, t2, Assembler::AVX_128bit); //<t6:t7> holds the result of the accumulated carry-less multiplications

  //first phase of the reduction
  __ movdqu(t3, ExternalAddress(ghash_polynomial_reduction_addr()), rbx /*rscratch*/);

  __ vpclmulqdq(t2, t3, t7, 0x01);
  __ vpslldq(t2, t2, 8, Assembler::AVX_128bit); // shift-L t2 2 DWs

  __ vpxor(t7, t7, t2, Assembler::AVX_128bit);//first phase of the reduction complete

  //second phase of the reduction
  __ vpclmulqdq(t2, t3, t7, 0x00);
  __ vpsrldq(t2, t2, 4, Assembler::AVX_128bit); //shift-R t2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)

  __ vpclmulqdq(t4, t3, t7, 0x10);
  __ vpslldq(t4, t4, 4, Assembler::AVX_128bit); //shift-L t4 1 DW (Shift-L 1-DW to obtain result with no shifts)
  __ vpxor(t4, t4, t2, Assembler::AVX_128bit); //second phase of the reduction complete
  __ vpxor(t6, t6, t4, Assembler::AVX_128bit); //the result is in t6
}

//Encrypt initial number of 8 blocks
//Inputs:
//ctr - counter for aes operations
//rounds - number of aes rounds calculated based on key length
//key - key for aes operations
//len - input length to be processed
//in - input buffer
//out - output buffer
//ct - ciphertext buffer
//aad_hashx - input aad hash
//pos - holds the length processed in this method
//Outputs:
//xmm1-xmm8 - holds updated encrypted counter values
//ctr - updated counter value
//pos - updated position
//len - updated length
//out - updated output buffer
//Temp registers: xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
void StubGenerator::initial_blocks_avx2(XMMRegister ctr, Register rounds, Register key, Register len, Register in,
                                        Register out, Register ct, XMMRegister aad_hashx, Register pos) {
  const XMMRegister t1 = xmm12;
  const XMMRegister t2 = xmm13;
  const XMMRegister t3 = xmm14;
  const XMMRegister t4 = xmm15;
  const XMMRegister t5 = xmm11;
  const XMMRegister t6 = xmm10;
  const XMMRegister t_key = xmm0;

  Label skip_reload, last_aes_rnd, aes_192, aes_256;
  //Move AAD_HASH to temp reg t3
  __ movdqu(t3, aad_hashx);
  //Prepare 8 counter blocks and perform rounds of AES cipher on
  //them, load plain/cipher text and store cipher/plain text.
  __ movdqu(xmm1, ctr);
  __ movdqu(t5, ExternalAddress(counter_mask_linc1_addr()), rbx /*rscratch*/);
  __ movdqu(t6, ExternalAddress(counter_mask_linc2_addr()), rbx /*rscratch*/ );
  __ vpaddd(xmm2, xmm1, t5, Assembler::AVX_128bit);
  for (int rnum = 1; rnum <= 6; rnum++) {
    __ vpaddd(as_XMMRegister(rnum + 2), as_XMMRegister(rnum), t6, Assembler::AVX_128bit);
  }
  __ movdqu(ctr, xmm8);

  __ movdqu(t5, ExternalAddress(counter_shuffle_mask_addr()), rbx /*rscratch*/);
  for (int rnum = 1; rnum <= 8; rnum++) {
    __ vpshufb(as_XMMRegister(rnum), as_XMMRegister(rnum), t5, Assembler::AVX_128bit); //perform a 16Byte swap
  }

  load_key(t_key, key, 16 * 0, rbx /*rscratch*/);
  for (int rnum = 1; rnum <= 8; rnum++) {
    __ vpxor(as_XMMRegister(rnum), as_XMMRegister(rnum), t_key, Assembler::AVX_128bit);
  }

  for (int i = 1; i <= 9; i++) {
    load_key(t_key, key, 16 * i, rbx /*rscratch*/);
    aesenc_step_avx2(t_key);
  }

  load_key(t_key, key, 16 * 10, rbx /*rscratch*/);
  __ cmpl(rounds, 52);
  __ jcc(Assembler::less, last_aes_rnd);

  __ bind(aes_192);
  aesenc_step_avx2(t_key);
  load_key(t_key, key, 16 * 11, rbx /*rscratch*/);
  aesenc_step_avx2(t_key);
  load_key(t_key, key, 16 * 12, rbx /*rscratch*/);
  __ cmpl(rounds, 60);
  __ jcc(Assembler::less, last_aes_rnd);

  __ bind(aes_256);
  aesenc_step_avx2(t_key);
  load_key(t_key, key, 16 * 13, rbx /*rscratch*/);
  aesenc_step_avx2(t_key);
  load_key(t_key, key, 16 * 14, rbx /*rscratch*/);

  __ bind(last_aes_rnd);
  for (int rnum = 1; rnum <= 8; rnum++) {
    __ aesenclast(as_XMMRegister(rnum), t_key);
  }

  //XOR and store data
  for (int i = 0; i <= 7; i++) {
    __ movdqu(t1, Address(in, pos, Address::times_1, 16 * i));
    __ vpxor(as_XMMRegister(i + 1), as_XMMRegister(i + 1), t1, Assembler::AVX_128bit);
    __ movdqu(Address(out, pos, Address::times_1, 16 * i), as_XMMRegister(i + 1));
  }

  __ cmpptr(ct, out);
  __ jcc(Assembler::equal, skip_reload);
  for (int i = 0; i <= 7; i++) {
    __ movdqu(as_XMMRegister(i + 1), Address(in, pos, Address::times_1, 16 * i));
  }

  __ bind(skip_reload);
  //Update len with the number of blocks processed
  __ subl(len, 128);
  __ addl(pos, 128);

  __ movdqu(t4, ExternalAddress(counter_shuffle_mask_addr()), rbx /*rscratch*/);
  for (int rnum = 1; rnum <= 8; rnum++) {
    __ vpshufb(as_XMMRegister(rnum), as_XMMRegister(rnum), t4, Assembler::AVX_128bit);
  }
  // Combine GHASHed value with the corresponding ciphertext
  __ vpxor(xmm1, xmm1, t3, Assembler::AVX_128bit);
}

//AES-GCM interleaved implementation
//Inputs:
//in - input buffer
//len- message length to be processed
//ct - cipher text buffer
//out - output buffer
//key - key for aes operations
//state - address of aad hash for ghash computation
//subkeyHtbl- table consisting of H constants
//counter - address of counter for aes operations
//Output:
//(counter) - updated in memory counter value
//(state) - updated in memory aad hash
//rax - length processed
//(out) - output buffer updated
//len - updated length
//Temp registers: xmm0-xmm15, r10, r15, rbx
void StubGenerator::aesgcm_avx2(Register in, Register len, Register ct, Register out, Register key,
                                Register state, Register subkeyHtbl, Register counter) {
  const Register pos = rax;
  const Register rounds = r10;
  const XMMRegister ctr_blockx = xmm9;
  const XMMRegister aad_hashx = xmm8;
  Label encrypt_done, encrypt_by_8_new, encrypt_by_8, exit;

  //This routine should be called only for message sizes of 128 bytes or more.
  //Macro flow:
  //process 8 16 byte blocks in initial_num_blocks.
  //process 8 16 byte blocks at a time until all are done 'encrypt_by_8_new  followed by ghash_last_8'
  __ xorl(pos, pos);
  __ cmpl(len, 128);
  __ jcc(Assembler::less, exit);

  //Generate 8 constants for htbl
  generateHtbl_8_block_avx2(subkeyHtbl);

  //Compute #rounds for AES based on the length of the key array
  __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

  //Load and shuffle state and counter values
  __ movdqu(ctr_blockx, Address(counter, 0));
  __ movdqu(aad_hashx, Address(state, 0));
  __ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
  __ vpshufb(aad_hashx, aad_hashx, ExternalAddress(ghash_long_swap_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);

  initial_blocks_avx2(ctr_blockx, rounds, key, len, in, out, ct, aad_hashx, pos);

  //We need at least 128 bytes to proceed further.
  __ cmpl(len, 128);
  __ jcc(Assembler::less, encrypt_done);

  //in_order vs. out_order is an optimization to increment the counter without shuffling
  //it back into little endian. r15d keeps track of when we need to increment in order so
  //that the carry is handled correctly.
  __ movdl(r15, ctr_blockx);
  __ andl(r15, 255);
  __ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);

  __ bind(encrypt_by_8_new);
  __ cmpl(r15, 255 - 8);
  __ jcc(Assembler::greater, encrypt_by_8);

  __ addb(r15, 8);
  ghash8_encrypt8_parallel_avx2(key, subkeyHtbl, ctr_blockx, in, out, ct, pos, false, rounds,
                                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8);
  __ addl(pos, 128);
  __ subl(len, 128);
  __ cmpl(len, 128);
  __ jcc(Assembler::greaterEqual, encrypt_by_8_new);

  __ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
  __ jmp(encrypt_done);

  __ bind(encrypt_by_8);
  __ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);

  __ addb(r15, 8);
  ghash8_encrypt8_parallel_avx2(key, subkeyHtbl, ctr_blockx, in, out, ct, pos, true, rounds,
                                xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8);

  __ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
  __ addl(pos, 128);
  __ subl(len, 128);
  __ cmpl(len, 128);
  __ jcc(Assembler::greaterEqual, encrypt_by_8_new);
  __ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);

  __ bind(encrypt_done);
  ghash_last_8_avx2(subkeyHtbl);

  __ vpaddd(ctr_blockx, ctr_blockx, ExternalAddress(counter_mask_linc1_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
  __ vpshufb(ctr_blockx, ctr_blockx, ExternalAddress(counter_shuffle_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
  __ movdqu(Address(counter, 0), ctr_blockx); //current_counter = xmm9
  __ vpshufb(xmm14, xmm14, ExternalAddress(ghash_long_swap_mask_addr()), Assembler::AVX_128bit, rbx /*rscratch*/);
  __ movdqu(Address(state, 0), xmm14); //aad hash = xmm14
  //Xor out round keys
  __ vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
  __ vpxor(xmm13, xmm13, xmm13, Assembler::AVX_128bit);

  __ bind(exit);
 }

#undef __