jdk/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

/*
 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */

#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "asm/register.hpp"
#include "atomic_aarch64.hpp"
#include "compiler/oopMap.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "gc/shared/gc_globals.hpp"
#include "gc/shared/tlab_globals.hpp"
#include "interpreter/interpreter.hpp"
#include "memory/universe.hpp"
#include "nativeInst_aarch64.hpp"
#include "oops/instanceOop.hpp"
#include "oops/method.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/atomic.hpp"
#include "runtime/continuation.hpp"
#include "runtime/continuationEntry.inline.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/javaThread.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "utilities/align.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/powerOfTwo.hpp"
#ifdef COMPILER2
#include "opto/runtime.hpp"
#endif
#if INCLUDE_ZGC
#include "gc/z/zThreadLocalData.hpp"
#endif

// Declaration and definition of StubGenerator (no .hpp file).
// For a more detailed description of the stub routine structure
// see the comment in stubRoutines.hpp

#undef __
#define __ _masm->

#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif

#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")

// Stub Code definitions

class StubGenerator: public StubCodeGenerator {
 private:

#ifdef PRODUCT
#define inc_counter_np(counter) ((void)0)
#else
  void inc_counter_np_(int& counter) {
    __ lea(rscratch2, ExternalAddress((address)&counter));
    __ ldrw(rscratch1, Address(rscratch2));
    __ addw(rscratch1, rscratch1, 1);
    __ strw(rscratch1, Address(rscratch2));
  }
#define inc_counter_np(counter) \
  BLOCK_COMMENT("inc_counter " #counter); \
  inc_counter_np_(counter);
#endif

  // Call stubs are used to call Java from C
  //
  // Arguments:
  //    c_rarg0:   call wrapper address                   address
  //    c_rarg1:   result                                 address
  //    c_rarg2:   result type                            BasicType
  //    c_rarg3:   method                                 Method*
  //    c_rarg4:   (interpreter) entry point              address
  //    c_rarg5:   parameters                             intptr_t*
  //    c_rarg6:   parameter size (in words)              int
  //    c_rarg7:   thread                                 Thread*
  //
  // There is no return from the stub itself as any Java result
  // is written to result
  //
  // we save r30 (lr) as the return PC at the base of the frame and
  // link r29 (fp) below it as the frame pointer installing sp (r31)
  // into fp.
  //
  // we save r0-r7, which accounts for all the c arguments.
  //
  // TODO: strictly do we need to save them all? they are treated as
  // volatile by C so could we omit saving the ones we are going to
  // place in global registers (thread? method?) or those we only use
  // during setup of the Java call?
  //
  // we don't need to save r8 which C uses as an indirect result location
  // return register.
  //
  // we don't need to save r9-r15 which both C and Java treat as
  // volatile
  //
  // we don't need to save r16-18 because Java does not use them
  //
  // we save r19-r28 which Java uses as scratch registers and C
  // expects to be callee-save
  //
  // we save the bottom 64 bits of each value stored in v8-v15; it is
  // the responsibility of the caller to preserve larger values.
  //
  // so the stub frame looks like this when we enter Java code
  //
  //     [ return_from_Java     ] <--- sp
  //     [ argument word n      ]
  //      ...
  // -27 [ argument word 1      ]
  // -26 [ saved v15            ] <--- sp_after_call
  // -25 [ saved v14            ]
  // -24 [ saved v13            ]
  // -23 [ saved v12            ]
  // -22 [ saved v11            ]
  // -21 [ saved v10            ]
  // -20 [ saved v9             ]
  // -19 [ saved v8             ]
  // -18 [ saved r28            ]
  // -17 [ saved r27            ]
  // -16 [ saved r26            ]
  // -15 [ saved r25            ]
  // -14 [ saved r24            ]
  // -13 [ saved r23            ]
  // -12 [ saved r22            ]
  // -11 [ saved r21            ]
  // -10 [ saved r20            ]
  //  -9 [ saved r19            ]
  //  -8 [ call wrapper    (r0) ]
  //  -7 [ result          (r1) ]
  //  -6 [ result type     (r2) ]
  //  -5 [ method          (r3) ]
  //  -4 [ entry point     (r4) ]
  //  -3 [ parameters      (r5) ]
  //  -2 [ parameter size  (r6) ]
  //  -1 [ thread (r7)          ]
  //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  //   1 [ saved lr       (r30) ]

  // Call stub stack layout word offsets from fp
  enum call_stub_layout {
    sp_after_call_off = -26,

    d15_off            = -26,
    d13_off            = -24,
    d11_off            = -22,
    d9_off             = -20,

    r28_off            = -18,
    r26_off            = -16,
    r24_off            = -14,
    r22_off            = -12,
    r20_off            = -10,
    call_wrapper_off   =  -8,
    result_off         =  -7,
    result_type_off    =  -6,
    method_off         =  -5,
    entry_point_off    =  -4,
    parameter_size_off =  -2,
    thread_off         =  -1,
    fp_f               =   0,
    retaddr_off        =   1,
  };

  address generate_call_stub(address& return_address) {
    assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
           (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
           "adjust this code");

    StubCodeMark mark(this, "StubRoutines", "call_stub");
    address start = __ pc();

    const Address sp_after_call(rfp, sp_after_call_off * wordSize);

    const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
    const Address result        (rfp, result_off         * wordSize);
    const Address result_type   (rfp, result_type_off    * wordSize);
    const Address method        (rfp, method_off         * wordSize);
    const Address entry_point   (rfp, entry_point_off    * wordSize);
    const Address parameter_size(rfp, parameter_size_off * wordSize);

    const Address thread        (rfp, thread_off         * wordSize);

    const Address d15_save      (rfp, d15_off * wordSize);
    const Address d13_save      (rfp, d13_off * wordSize);
    const Address d11_save      (rfp, d11_off * wordSize);
    const Address d9_save       (rfp, d9_off * wordSize);

    const Address r28_save      (rfp, r28_off * wordSize);
    const Address r26_save      (rfp, r26_off * wordSize);
    const Address r24_save      (rfp, r24_off * wordSize);
    const Address r22_save      (rfp, r22_off * wordSize);
    const Address r20_save      (rfp, r20_off * wordSize);

    // stub code

    address aarch64_entry = __ pc();

    // set up frame and move sp to end of save area
    __ enter();
    __ sub(sp, rfp, -sp_after_call_off * wordSize);

    // save register parameters and Java scratch/global registers
    // n.b. we save thread even though it gets installed in
    // rthread because we want to sanity check rthread later
    __ str(c_rarg7,  thread);
    __ strw(c_rarg6, parameter_size);
    __ stp(c_rarg4, c_rarg5,  entry_point);
    __ stp(c_rarg2, c_rarg3,  result_type);
    __ stp(c_rarg0, c_rarg1,  call_wrapper);

    __ stp(r20, r19,   r20_save);
    __ stp(r22, r21,   r22_save);
    __ stp(r24, r23,   r24_save);
    __ stp(r26, r25,   r26_save);
    __ stp(r28, r27,   r28_save);

    __ stpd(v9,  v8,   d9_save);
    __ stpd(v11, v10,  d11_save);
    __ stpd(v13, v12,  d13_save);
    __ stpd(v15, v14,  d15_save);

    // install Java thread in global register now we have saved
    // whatever value it held
    __ mov(rthread, c_rarg7);
    // And method
    __ mov(rmethod, c_rarg3);

    // set up the heapbase register
    __ reinit_heapbase();

#ifdef ASSERT
    // make sure we have no pending exceptions
    {
      Label L;
      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
      __ cmp(rscratch1, (u1)NULL_WORD);
      __ br(Assembler::EQ, L);
      __ stop("StubRoutines::call_stub: entered with pending exception");
      __ BIND(L);
    }
#endif
    // pass parameters if any
    __ mov(esp, sp);
    __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
    __ andr(sp, rscratch1, -2 * wordSize);

    BLOCK_COMMENT("pass parameters if any");
    Label parameters_done;
    // parameter count is still in c_rarg6
    // and parameter pointer identifying param 1 is in c_rarg5
    __ cbzw(c_rarg6, parameters_done);

    address loop = __ pc();
    __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
    __ subsw(c_rarg6, c_rarg6, 1);
    __ push(rscratch1);
    __ br(Assembler::GT, loop);

    __ BIND(parameters_done);

    // call Java entry -- passing methdoOop, and current sp
    //      rmethod: Method*
    //      r19_sender_sp: sender sp
    BLOCK_COMMENT("call Java function");
    __ mov(r19_sender_sp, sp);
    __ blr(c_rarg4);

    // we do this here because the notify will already have been done
    // if we get to the next instruction via an exception
    //
    // n.b. adding this instruction here affects the calculation of
    // whether or not a routine returns to the call stub (used when
    // doing stack walks) since the normal test is to check the return
    // pc against the address saved below. so we may need to allow for
    // this extra instruction in the check.

    // save current address for use by exception handling code

    return_address = __ pc();

    // store result depending on type (everything that is not
    // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
    // n.b. this assumes Java returns an integral result in r0
    // and a floating result in j_farg0
    __ ldr(j_rarg2, result);
    Label is_long, is_float, is_double, exit;
    __ ldr(j_rarg1, result_type);
    __ cmp(j_rarg1, (u1)T_OBJECT);
    __ br(Assembler::EQ, is_long);
    __ cmp(j_rarg1, (u1)T_LONG);
    __ br(Assembler::EQ, is_long);
    __ cmp(j_rarg1, (u1)T_FLOAT);
    __ br(Assembler::EQ, is_float);
    __ cmp(j_rarg1, (u1)T_DOUBLE);
    __ br(Assembler::EQ, is_double);

    // handle T_INT case
    __ strw(r0, Address(j_rarg2));

    __ BIND(exit);

    // pop parameters
    __ sub(esp, rfp, -sp_after_call_off * wordSize);

#ifdef ASSERT
    // verify that threads correspond
    {
      Label L, S;
      __ ldr(rscratch1, thread);
      __ cmp(rthread, rscratch1);
      __ br(Assembler::NE, S);
      __ get_thread(rscratch1);
      __ cmp(rthread, rscratch1);
      __ br(Assembler::EQ, L);
      __ BIND(S);
      __ stop("StubRoutines::call_stub: threads must correspond");
      __ BIND(L);
    }
#endif

    __ pop_cont_fastpath(rthread);

    // restore callee-save registers
    __ ldpd(v15, v14,  d15_save);
    __ ldpd(v13, v12,  d13_save);
    __ ldpd(v11, v10,  d11_save);
    __ ldpd(v9,  v8,   d9_save);

    __ ldp(r28, r27,   r28_save);
    __ ldp(r26, r25,   r26_save);
    __ ldp(r24, r23,   r24_save);
    __ ldp(r22, r21,   r22_save);
    __ ldp(r20, r19,   r20_save);

    __ ldp(c_rarg0, c_rarg1,  call_wrapper);
    __ ldrw(c_rarg2, result_type);
    __ ldr(c_rarg3,  method);
    __ ldp(c_rarg4, c_rarg5,  entry_point);
    __ ldp(c_rarg6, c_rarg7,  parameter_size);

    // leave frame and return to caller
    __ leave();
    __ ret(lr);

    // handle return types different from T_INT

    __ BIND(is_long);
    __ str(r0, Address(j_rarg2, 0));
    __ br(Assembler::AL, exit);

    __ BIND(is_float);
    __ strs(j_farg0, Address(j_rarg2, 0));
    __ br(Assembler::AL, exit);

    __ BIND(is_double);
    __ strd(j_farg0, Address(j_rarg2, 0));
    __ br(Assembler::AL, exit);

    return start;
  }

  // Return point for a Java call if there's an exception thrown in
  // Java code.  The exception is caught and transformed into a
  // pending exception stored in JavaThread that can be tested from
  // within the VM.
  //
  // Note: Usually the parameters are removed by the callee. In case
  // of an exception crossing an activation frame boundary, that is
  // not the case if the callee is compiled code => need to setup the
  // rsp.
  //
  // r0: exception oop

  address generate_catch_exception() {
    StubCodeMark mark(this, "StubRoutines", "catch_exception");
    address start = __ pc();

    // same as in generate_call_stub():
    const Address sp_after_call(rfp, sp_after_call_off * wordSize);
    const Address thread        (rfp, thread_off         * wordSize);

#ifdef ASSERT
    // verify that threads correspond
    {
      Label L, S;
      __ ldr(rscratch1, thread);
      __ cmp(rthread, rscratch1);
      __ br(Assembler::NE, S);
      __ get_thread(rscratch1);
      __ cmp(rthread, rscratch1);
      __ br(Assembler::EQ, L);
      __ bind(S);
      __ stop("StubRoutines::catch_exception: threads must correspond");
      __ bind(L);
    }
#endif

    // set pending exception
    __ verify_oop(r0);

    __ str(r0, Address(rthread, Thread::pending_exception_offset()));
    __ mov(rscratch1, (address)__FILE__);
    __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
    __ movw(rscratch1, (int)__LINE__);
    __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));

    // complete return to VM
    assert(StubRoutines::_call_stub_return_address != NULL,
           "_call_stub_return_address must have been generated before");
    __ b(StubRoutines::_call_stub_return_address);

    return start;
  }

  // Continuation point for runtime calls returning with a pending
  // exception.  The pending exception check happened in the runtime
  // or native call stub.  The pending exception in Thread is
  // converted into a Java-level exception.
  //
  // Contract with Java-level exception handlers:
  // r0: exception
  // r3: throwing pc
  //
  // NOTE: At entry of this stub, exception-pc must be in LR !!

  // NOTE: this is always used as a jump target within generated code
  // so it just needs to be generated code with no x86 prolog

  address generate_forward_exception() {
    StubCodeMark mark(this, "StubRoutines", "forward exception");
    address start = __ pc();

    // Upon entry, LR points to the return address returning into
    // Java (interpreted or compiled) code; i.e., the return address
    // becomes the throwing pc.
    //
    // Arguments pushed before the runtime call are still on the stack
    // but the exception handler will reset the stack pointer ->
    // ignore them.  A potential result in registers can be ignored as
    // well.

#ifdef ASSERT
    // make sure this code is only executed if there is a pending exception
    {
      Label L;
      __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
      __ cbnz(rscratch1, L);
      __ stop("StubRoutines::forward exception: no pending exception (1)");
      __ bind(L);
    }
#endif

    // compute exception handler into r19

    // call the VM to find the handler address associated with the
    // caller address. pass thread in r0 and caller pc (ret address)
    // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
    // the stack.
    __ mov(c_rarg1, lr);
    // lr will be trashed by the VM call so we move it to R19
    // (callee-saved) because we also need to pass it to the handler
    // returned by this call.
    __ mov(r19, lr);
    BLOCK_COMMENT("call exception_handler_for_return_address");
    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
                         SharedRuntime::exception_handler_for_return_address),
                    rthread, c_rarg1);
    // Reinitialize the ptrue predicate register, in case the external runtime
    // call clobbers ptrue reg, as we may return to SVE compiled code.
    __ reinitialize_ptrue();

    // we should not really care that lr is no longer the callee
    // address. we saved the value the handler needs in r19 so we can
    // just copy it to r3. however, the C2 handler will push its own
    // frame and then calls into the VM and the VM code asserts that
    // the PC for the frame above the handler belongs to a compiled
    // Java method. So, we restore lr here to satisfy that assert.
    __ mov(lr, r19);
    // setup r0 & r3 & clear pending exception
    __ mov(r3, r19);
    __ mov(r19, r0);
    __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
    __ str(zr, Address(rthread, Thread::pending_exception_offset()));

#ifdef ASSERT
    // make sure exception is set
    {
      Label L;
      __ cbnz(r0, L);
      __ stop("StubRoutines::forward exception: no pending exception (2)");
      __ bind(L);
    }
#endif

    // continue at exception handler
    // r0: exception
    // r3: throwing pc
    // r19: exception handler
    __ verify_oop(r0);
    __ br(r19);

    return start;
  }

  // Non-destructive plausibility checks for oops
  //
  // Arguments:
  //    r0: oop to verify
  //    rscratch1: error message
  //
  // Stack after saving c_rarg3:
  //    [tos + 0]: saved c_rarg3
  //    [tos + 1]: saved c_rarg2
  //    [tos + 2]: saved lr
  //    [tos + 3]: saved rscratch2
  //    [tos + 4]: saved r0
  //    [tos + 5]: saved rscratch1
  address generate_verify_oop() {

    StubCodeMark mark(this, "StubRoutines", "verify_oop");
    address start = __ pc();

    Label exit, error;

    // save c_rarg2 and c_rarg3
    __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));

    // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
    __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
    __ ldr(c_rarg3, Address(c_rarg2));
    __ add(c_rarg3, c_rarg3, 1);
    __ str(c_rarg3, Address(c_rarg2));

    // object is in r0
    // make sure object is 'reasonable'
    __ cbz(r0, exit); // if obj is NULL it is OK

#if INCLUDE_ZGC
    if (UseZGC) {
      // Check if mask is good.
      // verifies that ZAddressBadMask & r0 == 0
      __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
      __ andr(c_rarg2, r0, c_rarg3);
      __ cbnz(c_rarg2, error);
    }
#endif

    // Check if the oop is in the right area of memory
    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
    __ andr(c_rarg2, r0, c_rarg3);
    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());

    // Compare c_rarg2 and c_rarg3.  We don't use a compare
    // instruction here because the flags register is live.
    __ eor(c_rarg2, c_rarg2, c_rarg3);
    __ cbnz(c_rarg2, error);

    // make sure klass is 'reasonable', which is not zero.
    __ load_klass(r0, r0);  // get klass
    __ cbz(r0, error);      // if klass is NULL it is broken

    // return if everything seems ok
    __ bind(exit);

    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
    __ ret(lr);

    // handle errors
    __ bind(error);
    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));

    __ push(RegSet::range(r0, r29), sp);
    // debug(char* msg, int64_t pc, int64_t regs[])
    __ mov(c_rarg0, rscratch1);      // pass address of error message
    __ mov(c_rarg1, lr);             // pass return address
    __ mov(c_rarg2, sp);             // pass address of regs on stack
#ifndef PRODUCT
    assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
#endif
    BLOCK_COMMENT("call MacroAssembler::debug");
    __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
    __ blr(rscratch1);
    __ hlt(0);

    return start;
  }

  // Generate indices for iota vector.
  address generate_iota_indices(const char *stub_name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", stub_name);
    address start = __ pc();
    // B
    __ emit_data64(0x0706050403020100, relocInfo::none);
    __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
    // H
    __ emit_data64(0x0003000200010000, relocInfo::none);
    __ emit_data64(0x0007000600050004, relocInfo::none);
    // S
    __ emit_data64(0x0000000100000000, relocInfo::none);
    __ emit_data64(0x0000000300000002, relocInfo::none);
    // D
    __ emit_data64(0x0000000000000000, relocInfo::none);
    __ emit_data64(0x0000000000000001, relocInfo::none);
    // S - FP
    __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
    __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
    // D - FP
    __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
    __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
    return start;
  }

  // The inner part of zero_words().  This is the bulk operation,
  // zeroing words in blocks, possibly using DC ZVA to do it.  The
  // caller is responsible for zeroing the last few words.
  //
  // Inputs:
  // r10: the HeapWord-aligned base address of an array to zero.
  // r11: the count in HeapWords, r11 > 0.
  //
  // Returns r10 and r11, adjusted for the caller to clear.
  // r10: the base address of the tail of words left to clear.
  // r11: the number of words in the tail.
  //      r11 < MacroAssembler::zero_words_block_size.

  address generate_zero_blocks() {
    Label done;
    Label base_aligned;

    Register base = r10, cnt = r11;

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "zero_blocks");
    address start = __ pc();

    if (UseBlockZeroing) {
      int zva_length = VM_Version::zva_length();

      // Ensure ZVA length can be divided by 16. This is required by
      // the subsequent operations.
      assert (zva_length % 16 == 0, "Unexpected ZVA Length");

      __ tbz(base, 3, base_aligned);
      __ str(zr, Address(__ post(base, 8)));
      __ sub(cnt, cnt, 1);
      __ bind(base_aligned);

      // Ensure count >= zva_length * 2 so that it still deserves a zva after
      // alignment.
      Label small;
      int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
      __ subs(rscratch1, cnt, low_limit >> 3);
      __ br(Assembler::LT, small);
      __ zero_dcache_blocks(base, cnt);
      __ bind(small);
    }

    {
      // Number of stp instructions we'll unroll
      const int unroll =
        MacroAssembler::zero_words_block_size / 2;
      // Clear the remaining blocks.
      Label loop;
      __ subs(cnt, cnt, unroll * 2);
      __ br(Assembler::LT, done);
      __ bind(loop);
      for (int i = 0; i < unroll; i++)
        __ stp(zr, zr, __ post(base, 16));
      __ subs(cnt, cnt, unroll * 2);
      __ br(Assembler::GE, loop);
      __ bind(done);
      __ add(cnt, cnt, unroll * 2);
    }

    __ ret(lr);

    return start;
  }


  typedef enum {
    copy_forwards = 1,
    copy_backwards = -1
  } copy_direction;

  // Bulk copy of blocks of 8 words.
  //
  // count is a count of words.
  //
  // Precondition: count >= 8
  //
  // Postconditions:
  //
  // The least significant bit of count contains the remaining count
  // of words to copy.  The rest of count is trash.
  //
  // s and d are adjusted to point to the remaining words to copy
  //
  void generate_copy_longs(Label &start, Register s, Register d, Register count,
                           copy_direction direction) {
    int unit = wordSize * direction;
    int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;

    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
      t4 = r7, t5 = r10, t6 = r11, t7 = r12;
    const Register stride = r13;

    assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
    assert_different_registers(s, d, count, rscratch1);

    Label again, drain;
    const char *stub_name;
    if (direction == copy_forwards)
      stub_name = "forward_copy_longs";
    else
      stub_name = "backward_copy_longs";

    __ align(CodeEntryAlignment);

    StubCodeMark mark(this, "StubRoutines", stub_name);

    __ bind(start);

    Label unaligned_copy_long;
    if (AvoidUnalignedAccesses) {
      __ tbnz(d, 3, unaligned_copy_long);
    }

    if (direction == copy_forwards) {
      __ sub(s, s, bias);
      __ sub(d, d, bias);
    }

#ifdef ASSERT
    // Make sure we are never given < 8 words
    {
      Label L;
      __ cmp(count, (u1)8);
      __ br(Assembler::GE, L);
      __ stop("genrate_copy_longs called with < 8 words");
      __ bind(L);
    }
#endif

    // Fill 8 registers
    if (UseSIMDForMemoryOps) {
      __ ldpq(v0, v1, Address(s, 4 * unit));
      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
    } else {
      __ ldp(t0, t1, Address(s, 2 * unit));
      __ ldp(t2, t3, Address(s, 4 * unit));
      __ ldp(t4, t5, Address(s, 6 * unit));
      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
    }

    __ subs(count, count, 16);
    __ br(Assembler::LO, drain);

    int prefetch = PrefetchCopyIntervalInBytes;
    bool use_stride = false;
    if (direction == copy_backwards) {
       use_stride = prefetch > 256;
       prefetch = -prefetch;
       if (use_stride) __ mov(stride, prefetch);
    }

    __ bind(again);

    if (PrefetchCopyIntervalInBytes > 0)
      __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);

    if (UseSIMDForMemoryOps) {
      __ stpq(v0, v1, Address(d, 4 * unit));
      __ ldpq(v0, v1, Address(s, 4 * unit));
      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
    } else {
      __ stp(t0, t1, Address(d, 2 * unit));
      __ ldp(t0, t1, Address(s, 2 * unit));
      __ stp(t2, t3, Address(d, 4 * unit));
      __ ldp(t2, t3, Address(s, 4 * unit));
      __ stp(t4, t5, Address(d, 6 * unit));
      __ ldp(t4, t5, Address(s, 6 * unit));
      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
    }

    __ subs(count, count, 8);
    __ br(Assembler::HS, again);

    // Drain
    __ bind(drain);
    if (UseSIMDForMemoryOps) {
      __ stpq(v0, v1, Address(d, 4 * unit));
      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
    } else {
      __ stp(t0, t1, Address(d, 2 * unit));
      __ stp(t2, t3, Address(d, 4 * unit));
      __ stp(t4, t5, Address(d, 6 * unit));
      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
    }

    {
      Label L1, L2;
      __ tbz(count, exact_log2(4), L1);
      if (UseSIMDForMemoryOps) {
        __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
        __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
      } else {
        __ ldp(t0, t1, Address(s, 2 * unit));
        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
        __ stp(t0, t1, Address(d, 2 * unit));
        __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
      }
      __ bind(L1);

      if (direction == copy_forwards) {
        __ add(s, s, bias);
        __ add(d, d, bias);
      }

      __ tbz(count, 1, L2);
      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
      __ bind(L2);
    }

    __ ret(lr);

    if (AvoidUnalignedAccesses) {
      Label drain, again;
      // Register order for storing. Order is different for backward copy.

      __ bind(unaligned_copy_long);

      // source address is even aligned, target odd aligned
      //
      // when forward copying word pairs we read long pairs at offsets
      // {0, 2, 4, 6} (in long words). when backwards copying we read
      // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
      // address by -2 in the forwards case so we can compute the
      // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
      // or -1.
      //
      // when forward copying we need to store 1 word, 3 pairs and
      // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
      // zero offset We adjust the destination by -1 which means we
      // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
      //
      // When backwards copyng we need to store 1 word, 3 pairs and
      // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
      // offsets {1, 3, 5, 7, 8} * unit.

      if (direction == copy_forwards) {
        __ sub(s, s, 16);
        __ sub(d, d, 8);
      }

      // Fill 8 registers
      //
      // for forwards copy s was offset by -16 from the original input
      // value of s so the register contents are at these offsets
      // relative to the 64 bit block addressed by that original input
      // and so on for each successive 64 byte block when s is updated
      //
      // t0 at offset 0,  t1 at offset 8
      // t2 at offset 16, t3 at offset 24
      // t4 at offset 32, t5 at offset 40
      // t6 at offset 48, t7 at offset 56

      // for backwards copy s was not offset so the register contents
      // are at these offsets into the preceding 64 byte block
      // relative to that original input and so on for each successive
      // preceding 64 byte block when s is updated. this explains the
      // slightly counter-intuitive looking pattern of register usage
      // in the stp instructions for backwards copy.
      //
      // t0 at offset -16, t1 at offset -8
      // t2 at offset -32, t3 at offset -24
      // t4 at offset -48, t5 at offset -40
      // t6 at offset -64, t7 at offset -56

      __ ldp(t0, t1, Address(s, 2 * unit));
      __ ldp(t2, t3, Address(s, 4 * unit));
      __ ldp(t4, t5, Address(s, 6 * unit));
      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));

      __ subs(count, count, 16);
      __ br(Assembler::LO, drain);

      int prefetch = PrefetchCopyIntervalInBytes;
      bool use_stride = false;
      if (direction == copy_backwards) {
         use_stride = prefetch > 256;
         prefetch = -prefetch;
         if (use_stride) __ mov(stride, prefetch);
      }

      __ bind(again);

      if (PrefetchCopyIntervalInBytes > 0)
        __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);

      if (direction == copy_forwards) {
       // allowing for the offset of -8 the store instructions place
       // registers into the target 64 bit block at the following
       // offsets
       //
       // t0 at offset 0
       // t1 at offset 8,  t2 at offset 16
       // t3 at offset 24, t4 at offset 32
       // t5 at offset 40, t6 at offset 48
       // t7 at offset 56

        __ str(t0, Address(d, 1 * unit));
        __ stp(t1, t2, Address(d, 2 * unit));
        __ ldp(t0, t1, Address(s, 2 * unit));
        __ stp(t3, t4, Address(d, 4 * unit));
        __ ldp(t2, t3, Address(s, 4 * unit));
        __ stp(t5, t6, Address(d, 6 * unit));
        __ ldp(t4, t5, Address(s, 6 * unit));
        __ str(t7, Address(__ pre(d, 8 * unit)));
        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
      } else {
       // d was not offset when we started so the registers are
       // written into the 64 bit block preceding d with the following
       // offsets
       //
       // t1 at offset -8
       // t3 at offset -24, t0 at offset -16
       // t5 at offset -48, t2 at offset -32
       // t7 at offset -56, t4 at offset -48
       //                   t6 at offset -64
       //
       // note that this matches the offsets previously noted for the
       // loads

        __ str(t1, Address(d, 1 * unit));
        __ stp(t3, t0, Address(d, 3 * unit));
        __ ldp(t0, t1, Address(s, 2 * unit));
        __ stp(t5, t2, Address(d, 5 * unit));
        __ ldp(t2, t3, Address(s, 4 * unit));
        __ stp(t7, t4, Address(d, 7 * unit));
        __ ldp(t4, t5, Address(s, 6 * unit));
        __ str(t6, Address(__ pre(d, 8 * unit)));
        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
      }

      __ subs(count, count, 8);
      __ br(Assembler::HS, again);

      // Drain
      //
      // this uses the same pattern of offsets and register arguments
      // as above
      __ bind(drain);
      if (direction == copy_forwards) {
        __ str(t0, Address(d, 1 * unit));
        __ stp(t1, t2, Address(d, 2 * unit));
        __ stp(t3, t4, Address(d, 4 * unit));
        __ stp(t5, t6, Address(d, 6 * unit));
        __ str(t7, Address(__ pre(d, 8 * unit)));
      } else {
        __ str(t1, Address(d, 1 * unit));
        __ stp(t3, t0, Address(d, 3 * unit));
        __ stp(t5, t2, Address(d, 5 * unit));
        __ stp(t7, t4, Address(d, 7 * unit));
        __ str(t6, Address(__ pre(d, 8 * unit)));
      }
      // now we need to copy any remaining part block which may
      // include a 4 word block subblock and/or a 2 word subblock.
      // bits 2 and 1 in the count are the tell-tale for whether we
      // have each such subblock
      {
        Label L1, L2;
        __ tbz(count, exact_log2(4), L1);
       // this is the same as above but copying only 4 longs hence
       // with only one intervening stp between the str instructions
       // but note that the offsets and registers still follow the
       // same pattern
        __ ldp(t0, t1, Address(s, 2 * unit));
        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
        if (direction == copy_forwards) {
          __ str(t0, Address(d, 1 * unit));
          __ stp(t1, t2, Address(d, 2 * unit));
          __ str(t3, Address(__ pre(d, 4 * unit)));
        } else {
          __ str(t1, Address(d, 1 * unit));
          __ stp(t3, t0, Address(d, 3 * unit));
          __ str(t2, Address(__ pre(d, 4 * unit)));
        }
        __ bind(L1);

        __ tbz(count, 1, L2);
       // this is the same as above but copying only 2 longs hence
       // there is no intervening stp between the str instructions
       // but note that the offset and register patterns are still
       // the same
        __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
        if (direction == copy_forwards) {
          __ str(t0, Address(d, 1 * unit));
          __ str(t1, Address(__ pre(d, 2 * unit)));
        } else {
          __ str(t1, Address(d, 1 * unit));
          __ str(t0, Address(__ pre(d, 2 * unit)));
        }
        __ bind(L2);

       // for forwards copy we need to re-adjust the offsets we
       // applied so that s and d are follow the last words written

       if (direction == copy_forwards) {
         __ add(s, s, 16);
         __ add(d, d, 8);
       }

      }

      __ ret(lr);
      }
  }

  // Small copy: less than 16 bytes.
  //
  // NB: Ignores all of the bits of count which represent more than 15
  // bytes, so a caller doesn't have to mask them.

  void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
    bool is_backwards = step < 0;
    size_t granularity = uabs(step);
    int direction = is_backwards ? -1 : 1;
    int unit = wordSize * direction;

    Label Lword, Lint, Lshort, Lbyte;

    assert(granularity
           && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");

    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;

    // ??? I don't know if this bit-test-and-branch is the right thing
    // to do.  It does a lot of jumping, resulting in several
    // mispredicted branches.  It might make more sense to do this
    // with something like Duff's device with a single computed branch.

    __ tbz(count, 3 - exact_log2(granularity), Lword);
    __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
    __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
    __ bind(Lword);

    if (granularity <= sizeof (jint)) {
      __ tbz(count, 2 - exact_log2(granularity), Lint);
      __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
      __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
      __ bind(Lint);
    }

    if (granularity <= sizeof (jshort)) {
      __ tbz(count, 1 - exact_log2(granularity), Lshort);
      __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
      __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
      __ bind(Lshort);
    }

    if (granularity <= sizeof (jbyte)) {
      __ tbz(count, 0, Lbyte);
      __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
      __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
      __ bind(Lbyte);
    }
  }

  Label copy_f, copy_b;

  // All-singing all-dancing memory copy.
  //
  // Copy count units of memory from s to d.  The size of a unit is
  // step, which can be positive or negative depending on the direction
  // of copy.  If is_aligned is false, we align the source address.
  //

  void copy_memory(bool is_aligned, Register s, Register d,
                   Register count, Register tmp, int step) {
    copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
    bool is_backwards = step < 0;
    unsigned int granularity = uabs(step);
    const Register t0 = r3, t1 = r4;

    // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
    // load all the data before writing anything
    Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
    const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
    const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
    const Register send = r17, dend = r16;

    if (PrefetchCopyIntervalInBytes > 0)
      __ prfm(Address(s, 0), PLDL1KEEP);
    __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
    __ br(Assembler::HI, copy_big);

    __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
    __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));

    __ cmp(count, u1(16/granularity));
    __ br(Assembler::LS, copy16);

    __ cmp(count, u1(64/granularity));
    __ br(Assembler::HI, copy80);

    __ cmp(count, u1(32/granularity));
    __ br(Assembler::LS, copy32);

    // 33..64 bytes
    if (UseSIMDForMemoryOps) {
      __ ldpq(v0, v1, Address(s, 0));
      __ ldpq(v2, v3, Address(send, -32));
      __ stpq(v0, v1, Address(d, 0));
      __ stpq(v2, v3, Address(dend, -32));
    } else {
      __ ldp(t0, t1, Address(s, 0));
      __ ldp(t2, t3, Address(s, 16));
      __ ldp(t4, t5, Address(send, -32));
      __ ldp(t6, t7, Address(send, -16));

      __ stp(t0, t1, Address(d, 0));
      __ stp(t2, t3, Address(d, 16));
      __ stp(t4, t5, Address(dend, -32));
      __ stp(t6, t7, Address(dend, -16));
    }
    __ b(finish);

    // 17..32 bytes
    __ bind(copy32);
    __ ldp(t0, t1, Address(s, 0));
    __ ldp(t2, t3, Address(send, -16));
    __ stp(t0, t1, Address(d, 0));
    __ stp(t2, t3, Address(dend, -16));
    __ b(finish);

    // 65..80/96 bytes
    // (96 bytes if SIMD because we do 32 byes per instruction)
    __ bind(copy80);
    if (UseSIMDForMemoryOps) {
      __ ldpq(v0, v1, Address(s, 0));
      __ ldpq(v2, v3, Address(s, 32));
      // Unaligned pointers can be an issue for copying.
      // The issue has more chances to happen when granularity of data is
      // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
      // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
      // The most performance drop has been seen for the range 65-80 bytes.
      // For such cases using the pair of ldp/stp instead of the third pair of
      // ldpq/stpq fixes the performance issue.
      if (granularity < sizeof (jint)) {
        Label copy96;
        __ cmp(count, u1(80/granularity));
        __ br(Assembler::HI, copy96);
        __ ldp(t0, t1, Address(send, -16));

        __ stpq(v0, v1, Address(d, 0));
        __ stpq(v2, v3, Address(d, 32));
        __ stp(t0, t1, Address(dend, -16));
        __ b(finish);

        __ bind(copy96);
      }
      __ ldpq(v4, v5, Address(send, -32));

      __ stpq(v0, v1, Address(d, 0));
      __ stpq(v2, v3, Address(d, 32));
      __ stpq(v4, v5, Address(dend, -32));
    } else {
      __ ldp(t0, t1, Address(s, 0));
      __ ldp(t2, t3, Address(s, 16));
      __ ldp(t4, t5, Address(s, 32));
      __ ldp(t6, t7, Address(s, 48));
      __ ldp(t8, t9, Address(send, -16));

      __ stp(t0, t1, Address(d, 0));
      __ stp(t2, t3, Address(d, 16));
      __ stp(t4, t5, Address(d, 32));
      __ stp(t6, t7, Address(d, 48));
      __ stp(t8, t9, Address(dend, -16));
    }
    __ b(finish);

    // 0..16 bytes
    __ bind(copy16);
    __ cmp(count, u1(8/granularity));
    __ br(Assembler::LO, copy8);

    // 8..16 bytes
    __ ldr(t0, Address(s, 0));
    __ ldr(t1, Address(send, -8));
    __ str(t0, Address(d, 0));
    __ str(t1, Address(dend, -8));
    __ b(finish);

    if (granularity < 8) {
      // 4..7 bytes
      __ bind(copy8);
      __ tbz(count, 2 - exact_log2(granularity), copy4);
      __ ldrw(t0, Address(s, 0));
      __ ldrw(t1, Address(send, -4));
      __ strw(t0, Address(d, 0));
      __ strw(t1, Address(dend, -4));
      __ b(finish);
      if (granularity < 4) {
        // 0..3 bytes
        __ bind(copy4);
        __ cbz(count, finish); // get rid of 0 case
        if (granularity == 2) {
          __ ldrh(t0, Address(s, 0));
          __ strh(t0, Address(d, 0));
        } else { // granularity == 1
          // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
          // the first and last byte.
          // Handle the 3 byte case by loading and storing base + count/2
          // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
          // This does means in the 1 byte case we load/store the same
          // byte 3 times.
          __ lsr(count, count, 1);
          __ ldrb(t0, Address(s, 0));
          __ ldrb(t1, Address(send, -1));
          __ ldrb(t2, Address(s, count));
          __ strb(t0, Address(d, 0));
          __ strb(t1, Address(dend, -1));
          __ strb(t2, Address(d, count));
        }
        __ b(finish);
      }
    }

    __ bind(copy_big);
    if (is_backwards) {
      __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
      __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
    }

    // Now we've got the small case out of the way we can align the
    // source address on a 2-word boundary.

    Label aligned;

    if (is_aligned) {
      // We may have to adjust by 1 word to get s 2-word-aligned.
      __ tbz(s, exact_log2(wordSize), aligned);
      __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
      __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
      __ sub(count, count, wordSize/granularity);
    } else {
      if (is_backwards) {
        __ andr(rscratch2, s, 2 * wordSize - 1);
      } else {
        __ neg(rscratch2, s);
        __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
      }
      // rscratch2 is the byte adjustment needed to align s.
      __ cbz(rscratch2, aligned);
      int shift = exact_log2(granularity);
      if (shift)  __ lsr(rscratch2, rscratch2, shift);
      __ sub(count, count, rscratch2);

#if 0
      // ?? This code is only correct for a disjoint copy.  It may or
      // may not make sense to use it in that case.

      // Copy the first pair; s and d may not be aligned.
      __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
      __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));

      // Align s and d, adjust count
      if (is_backwards) {
        __ sub(s, s, rscratch2);
        __ sub(d, d, rscratch2);
      } else {
        __ add(s, s, rscratch2);
        __ add(d, d, rscratch2);
      }
#else
      copy_memory_small(s, d, rscratch2, rscratch1, step);
#endif
    }

    __ bind(aligned);

    // s is now 2-word-aligned.

    // We have a count of units and some trailing bytes.  Adjust the
    // count and do a bulk copy of words.
    __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
    if (direction == copy_forwards)
      __ bl(copy_f);
    else
      __ bl(copy_b);

    // And the tail.
    copy_memory_small(s, d, count, tmp, step);

    if (granularity >= 8) __ bind(copy8);
    if (granularity >= 4) __ bind(copy4);
    __ bind(finish);
  }


  void clobber_registers() {
#ifdef ASSERT
    RegSet clobbered
      = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
    __ mov(rscratch1, (uint64_t)0xdeadbeef);
    __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
    for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
      __ mov(*it, rscratch1);
    }
#endif

  }

  // Scan over array at a for count oops, verifying each one.
  // Preserves a and count, clobbers rscratch1 and rscratch2.
  void verify_oop_array (int size, Register a, Register count, Register temp) {
    Label loop, end;
    __ mov(rscratch1, a);
    __ mov(rscratch2, zr);
    __ bind(loop);
    __ cmp(rscratch2, count);
    __ br(Assembler::HS, end);
    if (size == wordSize) {
      __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
      __ verify_oop(temp);
    } else {
      __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
      __ decode_heap_oop(temp); // calls verify_oop
    }
    __ add(rscratch2, rscratch2, 1);
    __ b(loop);
    __ bind(end);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   is_oop  - true => oop array, so generate store check code
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
  //   disjoint_int_copy_entry is set to the no-overlap entry point
  //   used by generate_conjoint_int_oop_copy().
  //
  address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
                                  const char *name, bool dest_uninitialized = false) {
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_reg = RegSet::of(s, d, count);
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter();

    if (entry != NULL) {
      *entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }

    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
    if (dest_uninitialized) {
      decorators |= IS_DEST_UNINITIALIZED;
    }
    if (aligned) {
      decorators |= ARRAYCOPY_ALIGNED;
    }

    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
    bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);

    if (is_oop) {
      // save regs before copy_memory
      __ push(RegSet::of(d, count), sp);
    }
    {
      // UnsafeCopyMemory page error: continue after ucm
      bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
      UnsafeCopyMemoryMark ucmm(this, add_entry, true);
      copy_memory(aligned, s, d, count, rscratch1, size);
    }

    if (is_oop) {
      __ pop(RegSet::of(d, count), sp);
      if (VerifyOops)
        verify_oop_array(size, d, count, r16);
    }

    bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());

    __ leave();
    __ mov(r0, zr); // return 0
    __ ret(lr);
    return start;
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   is_oop  - true => oop array, so generate store check code
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
  address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
                                 address *entry, const char *name,
                                 bool dest_uninitialized = false) {
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_regs = RegSet::of(s, d, count);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter();

    if (entry != NULL) {
      *entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }

    // use fwd copy when (d-s) above_equal (count*size)
    __ sub(rscratch1, d, s);
    __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
    __ br(Assembler::HS, nooverlap_target);

    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
    if (dest_uninitialized) {
      decorators |= IS_DEST_UNINITIALIZED;
    }
    if (aligned) {
      decorators |= ARRAYCOPY_ALIGNED;
    }

    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
    bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);

    if (is_oop) {
      // save regs before copy_memory
      __ push(RegSet::of(d, count), sp);
    }
    {
      // UnsafeCopyMemory page error: continue after ucm
      bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
      UnsafeCopyMemoryMark ucmm(this, add_entry, true);
      copy_memory(aligned, s, d, count, rscratch1, -size);
    }
    if (is_oop) {
      __ pop(RegSet::of(d, count), sp);
      if (VerifyOops)
        verify_oop_array(size, d, count, r16);
    }
    bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
    __ leave();
    __ mov(r0, zr); // return 0
    __ ret(lr);
    return start;
}

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
  // we let the hardware handle it.  The one to eight bytes within words,
  // dwords or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  // Side Effects:
  //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
  // we let the hardware handle it.  The one to eight bytes within words,
  // dwords or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  // Side Effects:
  //   disjoint_byte_copy_entry is set to the no-overlap entry point
  //   used by generate_conjoint_byte_copy().
  //
  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
    const bool not_oop = false;
    return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
  // we let the hardware handle it.  The one to eight bytes within words,
  // dwords or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
                                      address* entry, const char *name) {
    const bool not_oop = false;
    return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  // let the hardware handle it.  The two or four words within dwords
  // or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  // Side Effects:
  //   disjoint_short_copy_entry is set to the no-overlap entry point
  //   used by generate_conjoint_short_copy().
  //
  address generate_disjoint_short_copy(bool aligned,
                                       address* entry, const char *name) {
    const bool not_oop = false;
    return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  // let the hardware handle it.  The two or four words within dwords
  // or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
                                       address *entry, const char *name) {
    const bool not_oop = false;
    return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);

  }
  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
  //   disjoint_int_copy_entry is set to the no-overlap entry point
  //   used by generate_conjoint_int_oop_copy().
  //
  address generate_disjoint_int_copy(bool aligned, address *entry,
                                         const char *name, bool dest_uninitialized = false) {
    const bool not_oop = false;
    return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
                                     address *entry, const char *name,
                                     bool dest_uninitialized = false) {
    const bool not_oop = false;
    return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
  }


  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as size_t, can be zero
  //
  // Side Effects:
  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
  //
  address generate_disjoint_long_copy(bool aligned, address *entry,
                                          const char *name, bool dest_uninitialized = false) {
    const bool not_oop = false;
    return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as size_t, can be zero
  //
  address generate_conjoint_long_copy(bool aligned,
                                      address nooverlap_target, address *entry,
                                      const char *name, bool dest_uninitialized = false) {
    const bool not_oop = false;
    return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as size_t, can be zero
  //
  // Side Effects:
  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
  //
  address generate_disjoint_oop_copy(bool aligned, address *entry,
                                     const char *name, bool dest_uninitialized) {
    const bool is_oop = true;
    const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
    return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as size_t, can be zero
  //
  address generate_conjoint_oop_copy(bool aligned,
                                     address nooverlap_target, address *entry,
                                     const char *name, bool dest_uninitialized) {
    const bool is_oop = true;
    const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
    return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
                                  name, dest_uninitialized);
  }


  // Helper for generating a dynamic type check.
  // Smashes rscratch1, rscratch2.
  void generate_type_check(Register sub_klass,
                           Register super_check_offset,
                           Register super_klass,
                           Label& L_success) {
    assert_different_registers(sub_klass, super_check_offset, super_klass);

    BLOCK_COMMENT("type_check:");

    Label L_miss;

    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
                                     super_check_offset);
    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);

    // Fall through on failure!
    __ BIND(L_miss);
  }

  //
  //  Generate checkcasting array copy stub
  //
  //  Input:
  //    c_rarg0   - source array address
  //    c_rarg1   - destination array address
  //    c_rarg2   - element count, treated as ssize_t, can be zero
  //    c_rarg3   - size_t ckoff (super_check_offset)
  //    c_rarg4   - oop ckval (super_klass)
  //
  //  Output:
  //    r0 ==  0  -  success
  //    r0 == -1^K - failure, where K is partial transfer count
  //
  address generate_checkcast_copy(const char *name, address *entry,
                                  bool dest_uninitialized = false) {

    Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;

    // Input registers (after setup_arg_regs)
    const Register from        = c_rarg0;   // source array address
    const Register to          = c_rarg1;   // destination array address
    const Register count       = c_rarg2;   // elementscount
    const Register ckoff       = c_rarg3;   // super_check_offset
    const Register ckval       = c_rarg4;   // super_klass

    RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
    RegSet wb_post_saved_regs = RegSet::of(count);

    // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
    const Register copied_oop  = r22;       // actual oop copied
    const Register count_save  = r21;       // orig elementscount
    const Register start_to    = r20;       // destination array start address
    const Register r19_klass   = r19;       // oop._klass

    //---------------------------------------------------------------
    // Assembler stub will be used for this call to arraycopy
    // if the two arrays are subtypes of Object[] but the
    // destination array type is not equal to or a supertype
    // of the source type.  Each element must be separately
    // checked.

    assert_different_registers(from, to, count, ckoff, ckval, start_to,
                               copied_oop, r19_klass, count_save);

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();

    __ enter(); // required for proper stackwalking of RuntimeStub frame

#ifdef ASSERT
    // caller guarantees that the arrays really are different
    // otherwise, we would have to make conjoint checks
    { Label L;
      __ b(L);                  // conjoint check not yet implemented
      __ stop("checkcast_copy within a single array");
      __ bind(L);
    }
#endif //ASSERT

    // Caller of this entry point must set up the argument registers.
    if (entry != NULL) {
      *entry = __ pc();
      BLOCK_COMMENT("Entry:");
    }

     // Empty array:  Nothing to do.
    __ cbz(count, L_done);
    __ push(RegSet::of(r19, r20, r21, r22), sp);

#ifdef ASSERT
    BLOCK_COMMENT("assert consistent ckoff/ckval");
    // The ckoff and ckval must be mutually consistent,
    // even though caller generates both.
    { Label L;
      int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ ldrw(start_to, Address(ckval, sco_offset));
      __ cmpw(ckoff, start_to);
      __ br(Assembler::EQ, L);
      __ stop("super_check_offset inconsistent");
      __ bind(L);
    }
#endif //ASSERT

    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
    bool is_oop = true;
    if (dest_uninitialized) {
      decorators |= IS_DEST_UNINITIALIZED;
    }

    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
    bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);

    // save the original count
    __ mov(count_save, count);

    // Copy from low to high addresses
    __ mov(start_to, to);              // Save destination array start address
    __ b(L_load_element);

    // ======== begin loop ========
    // (Loop is rotated; its entry is L_load_element.)
    // Loop control:
    //   for (; count != 0; count--) {
    //     copied_oop = load_heap_oop(from++);
    //     ... generate_type_check ...;
    //     store_heap_oop(to++, copied_oop);
    //   }
    __ align(OptoLoopAlignment);

    __ BIND(L_store_element);
    __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
    __ sub(count, count, 1);
    __ cbz(count, L_do_card_marks);

    // ======== loop entry is here ========
    __ BIND(L_load_element);
    __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
    __ cbz(copied_oop, L_store_element);

    __ load_klass(r19_klass, copied_oop);// query the object klass
    generate_type_check(r19_klass, ckoff, ckval, L_store_element);
    // ======== end loop ========

    // It was a real error; we must depend on the caller to finish the job.
    // Register count = remaining oops, count_orig = total oops.
    // Emit GC store barriers for the oops we have copied and report
    // their number to the caller.

    __ subs(count, count_save, count);     // K = partially copied oop count
    __ eon(count, count, zr);                   // report (-1^K) to caller
    __ br(Assembler::EQ, L_done_pop);

    __ BIND(L_do_card_marks);
    bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);

    __ bind(L_done_pop);
    __ pop(RegSet::of(r19, r20, r21, r22), sp);
    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);

    __ bind(L_done);
    __ mov(r0, count);
    __ leave();
    __ ret(lr);

    return start;
  }

  // Perform range checks on the proposed arraycopy.
  // Kills temp, but nothing else.
  // Also, clean the sign bits of src_pos and dst_pos.
  void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
                              Register src_pos, // source position (c_rarg1)
                              Register dst,     // destination array oo (c_rarg2)
                              Register dst_pos, // destination position (c_rarg3)
                              Register length,
                              Register temp,
                              Label& L_failed) {
    BLOCK_COMMENT("arraycopy_range_checks:");

    assert_different_registers(rscratch1, temp);

    //  if (src_pos + length > arrayOop(src)->length())  FAIL;
    __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
    __ addw(temp, length, src_pos);
    __ cmpw(temp, rscratch1);
    __ br(Assembler::HI, L_failed);

    //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
    __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
    __ addw(temp, length, dst_pos);
    __ cmpw(temp, rscratch1);
    __ br(Assembler::HI, L_failed);

    // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
    __ movw(src_pos, src_pos);
    __ movw(dst_pos, dst_pos);

    BLOCK_COMMENT("arraycopy_range_checks done");
  }

  // These stubs get called from some dumb test routine.
  // I'll write them properly when they're called from
  // something that's actually doing something.
  static void fake_arraycopy_stub(address src, address dst, int count) {
    assert(count == 0, "huh?");
  }


  //
  //  Generate 'unsafe' array copy stub
  //  Though just as safe as the other stubs, it takes an unscaled
  //  size_t argument instead of an element count.
  //
  //  Input:
  //    c_rarg0   - source array address
  //    c_rarg1   - destination array address
  //    c_rarg2   - byte count, treated as ssize_t, can be zero
  //
  // Examines the alignment of the operands and dispatches
  // to a long, int, short, or byte copy loop.
  //
  address generate_unsafe_copy(const char *name,
                               address byte_copy_entry,
                               address short_copy_entry,
                               address int_copy_entry,
                               address long_copy_entry) {
    Label L_long_aligned, L_int_aligned, L_short_aligned;
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter(); // required for proper stackwalking of RuntimeStub frame

    // bump this on entry, not on exit:
    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);

    __ orr(rscratch1, s, d);
    __ orr(rscratch1, rscratch1, count);

    __ andr(rscratch1, rscratch1, BytesPerLong-1);
    __ cbz(rscratch1, L_long_aligned);
    __ andr(rscratch1, rscratch1, BytesPerInt-1);
    __ cbz(rscratch1, L_int_aligned);
    __ tbz(rscratch1, 0, L_short_aligned);
    __ b(RuntimeAddress(byte_copy_entry));

    __ BIND(L_short_aligned);
    __ lsr(count, count, LogBytesPerShort);  // size => short_count
    __ b(RuntimeAddress(short_copy_entry));
    __ BIND(L_int_aligned);
    __ lsr(count, count, LogBytesPerInt);    // size => int_count
    __ b(RuntimeAddress(int_copy_entry));
    __ BIND(L_long_aligned);
    __ lsr(count, count, LogBytesPerLong);   // size => long_count
    __ b(RuntimeAddress(long_copy_entry));

    return start;
  }

  //
  //  Generate generic array copy stubs
  //
  //  Input:
  //    c_rarg0    -  src oop
  //    c_rarg1    -  src_pos (32-bits)
  //    c_rarg2    -  dst oop
  //    c_rarg3    -  dst_pos (32-bits)
  //    c_rarg4    -  element count (32-bits)
  //
  //  Output:
  //    r0 ==  0  -  success
  //    r0 == -1^K - failure, where K is partial transfer count
  //
  address generate_generic_copy(const char *name,
                                address byte_copy_entry, address short_copy_entry,
                                address int_copy_entry, address oop_copy_entry,
                                address long_copy_entry, address checkcast_copy_entry) {

    Label L_failed, L_objArray;
    Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;

    // Input registers
    const Register src        = c_rarg0;  // source array oop
    const Register src_pos    = c_rarg1;  // source position
    const Register dst        = c_rarg2;  // destination array oop
    const Register dst_pos    = c_rarg3;  // destination position
    const Register length     = c_rarg4;


    // Registers used as temps
    const Register dst_klass  = c_rarg5;

    __ align(CodeEntryAlignment);

    StubCodeMark mark(this, "StubRoutines", name);

    address start = __ pc();

    __ enter(); // required for proper stackwalking of RuntimeStub frame

    // bump this on entry, not on exit:
    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);

    //-----------------------------------------------------------------------
    // Assembler stub will be used for this call to arraycopy
    // if the following conditions are met:
    //
    // (1) src and dst must not be null.
    // (2) src_pos must not be negative.
    // (3) dst_pos must not be negative.
    // (4) length  must not be negative.
    // (5) src klass and dst klass should be the same and not NULL.
    // (6) src and dst should be arrays.
    // (7) src_pos + length must not exceed length of src.
    // (8) dst_pos + length must not exceed length of dst.
    //

    //  if (src == NULL) return -1;
    __ cbz(src, L_failed);

    //  if (src_pos < 0) return -1;
    __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set

    //  if (dst == NULL) return -1;
    __ cbz(dst, L_failed);

    //  if (dst_pos < 0) return -1;
    __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set

    // registers used as temp
    const Register scratch_length    = r16; // elements count to copy
    const Register scratch_src_klass = r17; // array klass
    const Register lh                = r15; // layout helper

    //  if (length < 0) return -1;
    __ movw(scratch_length, length);        // length (elements count, 32-bits value)
    __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set

    __ load_klass(scratch_src_klass, src);
#ifdef ASSERT
    //  assert(src->klass() != NULL);
    {
      BLOCK_COMMENT("assert klasses not null {");
      Label L1, L2;
      __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
      __ bind(L1);
      __ stop("broken null klass");
      __ bind(L2);
      __ load_klass(rscratch1, dst);
      __ cbz(rscratch1, L1);     // this would be broken also
      BLOCK_COMMENT("} assert klasses not null done");
    }
#endif

    // Load layout helper (32-bits)
    //
    //  |array_tag|     | header_size | element_type |     |log2_element_size|
    // 32        30    24            16              8     2                 0
    //
    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
    //

    const int lh_offset = in_bytes(Klass::layout_helper_offset());

    // Handle objArrays completely differently...
    const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
    __ ldrw(lh, Address(scratch_src_klass, lh_offset));
    __ movw(rscratch1, objArray_lh);
    __ eorw(rscratch2, lh, rscratch1);
    __ cbzw(rscratch2, L_objArray);

    //  if (src->klass() != dst->klass()) return -1;
    __ load_klass(rscratch2, dst);
    __ eor(rscratch2, rscratch2, scratch_src_klass);
    __ cbnz(rscratch2, L_failed);

    //  if (!src->is_Array()) return -1;
    __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)

    // At this point, it is known to be a typeArray (array_tag 0x3).
#ifdef ASSERT
    {
      BLOCK_COMMENT("assert primitive array {");
      Label L;
      __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
      __ cmpw(lh, rscratch2);
      __ br(Assembler::GE, L);
      __ stop("must be a primitive array");
      __ bind(L);
      BLOCK_COMMENT("} assert primitive array done");
    }
#endif

    arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
                           rscratch2, L_failed);

    // TypeArrayKlass
    //
    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
    //

    const Register rscratch1_offset = rscratch1;    // array offset
    const Register r15_elsize = lh; // element size

    __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
           exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
    __ add(src, src, rscratch1_offset);           // src array offset
    __ add(dst, dst, rscratch1_offset);           // dst array offset
    BLOCK_COMMENT("choose copy loop based on element size");

    // next registers should be set before the jump to corresponding stub
    const Register from     = c_rarg0;  // source array address
    const Register to       = c_rarg1;  // destination array address
    const Register count    = c_rarg2;  // elements count

    // 'from', 'to', 'count' registers should be set in such order
    // since they are the same as 'src', 'src_pos', 'dst'.

    assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");

    // The possible values of elsize are 0-3, i.e. exact_log2(element
    // size in bytes).  We do a simple bitwise binary search.
  __ BIND(L_copy_bytes);
    __ tbnz(r15_elsize, 1, L_copy_ints);
    __ tbnz(r15_elsize, 0, L_copy_shorts);
    __ lea(from, Address(src, src_pos));// src_addr
    __ lea(to,   Address(dst, dst_pos));// dst_addr
    __ movw(count, scratch_length); // length
    __ b(RuntimeAddress(byte_copy_entry));

  __ BIND(L_copy_shorts);
    __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
    __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
    __ movw(count, scratch_length); // length
    __ b(RuntimeAddress(short_copy_entry));

  __ BIND(L_copy_ints);
    __ tbnz(r15_elsize, 0, L_copy_longs);
    __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
    __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
    __ movw(count, scratch_length); // length
    __ b(RuntimeAddress(int_copy_entry));

  __ BIND(L_copy_longs);
#ifdef ASSERT
    {
      BLOCK_COMMENT("assert long copy {");
      Label L;
      __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
      __ cmpw(r15_elsize, LogBytesPerLong);
      __ br(Assembler::EQ, L);
      __ stop("must be long copy, but elsize is wrong");
      __ bind(L);
      BLOCK_COMMENT("} assert long copy done");
    }
#endif
    __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
    __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
    __ movw(count, scratch_length); // length
    __ b(RuntimeAddress(long_copy_entry));

    // ObjArrayKlass
  __ BIND(L_objArray);
    // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]

    Label L_plain_copy, L_checkcast_copy;
    //  test array classes for subtyping
    __ load_klass(r15, dst);
    __ cmp(scratch_src_klass, r15); // usual case is exact equality
    __ br(Assembler::NE, L_checkcast_copy);

    // Identically typed arrays can be copied without element-wise checks.
    arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
                           rscratch2, L_failed);

    __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
    __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
    __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
    __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
    __ movw(count, scratch_length); // length
  __ BIND(L_plain_copy);
    __ b(RuntimeAddress(oop_copy_entry));

  __ BIND(L_checkcast_copy);
    // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
    {
      // Before looking at dst.length, make sure dst is also an objArray.
      __ ldrw(rscratch1, Address(r15, lh_offset));
      __ movw(rscratch2, objArray_lh);
      __ eorw(rscratch1, rscratch1, rscratch2);
      __ cbnzw(rscratch1, L_failed);

      // It is safe to examine both src.length and dst.length.
      arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
                             r15, L_failed);

      __ load_klass(dst_klass, dst); // reload

      // Marshal the base address arguments now, freeing registers.
      __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
      __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
      __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
      __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
      __ movw(count, length);           // length (reloaded)
      Register sco_temp = c_rarg3;      // this register is free now
      assert_different_registers(from, to, count, sco_temp,
                                 dst_klass, scratch_src_klass);
      // assert_clean_int(count, sco_temp);

      // Generate the type check.
      const int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ ldrw(sco_temp, Address(dst_klass, sco_offset));

      // Smashes rscratch1, rscratch2
      generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);

      // Fetch destination element klass from the ObjArrayKlass header.
      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
      __ ldr(dst_klass, Address(dst_klass, ek_offset));
      __ ldrw(sco_temp, Address(dst_klass, sco_offset));

      // the checkcast_copy loop needs two extra arguments:
      assert(c_rarg3 == sco_temp, "#3 already in place");
      // Set up arguments for checkcast_copy_entry.
      __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
      __ b(RuntimeAddress(checkcast_copy_entry));
    }

  __ BIND(L_failed);
    __ mov(r0, -1);
    __ leave();   // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);

    return start;
  }

  //
  // Generate stub for array fill. If "aligned" is true, the
  // "to" address is assumed to be heapword aligned.
  //
  // Arguments for generated stub:
  //   to:    c_rarg0
  //   value: c_rarg1
  //   count: c_rarg2 treated as signed
  //
  address generate_fill(BasicType t, bool aligned, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();

    BLOCK_COMMENT("Entry:");

    const Register to        = c_rarg0;  // source array address
    const Register value     = c_rarg1;  // value
    const Register count     = c_rarg2;  // elements count

    const Register bz_base = r10;        // base for block_zero routine
    const Register cnt_words = r11;      // temp register

    __ enter();

    Label L_fill_elements, L_exit1;

    int shift = -1;
    switch (t) {
      case T_BYTE:
        shift = 0;
        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
        __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
        __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
        __ br(Assembler::LO, L_fill_elements);
        break;
      case T_SHORT:
        shift = 1;
        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
        __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
        __ br(Assembler::LO, L_fill_elements);
        break;
      case T_INT:
        shift = 2;
        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
        __ br(Assembler::LO, L_fill_elements);
        break;
      default: ShouldNotReachHere();
    }

    // Align source address at 8 bytes address boundary.
    Label L_skip_align1, L_skip_align2, L_skip_align4;
    if (!aligned) {
      switch (t) {
        case T_BYTE:
          // One byte misalignment happens only for byte arrays.
          __ tbz(to, 0, L_skip_align1);
          __ strb(value, Address(__ post(to, 1)));
          __ subw(count, count, 1);
          __ bind(L_skip_align1);
          // Fallthrough
        case T_SHORT:
          // Two bytes misalignment happens only for byte and short (char) arrays.
          __ tbz(to, 1, L_skip_align2);
          __ strh(value, Address(__ post(to, 2)));
          __ subw(count, count, 2 >> shift);
          __ bind(L_skip_align2);
          // Fallthrough
        case T_INT:
          // Align to 8 bytes, we know we are 4 byte aligned to start.
          __ tbz(to, 2, L_skip_align4);
          __ strw(value, Address(__ post(to, 4)));
          __ subw(count, count, 4 >> shift);
          __ bind(L_skip_align4);
          break;
        default: ShouldNotReachHere();
      }
    }

    //
    //  Fill large chunks
    //
    __ lsrw(cnt_words, count, 3 - shift); // number of words
    __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
    __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
    if (UseBlockZeroing) {
      Label non_block_zeroing, rest;
      // If the fill value is zero we can use the fast zero_words().
      __ cbnz(value, non_block_zeroing);
      __ mov(bz_base, to);
      __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
      __ zero_words(bz_base, cnt_words);
      __ b(rest);
      __ bind(non_block_zeroing);
      __ fill_words(to, cnt_words, value);
      __ bind(rest);
    } else {
      __ fill_words(to, cnt_words, value);
    }

    // Remaining count is less than 8 bytes. Fill it by a single store.
    // Note that the total length is no less than 8 bytes.
    if (t == T_BYTE || t == T_SHORT) {
      Label L_exit1;
      __ cbzw(count, L_exit1);
      __ add(to, to, count, Assembler::LSL, shift); // points to the end
      __ str(value, Address(to, -8));    // overwrite some elements
      __ bind(L_exit1);
      __ leave();
      __ ret(lr);
    }

    // Handle copies less than 8 bytes.
    Label L_fill_2, L_fill_4, L_exit2;
    __ bind(L_fill_elements);
    switch (t) {
      case T_BYTE:
        __ tbz(count, 0, L_fill_2);
        __ strb(value, Address(__ post(to, 1)));
        __ bind(L_fill_2);
        __ tbz(count, 1, L_fill_4);
        __ strh(value, Address(__ post(to, 2)));
        __ bind(L_fill_4);
        __ tbz(count, 2, L_exit2);
        __ strw(value, Address(to));
        break;
      case T_SHORT:
        __ tbz(count, 0, L_fill_4);
        __ strh(value, Address(__ post(to, 2)));
        __ bind(L_fill_4);
        __ tbz(count, 1, L_exit2);
        __ strw(value, Address(to));
        break;
      case T_INT:
        __ cbzw(count, L_exit2);
        __ strw(value, Address(to));
        break;
      default: ShouldNotReachHere();
    }
    __ bind(L_exit2);
    __ leave();
    __ ret(lr);
    return start;
  }

  address generate_data_cache_writeback() {
    const Register line        = c_rarg0;  // address of line to write back

    __ align(CodeEntryAlignment);

    StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");

    address start = __ pc();
    __ enter();
    __ cache_wb(Address(line, 0));
    __ leave();
    __ ret(lr);

    return start;
  }

  address generate_data_cache_writeback_sync() {
    const Register is_pre     = c_rarg0;  // pre or post sync

    __ align(CodeEntryAlignment);

    StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");

    // pre wbsync is a no-op
    // post wbsync translates to an sfence

    Label skip;
    address start = __ pc();
    __ enter();
    __ cbnz(is_pre, skip);
    __ cache_wbsync(false);
    __ bind(skip);
    __ leave();
    __ ret(lr);

    return start;
  }

  void generate_arraycopy_stubs() {
    address entry;
    address entry_jbyte_arraycopy;
    address entry_jshort_arraycopy;
    address entry_jint_arraycopy;
    address entry_oop_arraycopy;
    address entry_jlong_arraycopy;
    address entry_checkcast_arraycopy;

    generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
    generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);

    StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();

    //*** jbyte
    // Always need aligned and unaligned versions
    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
                                                                                  "jbyte_disjoint_arraycopy");
    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
                                                                                  &entry_jbyte_arraycopy,
                                                                                  "jbyte_arraycopy");
    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
                                                                                  "arrayof_jbyte_disjoint_arraycopy");
    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
                                                                                  "arrayof_jbyte_arraycopy");

    //*** jshort
    // Always need aligned and unaligned versions
    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
                                                                                    "jshort_disjoint_arraycopy");
    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
                                                                                    &entry_jshort_arraycopy,
                                                                                    "jshort_arraycopy");
    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
                                                                                    "arrayof_jshort_disjoint_arraycopy");
    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
                                                                                    "arrayof_jshort_arraycopy");

    //*** jint
    // Aligned versions
    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
                                                                                "arrayof_jint_disjoint_arraycopy");
    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
                                                                                "arrayof_jint_arraycopy");
    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
    // entry_jint_arraycopy always points to the unaligned version
    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
                                                                                "jint_disjoint_arraycopy");
    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
                                                                                &entry_jint_arraycopy,
                                                                                "jint_arraycopy");

    //*** jlong
    // It is always aligned
    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
                                                                                  "arrayof_jlong_disjoint_arraycopy");
    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
                                                                                  "arrayof_jlong_arraycopy");
    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;

    //*** oops
    {
      // With compressed oops we need unaligned versions; notice that
      // we overwrite entry_oop_arraycopy.
      bool aligned = !UseCompressedOops;

      StubRoutines::_arrayof_oop_disjoint_arraycopy
        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
                                     /*dest_uninitialized*/false);
      StubRoutines::_arrayof_oop_arraycopy
        = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
                                     /*dest_uninitialized*/false);
      // Aligned versions without pre-barriers
      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
                                     /*dest_uninitialized*/true);
      StubRoutines::_arrayof_oop_arraycopy_uninit
        = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
                                     /*dest_uninitialized*/true);
    }

    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;

    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
                                                                        /*dest_uninitialized*/true);

    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
                                                              entry_jbyte_arraycopy,
                                                              entry_jshort_arraycopy,
                                                              entry_jint_arraycopy,
                                                              entry_jlong_arraycopy);

    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
                                                               entry_jbyte_arraycopy,
                                                               entry_jshort_arraycopy,
                                                               entry_jint_arraycopy,
                                                               entry_oop_arraycopy,
                                                               entry_jlong_arraycopy,
                                                               entry_checkcast_arraycopy);

    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
  }

  void generate_math_stubs() { Unimplemented(); }

  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - source byte array address
  //   c_rarg1   - destination byte array address
  //   c_rarg2   - K (key) in little endian int array
  //
  address generate_aescrypt_encryptBlock() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");

    const Register from        = c_rarg0;  // source array address
    const Register to          = c_rarg1;  // destination array address
    const Register key         = c_rarg2;  // key array address
    const Register keylen      = rscratch1;

    address start = __ pc();
    __ enter();

    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

    __ aesenc_loadkeys(key, keylen);
    __ aesecb_encrypt(from, to, keylen);

    __ mov(r0, 0);

    __ leave();
    __ ret(lr);

    return start;
  }

  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - source byte array address
  //   c_rarg1   - destination byte array address
  //   c_rarg2   - K (key) in little endian int array
  //
  address generate_aescrypt_decryptBlock() {
    assert(UseAES, "need AES cryptographic extension support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
    Label L_doLast;

    const Register from        = c_rarg0;  // source array address
    const Register to          = c_rarg1;  // destination array address
    const Register key         = c_rarg2;  // key array address
    const Register keylen      = rscratch1;

    address start = __ pc();
    __ enter(); // required for proper stackwalking of RuntimeStub frame

    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

    __ aesecb_decrypt(from, to, key, keylen);

    __ mov(r0, 0);

    __ leave();
    __ ret(lr);

    return start;
  }

  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - source byte array address
  //   c_rarg1   - destination byte array address
  //   c_rarg2   - K (key) in little endian int array
  //   c_rarg3   - r vector byte array address
  //   c_rarg4   - input length
  //
  // Output:
  //   x0        - input length
  //
  address generate_cipherBlockChaining_encryptAESCrypt() {
    assert(UseAES, "need AES cryptographic extension support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");

    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;

    const Register from        = c_rarg0;  // source array address
    const Register to          = c_rarg1;  // destination array address
    const Register key         = c_rarg2;  // key array address
    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
                                           // and left with the results of the last encryption block
    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
    const Register keylen      = rscratch1;

    address start = __ pc();

      __ enter();

      __ movw(rscratch2, len_reg);

      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

      __ ld1(v0, __ T16B, rvec);

      __ cmpw(keylen, 52);
      __ br(Assembler::CC, L_loadkeys_44);
      __ br(Assembler::EQ, L_loadkeys_52);

      __ ld1(v17, v18, __ T16B, __ post(key, 32));
      __ rev32(v17, __ T16B, v17);
      __ rev32(v18, __ T16B, v18);
    __ BIND(L_loadkeys_52);
      __ ld1(v19, v20, __ T16B, __ post(key, 32));
      __ rev32(v19, __ T16B, v19);
      __ rev32(v20, __ T16B, v20);
    __ BIND(L_loadkeys_44);
      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
      __ rev32(v21, __ T16B, v21);
      __ rev32(v22, __ T16B, v22);
      __ rev32(v23, __ T16B, v23);
      __ rev32(v24, __ T16B, v24);
      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
      __ rev32(v25, __ T16B, v25);
      __ rev32(v26, __ T16B, v26);
      __ rev32(v27, __ T16B, v27);
      __ rev32(v28, __ T16B, v28);
      __ ld1(v29, v30, v31, __ T16B, key);
      __ rev32(v29, __ T16B, v29);
      __ rev32(v30, __ T16B, v30);
      __ rev32(v31, __ T16B, v31);

    __ BIND(L_aes_loop);
      __ ld1(v1, __ T16B, __ post(from, 16));
      __ eor(v0, __ T16B, v0, v1);

      __ br(Assembler::CC, L_rounds_44);
      __ br(Assembler::EQ, L_rounds_52);

      __ aese(v0, v17); __ aesmc(v0, v0);
      __ aese(v0, v18); __ aesmc(v0, v0);
    __ BIND(L_rounds_52);
      __ aese(v0, v19); __ aesmc(v0, v0);
      __ aese(v0, v20); __ aesmc(v0, v0);
    __ BIND(L_rounds_44);
      __ aese(v0, v21); __ aesmc(v0, v0);
      __ aese(v0, v22); __ aesmc(v0, v0);
      __ aese(v0, v23); __ aesmc(v0, v0);
      __ aese(v0, v24); __ aesmc(v0, v0);
      __ aese(v0, v25); __ aesmc(v0, v0);
      __ aese(v0, v26); __ aesmc(v0, v0);
      __ aese(v0, v27); __ aesmc(v0, v0);
      __ aese(v0, v28); __ aesmc(v0, v0);
      __ aese(v0, v29); __ aesmc(v0, v0);
      __ aese(v0, v30);
      __ eor(v0, __ T16B, v0, v31);

      __ st1(v0, __ T16B, __ post(to, 16));

      __ subw(len_reg, len_reg, 16);
      __ cbnzw(len_reg, L_aes_loop);

      __ st1(v0, __ T16B, rvec);

      __ mov(r0, rscratch2);

      __ leave();
      __ ret(lr);

      return start;
  }

  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - source byte array address
  //   c_rarg1   - destination byte array address
  //   c_rarg2   - K (key) in little endian int array
  //   c_rarg3   - r vector byte array address
  //   c_rarg4   - input length
  //
  // Output:
  //   r0        - input length
  //
  address generate_cipherBlockChaining_decryptAESCrypt() {
    assert(UseAES, "need AES cryptographic extension support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");

    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;

    const Register from        = c_rarg0;  // source array address
    const Register to          = c_rarg1;  // destination array address
    const Register key         = c_rarg2;  // key array address
    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
                                           // and left with the results of the last encryption block
    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
    const Register keylen      = rscratch1;

    address start = __ pc();

      __ enter();

      __ movw(rscratch2, len_reg);

      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

      __ ld1(v2, __ T16B, rvec);

      __ ld1(v31, __ T16B, __ post(key, 16));
      __ rev32(v31, __ T16B, v31);

      __ cmpw(keylen, 52);
      __ br(Assembler::CC, L_loadkeys_44);
      __ br(Assembler::EQ, L_loadkeys_52);

      __ ld1(v17, v18, __ T16B, __ post(key, 32));
      __ rev32(v17, __ T16B, v17);
      __ rev32(v18, __ T16B, v18);
    __ BIND(L_loadkeys_52);
      __ ld1(v19, v20, __ T16B, __ post(key, 32));
      __ rev32(v19, __ T16B, v19);
      __ rev32(v20, __ T16B, v20);
    __ BIND(L_loadkeys_44);
      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
      __ rev32(v21, __ T16B, v21);
      __ rev32(v22, __ T16B, v22);
      __ rev32(v23, __ T16B, v23);
      __ rev32(v24, __ T16B, v24);
      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
      __ rev32(v25, __ T16B, v25);
      __ rev32(v26, __ T16B, v26);
      __ rev32(v27, __ T16B, v27);
      __ rev32(v28, __ T16B, v28);
      __ ld1(v29, v30, __ T16B, key);
      __ rev32(v29, __ T16B, v29);
      __ rev32(v30, __ T16B, v30);

    __ BIND(L_aes_loop);
      __ ld1(v0, __ T16B, __ post(from, 16));
      __ orr(v1, __ T16B, v0, v0);

      __ br(Assembler::CC, L_rounds_44);
      __ br(Assembler::EQ, L_rounds_52);

      __ aesd(v0, v17); __ aesimc(v0, v0);
      __ aesd(v0, v18); __ aesimc(v0, v0);
    __ BIND(L_rounds_52);
      __ aesd(v0, v19); __ aesimc(v0, v0);
      __ aesd(v0, v20); __ aesimc(v0, v0);
    __ BIND(L_rounds_44);
      __ aesd(v0, v21); __ aesimc(v0, v0);
      __ aesd(v0, v22); __ aesimc(v0, v0);
      __ aesd(v0, v23); __ aesimc(v0, v0);
      __ aesd(v0, v24); __ aesimc(v0, v0);
      __ aesd(v0, v25); __ aesimc(v0, v0);
      __ aesd(v0, v26); __ aesimc(v0, v0);
      __ aesd(v0, v27); __ aesimc(v0, v0);
      __ aesd(v0, v28); __ aesimc(v0, v0);
      __ aesd(v0, v29); __ aesimc(v0, v0);
      __ aesd(v0, v30);
      __ eor(v0, __ T16B, v0, v31);
      __ eor(v0, __ T16B, v0, v2);

      __ st1(v0, __ T16B, __ post(to, 16));
      __ orr(v2, __ T16B, v1, v1);

      __ subw(len_reg, len_reg, 16);
      __ cbnzw(len_reg, L_aes_loop);

      __ st1(v2, __ T16B, rvec);

      __ mov(r0, rscratch2);

      __ leave();
      __ ret(lr);

    return start;
  }

  // CTR AES crypt.
  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - source byte array address
  //   c_rarg1   - destination byte array address
  //   c_rarg2   - K (key) in little endian int array
  //   c_rarg3   - counter vector byte array address
  //   c_rarg4   - input length
  //   c_rarg5   - saved encryptedCounter start
  //   c_rarg6   - saved used length
  //
  // Output:
  //   r0       - input length
  //
  address generate_counterMode_AESCrypt() {
    const Register in = c_rarg0;
    const Register out = c_rarg1;
    const Register key = c_rarg2;
    const Register counter = c_rarg3;
    const Register saved_len = c_rarg4, len = r10;
    const Register saved_encrypted_ctr = c_rarg5;
    const Register used_ptr = c_rarg6, used = r12;

    const Register offset = r7;
    const Register keylen = r11;

    const unsigned char block_size = 16;
    const int bulk_width = 4;
    // NB: bulk_width can be 4 or 8. 8 gives slightly faster
    // performance with larger data sizes, but it also means that the
    // fast path isn't used until you have at least 8 blocks, and up
    // to 127 bytes of data will be executed on the slow path. For
    // that reason, and also so as not to blow away too much icache, 4
    // blocks seems like a sensible compromise.

    // Algorithm:
    //
    //    if (len == 0) {
    //        goto DONE;
    //    }
    //    int result = len;
    //    do {
    //        if (used >= blockSize) {
    //            if (len >= bulk_width * blockSize) {
    //                CTR_large_block();
    //                if (len == 0)
    //                    goto DONE;
    //            }
    //            for (;;) {
    //                16ByteVector v0 = counter;
    //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
    //                used = 0;
    //                if (len < blockSize)
    //                    break;    /* goto NEXT */
    //                16ByteVector v1 = load16Bytes(in, offset);
    //                v1 = v1 ^ encryptedCounter;
    //                store16Bytes(out, offset);
    //                used = blockSize;
    //                offset += blockSize;
    //                len -= blockSize;
    //                if (len == 0)
    //                    goto DONE;
    //            }
    //        }
    //      NEXT:
    //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
    //        len--;
    //    } while (len != 0);
    //  DONE:
    //    return result;
    //
    // CTR_large_block()
    //    Wide bulk encryption of whole blocks.

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
    const address start = __ pc();
    __ enter();

    Label DONE, CTR_large_block, large_block_return;
    __ ldrw(used, Address(used_ptr));
    __ cbzw(saved_len, DONE);

    __ mov(len, saved_len);
    __ mov(offset, 0);

    // Compute #rounds for AES based on the length of the key array
    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

    __ aesenc_loadkeys(key, keylen);

    {
      Label L_CTR_loop, NEXT;

      __ bind(L_CTR_loop);

      __ cmp(used, block_size);
      __ br(__ LO, NEXT);

      // Maybe we have a lot of data
      __ subsw(rscratch1, len, bulk_width * block_size);
      __ br(__ HS, CTR_large_block);
      __ BIND(large_block_return);
      __ cbzw(len, DONE);

      // Setup the counter
      __ movi(v4, __ T4S, 0);
      __ movi(v5, __ T4S, 1);
      __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }

      __ ld1(v0, __ T16B, counter); // Load the counter into v0
      __ rev32(v16, __ T16B, v0);
      __ addv(v16, __ T4S, v16, v4);
      __ rev32(v16, __ T16B, v16);
      __ st1(v16, __ T16B, counter); // Save the incremented counter back

      {
        // We have fewer than bulk_width blocks of data left. Encrypt
        // them one by one until there is less than a full block
        // remaining, being careful to save both the encrypted counter
        // and the counter.

        Label inner_loop;
        __ bind(inner_loop);
        // Counter to encrypt is in v0
        __ aesecb_encrypt(noreg, noreg, keylen);
        __ st1(v0, __ T16B, saved_encrypted_ctr);

        // Do we have a remaining full block?

        __ mov(used, 0);
        __ cmp(len, block_size);
        __ br(__ LO, NEXT);

        // Yes, we have a full block
        __ ldrq(v1, Address(in, offset));
        __ eor(v1, __ T16B, v1, v0);
        __ strq(v1, Address(out, offset));
        __ mov(used, block_size);
        __ add(offset, offset, block_size);

        __ subw(len, len, block_size);
        __ cbzw(len, DONE);

        // Increment the counter, store it back
        __ orr(v0, __ T16B, v16, v16);
        __ rev32(v16, __ T16B, v16);
        __ addv(v16, __ T4S, v16, v4);
        __ rev32(v16, __ T16B, v16);
        __ st1(v16, __ T16B, counter); // Save the incremented counter back

        __ b(inner_loop);
      }

      __ BIND(NEXT);

      // Encrypt a single byte, and loop.
      // We expect this to be a rare event.
      __ ldrb(rscratch1, Address(in, offset));
      __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
      __ eor(rscratch1, rscratch1, rscratch2);
      __ strb(rscratch1, Address(out, offset));
      __ add(offset, offset, 1);
      __ add(used, used, 1);
      __ subw(len, len,1);
      __ cbnzw(len, L_CTR_loop);
    }

    __ bind(DONE);
    __ strw(used, Address(used_ptr));
    __ mov(r0, saved_len);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);

    // Bulk encryption

    __ BIND (CTR_large_block);
    assert(bulk_width == 4 || bulk_width == 8, "must be");

    if (bulk_width == 8) {
      __ sub(sp, sp, 4 * 16);
      __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
    }
    __ sub(sp, sp, 4 * 16);
    __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
    RegSet saved_regs = (RegSet::of(in, out, offset)
                         + RegSet::of(saved_encrypted_ctr, used_ptr, len));
    __ push(saved_regs, sp);
    __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
    __ add(in, in, offset);
    __ add(out, out, offset);

    // Keys should already be loaded into the correct registers

    __ ld1(v0, __ T16B, counter); // v0 contains the first counter
    __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter

    // AES/CTR loop
    {
      Label L_CTR_loop;
      __ BIND(L_CTR_loop);

      // Setup the counters
      __ movi(v8, __ T4S, 0);
      __ movi(v9, __ T4S, 1);
      __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }

      for (int i = 0; i < bulk_width; i++) {
        FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
        __ rev32(v0_ofs, __ T16B, v16);
        __ addv(v16, __ T4S, v16, v8);
      }

      __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));

      // Encrypt the counters
      __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);

      if (bulk_width == 8) {
        __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
      }

      // XOR the encrypted counters with the inputs
      for (int i = 0; i < bulk_width; i++) {
        FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
        FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
        __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
      }

      // Write the encrypted data
      __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
      if (bulk_width == 8) {
        __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
      }

      __ subw(len, len, 16 * bulk_width);
      __ cbnzw(len, L_CTR_loop);
    }

    // Save the counter back where it goes
    __ rev32(v16, __ T16B, v16);
    __ st1(v16, __ T16B, counter);

    __ pop(saved_regs, sp);

    __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
    if (bulk_width == 8) {
      __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
    }

    __ andr(rscratch1, len, -16 * bulk_width);
    __ sub(len, len, rscratch1);
    __ add(offset, offset, rscratch1);
    __ mov(used, 16);
    __ strw(used, Address(used_ptr));
    __ b(large_block_return);

    return start;
  }

  // Vector AES Galois Counter Mode implementation. Parameters:
  //
  // in = c_rarg0
  // len = c_rarg1
  // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
  // out = c_rarg3
  // key = c_rarg4
  // state = c_rarg5 - GHASH.state
  // subkeyHtbl = c_rarg6 - powers of H
  // counter = c_rarg7 - 16 bytes of CTR
  // return - number of processed bytes
  address generate_galoisCounterMode_AESCrypt() {
    address ghash_polynomial = __ pc();
    __ emit_int64(0x87);  // The low-order bits of the field
                          // polynomial (i.e. p = z^7+z^2+z+1)
                          // repeated in the low and high parts of a
                          // 128-bit vector
    __ emit_int64(0x87);

    __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
    address start = __ pc();
    __ enter();

    const Register in = c_rarg0;
    const Register len = c_rarg1;
    const Register ct = c_rarg2;
    const Register out = c_rarg3;
    // and updated with the incremented counter in the end

    const Register key = c_rarg4;
    const Register state = c_rarg5;

    const Register subkeyHtbl = c_rarg6;

    const Register counter = c_rarg7;

    const Register keylen = r10;
    // Save state before entering routine
    __ sub(sp, sp, 4 * 16);
    __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
    __ sub(sp, sp, 4 * 16);
    __ st1(v8, v9, v10, v11, __ T16B, Address(sp));

    // __ andr(len, len, -512);
    __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
    __ str(len, __ pre(sp, -2 * wordSize));

    Label DONE;
    __ cbz(len, DONE);

    // Compute #rounds for AES based on the length of the key array
    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

    __ aesenc_loadkeys(key, keylen);
    __ ld1(v0, __ T16B, counter); // v0 contains the first counter
    __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter

    // AES/CTR loop
    {
      Label L_CTR_loop;
      __ BIND(L_CTR_loop);

      // Setup the counters
      __ movi(v8, __ T4S, 0);
      __ movi(v9, __ T4S, 1);
      __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }

      assert(v0->encoding() < v8->encoding(), "");
      for (int i = v0->encoding(); i < v8->encoding(); i++) {
        FloatRegister f = as_FloatRegister(i);
        __ rev32(f, __ T16B, v16);
        __ addv(v16, __ T4S, v16, v8);
      }

      __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));

      // Encrypt the counters
      __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);

      __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));

      // XOR the encrypted counters with the inputs
      for (int i = 0; i < 8; i++) {
        FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
        FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
        __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
      }
      __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
      __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));

      __ subw(len, len, 16 * 8);
      __ cbnzw(len, L_CTR_loop);
    }

    __ rev32(v16, __ T16B, v16);
    __ st1(v16, __ T16B, counter);

    __ ldr(len, Address(sp));
    __ lsr(len, len, exact_log2(16));  // We want the count of blocks

    // GHASH/CTR loop
    __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
                                len, /*unrolls*/4);

#ifdef ASSERT
    { Label L;
      __ cmp(len, (unsigned char)0);
      __ br(Assembler::EQ, L);
      __ stop("stubGenerator: abort");
      __ bind(L);
  }
#endif

  __ bind(DONE);
    // Return the number of bytes processed
    __ ldr(r0, __ post(sp, 2 * wordSize));

    __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
    __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));

    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);
     return start;
  }

  // Utility routines for md5.
  // Clobbers r10 and r11.
  void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4,
              int k, int s, int t) {
    Register rscratch3 = r10;
    Register rscratch4 = r11;

    __ eorw(rscratch3, r3, r4);
    __ movw(rscratch2, t);
    __ andw(rscratch3, rscratch3, r2);
    __ addw(rscratch4, r1, rscratch2);
    __ ldrw(rscratch1, Address(buf, k*4));
    __ eorw(rscratch3, rscratch3, r4);
    __ addw(rscratch3, rscratch3, rscratch1);
    __ addw(rscratch3, rscratch3, rscratch4);
    __ rorw(rscratch2, rscratch3, 32 - s);
    __ addw(r1, rscratch2, r2);
  }

  void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4,
              int k, int s, int t) {
    Register rscratch3 = r10;
    Register rscratch4 = r11;

    __ eorw(rscratch2, r2, r3);
    __ ldrw(rscratch1, Address(buf, k*4));
    __ andw(rscratch3, rscratch2, r4);
    __ movw(rscratch2, t);
    __ eorw(rscratch3, rscratch3, r3);
    __ addw(rscratch4, r1, rscratch2);
    __ addw(rscratch3, rscratch3, rscratch1);
    __ addw(rscratch3, rscratch3, rscratch4);
    __ rorw(rscratch2, rscratch3, 32 - s);
    __ addw(r1, rscratch2, r2);
  }

  void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4,
              int k, int s, int t) {
    Register rscratch3 = r10;
    Register rscratch4 = r11;

    __ eorw(rscratch3, r3, r4);
    __ movw(rscratch2, t);
    __ addw(rscratch4, r1, rscratch2);
    __ ldrw(rscratch1, Address(buf, k*4));
    __ eorw(rscratch3, rscratch3, r2);
    __ addw(rscratch3, rscratch3, rscratch1);
    __ addw(rscratch3, rscratch3, rscratch4);
    __ rorw(rscratch2, rscratch3, 32 - s);
    __ addw(r1, rscratch2, r2);
  }

  void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4,
              int k, int s, int t) {
    Register rscratch3 = r10;
    Register rscratch4 = r11;

    __ movw(rscratch3, t);
    __ ornw(rscratch2, r2, r4);
    __ addw(rscratch4, r1, rscratch3);
    __ ldrw(rscratch1, Address(buf, k*4));
    __ eorw(rscratch3, rscratch2, r3);
    __ addw(rscratch3, rscratch3, rscratch1);
    __ addw(rscratch3, rscratch3, rscratch4);
    __ rorw(rscratch2, rscratch3, 32 - s);
    __ addw(r1, rscratch2, r2);
  }

  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - byte[]  source+offset
  //   c_rarg1   - int[]   SHA.state
  //   c_rarg2   - int     offset
  //   c_rarg3   - int     limit
  //
  address generate_md5_implCompress(bool multi_block, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();

    Register buf       = c_rarg0;
    Register state     = c_rarg1;
    Register ofs       = c_rarg2;
    Register limit     = c_rarg3;
    Register a         = r4;
    Register b         = r5;
    Register c         = r6;
    Register d         = r7;
    Register rscratch3 = r10;
    Register rscratch4 = r11;

    Label md5_loop;
    __ BIND(md5_loop);

    // Save hash values for addition after rounds
    __ ldrw(a, Address(state,  0));
    __ ldrw(b, Address(state,  4));
    __ ldrw(c, Address(state,  8));
    __ ldrw(d, Address(state, 12));

    // Round 1
    md5_FF(buf, a, b, c, d,  0,  7, 0xd76aa478);
    md5_FF(buf, d, a, b, c,  1, 12, 0xe8c7b756);
    md5_FF(buf, c, d, a, b,  2, 17, 0x242070db);
    md5_FF(buf, b, c, d, a,  3, 22, 0xc1bdceee);
    md5_FF(buf, a, b, c, d,  4,  7, 0xf57c0faf);
    md5_FF(buf, d, a, b, c,  5, 12, 0x4787c62a);
    md5_FF(buf, c, d, a, b,  6, 17, 0xa8304613);
    md5_FF(buf, b, c, d, a,  7, 22, 0xfd469501);
    md5_FF(buf, a, b, c, d,  8,  7, 0x698098d8);
    md5_FF(buf, d, a, b, c,  9, 12, 0x8b44f7af);
    md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1);
    md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be);
    md5_FF(buf, a, b, c, d, 12,  7, 0x6b901122);
    md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193);
    md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e);
    md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821);

    // Round 2
    md5_GG(buf, a, b, c, d,  1,  5, 0xf61e2562);
    md5_GG(buf, d, a, b, c,  6,  9, 0xc040b340);
    md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51);
    md5_GG(buf, b, c, d, a,  0, 20, 0xe9b6c7aa);
    md5_GG(buf, a, b, c, d,  5,  5, 0xd62f105d);
    md5_GG(buf, d, a, b, c, 10,  9, 0x02441453);
    md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681);
    md5_GG(buf, b, c, d, a,  4, 20, 0xe7d3fbc8);
    md5_GG(buf, a, b, c, d,  9,  5, 0x21e1cde6);
    md5_GG(buf, d, a, b, c, 14,  9, 0xc33707d6);
    md5_GG(buf, c, d, a, b,  3, 14, 0xf4d50d87);
    md5_GG(buf, b, c, d, a,  8, 20, 0x455a14ed);
    md5_GG(buf, a, b, c, d, 13,  5, 0xa9e3e905);
    md5_GG(buf, d, a, b, c,  2,  9, 0xfcefa3f8);
    md5_GG(buf, c, d, a, b,  7, 14, 0x676f02d9);
    md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a);

    // Round 3
    md5_HH(buf, a, b, c, d,  5,  4, 0xfffa3942);
    md5_HH(buf, d, a, b, c,  8, 11, 0x8771f681);
    md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122);
    md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c);
    md5_HH(buf, a, b, c, d,  1,  4, 0xa4beea44);
    md5_HH(buf, d, a, b, c,  4, 11, 0x4bdecfa9);
    md5_HH(buf, c, d, a, b,  7, 16, 0xf6bb4b60);
    md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70);
    md5_HH(buf, a, b, c, d, 13,  4, 0x289b7ec6);
    md5_HH(buf, d, a, b, c,  0, 11, 0xeaa127fa);
    md5_HH(buf, c, d, a, b,  3, 16, 0xd4ef3085);
    md5_HH(buf, b, c, d, a,  6, 23, 0x04881d05);
    md5_HH(buf, a, b, c, d,  9,  4, 0xd9d4d039);
    md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5);
    md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8);
    md5_HH(buf, b, c, d, a,  2, 23, 0xc4ac5665);

    // Round 4
    md5_II(buf, a, b, c, d,  0,  6, 0xf4292244);
    md5_II(buf, d, a, b, c,  7, 10, 0x432aff97);
    md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7);
    md5_II(buf, b, c, d, a,  5, 21, 0xfc93a039);
    md5_II(buf, a, b, c, d, 12,  6, 0x655b59c3);
    md5_II(buf, d, a, b, c,  3, 10, 0x8f0ccc92);
    md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d);
    md5_II(buf, b, c, d, a,  1, 21, 0x85845dd1);
    md5_II(buf, a, b, c, d,  8,  6, 0x6fa87e4f);
    md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0);
    md5_II(buf, c, d, a, b,  6, 15, 0xa3014314);
    md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1);
    md5_II(buf, a, b, c, d,  4,  6, 0xf7537e82);
    md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235);
    md5_II(buf, c, d, a, b,  2, 15, 0x2ad7d2bb);
    md5_II(buf, b, c, d, a,  9, 21, 0xeb86d391);

    // write hash values back in the correct order
    __ ldrw(rscratch1, Address(state,  0));
    __ addw(rscratch1, rscratch1, a);
    __ strw(rscratch1, Address(state,  0));

    __ ldrw(rscratch2, Address(state,  4));
    __ addw(rscratch2, rscratch2, b);
    __ strw(rscratch2, Address(state,  4));

    __ ldrw(rscratch3, Address(state,  8));
    __ addw(rscratch3, rscratch3, c);
    __ strw(rscratch3, Address(state,  8));

    __ ldrw(rscratch4, Address(state, 12));
    __ addw(rscratch4, rscratch4, d);
    __ strw(rscratch4, Address(state, 12));

    if (multi_block) {
      __ add(buf, buf, 64);
      __ add(ofs, ofs, 64);
      __ cmp(ofs, limit);
      __ br(Assembler::LE, md5_loop);
      __ mov(c_rarg0, ofs); // return ofs
    }

    __ ret(lr);

    return start;
  }

  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - byte[]  source+offset
  //   c_rarg1   - int[]   SHA.state
  //   c_rarg2   - int     offset
  //   c_rarg3   - int     limit
  //
  address generate_sha1_implCompress(bool multi_block, const char *name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();

    Register buf   = c_rarg0;
    Register state = c_rarg1;
    Register ofs   = c_rarg2;
    Register limit = c_rarg3;

    Label keys;
    Label sha1_loop;

    // load the keys into v0..v3
    __ adr(rscratch1, keys);
    __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
    // load 5 words state into v6, v7
    __ ldrq(v6, Address(state, 0));
    __ ldrs(v7, Address(state, 16));


    __ BIND(sha1_loop);
    // load 64 bytes of data into v16..v19
    __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
    __ rev32(v16, __ T16B, v16);
    __ rev32(v17, __ T16B, v17);
    __ rev32(v18, __ T16B, v18);
    __ rev32(v19, __ T16B, v19);

    // do the sha1
    __ addv(v4, __ T4S, v16, v0);
    __ orr(v20, __ T16B, v6, v6);

    FloatRegister d0 = v16;
    FloatRegister d1 = v17;
    FloatRegister d2 = v18;
    FloatRegister d3 = v19;

    for (int round = 0; round < 20; round++) {
      FloatRegister tmp1 = (round & 1) ? v4 : v5;
      FloatRegister tmp2 = (round & 1) ? v21 : v22;
      FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
      FloatRegister tmp4 = (round & 1) ? v5 : v4;
      FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));

      if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
      if (round < 19) __ addv(tmp1, __ T4S, d1, key);
      __ sha1h(tmp2, __ T4S, v20);
      if (round < 5)
        __ sha1c(v20, __ T4S, tmp3, tmp4);
      else if (round < 10 || round >= 15)
        __ sha1p(v20, __ T4S, tmp3, tmp4);
      else
        __ sha1m(v20, __ T4S, tmp3, tmp4);
      if (round < 16) __ sha1su1(d0, __ T4S, d3);

      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
    }

    __ addv(v7, __ T2S, v7, v21);
    __ addv(v6, __ T4S, v6, v20);

    if (multi_block) {
      __ add(ofs, ofs, 64);
      __ cmp(ofs, limit);
      __ br(Assembler::LE, sha1_loop);
      __ mov(c_rarg0, ofs); // return ofs
    }

    __ strq(v6, Address(state, 0));
    __ strs(v7, Address(state, 16));

    __ ret(lr);

    __ bind(keys);
    __ emit_int32(0x5a827999);
    __ emit_int32(0x6ed9eba1);
    __ emit_int32(0x8f1bbcdc);
    __ emit_int32(0xca62c1d6);

    return start;
  }


  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - byte[]  source+offset
  //   c_rarg1   - int[]   SHA.state
  //   c_rarg2   - int     offset
  //   c_rarg3   - int     limit
  //
  address generate_sha256_implCompress(bool multi_block, const char *name) {
    static const uint32_t round_consts[64] = {
      0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
      0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
      0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
      0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
      0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
      0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
      0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
      0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
      0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
      0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
      0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
      0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
      0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
      0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
      0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
      0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
    };
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();

    Register buf   = c_rarg0;
    Register state = c_rarg1;
    Register ofs   = c_rarg2;
    Register limit = c_rarg3;

    Label sha1_loop;

    __ stpd(v8, v9, __ pre(sp, -32));
    __ stpd(v10, v11, Address(sp, 16));

// dga == v0
// dgb == v1
// dg0 == v2
// dg1 == v3
// dg2 == v4
// t0 == v6
// t1 == v7

    // load 16 keys to v16..v31
    __ lea(rscratch1, ExternalAddress((address)round_consts));
    __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
    __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
    __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
    __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);

    // load 8 words (256 bits) state
    __ ldpq(v0, v1, state);

    __ BIND(sha1_loop);
    // load 64 bytes of data into v8..v11
    __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
    __ rev32(v8, __ T16B, v8);
    __ rev32(v9, __ T16B, v9);
    __ rev32(v10, __ T16B, v10);
    __ rev32(v11, __ T16B, v11);

    __ addv(v6, __ T4S, v8, v16);
    __ orr(v2, __ T16B, v0, v0);
    __ orr(v3, __ T16B, v1, v1);

    FloatRegister d0 = v8;
    FloatRegister d1 = v9;
    FloatRegister d2 = v10;
    FloatRegister d3 = v11;


    for (int round = 0; round < 16; round++) {
      FloatRegister tmp1 = (round & 1) ? v6 : v7;
      FloatRegister tmp2 = (round & 1) ? v7 : v6;
      FloatRegister tmp3 = (round & 1) ? v2 : v4;
      FloatRegister tmp4 = (round & 1) ? v4 : v2;

      if (round < 12) __ sha256su0(d0, __ T4S, d1);
       __ orr(v4, __ T16B, v2, v2);
      if (round < 15)
        __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
      __ sha256h(v2, __ T4S, v3, tmp2);
      __ sha256h2(v3, __ T4S, v4, tmp2);
      if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);

      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
    }

    __ addv(v0, __ T4S, v0, v2);
    __ addv(v1, __ T4S, v1, v3);

    if (multi_block) {
      __ add(ofs, ofs, 64);
      __ cmp(ofs, limit);
      __ br(Assembler::LE, sha1_loop);
      __ mov(c_rarg0, ofs); // return ofs
    }

    __ ldpd(v10, v11, Address(sp, 16));
    __ ldpd(v8, v9, __ post(sp, 32));

    __ stpq(v0, v1, state);

    __ ret(lr);

    return start;
  }

  // Double rounds for sha512.
  void sha512_dround(int dr,
                     FloatRegister vi0, FloatRegister vi1,
                     FloatRegister vi2, FloatRegister vi3,
                     FloatRegister vi4, FloatRegister vrc0,
                     FloatRegister vrc1, FloatRegister vin0,
                     FloatRegister vin1, FloatRegister vin2,
                     FloatRegister vin3, FloatRegister vin4) {
      if (dr < 36) {
        __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
      }
      __ addv(v5, __ T2D, vrc0, vin0);
      __ ext(v6, __ T16B, vi2, vi3, 8);
      __ ext(v5, __ T16B, v5, v5, 8);
      __ ext(v7, __ T16B, vi1, vi2, 8);
      __ addv(vi3, __ T2D, vi3, v5);
      if (dr < 32) {
        __ ext(v5, __ T16B, vin3, vin4, 8);
        __ sha512su0(vin0, __ T2D, vin1);
      }
      __ sha512h(vi3, __ T2D, v6, v7);
      if (dr < 32) {
        __ sha512su1(vin0, __ T2D, vin2, v5);
      }
      __ addv(vi4, __ T2D, vi1, vi3);
      __ sha512h2(vi3, __ T2D, vi1, vi0);
  }

  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - byte[]  source+offset
  //   c_rarg1   - int[]   SHA.state
  //   c_rarg2   - int     offset
  //   c_rarg3   - int     limit
  //
  address generate_sha512_implCompress(bool multi_block, const char *name) {
    static const uint64_t round_consts[80] = {
      0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
      0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
      0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
      0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
      0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
      0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
      0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
      0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
      0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
      0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
      0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
      0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
      0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
      0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
      0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
      0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
      0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
      0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
      0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
      0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
      0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
      0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
      0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
      0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
      0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
      0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
      0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
    };

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();

    Register buf   = c_rarg0;
    Register state = c_rarg1;
    Register ofs   = c_rarg2;
    Register limit = c_rarg3;

    __ stpd(v8, v9, __ pre(sp, -64));
    __ stpd(v10, v11, Address(sp, 16));
    __ stpd(v12, v13, Address(sp, 32));
    __ stpd(v14, v15, Address(sp, 48));

    Label sha512_loop;

    // load state
    __ ld1(v8, v9, v10, v11, __ T2D, state);

    // load first 4 round constants
    __ lea(rscratch1, ExternalAddress((address)round_consts));
    __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));

    __ BIND(sha512_loop);
    // load 128B of data into v12..v19
    __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
    __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
    __ rev64(v12, __ T16B, v12);
    __ rev64(v13, __ T16B, v13);
    __ rev64(v14, __ T16B, v14);
    __ rev64(v15, __ T16B, v15);
    __ rev64(v16, __ T16B, v16);
    __ rev64(v17, __ T16B, v17);
    __ rev64(v18, __ T16B, v18);
    __ rev64(v19, __ T16B, v19);

    __ mov(rscratch2, rscratch1);

    __ mov(v0, __ T16B, v8);
    __ mov(v1, __ T16B, v9);
    __ mov(v2, __ T16B, v10);
    __ mov(v3, __ T16B, v11);

    sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
    sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
    sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
    sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
    sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
    sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
    sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
    sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
    sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
    sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
    sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
    sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
    sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
    sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
    sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
    sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
    sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
    sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
    sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
    sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
    sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
    sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
    sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
    sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
    sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
    sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
    sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
    sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
    sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
    sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
    sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
    sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
    sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
    sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
    sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
    sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
    sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
    sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
    sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
    sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);

    __ addv(v8, __ T2D, v8, v0);
    __ addv(v9, __ T2D, v9, v1);
    __ addv(v10, __ T2D, v10, v2);
    __ addv(v11, __ T2D, v11, v3);

    if (multi_block) {
      __ add(ofs, ofs, 128);
      __ cmp(ofs, limit);
      __ br(Assembler::LE, sha512_loop);
      __ mov(c_rarg0, ofs); // return ofs
    }

    __ st1(v8, v9, v10, v11, __ T2D, state);

    __ ldpd(v14, v15, Address(sp, 48));
    __ ldpd(v12, v13, Address(sp, 32));
    __ ldpd(v10, v11, Address(sp, 16));
    __ ldpd(v8, v9, __ post(sp, 64));

    __ ret(lr);

    return start;
  }

  // Arguments:
  //
  // Inputs:
  //   c_rarg0   - byte[]  source+offset
  //   c_rarg1   - byte[]   SHA.state
  //   c_rarg2   - int     digest_length
  //   c_rarg3   - int     offset
  //   c_rarg4   - int     limit
  //
  address generate_sha3_implCompress(bool multi_block, const char *name) {
    static const uint64_t round_consts[24] = {
      0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
      0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
      0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
      0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
      0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
      0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
      0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
      0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
    };

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();

    Register buf           = c_rarg0;
    Register state         = c_rarg1;
    Register digest_length = c_rarg2;
    Register ofs           = c_rarg3;
    Register limit         = c_rarg4;

    Label sha3_loop, rounds24_loop;
    Label sha3_512, sha3_384_or_224, sha3_256;

    __ stpd(v8, v9, __ pre(sp, -64));
    __ stpd(v10, v11, Address(sp, 16));
    __ stpd(v12, v13, Address(sp, 32));
    __ stpd(v14, v15, Address(sp, 48));

    // load state
    __ add(rscratch1, state, 32);
    __ ld1(v0, v1, v2,  v3,  __ T1D, state);
    __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
    __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
    __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
    __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
    __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
    __ ld1(v24, __ T1D, rscratch1);

    __ BIND(sha3_loop);

    // 24 keccak rounds
    __ movw(rscratch2, 24);

    // load round_constants base
    __ lea(rscratch1, ExternalAddress((address) round_consts));

    // load input
    __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
    __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
    __ eor(v0, __ T8B, v0, v25);
    __ eor(v1, __ T8B, v1, v26);
    __ eor(v2, __ T8B, v2, v27);
    __ eor(v3, __ T8B, v3, v28);
    __ eor(v4, __ T8B, v4, v29);
    __ eor(v5, __ T8B, v5, v30);
    __ eor(v6, __ T8B, v6, v31);

    // digest_length == 64, SHA3-512
    __ tbnz(digest_length, 6, sha3_512);

    __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
    __ ld1(v29, v30, __ T8B, __ post(buf, 16));
    __ eor(v7, __ T8B, v7, v25);
    __ eor(v8, __ T8B, v8, v26);
    __ eor(v9, __ T8B, v9, v27);
    __ eor(v10, __ T8B, v10, v28);
    __ eor(v11, __ T8B, v11, v29);
    __ eor(v12, __ T8B, v12, v30);

    // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
    __ tbnz(digest_length, 4, sha3_384_or_224);

    // SHA3-256
    __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
    __ eor(v13, __ T8B, v13, v25);
    __ eor(v14, __ T8B, v14, v26);
    __ eor(v15, __ T8B, v15, v27);
    __ eor(v16, __ T8B, v16, v28);
    __ b(rounds24_loop);

    __ BIND(sha3_384_or_224);
    __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384

    // SHA3-224
    __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
    __ ld1(v29, __ T8B, __ post(buf, 8));
    __ eor(v13, __ T8B, v13, v25);
    __ eor(v14, __ T8B, v14, v26);
    __ eor(v15, __ T8B, v15, v27);
    __ eor(v16, __ T8B, v16, v28);
    __ eor(v17, __ T8B, v17, v29);
    __ b(rounds24_loop);

    __ BIND(sha3_512);
    __ ld1(v25, v26, __ T8B, __ post(buf, 16));
    __ eor(v7, __ T8B, v7, v25);
    __ eor(v8, __ T8B, v8, v26);

    __ BIND(rounds24_loop);
    __ subw(rscratch2, rscratch2, 1);

    __ eor3(v29, __ T16B, v4, v9, v14);
    __ eor3(v26, __ T16B, v1, v6, v11);
    __ eor3(v28, __ T16B, v3, v8, v13);
    __ eor3(v25, __ T16B, v0, v5, v10);
    __ eor3(v27, __ T16B, v2, v7, v12);
    __ eor3(v29, __ T16B, v29, v19, v24);
    __ eor3(v26, __ T16B, v26, v16, v21);
    __ eor3(v28, __ T16B, v28, v18, v23);
    __ eor3(v25, __ T16B, v25, v15, v20);
    __ eor3(v27, __ T16B, v27, v17, v22);

    __ rax1(v30, __ T2D, v29, v26);
    __ rax1(v26, __ T2D, v26, v28);
    __ rax1(v28, __ T2D, v28, v25);
    __ rax1(v25, __ T2D, v25, v27);
    __ rax1(v27, __ T2D, v27, v29);

    __ eor(v0, __ T16B, v0, v30);
    __ xar(v29, __ T2D, v1,  v25, (64 - 1));
    __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
    __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
    __ xar(v9,  __ T2D, v22, v26, (64 - 61));
    __ xar(v22, __ T2D, v14, v28, (64 - 39));
    __ xar(v14, __ T2D, v20, v30, (64 - 18));
    __ xar(v31, __ T2D, v2,  v26, (64 - 62));
    __ xar(v2,  __ T2D, v12, v26, (64 - 43));
    __ xar(v12, __ T2D, v13, v27, (64 - 25));
    __ xar(v13, __ T2D, v19, v28, (64 - 8));
    __ xar(v19, __ T2D, v23, v27, (64 - 56));
    __ xar(v23, __ T2D, v15, v30, (64 - 41));
    __ xar(v15, __ T2D, v4,  v28, (64 - 27));
    __ xar(v28, __ T2D, v24, v28, (64 - 14));
    __ xar(v24, __ T2D, v21, v25, (64 - 2));
    __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
    __ xar(v4,  __ T2D, v16, v25, (64 - 45));
    __ xar(v16, __ T2D, v5,  v30, (64 - 36));
    __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
    __ xar(v27, __ T2D, v18, v27, (64 - 21));
    __ xar(v3,  __ T2D, v17, v26, (64 - 15));
    __ xar(v25, __ T2D, v11, v25, (64 - 10));
    __ xar(v26, __ T2D, v7,  v26, (64 - 6));
    __ xar(v30, __ T2D, v10, v30, (64 - 3));

    __ bcax(v20, __ T16B, v31, v22, v8);
    __ bcax(v21, __ T16B, v8,  v23, v22);
    __ bcax(v22, __ T16B, v22, v24, v23);
    __ bcax(v23, __ T16B, v23, v31, v24);
    __ bcax(v24, __ T16B, v24, v8,  v31);

    __ ld1r(v31, __ T2D, __ post(rscratch1, 8));

    __ bcax(v17, __ T16B, v25, v19, v3);
    __ bcax(v18, __ T16B, v3,  v15, v19);
    __ bcax(v19, __ T16B, v19, v16, v15);
    __ bcax(v15, __ T16B, v15, v25, v16);
    __ bcax(v16, __ T16B, v16, v3,  v25);

    __ bcax(v10, __ T16B, v29, v12, v26);
    __ bcax(v11, __ T16B, v26, v13, v12);
    __ bcax(v12, __ T16B, v12, v14, v13);
    __ bcax(v13, __ T16B, v13, v29, v14);
    __ bcax(v14, __ T16B, v14, v26, v29);

    __ bcax(v7, __ T16B, v30, v9,  v4);
    __ bcax(v8, __ T16B, v4,  v5,  v9);
    __ bcax(v9, __ T16B, v9,  v6,  v5);
    __ bcax(v5, __ T16B, v5,  v30, v6);
    __ bcax(v6, __ T16B, v6,  v4,  v30);

    __ bcax(v3, __ T16B, v27, v0,  v28);
    __ bcax(v4, __ T16B, v28, v1,  v0);
    __ bcax(v0, __ T16B, v0,  v2,  v1);
    __ bcax(v1, __ T16B, v1,  v27, v2);
    __ bcax(v2, __ T16B, v2,  v28, v27);

    __ eor(v0, __ T16B, v0, v31);

    __ cbnzw(rscratch2, rounds24_loop);

    if (multi_block) {
      // block_size =  200 - 2 * digest_length, ofs += block_size
      __ add(ofs, ofs, 200);
      __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);

      __ cmp(ofs, limit);
      __ br(Assembler::LE, sha3_loop);
      __ mov(c_rarg0, ofs); // return ofs
    }

    __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
    __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
    __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
    __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
    __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
    __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
    __ st1(v24, __ T1D, state);

    __ ldpd(v14, v15, Address(sp, 48));
    __ ldpd(v12, v13, Address(sp, 32));
    __ ldpd(v10, v11, Address(sp, 16));
    __ ldpd(v8, v9, __ post(sp, 64));

    __ ret(lr);

    return start;
  }

  /**
   *  Arguments:
   *
   * Inputs:
   *   c_rarg0   - int crc
   *   c_rarg1   - byte* buf
   *   c_rarg2   - int length
   *
   * Output:
   *       rax   - int crc result
   */
  address generate_updateBytesCRC32() {
    assert(UseCRC32Intrinsics, "what are we doing here?");

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");

    address start = __ pc();

    const Register crc   = c_rarg0;  // crc
    const Register buf   = c_rarg1;  // source java byte array address
    const Register len   = c_rarg2;  // length
    const Register table0 = c_rarg3; // crc_table address
    const Register table1 = c_rarg4;
    const Register table2 = c_rarg5;
    const Register table3 = c_rarg6;
    const Register tmp3 = c_rarg7;

    BLOCK_COMMENT("Entry:");
    __ enter(); // required for proper stackwalking of RuntimeStub frame

    __ kernel_crc32(crc, buf, len,
              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);

    return start;
  }

  /**
   *  Arguments:
   *
   * Inputs:
   *   c_rarg0   - int crc
   *   c_rarg1   - byte* buf
   *   c_rarg2   - int length
   *   c_rarg3   - int* table
   *
   * Output:
   *       r0   - int crc result
   */
  address generate_updateBytesCRC32C() {
    assert(UseCRC32CIntrinsics, "what are we doing here?");

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");

    address start = __ pc();

    const Register crc   = c_rarg0;  // crc
    const Register buf   = c_rarg1;  // source java byte array address
    const Register len   = c_rarg2;  // length
    const Register table0 = c_rarg3; // crc_table address
    const Register table1 = c_rarg4;
    const Register table2 = c_rarg5;
    const Register table3 = c_rarg6;
    const Register tmp3 = c_rarg7;

    BLOCK_COMMENT("Entry:");
    __ enter(); // required for proper stackwalking of RuntimeStub frame

    __ kernel_crc32c(crc, buf, len,
              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);

    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);

    return start;
  }

  /***
   *  Arguments:
   *
   *  Inputs:
   *   c_rarg0   - int   adler
   *   c_rarg1   - byte* buff
   *   c_rarg2   - int   len
   *
   * Output:
   *   c_rarg0   - int adler result
   */
  address generate_updateBytesAdler32() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
    address start = __ pc();

    Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;

    // Aliases
    Register adler  = c_rarg0;
    Register s1     = c_rarg0;
    Register s2     = c_rarg3;
    Register buff   = c_rarg1;
    Register len    = c_rarg2;
    Register nmax  = r4;
    Register base  = r5;
    Register count = r6;
    Register temp0 = rscratch1;
    Register temp1 = rscratch2;
    FloatRegister vbytes = v0;
    FloatRegister vs1acc = v1;
    FloatRegister vs2acc = v2;
    FloatRegister vtable = v3;

    // Max number of bytes we can process before having to take the mod
    // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
    uint64_t BASE = 0xfff1;
    uint64_t NMAX = 0x15B0;

    __ mov(base, BASE);
    __ mov(nmax, NMAX);

    // Load accumulation coefficients for the upper 16 bits
    __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
    __ ld1(vtable, __ T16B, Address(temp0));

    // s1 is initialized to the lower 16 bits of adler
    // s2 is initialized to the upper 16 bits of adler
    __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
    __ uxth(s1, adler);          // s1 = (adler & 0xffff)

    // The pipelined loop needs at least 16 elements for 1 iteration
    // It does check this, but it is more effective to skip to the cleanup loop
    __ cmp(len, (u1)16);
    __ br(Assembler::HS, L_nmax);
    __ cbz(len, L_combine);

    __ bind(L_simple_by1_loop);
    __ ldrb(temp0, Address(__ post(buff, 1)));
    __ add(s1, s1, temp0);
    __ add(s2, s2, s1);
    __ subs(len, len, 1);
    __ br(Assembler::HI, L_simple_by1_loop);

    // s1 = s1 % BASE
    __ subs(temp0, s1, base);
    __ csel(s1, temp0, s1, Assembler::HS);

    // s2 = s2 % BASE
    __ lsr(temp0, s2, 16);
    __ lsl(temp1, temp0, 4);
    __ sub(temp1, temp1, temp0);
    __ add(s2, temp1, s2, ext::uxth);

    __ subs(temp0, s2, base);
    __ csel(s2, temp0, s2, Assembler::HS);

    __ b(L_combine);

    __ bind(L_nmax);
    __ subs(len, len, nmax);
    __ sub(count, nmax, 16);
    __ br(Assembler::LO, L_by16);

    __ bind(L_nmax_loop);

    generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
                                      vbytes, vs1acc, vs2acc, vtable);

    __ subs(count, count, 16);
    __ br(Assembler::HS, L_nmax_loop);

    // s1 = s1 % BASE
    __ lsr(temp0, s1, 16);
    __ lsl(temp1, temp0, 4);
    __ sub(temp1, temp1, temp0);
    __ add(temp1, temp1, s1, ext::uxth);

    __ lsr(temp0, temp1, 16);
    __ lsl(s1, temp0, 4);
    __ sub(s1, s1, temp0);
    __ add(s1, s1, temp1, ext:: uxth);

    __ subs(temp0, s1, base);
    __ csel(s1, temp0, s1, Assembler::HS);

    // s2 = s2 % BASE
    __ lsr(temp0, s2, 16);
    __ lsl(temp1, temp0, 4);
    __ sub(temp1, temp1, temp0);
    __ add(temp1, temp1, s2, ext::uxth);

    __ lsr(temp0, temp1, 16);
    __ lsl(s2, temp0, 4);
    __ sub(s2, s2, temp0);
    __ add(s2, s2, temp1, ext:: uxth);

    __ subs(temp0, s2, base);
    __ csel(s2, temp0, s2, Assembler::HS);

    __ subs(len, len, nmax);
    __ sub(count, nmax, 16);
    __ br(Assembler::HS, L_nmax_loop);

    __ bind(L_by16);
    __ adds(len, len, count);
    __ br(Assembler::LO, L_by1);

    __ bind(L_by16_loop);

    generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
                                      vbytes, vs1acc, vs2acc, vtable);

    __ subs(len, len, 16);
    __ br(Assembler::HS, L_by16_loop);

    __ bind(L_by1);
    __ adds(len, len, 15);
    __ br(Assembler::LO, L_do_mod);

    __ bind(L_by1_loop);
    __ ldrb(temp0, Address(__ post(buff, 1)));
    __ add(s1, temp0, s1);
    __ add(s2, s2, s1);
    __ subs(len, len, 1);
    __ br(Assembler::HS, L_by1_loop);

    __ bind(L_do_mod);
    // s1 = s1 % BASE
    __ lsr(temp0, s1, 16);
    __ lsl(temp1, temp0, 4);
    __ sub(temp1, temp1, temp0);
    __ add(temp1, temp1, s1, ext::uxth);

    __ lsr(temp0, temp1, 16);
    __ lsl(s1, temp0, 4);
    __ sub(s1, s1, temp0);
    __ add(s1, s1, temp1, ext:: uxth);

    __ subs(temp0, s1, base);
    __ csel(s1, temp0, s1, Assembler::HS);

    // s2 = s2 % BASE
    __ lsr(temp0, s2, 16);
    __ lsl(temp1, temp0, 4);
    __ sub(temp1, temp1, temp0);
    __ add(temp1, temp1, s2, ext::uxth);

    __ lsr(temp0, temp1, 16);
    __ lsl(s2, temp0, 4);
    __ sub(s2, s2, temp0);
    __ add(s2, s2, temp1, ext:: uxth);

    __ subs(temp0, s2, base);
    __ csel(s2, temp0, s2, Assembler::HS);

    // Combine lower bits and higher bits
    __ bind(L_combine);
    __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)

    __ ret(lr);

    return start;
  }

  void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
          Register temp0, Register temp1, FloatRegister vbytes,
          FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
    // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
    // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
    // In non-vectorized code, we update s1 and s2 as:
    //   s1 <- s1 + b1
    //   s2 <- s2 + s1
    //   s1 <- s1 + b2
    //   s2 <- s2 + b1
    //   ...
    //   s1 <- s1 + b16
    //   s2 <- s2 + s1
    // Putting above assignments together, we have:
    //   s1_new = s1 + b1 + b2 + ... + b16
    //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
    //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
    //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
    __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));

    // s2 = s2 + s1 * 16
    __ add(s2, s2, s1, Assembler::LSL, 4);

    // vs1acc = b1 + b2 + b3 + ... + b16
    // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
    __ umullv(vs2acc, __ T8B, vtable, vbytes);
    __ umlalv(vs2acc, __ T16B, vtable, vbytes);
    __ uaddlv(vs1acc, __ T16B, vbytes);
    __ uaddlv(vs2acc, __ T8H, vs2acc);

    // s1 = s1 + vs1acc, s2 = s2 + vs2acc
    __ fmovd(temp0, vs1acc);
    __ fmovd(temp1, vs2acc);
    __ add(s1, s1, temp0);
    __ add(s2, s2, temp1);
  }

  /**
   *  Arguments:
   *
   *  Input:
   *    c_rarg0   - x address
   *    c_rarg1   - x length
   *    c_rarg2   - y address
   *    c_rarg3   - y length
   *    c_rarg4   - z address
   *    c_rarg5   - z length
   */
  address generate_multiplyToLen() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "multiplyToLen");

    address start = __ pc();
    const Register x     = r0;
    const Register xlen  = r1;
    const Register y     = r2;
    const Register ylen  = r3;
    const Register z     = r4;
    const Register zlen  = r5;

    const Register tmp1  = r10;
    const Register tmp2  = r11;
    const Register tmp3  = r12;
    const Register tmp4  = r13;
    const Register tmp5  = r14;
    const Register tmp6  = r15;
    const Register tmp7  = r16;

    BLOCK_COMMENT("Entry:");
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);

    return start;
  }

  address generate_squareToLen() {
    // squareToLen algorithm for sizes 1..127 described in java code works
    // faster than multiply_to_len on some CPUs and slower on others, but
    // multiply_to_len shows a bit better overall results
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "squareToLen");
    address start = __ pc();

    const Register x     = r0;
    const Register xlen  = r1;
    const Register z     = r2;
    const Register zlen  = r3;
    const Register y     = r4; // == x
    const Register ylen  = r5; // == xlen

    const Register tmp1  = r10;
    const Register tmp2  = r11;
    const Register tmp3  = r12;
    const Register tmp4  = r13;
    const Register tmp5  = r14;
    const Register tmp6  = r15;
    const Register tmp7  = r16;

    RegSet spilled_regs = RegSet::of(y, ylen);
    BLOCK_COMMENT("Entry:");
    __ enter();
    __ push(spilled_regs, sp);
    __ mov(y, x);
    __ mov(ylen, xlen);
    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
    __ pop(spilled_regs, sp);
    __ leave();
    __ ret(lr);
    return start;
  }

  address generate_mulAdd() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "mulAdd");

    address start = __ pc();

    const Register out     = r0;
    const Register in      = r1;
    const Register offset  = r2;
    const Register len     = r3;
    const Register k       = r4;

    BLOCK_COMMENT("Entry:");
    __ enter();
    __ mul_add(out, in, offset, len, k);
    __ leave();
    __ ret(lr);

    return start;
  }

  // Arguments:
  //
  // Input:
  //   c_rarg0   - newArr address
  //   c_rarg1   - oldArr address
  //   c_rarg2   - newIdx
  //   c_rarg3   - shiftCount
  //   c_rarg4   - numIter
  //
  address generate_bigIntegerRightShift() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
    address start = __ pc();

    Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;

    Register newArr        = c_rarg0;
    Register oldArr        = c_rarg1;
    Register newIdx        = c_rarg2;
    Register shiftCount    = c_rarg3;
    Register numIter       = c_rarg4;
    Register idx           = numIter;

    Register newArrCur     = rscratch1;
    Register shiftRevCount = rscratch2;
    Register oldArrCur     = r13;
    Register oldArrNext    = r14;

    FloatRegister oldElem0        = v0;
    FloatRegister oldElem1        = v1;
    FloatRegister newElem         = v2;
    FloatRegister shiftVCount     = v3;
    FloatRegister shiftVRevCount  = v4;

    __ cbz(idx, Exit);

    __ add(newArr, newArr, newIdx, Assembler::LSL, 2);

    // left shift count
    __ movw(shiftRevCount, 32);
    __ subw(shiftRevCount, shiftRevCount, shiftCount);

    // numIter too small to allow a 4-words SIMD loop, rolling back
    __ cmp(numIter, (u1)4);
    __ br(Assembler::LT, ShiftThree);

    __ dup(shiftVCount,    __ T4S, shiftCount);
    __ dup(shiftVRevCount, __ T4S, shiftRevCount);
    __ negr(shiftVCount,   __ T4S, shiftVCount);

    __ BIND(ShiftSIMDLoop);

    // Calculate the load addresses
    __ sub(idx, idx, 4);
    __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
    __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
    __ add(oldArrCur,  oldArrNext, 4);

    // Load 4 words and process
    __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
    __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
    __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
    __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
    __ orr(newElem,   __ T16B, oldElem0, oldElem1);
    __ st1(newElem,   __ T4S,  Address(newArrCur));

    __ cmp(idx, (u1)4);
    __ br(Assembler::LT, ShiftTwoLoop);
    __ b(ShiftSIMDLoop);

    __ BIND(ShiftTwoLoop);
    __ cbz(idx, Exit);
    __ cmp(idx, (u1)1);
    __ br(Assembler::EQ, ShiftOne);

    // Calculate the load addresses
    __ sub(idx, idx, 2);
    __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
    __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
    __ add(oldArrCur,  oldArrNext, 4);

    // Load 2 words and process
    __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
    __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
    __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
    __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
    __ orr(newElem,   __ T8B, oldElem0, oldElem1);
    __ st1(newElem,   __ T2S, Address(newArrCur));
    __ b(ShiftTwoLoop);

    __ BIND(ShiftThree);
    __ tbz(idx, 1, ShiftOne);
    __ tbz(idx, 0, ShiftTwo);
    __ ldrw(r10,  Address(oldArr, 12));
    __ ldrw(r11,  Address(oldArr, 8));
    __ lsrvw(r10, r10, shiftCount);
    __ lslvw(r11, r11, shiftRevCount);
    __ orrw(r12,  r10, r11);
    __ strw(r12,  Address(newArr, 8));

    __ BIND(ShiftTwo);
    __ ldrw(r10,  Address(oldArr, 8));
    __ ldrw(r11,  Address(oldArr, 4));
    __ lsrvw(r10, r10, shiftCount);
    __ lslvw(r11, r11, shiftRevCount);
    __ orrw(r12,  r10, r11);
    __ strw(r12,  Address(newArr, 4));

    __ BIND(ShiftOne);
    __ ldrw(r10,  Address(oldArr, 4));
    __ ldrw(r11,  Address(oldArr));
    __ lsrvw(r10, r10, shiftCount);
    __ lslvw(r11, r11, shiftRevCount);
    __ orrw(r12,  r10, r11);
    __ strw(r12,  Address(newArr));

    __ BIND(Exit);
    __ ret(lr);

    return start;
  }

  // Arguments:
  //
  // Input:
  //   c_rarg0   - newArr address
  //   c_rarg1   - oldArr address
  //   c_rarg2   - newIdx
  //   c_rarg3   - shiftCount
  //   c_rarg4   - numIter
  //
  address generate_bigIntegerLeftShift() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
    address start = __ pc();

    Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;

    Register newArr        = c_rarg0;
    Register oldArr        = c_rarg1;
    Register newIdx        = c_rarg2;
    Register shiftCount    = c_rarg3;
    Register numIter       = c_rarg4;

    Register shiftRevCount = rscratch1;
    Register oldArrNext    = rscratch2;

    FloatRegister oldElem0        = v0;
    FloatRegister oldElem1        = v1;
    FloatRegister newElem         = v2;
    FloatRegister shiftVCount     = v3;
    FloatRegister shiftVRevCount  = v4;

    __ cbz(numIter, Exit);

    __ add(oldArrNext, oldArr, 4);
    __ add(newArr, newArr, newIdx, Assembler::LSL, 2);

    // right shift count
    __ movw(shiftRevCount, 32);
    __ subw(shiftRevCount, shiftRevCount, shiftCount);

    // numIter too small to allow a 4-words SIMD loop, rolling back
    __ cmp(numIter, (u1)4);
    __ br(Assembler::LT, ShiftThree);

    __ dup(shiftVCount,     __ T4S, shiftCount);
    __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
    __ negr(shiftVRevCount, __ T4S, shiftVRevCount);

    __ BIND(ShiftSIMDLoop);

    // load 4 words and process
    __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
    __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
    __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
    __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
    __ orr(newElem,   __ T16B, oldElem0, oldElem1);
    __ st1(newElem,   __ T4S,  __ post(newArr, 16));
    __ sub(numIter,   numIter, 4);

    __ cmp(numIter, (u1)4);
    __ br(Assembler::LT, ShiftTwoLoop);
    __ b(ShiftSIMDLoop);

    __ BIND(ShiftTwoLoop);
    __ cbz(numIter, Exit);
    __ cmp(numIter, (u1)1);
    __ br(Assembler::EQ, ShiftOne);

    // load 2 words and process
    __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
    __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
    __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
    __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
    __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
    __ st1(newElem,   __ T2S,  __ post(newArr, 8));
    __ sub(numIter,   numIter, 2);
    __ b(ShiftTwoLoop);

    __ BIND(ShiftThree);
    __ ldrw(r10,  __ post(oldArr, 4));
    __ ldrw(r11,  __ post(oldArrNext, 4));
    __ lslvw(r10, r10, shiftCount);
    __ lsrvw(r11, r11, shiftRevCount);
    __ orrw(r12,  r10, r11);
    __ strw(r12,  __ post(newArr, 4));
    __ tbz(numIter, 1, Exit);
    __ tbz(numIter, 0, ShiftOne);

    __ BIND(ShiftTwo);
    __ ldrw(r10,  __ post(oldArr, 4));
    __ ldrw(r11,  __ post(oldArrNext, 4));
    __ lslvw(r10, r10, shiftCount);
    __ lsrvw(r11, r11, shiftRevCount);
    __ orrw(r12,  r10, r11);
    __ strw(r12,  __ post(newArr, 4));

    __ BIND(ShiftOne);
    __ ldrw(r10,  Address(oldArr));
    __ ldrw(r11,  Address(oldArrNext));
    __ lslvw(r10, r10, shiftCount);
    __ lsrvw(r11, r11, shiftRevCount);
    __ orrw(r12,  r10, r11);
    __ strw(r12,  Address(newArr));

    __ BIND(Exit);
    __ ret(lr);

    return start;
  }

  address generate_count_positives(address &count_positives_long) {
    const u1 large_loop_size = 64;
    const uint64_t UPPER_BIT_MASK=0x8080808080808080;
    int dcache_line = VM_Version::dcache_line_size();

    Register ary1 = r1, len = r2, result = r0;

    __ align(CodeEntryAlignment);

    StubCodeMark mark(this, "StubRoutines", "count_positives");

    address entry = __ pc();

    __ enter();
    // precondition: a copy of len is already in result
    // __ mov(result, len);

  Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
        LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;

  __ cmp(len, (u1)15);
  __ br(Assembler::GT, LEN_OVER_15);
  // The only case when execution falls into this code is when pointer is near
  // the end of memory page and we have to avoid reading next page
  __ add(ary1, ary1, len);
  __ subs(len, len, 8);
  __ br(Assembler::GT, LEN_OVER_8);
  __ ldr(rscratch2, Address(ary1, -8));
  __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
  __ lsrv(rscratch2, rscratch2, rscratch1);
  __ tst(rscratch2, UPPER_BIT_MASK);
  __ csel(result, zr, result, Assembler::NE);
  __ leave();
  __ ret(lr);
  __ bind(LEN_OVER_8);
  __ ldp(rscratch1, rscratch2, Address(ary1, -16));
  __ sub(len, len, 8); // no data dep., then sub can be executed while loading
  __ tst(rscratch2, UPPER_BIT_MASK);
  __ br(Assembler::NE, RET_NO_POP);
  __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
  __ lsrv(rscratch1, rscratch1, rscratch2);
  __ tst(rscratch1, UPPER_BIT_MASK);
  __ bind(RET_NO_POP);
  __ csel(result, zr, result, Assembler::NE);
  __ leave();
  __ ret(lr);

  Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
  const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;

  count_positives_long = __ pc(); // 2nd entry point

  __ enter();

  __ bind(LEN_OVER_15);
    __ push(spilled_regs, sp);
    __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
    __ cbz(rscratch2, ALIGNED);
    __ ldp(tmp6, tmp1, Address(ary1));
    __ mov(tmp5, 16);
    __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
    __ add(ary1, ary1, rscratch1);
    __ orr(tmp6, tmp6, tmp1);
    __ tst(tmp6, UPPER_BIT_MASK);
    __ br(Assembler::NE, RET_ADJUST);
    __ sub(len, len, rscratch1);

  __ bind(ALIGNED);
    __ cmp(len, large_loop_size);
    __ br(Assembler::LT, CHECK_16);
    // Perform 16-byte load as early return in pre-loop to handle situation
    // when initially aligned large array has negative values at starting bytes,
    // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
    // slower. Cases with negative bytes further ahead won't be affected that
    // much. In fact, it'll be faster due to early loads, less instructions and
    // less branches in LARGE_LOOP.
    __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
    __ sub(len, len, 16);
    __ orr(tmp6, tmp6, tmp1);
    __ tst(tmp6, UPPER_BIT_MASK);
    __ br(Assembler::NE, RET_ADJUST_16);
    __ cmp(len, large_loop_size);
    __ br(Assembler::LT, CHECK_16);

    if (SoftwarePrefetchHintDistance >= 0
        && SoftwarePrefetchHintDistance >= dcache_line) {
      // initial prefetch
      __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
    }
  __ bind(LARGE_LOOP);
    if (SoftwarePrefetchHintDistance >= 0) {
      __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
    }
    // Issue load instructions first, since it can save few CPU/MEM cycles, also
    // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
    // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
    // instructions per cycle and have less branches, but this approach disables
    // early return, thus, all 64 bytes are loaded and checked every time.
    __ ldp(tmp2, tmp3, Address(ary1));
    __ ldp(tmp4, tmp5, Address(ary1, 16));
    __ ldp(rscratch1, rscratch2, Address(ary1, 32));
    __ ldp(tmp6, tmp1, Address(ary1, 48));
    __ add(ary1, ary1, large_loop_size);
    __ sub(len, len, large_loop_size);
    __ orr(tmp2, tmp2, tmp3);
    __ orr(tmp4, tmp4, tmp5);
    __ orr(rscratch1, rscratch1, rscratch2);
    __ orr(tmp6, tmp6, tmp1);
    __ orr(tmp2, tmp2, tmp4);
    __ orr(rscratch1, rscratch1, tmp6);
    __ orr(tmp2, tmp2, rscratch1);
    __ tst(tmp2, UPPER_BIT_MASK);
    __ br(Assembler::NE, RET_ADJUST_LONG);
    __ cmp(len, large_loop_size);
    __ br(Assembler::GE, LARGE_LOOP);

  __ bind(CHECK_16); // small 16-byte load pre-loop
    __ cmp(len, (u1)16);
    __ br(Assembler::LT, POST_LOOP16);

  __ bind(LOOP16); // small 16-byte load loop
    __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
    __ sub(len, len, 16);
    __ orr(tmp2, tmp2, tmp3);
    __ tst(tmp2, UPPER_BIT_MASK);
    __ br(Assembler::NE, RET_ADJUST_16);
    __ cmp(len, (u1)16);
    __ br(Assembler::GE, LOOP16); // 16-byte load loop end

  __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
    __ cmp(len, (u1)8);
    __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
    __ ldr(tmp3, Address(__ post(ary1, 8)));
    __ tst(tmp3, UPPER_BIT_MASK);
    __ br(Assembler::NE, RET_ADJUST);
    __ sub(len, len, 8);

  __ bind(POST_LOOP16_LOAD_TAIL);
    __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
    __ ldr(tmp1, Address(ary1));
    __ mov(tmp2, 64);
    __ sub(tmp4, tmp2, len, __ LSL, 3);
    __ lslv(tmp1, tmp1, tmp4);
    __ tst(tmp1, UPPER_BIT_MASK);
    __ br(Assembler::NE, RET_ADJUST);
    // Fallthrough

  __ bind(RET_LEN);
    __ pop(spilled_regs, sp);
    __ leave();
    __ ret(lr);

    // difference result - len is the count of guaranteed to be
    // positive bytes

  __ bind(RET_ADJUST_LONG);
    __ add(len, len, (u1)(large_loop_size - 16));
  __ bind(RET_ADJUST_16);
    __ add(len, len, 16);
  __ bind(RET_ADJUST);
    __ pop(spilled_regs, sp);
    __ leave();
    __ sub(result, result, len);
    __ ret(lr);

    return entry;
  }

  void generate_large_array_equals_loop_nonsimd(int loopThreshold,
        bool usePrefetch, Label &NOT_EQUAL) {
    Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
        tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
        tmp7 = r12, tmp8 = r13;
    Label LOOP;

    __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
    __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
    __ bind(LOOP);
    if (usePrefetch) {
      __ prfm(Address(a1, SoftwarePrefetchHintDistance));
      __ prfm(Address(a2, SoftwarePrefetchHintDistance));
    }
    __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
    __ eor(tmp1, tmp1, tmp2);
    __ eor(tmp3, tmp3, tmp4);
    __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
    __ orr(tmp1, tmp1, tmp3);
    __ cbnz(tmp1, NOT_EQUAL);
    __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
    __ eor(tmp5, tmp5, tmp6);
    __ eor(tmp7, tmp7, tmp8);
    __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
    __ orr(tmp5, tmp5, tmp7);
    __ cbnz(tmp5, NOT_EQUAL);
    __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
    __ eor(tmp1, tmp1, tmp2);
    __ eor(tmp3, tmp3, tmp4);
    __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
    __ orr(tmp1, tmp1, tmp3);
    __ cbnz(tmp1, NOT_EQUAL);
    __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
    __ eor(tmp5, tmp5, tmp6);
    __ sub(cnt1, cnt1, 8 * wordSize);
    __ eor(tmp7, tmp7, tmp8);
    __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
    // tmp6 is not used. MacroAssembler::subs is used here (rather than
    // cmp) because subs allows an unlimited range of immediate operand.
    __ subs(tmp6, cnt1, loopThreshold);
    __ orr(tmp5, tmp5, tmp7);
    __ cbnz(tmp5, NOT_EQUAL);
    __ br(__ GE, LOOP);
    // post-loop
    __ eor(tmp1, tmp1, tmp2);
    __ eor(tmp3, tmp3, tmp4);
    __ orr(tmp1, tmp1, tmp3);
    __ sub(cnt1, cnt1, 2 * wordSize);
    __ cbnz(tmp1, NOT_EQUAL);
  }

  void generate_large_array_equals_loop_simd(int loopThreshold,
        bool usePrefetch, Label &NOT_EQUAL) {
    Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
        tmp2 = rscratch2;
    Label LOOP;

    __ bind(LOOP);
    if (usePrefetch) {
      __ prfm(Address(a1, SoftwarePrefetchHintDistance));
      __ prfm(Address(a2, SoftwarePrefetchHintDistance));
    }
    __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
    __ sub(cnt1, cnt1, 8 * wordSize);
    __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
    __ subs(tmp1, cnt1, loopThreshold);
    __ eor(v0, __ T16B, v0, v4);
    __ eor(v1, __ T16B, v1, v5);
    __ eor(v2, __ T16B, v2, v6);
    __ eor(v3, __ T16B, v3, v7);
    __ orr(v0, __ T16B, v0, v1);
    __ orr(v1, __ T16B, v2, v3);
    __ orr(v0, __ T16B, v0, v1);
    __ umov(tmp1, v0, __ D, 0);
    __ umov(tmp2, v0, __ D, 1);
    __ orr(tmp1, tmp1, tmp2);
    __ cbnz(tmp1, NOT_EQUAL);
    __ br(__ GE, LOOP);
  }

  // a1 = r1 - array1 address
  // a2 = r2 - array2 address
  // result = r0 - return value. Already contains "false"
  // cnt1 = r10 - amount of elements left to check, reduced by wordSize
  // r3-r5 are reserved temporary registers
  address generate_large_array_equals() {
    Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
        tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
        tmp7 = r12, tmp8 = r13;
    Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
        SMALL_LOOP, POST_LOOP;
    const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
    // calculate if at least 32 prefetched bytes are used
    int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
    int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
    RegSet spilled_regs = RegSet::range(tmp6, tmp8);
    assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
        tmp5, tmp6, tmp7, tmp8);

    __ align(CodeEntryAlignment);

    StubCodeMark mark(this, "StubRoutines", "large_array_equals");

    address entry = __ pc();
    __ enter();
    __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
    // also advance pointers to use post-increment instead of pre-increment
    __ add(a1, a1, wordSize);
    __ add(a2, a2, wordSize);
    if (AvoidUnalignedAccesses) {
      // both implementations (SIMD/nonSIMD) are using relatively large load
      // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
      // on some CPUs in case of address is not at least 16-byte aligned.
      // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
      // load if needed at least for 1st address and make if 16-byte aligned.
      Label ALIGNED16;
      __ tbz(a1, 3, ALIGNED16);
      __ ldr(tmp1, Address(__ post(a1, wordSize)));
      __ ldr(tmp2, Address(__ post(a2, wordSize)));
      __ sub(cnt1, cnt1, wordSize);
      __ eor(tmp1, tmp1, tmp2);
      __ cbnz(tmp1, NOT_EQUAL_NO_POP);
      __ bind(ALIGNED16);
    }
    if (UseSIMDForArrayEquals) {
      if (SoftwarePrefetchHintDistance >= 0) {
        __ subs(tmp1, cnt1, prefetchLoopThreshold);
        __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
        generate_large_array_equals_loop_simd(prefetchLoopThreshold,
            /* prfm = */ true, NOT_EQUAL);
        __ subs(zr, cnt1, nonPrefetchLoopThreshold);
        __ br(__ LT, TAIL);
      }
      __ bind(NO_PREFETCH_LARGE_LOOP);
      generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
          /* prfm = */ false, NOT_EQUAL);
    } else {
      __ push(spilled_regs, sp);
      if (SoftwarePrefetchHintDistance >= 0) {
        __ subs(tmp1, cnt1, prefetchLoopThreshold);
        __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
        generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
            /* prfm = */ true, NOT_EQUAL);
        __ subs(zr, cnt1, nonPrefetchLoopThreshold);
        __ br(__ LT, TAIL);
      }
      __ bind(NO_PREFETCH_LARGE_LOOP);
      generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
          /* prfm = */ false, NOT_EQUAL);
    }
    __ bind(TAIL);
      __ cbz(cnt1, EQUAL);
      __ subs(cnt1, cnt1, wordSize);
      __ br(__ LE, POST_LOOP);
    __ bind(SMALL_LOOP);
      __ ldr(tmp1, Address(__ post(a1, wordSize)));
      __ ldr(tmp2, Address(__ post(a2, wordSize)));
      __ subs(cnt1, cnt1, wordSize);
      __ eor(tmp1, tmp1, tmp2);
      __ cbnz(tmp1, NOT_EQUAL);
      __ br(__ GT, SMALL_LOOP);
    __ bind(POST_LOOP);
      __ ldr(tmp1, Address(a1, cnt1));
      __ ldr(tmp2, Address(a2, cnt1));
      __ eor(tmp1, tmp1, tmp2);
      __ cbnz(tmp1, NOT_EQUAL);
    __ bind(EQUAL);
      __ mov(result, true);
    __ bind(NOT_EQUAL);
      if (!UseSIMDForArrayEquals) {
        __ pop(spilled_regs, sp);
      }
    __ bind(NOT_EQUAL_NO_POP);
    __ leave();
    __ ret(lr);
    return entry;
  }

  address generate_dsin_dcos(bool isCos) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
    address start = __ pc();
    __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
        (address)StubRoutines::aarch64::_two_over_pi,
        (address)StubRoutines::aarch64::_pio2,
        (address)StubRoutines::aarch64::_dsin_coef,
        (address)StubRoutines::aarch64::_dcos_coef);
    return start;
  }

  address generate_dlog() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "dlog");
    address entry = __ pc();
    FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
        vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
    Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
    __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
        tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
    return entry;
  }


  // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
  void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
      Label &DIFF2) {
    Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
    FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;

    __ ldrq(vtmp, Address(__ post(tmp2, 16)));
    __ ldr(tmpU, Address(__ post(cnt1, 8)));
    __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
    // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3

    __ fmovd(tmpL, vtmp3);
    __ eor(rscratch2, tmp3, tmpL);
    __ cbnz(rscratch2, DIFF2);

    __ ldr(tmp3, Address(__ post(cnt1, 8)));
    __ umov(tmpL, vtmp3, __ D, 1);
    __ eor(rscratch2, tmpU, tmpL);
    __ cbnz(rscratch2, DIFF1);

    __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
    __ ldr(tmpU, Address(__ post(cnt1, 8)));
    __ fmovd(tmpL, vtmp);
    __ eor(rscratch2, tmp3, tmpL);
    __ cbnz(rscratch2, DIFF2);

    __ ldr(tmp3, Address(__ post(cnt1, 8)));
    __ umov(tmpL, vtmp, __ D, 1);
    __ eor(rscratch2, tmpU, tmpL);
    __ cbnz(rscratch2, DIFF1);
  }

  // r0  = result
  // r1  = str1
  // r2  = cnt1
  // r3  = str2
  // r4  = cnt2
  // r10 = tmp1
  // r11 = tmp2
  address generate_compare_long_string_different_encoding(bool isLU) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", isLU
        ? "compare_long_string_different_encoding LU"
        : "compare_long_string_different_encoding UL");
    address entry = __ pc();
    Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
        DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
        LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
        tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
    FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
    RegSet spilled_regs = RegSet::of(tmp3, tmp4);

    int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);

    __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
    // cnt2 == amount of characters left to compare
    // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
    __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
    __ add(str1, str1, isLU ? wordSize/2 : wordSize);
    __ add(str2, str2, isLU ? wordSize : wordSize/2);
    __ fmovd(isLU ? tmp1 : tmp2, vtmp);
    __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
    __ eor(rscratch2, tmp1, tmp2);
    __ mov(rscratch1, tmp2);
    __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
    Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
             tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
    __ push(spilled_regs, sp);
    __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
    __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load

    __ ldr(tmp3, Address(__ post(cnt1, 8)));

    if (SoftwarePrefetchHintDistance >= 0) {
      __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
      __ br(__ LT, NO_PREFETCH);
      __ bind(LARGE_LOOP_PREFETCH);
        __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
        __ mov(tmp4, 2);
        __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
        __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
          compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
          __ subs(tmp4, tmp4, 1);
          __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
          __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
          __ mov(tmp4, 2);
        __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
          compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
          __ subs(tmp4, tmp4, 1);
          __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
          __ sub(cnt2, cnt2, 64);
          __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
          __ br(__ GE, LARGE_LOOP_PREFETCH);
    }
    __ cbz(cnt2, LOAD_LAST); // no characters left except last load
    __ bind(NO_PREFETCH);
    __ subs(cnt2, cnt2, 16);
    __ br(__ LT, TAIL);
    __ align(OptoLoopAlignment);
    __ bind(SMALL_LOOP); // smaller loop
      __ subs(cnt2, cnt2, 16);
      compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
      __ br(__ GE, SMALL_LOOP);
      __ cmn(cnt2, (u1)16);
      __ br(__ EQ, LOAD_LAST);
    __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
      __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
      __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
      __ ldr(tmp3, Address(cnt1, -8));
      compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
      __ b(LOAD_LAST);
    __ bind(DIFF2);
      __ mov(tmpU, tmp3);
    __ bind(DIFF1);
      __ pop(spilled_regs, sp);
      __ b(CALCULATE_DIFFERENCE);
    __ bind(LOAD_LAST);
      // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
      // No need to load it again
      __ mov(tmpU, tmp3);
      __ pop(spilled_regs, sp);

      // tmp2 points to the address of the last 4 Latin1 characters right now
      __ ldrs(vtmp, Address(tmp2));
      __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
      __ fmovd(tmpL, vtmp);

      __ eor(rscratch2, tmpU, tmpL);
      __ cbz(rscratch2, DONE);

    // Find the first different characters in the longwords and
    // compute their difference.
    __ bind(CALCULATE_DIFFERENCE);
      __ rev(rscratch2, rscratch2);
      __ clz(rscratch2, rscratch2);
      __ andr(rscratch2, rscratch2, -16);
      __ lsrv(tmp1, tmp1, rscratch2);
      __ uxthw(tmp1, tmp1);
      __ lsrv(rscratch1, rscratch1, rscratch2);
      __ uxthw(rscratch1, rscratch1);
      __ subw(result, tmp1, rscratch1);
    __ bind(DONE);
      __ ret(lr);
    return entry;
  }

  address generate_method_entry_barrier() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");

    Label deoptimize_label;

    address start = __ pc();

    BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();

    if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
      BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
      // We can get here despite the nmethod being good, if we have not
      // yet applied our cross modification fence (or data fence).
      Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()) + 4);
      __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
      __ ldrw(rscratch2, rscratch2);
      __ strw(rscratch2, thread_epoch_addr);
      __ isb();
      __ membar(__ LoadLoad);
    }

    __ set_last_Java_frame(sp, rfp, lr, rscratch1);

    __ enter();
    __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr

    __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}

    __ push_call_clobbered_registers();

    __ mov(c_rarg0, rscratch2);
    __ call_VM_leaf
         (CAST_FROM_FN_PTR
          (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);

    __ reset_last_Java_frame(true);

    __ mov(rscratch1, r0);

    __ pop_call_clobbered_registers();

    __ cbnz(rscratch1, deoptimize_label);

    __ leave();
    __ ret(lr);

    __ BIND(deoptimize_label);

    __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
    __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));

    __ mov(sp, rscratch1);
    __ br(rscratch2);

    return start;
  }

  // r0  = result
  // r1  = str1
  // r2  = cnt1
  // r3  = str2
  // r4  = cnt2
  // r10 = tmp1
  // r11 = tmp2
  address generate_compare_long_string_same_encoding(bool isLL) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", isLL
        ? "compare_long_string_same_encoding LL"
        : "compare_long_string_same_encoding UU");
    address entry = __ pc();
    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
        tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;

    Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;

    // exit from large loop when less than 64 bytes left to read or we're about
    // to prefetch memory behind array border
    int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);

    // before jumping to stub, pre-load 8 bytes already, so do comparison directly
    __ eor(rscratch2, tmp1, tmp2);
    __ cbnz(rscratch2, CAL_DIFFERENCE);

    __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
    // update pointers, because of previous read
    __ add(str1, str1, wordSize);
    __ add(str2, str2, wordSize);
    if (SoftwarePrefetchHintDistance >= 0) {
      __ align(OptoLoopAlignment);
      __ bind(LARGE_LOOP_PREFETCH);
        __ prfm(Address(str1, SoftwarePrefetchHintDistance));
        __ prfm(Address(str2, SoftwarePrefetchHintDistance));

        for (int i = 0; i < 4; i++) {
          __ ldp(tmp1, tmp1h, Address(str1, i * 16));
          __ ldp(tmp2, tmp2h, Address(str2, i * 16));
          __ cmp(tmp1, tmp2);
          __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
          __ br(Assembler::NE, DIFF);
        }
        __ sub(cnt2, cnt2, isLL ? 64 : 32);
        __ add(str1, str1, 64);
        __ add(str2, str2, 64);
        __ subs(rscratch2, cnt2, largeLoopExitCondition);
        __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
        __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
    }

    __ subs(rscratch1, cnt2, isLL ? 16 : 8);
    __ br(Assembler::LE, LESS16);
    __ align(OptoLoopAlignment);
    __ bind(LOOP_COMPARE16);
      __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
      __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
      __ cmp(tmp1, tmp2);
      __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
      __ br(Assembler::NE, DIFF);
      __ sub(cnt2, cnt2, isLL ? 16 : 8);
      __ subs(rscratch2, cnt2, isLL ? 16 : 8);
      __ br(Assembler::LT, LESS16);

      __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
      __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
      __ cmp(tmp1, tmp2);
      __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
      __ br(Assembler::NE, DIFF);
      __ sub(cnt2, cnt2, isLL ? 16 : 8);
      __ subs(rscratch2, cnt2, isLL ? 16 : 8);
      __ br(Assembler::GE, LOOP_COMPARE16);
      __ cbz(cnt2, LENGTH_DIFF);

    __ bind(LESS16);
      // each 8 compare
      __ subs(cnt2, cnt2, isLL ? 8 : 4);
      __ br(Assembler::LE, LESS8);
      __ ldr(tmp1, Address(__ post(str1, 8)));
      __ ldr(tmp2, Address(__ post(str2, 8)));
      __ eor(rscratch2, tmp1, tmp2);
      __ cbnz(rscratch2, CAL_DIFFERENCE);
      __ sub(cnt2, cnt2, isLL ? 8 : 4);

    __ bind(LESS8); // directly load last 8 bytes
      if (!isLL) {
        __ add(cnt2, cnt2, cnt2);
      }
      __ ldr(tmp1, Address(str1, cnt2));
      __ ldr(tmp2, Address(str2, cnt2));
      __ eor(rscratch2, tmp1, tmp2);
      __ cbz(rscratch2, LENGTH_DIFF);
      __ b(CAL_DIFFERENCE);

    __ bind(DIFF);
      __ cmp(tmp1, tmp2);
      __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
      __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
      // reuse rscratch2 register for the result of eor instruction
      __ eor(rscratch2, tmp1, tmp2);

    __ bind(CAL_DIFFERENCE);
      __ rev(rscratch2, rscratch2);
      __ clz(rscratch2, rscratch2);
      __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
      __ lsrv(tmp1, tmp1, rscratch2);
      __ lsrv(tmp2, tmp2, rscratch2);
      if (isLL) {
        __ uxtbw(tmp1, tmp1);
        __ uxtbw(tmp2, tmp2);
      } else {
        __ uxthw(tmp1, tmp1);
        __ uxthw(tmp2, tmp2);
      }
      __ subw(result, tmp1, tmp2);

    __ bind(LENGTH_DIFF);
      __ ret(lr);
    return entry;
  }

  enum string_compare_mode {
    LL,
    LU,
    UL,
    UU,
  };

  // The following registers are declared in aarch64.ad
  // r0  = result
  // r1  = str1
  // r2  = cnt1
  // r3  = str2
  // r4  = cnt2
  // r10 = tmp1
  // r11 = tmp2
  // z0  = ztmp1
  // z1  = ztmp2
  // p0  = pgtmp1
  // p1  = pgtmp2
  address generate_compare_long_string_sve(string_compare_mode mode) {
    __ align(CodeEntryAlignment);
    address entry = __ pc();
    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
             tmp1 = r10, tmp2 = r11;

    Label LOOP, DONE, MISMATCH;
    Register vec_len = tmp1;
    Register idx = tmp2;
    // The minimum of the string lengths has been stored in cnt2.
    Register cnt = cnt2;
    FloatRegister ztmp1 = z0, ztmp2 = z1;
    PRegister pgtmp1 = p0, pgtmp2 = p1;

#define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
    switch (mode) {                                                            \
      case LL:                                                                 \
        __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
        __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
        break;                                                                 \
      case LU:                                                                 \
        __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
        __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
        break;                                                                 \
      case UL:                                                                 \
        __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
        __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
        break;                                                                 \
      case UU:                                                                 \
        __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
        __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
        break;                                                                 \
      default:                                                                 \
        ShouldNotReachHere();                                                  \
    }

    const char* stubname;
    switch (mode) {
      case LL: stubname = "compare_long_string_same_encoding LL";      break;
      case LU: stubname = "compare_long_string_different_encoding LU"; break;
      case UL: stubname = "compare_long_string_different_encoding UL"; break;
      case UU: stubname = "compare_long_string_same_encoding UU";      break;
      default: ShouldNotReachHere();
    }

    StubCodeMark mark(this, "StubRoutines", stubname);

    __ mov(idx, 0);
    __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);

    if (mode == LL) {
      __ sve_cntb(vec_len);
    } else {
      __ sve_cnth(vec_len);
    }

    __ sub(rscratch1, cnt, vec_len);

    __ bind(LOOP);

      // main loop
      LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
      __ add(idx, idx, vec_len);
      // Compare strings.
      __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
      __ br(__ NE, MISMATCH);
      __ cmp(idx, rscratch1);
      __ br(__ LT, LOOP);

    // post loop, last iteration
    __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);

    LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
    __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
    __ br(__ EQ, DONE);

    __ bind(MISMATCH);

    // Crop the vector to find its location.
    __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
    // Extract the first different characters of each string.
    __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
    __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);

    // Compute the difference of the first different characters.
    __ sub(result, rscratch1, rscratch2);

    __ bind(DONE);
    __ ret(lr);
#undef LOAD_PAIR
    return entry;
  }

  void generate_compare_long_strings() {
    if (UseSVE == 0) {
      StubRoutines::aarch64::_compare_long_string_LL
          = generate_compare_long_string_same_encoding(true);
      StubRoutines::aarch64::_compare_long_string_UU
          = generate_compare_long_string_same_encoding(false);
      StubRoutines::aarch64::_compare_long_string_LU
          = generate_compare_long_string_different_encoding(true);
      StubRoutines::aarch64::_compare_long_string_UL
          = generate_compare_long_string_different_encoding(false);
    } else {
      StubRoutines::aarch64::_compare_long_string_LL
          = generate_compare_long_string_sve(LL);
      StubRoutines::aarch64::_compare_long_string_UU
          = generate_compare_long_string_sve(UU);
      StubRoutines::aarch64::_compare_long_string_LU
          = generate_compare_long_string_sve(LU);
      StubRoutines::aarch64::_compare_long_string_UL
          = generate_compare_long_string_sve(UL);
    }
  }

  // R0 = result
  // R1 = str2
  // R2 = cnt1
  // R3 = str1
  // R4 = cnt2
  // This generic linear code use few additional ideas, which makes it faster:
  // 1) we can safely keep at least 1st register of pattern(since length >= 8)
  // in order to skip initial loading(help in systems with 1 ld pipeline)
  // 2) we can use "fast" algorithm of finding single character to search for
  // first symbol with less branches(1 branch per each loaded register instead
  // of branch for each symbol), so, this is where constants like
  // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
  // 3) after loading and analyzing 1st register of source string, it can be
  // used to search for every 1st character entry, saving few loads in
  // comparison with "simplier-but-slower" implementation
  // 4) in order to avoid lots of push/pop operations, code below is heavily
  // re-using/re-initializing/compressing register values, which makes code
  // larger and a bit less readable, however, most of extra operations are
  // issued during loads or branches, so, penalty is minimal
  address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
    const char* stubName = str1_isL
        ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
        : "indexof_linear_uu";
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", stubName);
    address entry = __ pc();

    int str1_chr_size = str1_isL ? 1 : 2;
    int str2_chr_size = str2_isL ? 1 : 2;
    int str1_chr_shift = str1_isL ? 0 : 1;
    int str2_chr_shift = str2_isL ? 0 : 1;
    bool isL = str1_isL && str2_isL;
   // parameters
    Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
    // temporary registers
    Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
    RegSet spilled_regs = RegSet::range(tmp1, tmp4);
    // redefinitions
    Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;

    __ push(spilled_regs, sp);
    Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
        L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
        L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
        L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
        L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
        L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
    // Read whole register from str1. It is safe, because length >=8 here
    __ ldr(ch1, Address(str1));
    // Read whole register from str2. It is safe, because length >=8 here
    __ ldr(ch2, Address(str2));
    __ sub(cnt2, cnt2, cnt1);
    __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
    if (str1_isL != str2_isL) {
      __ eor(v0, __ T16B, v0, v0);
    }
    __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
    __ mul(first, first, tmp1);
    // check if we have less than 1 register to check
    __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
    if (str1_isL != str2_isL) {
      __ fmovd(v1, ch1);
    }
    __ br(__ LE, L_SMALL);
    __ eor(ch2, first, ch2);
    if (str1_isL != str2_isL) {
      __ zip1(v1, __ T16B, v1, v0);
    }
    __ sub(tmp2, ch2, tmp1);
    __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
    __ bics(tmp2, tmp2, ch2);
    if (str1_isL != str2_isL) {
      __ fmovd(ch1, v1);
    }
    __ br(__ NE, L_HAS_ZERO);
    __ subs(cnt2, cnt2, wordSize/str2_chr_size);
    __ add(result, result, wordSize/str2_chr_size);
    __ add(str2, str2, wordSize);
    __ br(__ LT, L_POST_LOOP);
    __ BIND(L_LOOP);
      __ ldr(ch2, Address(str2));
      __ eor(ch2, first, ch2);
      __ sub(tmp2, ch2, tmp1);
      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
      __ bics(tmp2, tmp2, ch2);
      __ br(__ NE, L_HAS_ZERO);
    __ BIND(L_LOOP_PROCEED);
      __ subs(cnt2, cnt2, wordSize/str2_chr_size);
      __ add(str2, str2, wordSize);
      __ add(result, result, wordSize/str2_chr_size);
      __ br(__ GE, L_LOOP);
    __ BIND(L_POST_LOOP);
      __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
      __ br(__ LE, NOMATCH);
      __ ldr(ch2, Address(str2));
      __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
      __ eor(ch2, first, ch2);
      __ sub(tmp2, ch2, tmp1);
      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
      __ mov(tmp4, -1); // all bits set
      __ b(L_SMALL_PROCEED);
    __ align(OptoLoopAlignment);
    __ BIND(L_SMALL);
      __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
      __ eor(ch2, first, ch2);
      if (str1_isL != str2_isL) {
        __ zip1(v1, __ T16B, v1, v0);
      }
      __ sub(tmp2, ch2, tmp1);
      __ mov(tmp4, -1); // all bits set
      __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
      if (str1_isL != str2_isL) {
        __ fmovd(ch1, v1); // move converted 4 symbols
      }
    __ BIND(L_SMALL_PROCEED);
      __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
      __ bic(tmp2, tmp2, ch2);
      __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
      __ rbit(tmp2, tmp2);
      __ br(__ EQ, NOMATCH);
    __ BIND(L_SMALL_HAS_ZERO_LOOP);
      __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
      __ cmp(cnt1, u1(wordSize/str2_chr_size));
      __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
      if (str2_isL) { // LL
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
        __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
        __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
        __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
      } else {
        __ mov(ch2, 0xE); // all bits in byte set except last one
        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
        __ lslv(tmp2, tmp2, tmp4);
        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
      }
      __ cmp(ch1, ch2);
      __ mov(tmp4, wordSize/str2_chr_size);
      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
    __ BIND(L_SMALL_CMP_LOOP);
      str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
               : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
      str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
               : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
      __ add(tmp4, tmp4, 1);
      __ cmp(tmp4, cnt1);
      __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
      __ cmp(first, ch2);
      __ br(__ EQ, L_SMALL_CMP_LOOP);
    __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
      __ cbz(tmp2, NOMATCH); // no more matches. exit
      __ clz(tmp4, tmp2);
      __ add(result, result, 1); // advance index
      __ add(str2, str2, str2_chr_size); // advance pointer
      __ b(L_SMALL_HAS_ZERO_LOOP);
    __ align(OptoLoopAlignment);
    __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
      __ cmp(first, ch2);
      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
      __ b(DONE);
    __ align(OptoLoopAlignment);
    __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
      if (str2_isL) { // LL
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
        __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
        __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
        __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
      } else {
        __ mov(ch2, 0xE); // all bits in byte set except last one
        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
        __ lslv(tmp2, tmp2, tmp4);
        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
      }
      __ cmp(ch1, ch2);
      __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
      __ b(DONE);
    __ align(OptoLoopAlignment);
    __ BIND(L_HAS_ZERO);
      __ rbit(tmp2, tmp2);
      __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
      // Now, perform compression of counters(cnt2 and cnt1) into one register.
      // It's fine because both counters are 32bit and are not changed in this
      // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
      __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
      __ sub(result, result, 1);
    __ BIND(L_HAS_ZERO_LOOP);
      __ mov(cnt1, wordSize/str2_chr_size);
      __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
      __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
      if (str2_isL) {
        __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
        __ lslv(tmp2, tmp2, tmp4);
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ add(tmp4, tmp4, 1);
        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ lsl(tmp2, tmp2, 1);
        __ mov(tmp4, wordSize/str2_chr_size);
      } else {
        __ mov(ch2, 0xE);
        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
        __ lslv(tmp2, tmp2, tmp4);
        __ add(tmp4, tmp4, 1);
        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
        __ lsl(tmp2, tmp2, 1);
        __ mov(tmp4, wordSize/str2_chr_size);
        __ sub(str2, str2, str2_chr_size);
      }
      __ cmp(ch1, ch2);
      __ mov(tmp4, wordSize/str2_chr_size);
      __ br(__ NE, L_CMP_LOOP_NOMATCH);
    __ BIND(L_CMP_LOOP);
      str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
               : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
      str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
               : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
      __ add(tmp4, tmp4, 1);
      __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
      __ br(__ GE, L_CMP_LOOP_LAST_CMP);
      __ cmp(cnt1, ch2);
      __ br(__ EQ, L_CMP_LOOP);
    __ BIND(L_CMP_LOOP_NOMATCH);
      // here we're not matched
      __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
      __ clz(tmp4, tmp2);
      __ add(str2, str2, str2_chr_size); // advance pointer
      __ b(L_HAS_ZERO_LOOP);
    __ align(OptoLoopAlignment);
    __ BIND(L_CMP_LOOP_LAST_CMP);
      __ cmp(cnt1, ch2);
      __ br(__ NE, L_CMP_LOOP_NOMATCH);
      __ b(DONE);
    __ align(OptoLoopAlignment);
    __ BIND(L_CMP_LOOP_LAST_CMP2);
      if (str2_isL) {
        __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
        __ lslv(tmp2, tmp2, tmp4);
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ add(tmp4, tmp4, 1);
        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ lsl(tmp2, tmp2, 1);
      } else {
        __ mov(ch2, 0xE);
        __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
        __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
        __ lslv(tmp2, tmp2, tmp4);
        __ add(tmp4, tmp4, 1);
        __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
        __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
        __ lsl(tmp2, tmp2, 1);
        __ sub(str2, str2, str2_chr_size);
      }
      __ cmp(ch1, ch2);
      __ br(__ NE, L_CMP_LOOP_NOMATCH);
      __ b(DONE);
    __ align(OptoLoopAlignment);
    __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
      // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
      // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
      // so, result was increased at max by wordSize/str2_chr_size - 1, so,
      // respective high bit wasn't changed. L_LOOP_PROCEED will increase
      // result by analyzed characters value, so, we can just reset lower bits
      // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
      // 2) restore cnt1 and cnt2 values from "compressed" cnt2
      // 3) advance str2 value to represent next str2 octet. result & 7/3 is
      // index of last analyzed substring inside current octet. So, str2 in at
      // respective start address. We need to advance it to next octet
      __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
      __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
      __ bfm(result, zr, 0, 2 - str2_chr_shift);
      __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
      __ movw(cnt2, cnt2);
      __ b(L_LOOP_PROCEED);
    __ align(OptoLoopAlignment);
    __ BIND(NOMATCH);
      __ mov(result, -1);
    __ BIND(DONE);
      __ pop(spilled_regs, sp);
      __ ret(lr);
    return entry;
  }

  void generate_string_indexof_stubs() {
    StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
    StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
    StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
  }

  void inflate_and_store_2_fp_registers(bool generatePrfm,
      FloatRegister src1, FloatRegister src2) {
    Register dst = r1;
    __ zip1(v1, __ T16B, src1, v0);
    __ zip2(v2, __ T16B, src1, v0);
    if (generatePrfm) {
      __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
    }
    __ zip1(v3, __ T16B, src2, v0);
    __ zip2(v4, __ T16B, src2, v0);
    __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
  }

  // R0 = src
  // R1 = dst
  // R2 = len
  // R3 = len >> 3
  // V0 = 0
  // v1 = loaded 8 bytes
  address generate_large_byte_array_inflate() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
    address entry = __ pc();
    Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
    Register src = r0, dst = r1, len = r2, octetCounter = r3;
    const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;

    // do one more 8-byte read to have address 16-byte aligned in most cases
    // also use single store instruction
    __ ldrd(v2, __ post(src, 8));
    __ sub(octetCounter, octetCounter, 2);
    __ zip1(v1, __ T16B, v1, v0);
    __ zip1(v2, __ T16B, v2, v0);
    __ st1(v1, v2, __ T16B, __ post(dst, 32));
    __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
    __ subs(rscratch1, octetCounter, large_loop_threshold);
    __ br(__ LE, LOOP_START);
    __ b(LOOP_PRFM_START);
    __ bind(LOOP_PRFM);
      __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
    __ bind(LOOP_PRFM_START);
      __ prfm(Address(src, SoftwarePrefetchHintDistance));
      __ sub(octetCounter, octetCounter, 8);
      __ subs(rscratch1, octetCounter, large_loop_threshold);
      inflate_and_store_2_fp_registers(true, v3, v4);
      inflate_and_store_2_fp_registers(true, v5, v6);
      __ br(__ GT, LOOP_PRFM);
      __ cmp(octetCounter, (u1)8);
      __ br(__ LT, DONE);
    __ bind(LOOP);
      __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
      __ bind(LOOP_START);
      __ sub(octetCounter, octetCounter, 8);
      __ cmp(octetCounter, (u1)8);
      inflate_and_store_2_fp_registers(false, v3, v4);
      inflate_and_store_2_fp_registers(false, v5, v6);
      __ br(__ GE, LOOP);
    __ bind(DONE);
      __ ret(lr);
    return entry;
  }

  /**
   *  Arguments:
   *
   *  Input:
   *  c_rarg0   - current state address
   *  c_rarg1   - H key address
   *  c_rarg2   - data address
   *  c_rarg3   - number of blocks
   *
   *  Output:
   *  Updated state at c_rarg0
   */
  address generate_ghash_processBlocks() {
    // Bafflingly, GCM uses little-endian for the byte order, but
    // big-endian for the bit order.  For example, the polynomial 1 is
    // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
    //
    // So, we must either reverse the bytes in each word and do
    // everything big-endian or reverse the bits in each byte and do
    // it little-endian.  On AArch64 it's more idiomatic to reverse
    // the bits in each byte (we have an instruction, RBIT, to do
    // that) and keep the data in little-endian bit order through the
    // calculation, bit-reversing the inputs and outputs.

    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
    __ align(wordSize * 2);
    address p = __ pc();
    __ emit_int64(0x87);  // The low-order bits of the field
                          // polynomial (i.e. p = z^7+z^2+z+1)
                          // repeated in the low and high parts of a
                          // 128-bit vector
    __ emit_int64(0x87);

    __ align(CodeEntryAlignment);
    address start = __ pc();

    Register state   = c_rarg0;
    Register subkeyH = c_rarg1;
    Register data    = c_rarg2;
    Register blocks  = c_rarg3;

    FloatRegister vzr = v30;
    __ eor(vzr, __ T16B, vzr, vzr); // zero register

    __ ldrq(v24, p);    // The field polynomial

    __ ldrq(v0, Address(state));
    __ ldrq(v1, Address(subkeyH));

    __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
    __ rbit(v0, __ T16B, v0);
    __ rev64(v1, __ T16B, v1);
    __ rbit(v1, __ T16B, v1);

    __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
    __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))

    {
      Label L_ghash_loop;
      __ bind(L_ghash_loop);

      __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
                                                 // reversing each byte
      __ rbit(v2, __ T16B, v2);
      __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state

      // Multiply state in v2 by subkey in v1
      __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
                        /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
                        /*temps*/v6, v3, /*reuse/clobber b*/v2);
      // Reduce v7:v5 by the field polynomial
      __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);

      __ sub(blocks, blocks, 1);
      __ cbnz(blocks, L_ghash_loop);
    }

    // The bit-reversed result is at this point in v0
    __ rev64(v0, __ T16B, v0);
    __ rbit(v0, __ T16B, v0);

    __ st1(v0, __ T16B, state);
    __ ret(lr);

    return start;
  }

  address generate_ghash_processBlocks_wide() {
    address small = generate_ghash_processBlocks();

    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
    __ align(wordSize * 2);
    address p = __ pc();
    __ emit_int64(0x87);  // The low-order bits of the field
                          // polynomial (i.e. p = z^7+z^2+z+1)
                          // repeated in the low and high parts of a
                          // 128-bit vector
    __ emit_int64(0x87);

    __ align(CodeEntryAlignment);
    address start = __ pc();

    Register state   = c_rarg0;
    Register subkeyH = c_rarg1;
    Register data    = c_rarg2;
    Register blocks  = c_rarg3;

    const int unroll = 4;

    __ cmp(blocks, (unsigned char)(unroll * 2));
    __ br(__ LT, small);

    if (unroll > 1) {
    // Save state before entering routine
      __ sub(sp, sp, 4 * 16);
      __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
      __ sub(sp, sp, 4 * 16);
      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
    }

    __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);

    if (unroll > 1) {
      // And restore state
      __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
      __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
    }

    __ cmp(blocks, (unsigned char)0);
    __ br(__ GT, small);

    __ ret(lr);

    return start;
  }

  void generate_base64_encode_simdround(Register src, Register dst,
        FloatRegister codec, u8 size) {

    FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
    FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
    FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;

    Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;

    __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));

    __ ushr(ind0, arrangement, in0,  2);

    __ ushr(ind1, arrangement, in1,  2);
    __ shl(in0,   arrangement, in0,  6);
    __ orr(ind1,  arrangement, ind1, in0);
    __ ushr(ind1, arrangement, ind1, 2);

    __ ushr(ind2, arrangement, in2,  4);
    __ shl(in1,   arrangement, in1,  4);
    __ orr(ind2,  arrangement, in1,  ind2);
    __ ushr(ind2, arrangement, ind2, 2);

    __ shl(ind3,  arrangement, in2,  2);
    __ ushr(ind3, arrangement, ind3, 2);

    __ tbl(out0,  arrangement, codec,  4, ind0);
    __ tbl(out1,  arrangement, codec,  4, ind1);
    __ tbl(out2,  arrangement, codec,  4, ind2);
    __ tbl(out3,  arrangement, codec,  4, ind3);

    __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
  }

   /**
   *  Arguments:
   *
   *  Input:
   *  c_rarg0   - src_start
   *  c_rarg1   - src_offset
   *  c_rarg2   - src_length
   *  c_rarg3   - dest_start
   *  c_rarg4   - dest_offset
   *  c_rarg5   - isURL
   *
   */
  address generate_base64_encodeBlock() {

    static const char toBase64[64] = {
      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
      'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
      'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
    };

    static const char toBase64URL[64] = {
      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
      'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
      'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
    };

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "encodeBlock");
    address start = __ pc();

    Register src   = c_rarg0;  // source array
    Register soff  = c_rarg1;  // source start offset
    Register send  = c_rarg2;  // source end offset
    Register dst   = c_rarg3;  // dest array
    Register doff  = c_rarg4;  // position for writing to dest array
    Register isURL = c_rarg5;  // Base64 or URL character set

    // c_rarg6 and c_rarg7 are free to use as temps
    Register codec  = c_rarg6;
    Register length = c_rarg7;

    Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;

    __ add(src, src, soff);
    __ add(dst, dst, doff);
    __ sub(length, send, soff);

    // load the codec base address
    __ lea(codec, ExternalAddress((address) toBase64));
    __ cbz(isURL, ProcessData);
    __ lea(codec, ExternalAddress((address) toBase64URL));

    __ BIND(ProcessData);

    // too short to formup a SIMD loop, roll back
    __ cmp(length, (u1)24);
    __ br(Assembler::LT, Process3B);

    __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));

    __ BIND(Process48B);
    __ cmp(length, (u1)48);
    __ br(Assembler::LT, Process24B);
    generate_base64_encode_simdround(src, dst, v0, 16);
    __ sub(length, length, 48);
    __ b(Process48B);

    __ BIND(Process24B);
    __ cmp(length, (u1)24);
    __ br(Assembler::LT, SIMDExit);
    generate_base64_encode_simdround(src, dst, v0, 8);
    __ sub(length, length, 24);

    __ BIND(SIMDExit);
    __ cbz(length, Exit);

    __ BIND(Process3B);
    //  3 src bytes, 24 bits
    __ ldrb(r10, __ post(src, 1));
    __ ldrb(r11, __ post(src, 1));
    __ ldrb(r12, __ post(src, 1));
    __ orrw(r11, r11, r10, Assembler::LSL, 8);
    __ orrw(r12, r12, r11, Assembler::LSL, 8);
    // codec index
    __ ubfmw(r15, r12, 18, 23);
    __ ubfmw(r14, r12, 12, 17);
    __ ubfmw(r13, r12, 6,  11);
    __ andw(r12,  r12, 63);
    // get the code based on the codec
    __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
    __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
    __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
    __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
    __ strb(r15, __ post(dst, 1));
    __ strb(r14, __ post(dst, 1));
    __ strb(r13, __ post(dst, 1));
    __ strb(r12, __ post(dst, 1));
    __ sub(length, length, 3);
    __ cbnz(length, Process3B);

    __ BIND(Exit);
    __ ret(lr);

    return start;
  }

  void generate_base64_decode_simdround(Register src, Register dst,
        FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {

    FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
    FloatRegister out0 = v20, out1 = v21, out2 = v22;

    FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
    FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;

    Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;

    Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;

    __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));

    // we need unsigned saturating subtract, to make sure all input values
    // in range [0, 63] will have 0U value in the higher half lookup
    __ uqsubv(decH0, __ T16B, in0, v27);
    __ uqsubv(decH1, __ T16B, in1, v27);
    __ uqsubv(decH2, __ T16B, in2, v27);
    __ uqsubv(decH3, __ T16B, in3, v27);

    // lower half lookup
    __ tbl(decL0, arrangement, codecL, 4, in0);
    __ tbl(decL1, arrangement, codecL, 4, in1);
    __ tbl(decL2, arrangement, codecL, 4, in2);
    __ tbl(decL3, arrangement, codecL, 4, in3);

    // higher half lookup
    __ tbx(decH0, arrangement, codecH, 4, decH0);
    __ tbx(decH1, arrangement, codecH, 4, decH1);
    __ tbx(decH2, arrangement, codecH, 4, decH2);
    __ tbx(decH3, arrangement, codecH, 4, decH3);

    // combine lower and higher
    __ orr(decL0, arrangement, decL0, decH0);
    __ orr(decL1, arrangement, decL1, decH1);
    __ orr(decL2, arrangement, decL2, decH2);
    __ orr(decL3, arrangement, decL3, decH3);

    // check illegal inputs, value larger than 63 (maximum of 6 bits)
    __ cmhi(decH0, arrangement, decL0, v27);
    __ cmhi(decH1, arrangement, decL1, v27);
    __ cmhi(decH2, arrangement, decL2, v27);
    __ cmhi(decH3, arrangement, decL3, v27);
    __ orr(in0, arrangement, decH0, decH1);
    __ orr(in1, arrangement, decH2, decH3);
    __ orr(in2, arrangement, in0,   in1);
    __ umaxv(in3, arrangement, in2);
    __ umov(rscratch2, in3, __ B, 0);

    // get the data to output
    __ shl(out0,  arrangement, decL0, 2);
    __ ushr(out1, arrangement, decL1, 4);
    __ orr(out0,  arrangement, out0,  out1);
    __ shl(out1,  arrangement, decL1, 4);
    __ ushr(out2, arrangement, decL2, 2);
    __ orr(out1,  arrangement, out1,  out2);
    __ shl(out2,  arrangement, decL2, 6);
    __ orr(out2,  arrangement, out2,  decL3);

    __ cbz(rscratch2, NoIllegalData);

    // handle illegal input
    __ umov(r10, in2, __ D, 0);
    if (size == 16) {
      __ cbnz(r10, ErrorInLowerHalf);

      // illegal input is in higher half, store the lower half now.
      __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));

      __ umov(r10, in2,  __ D, 1);
      __ umov(r11, out0, __ D, 1);
      __ umov(r12, out1, __ D, 1);
      __ umov(r13, out2, __ D, 1);
      __ b(StoreLegalData);

      __ BIND(ErrorInLowerHalf);
    }
    __ umov(r11, out0, __ D, 0);
    __ umov(r12, out1, __ D, 0);
    __ umov(r13, out2, __ D, 0);

    __ BIND(StoreLegalData);
    __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
    __ strb(r11, __ post(dst, 1));
    __ strb(r12, __ post(dst, 1));
    __ strb(r13, __ post(dst, 1));
    __ lsr(r10, r10, 8);
    __ lsr(r11, r11, 8);
    __ lsr(r12, r12, 8);
    __ lsr(r13, r13, 8);
    __ b(StoreLegalData);

    __ BIND(NoIllegalData);
    __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
  }


   /**
   *  Arguments:
   *
   *  Input:
   *  c_rarg0   - src_start
   *  c_rarg1   - src_offset
   *  c_rarg2   - src_length
   *  c_rarg3   - dest_start
   *  c_rarg4   - dest_offset
   *  c_rarg5   - isURL
   *  c_rarg6   - isMIME
   *
   */
  address generate_base64_decodeBlock() {

    // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
    // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
    // titled "Base64 decoding".

    // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
    // except the trailing character '=' is also treated illegal value in this intrinsic. That
    // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
    static const uint8_t fromBase64ForNoSIMD[256] = {
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
       52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
       15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
      255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
       41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
    };

    static const uint8_t fromBase64URLForNoSIMD[256] = {
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
       52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
       15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
      255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
       41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
    };

    // A legal value of base64 code is in range [0, 127].  We need two lookups
    // with tbl/tbx and combine them to get the decode data. The 1st table vector
    // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
    // table vector lookup use tbx, out of range indices are unchanged in
    // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
    // The value of index 64 is set to 0, so that we know that we already get the
    // decoded data with the 1st lookup.
    static const uint8_t fromBase64ForSIMD[128] = {
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
       52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
        0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
       14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
      255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
       40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
    };

    static const uint8_t fromBase64URLForSIMD[128] = {
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
      255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
       52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
        0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
       14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
       63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
       40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
    };

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "decodeBlock");
    address start = __ pc();

    Register src    = c_rarg0;  // source array
    Register soff   = c_rarg1;  // source start offset
    Register send   = c_rarg2;  // source end offset
    Register dst    = c_rarg3;  // dest array
    Register doff   = c_rarg4;  // position for writing to dest array
    Register isURL  = c_rarg5;  // Base64 or URL character set
    Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation

    Register length = send;    // reuse send as length of source data to process

    Register simd_codec   = c_rarg6;
    Register nosimd_codec = c_rarg7;

    Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;

    __ enter();

    __ add(src, src, soff);
    __ add(dst, dst, doff);

    __ mov(doff, dst);

    __ sub(length, send, soff);
    __ bfm(length, zr, 0, 1);

    __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
    __ cbz(isURL, ProcessData);
    __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));

    __ BIND(ProcessData);
    __ mov(rscratch1, length);
    __ cmp(length, (u1)144); // 144 = 80 + 64
    __ br(Assembler::LT, Process4B);

    // In the MIME case, the line length cannot be more than 76
    // bytes (see RFC 2045). This is too short a block for SIMD
    // to be worthwhile, so we use non-SIMD here.
    __ movw(rscratch1, 79);

    __ BIND(Process4B);
    __ ldrw(r14, __ post(src, 4));
    __ ubfxw(r10, r14, 0,  8);
    __ ubfxw(r11, r14, 8,  8);
    __ ubfxw(r12, r14, 16, 8);
    __ ubfxw(r13, r14, 24, 8);
    // get the de-code
    __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
    __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
    __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
    __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
    // error detection, 255u indicates an illegal input
    __ orrw(r14, r10, r11);
    __ orrw(r15, r12, r13);
    __ orrw(r14, r14, r15);
    __ tbnz(r14, 7, Exit);
    // recover the data
    __ lslw(r14, r10, 10);
    __ bfiw(r14, r11, 4, 6);
    __ bfmw(r14, r12, 2, 5);
    __ rev16w(r14, r14);
    __ bfiw(r13, r12, 6, 2);
    __ strh(r14, __ post(dst, 2));
    __ strb(r13, __ post(dst, 1));
    // non-simd loop
    __ subsw(rscratch1, rscratch1, 4);
    __ br(Assembler::GT, Process4B);

    // if exiting from PreProcess80B, rscratch1 == -1;
    // otherwise, rscratch1 == 0.
    __ cbzw(rscratch1, Exit);
    __ sub(length, length, 80);

    __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
    __ cbz(isURL, SIMDEnter);
    __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));

    __ BIND(SIMDEnter);
    __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
    __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
    __ mov(rscratch1, 63);
    __ dup(v27, __ T16B, rscratch1);

    __ BIND(Process64B);
    __ cmp(length, (u1)64);
    __ br(Assembler::LT, Process32B);
    generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
    __ sub(length, length, 64);
    __ b(Process64B);

    __ BIND(Process32B);
    __ cmp(length, (u1)32);
    __ br(Assembler::LT, SIMDExit);
    generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
    __ sub(length, length, 32);
    __ b(Process32B);

    __ BIND(SIMDExit);
    __ cbz(length, Exit);
    __ movw(rscratch1, length);
    __ b(Process4B);

    __ BIND(Exit);
    __ sub(c_rarg0, dst, doff);

    __ leave();
    __ ret(lr);

    return start;
  }

  // Support for spin waits.
  address generate_spin_wait() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "spin_wait");
    address start = __ pc();

    __ spin_wait();
    __ ret(lr);

    return start;
  }

#if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)

  // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
  //
  // If LSE is in use, generate LSE versions of all the stubs. The
  // non-LSE versions are in atomic_aarch64.S.

  // class AtomicStubMark records the entry point of a stub and the
  // stub pointer which will point to it. The stub pointer is set to
  // the entry point when ~AtomicStubMark() is called, which must be
  // after ICache::invalidate_range. This ensures safe publication of
  // the generated code.
  class AtomicStubMark {
    address _entry_point;
    aarch64_atomic_stub_t *_stub;
    MacroAssembler *_masm;
  public:
    AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
      _masm = masm;
      __ align(32);
      _entry_point = __ pc();
      _stub = stub;
    }
    ~AtomicStubMark() {
      *_stub = (aarch64_atomic_stub_t)_entry_point;
    }
  };

  // NB: For memory_order_conservative we need a trailing membar after
  // LSE atomic operations but not a leading membar.
  //
  // We don't need a leading membar because a clause in the Arm ARM
  // says:
  //
  //   Barrier-ordered-before
  //
  //   Barrier instructions order prior Memory effects before subsequent
  //   Memory effects generated by the same Observer. A read or a write
  //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
  //   Observer if and only if RW1 appears in program order before RW 2
  //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
  //   instruction with both Acquire and Release semantics.
  //
  // All the atomic instructions {ldaddal, swapal, casal} have Acquire
  // and Release semantics, therefore we don't need a leading
  // barrier. However, there is no corresponding Barrier-ordered-after
  // relationship, therefore we need a trailing membar to prevent a
  // later store or load from being reordered with the store in an
  // atomic instruction.
  //
  // This was checked by using the herd7 consistency model simulator
  // (http://diy.inria.fr/) with this test case:
  //
  // AArch64 LseCas
  // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
  // P0 | P1;
  // LDR W4, [X2] | MOV W3, #0;
  // DMB LD       | MOV W4, #1;
  // LDR W3, [X1] | CASAL W3, W4, [X1];
  //              | DMB ISH;
  //              | STR W4, [X2];
  // exists
  // (0:X3=0 /\ 0:X4=1)
  //
  // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
  // with the store to x in P1. Without the DMB in P1 this may happen.
  //
  // At the time of writing we don't know of any AArch64 hardware that
  // reorders stores in this way, but the Reference Manual permits it.

  void gen_cas_entry(Assembler::operand_size size,
                     atomic_memory_order order) {
    Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
      exchange_val = c_rarg2;
    bool acquire, release;
    switch (order) {
      case memory_order_relaxed:
        acquire = false;
        release = false;
        break;
      case memory_order_release:
        acquire = false;
        release = true;
        break;
      default:
        acquire = true;
        release = true;
        break;
    }
    __ mov(prev, compare_val);
    __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
    if (order == memory_order_conservative) {
      __ membar(Assembler::StoreStore|Assembler::StoreLoad);
    }
    if (size == Assembler::xword) {
      __ mov(r0, prev);
    } else {
      __ movw(r0, prev);
    }
    __ ret(lr);
  }

  void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
    Register prev = r2, addr = c_rarg0, incr = c_rarg1;
    // If not relaxed, then default to conservative.  Relaxed is the only
    // case we use enough to be worth specializing.
    if (order == memory_order_relaxed) {
      __ ldadd(size, incr, prev, addr);
    } else {
      __ ldaddal(size, incr, prev, addr);
      __ membar(Assembler::StoreStore|Assembler::StoreLoad);
    }
    if (size == Assembler::xword) {
      __ mov(r0, prev);
    } else {
      __ movw(r0, prev);
    }
    __ ret(lr);
  }

  void gen_swpal_entry(Assembler::operand_size size) {
    Register prev = r2, addr = c_rarg0, incr = c_rarg1;
    __ swpal(size, incr, prev, addr);
    __ membar(Assembler::StoreStore|Assembler::StoreLoad);
    if (size == Assembler::xword) {
      __ mov(r0, prev);
    } else {
      __ movw(r0, prev);
    }
    __ ret(lr);
  }

  void generate_atomic_entry_points() {
    if (! UseLSE) {
      return;
    }

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "atomic entry points");
    address first_entry = __ pc();

    // ADD, memory_order_conservative
    AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
    gen_ldadd_entry(Assembler::word, memory_order_conservative);
    AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
    gen_ldadd_entry(Assembler::xword, memory_order_conservative);

    // ADD, memory_order_relaxed
    AtomicStubMark mark_fetch_add_4_relaxed
      (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
    gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
    AtomicStubMark mark_fetch_add_8_relaxed
      (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
    gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);

    // XCHG, memory_order_conservative
    AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
    gen_swpal_entry(Assembler::word);
    AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
    gen_swpal_entry(Assembler::xword);

    // CAS, memory_order_conservative
    AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
    gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
    AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
    gen_cas_entry(MacroAssembler::word, memory_order_conservative);
    AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
    gen_cas_entry(MacroAssembler::xword, memory_order_conservative);

    // CAS, memory_order_relaxed
    AtomicStubMark mark_cmpxchg_1_relaxed
      (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
    gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
    AtomicStubMark mark_cmpxchg_4_relaxed
      (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
    gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
    AtomicStubMark mark_cmpxchg_8_relaxed
      (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
    gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);

    AtomicStubMark mark_cmpxchg_4_release
      (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
    gen_cas_entry(MacroAssembler::word, memory_order_release);
    AtomicStubMark mark_cmpxchg_8_release
      (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
    gen_cas_entry(MacroAssembler::xword, memory_order_release);

    AtomicStubMark mark_cmpxchg_4_seq_cst
      (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
    gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
    AtomicStubMark mark_cmpxchg_8_seq_cst
      (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
    gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);

    ICache::invalidate_range(first_entry, __ pc() - first_entry);
  }
#endif // LINUX

  address generate_cont_thaw(Continuation::thaw_kind kind) {
    bool return_barrier = Continuation::is_thaw_return_barrier(kind);
    bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);

    address start = __ pc();

    if (return_barrier) {
      __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
      __ mov(sp, rscratch1);
    }
    assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");

    if (return_barrier) {
      // preserve possible return value from a method returning to the return barrier
      __ fmovd(rscratch1, v0);
      __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
    }

    __ movw(c_rarg1, (return_barrier ? 1 : 0));
    __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
    __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames

    if (return_barrier) {
      // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
      __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
      __ fmovd(v0, rscratch1);
    }
    assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");


    Label thaw_success;
    // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
    __ cbnz(rscratch2, thaw_success);
    __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
    __ br(rscratch1);
    __ bind(thaw_success);

    // make room for the thawed frames
    __ sub(rscratch1, sp, rscratch2);
    __ andr(rscratch1, rscratch1, -16); // align
    __ mov(sp, rscratch1);

    if (return_barrier) {
      // save original return value -- again
      __ fmovd(rscratch1, v0);
      __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
    }

    // If we want, we can templatize thaw by kind, and have three different entries
    __ movw(c_rarg1, (uint32_t)kind);

    __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
    __ mov(rscratch2, r0); // r0 is the sp of the yielding frame

    if (return_barrier) {
      // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
      __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
      __ fmovd(v0, rscratch1);
    } else {
      __ mov(r0, zr); // return 0 (success) from doYield
    }

    // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
    __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
    __ mov(rfp, sp);

    if (return_barrier_exception) {
      __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
      __ verify_oop(r0);
      __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19

      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);

      // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
      // __ reinitialize_ptrue();

      // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc

      __ mov(r1, r0); // the exception handler
      __ mov(r0, r19); // restore return value contaning the exception oop
      __ verify_oop(r0);

      __ leave();
      __ mov(r3, lr);
      __ br(r1); // the exception handler
    } else {
      // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
      __ leave();
      __ ret(lr);
    }

    return start;
  }

  address generate_cont_thaw() {
    if (!Continuations::enabled()) return nullptr;

    StubCodeMark mark(this, "StubRoutines", "Cont thaw");
    address start = __ pc();
    generate_cont_thaw(Continuation::thaw_top);
    return start;
  }

  address generate_cont_returnBarrier() {
    if (!Continuations::enabled()) return nullptr;

    // TODO: will probably need multiple return barriers depending on return type
    StubCodeMark mark(this, "StubRoutines", "cont return barrier");
    address start = __ pc();

    generate_cont_thaw(Continuation::thaw_return_barrier);

    return start;
  }

  address generate_cont_returnBarrier_exception() {
    if (!Continuations::enabled()) return nullptr;

    StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
    address start = __ pc();

    generate_cont_thaw(Continuation::thaw_return_barrier_exception);

    return start;
  }

#if INCLUDE_JFR

  static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
    __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
    __ mov(c_rarg0, thread);
  }

  // The handle is dereferenced through a load barrier.
  static void jfr_epilogue(MacroAssembler* _masm) {
    __ reset_last_Java_frame(true);
    Label null_jobject;
    __ cbz(r0, null_jobject);
    DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
    BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
    bs->load_at(_masm, decorators, T_OBJECT, r0, Address(r0, 0), rscratch1, rscratch2);
    __ bind(null_jobject);
  }

  // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
  // It returns a jobject handle to the event writer.
  // The handle is dereferenced and the return value is the event writer oop.
  static RuntimeStub* generate_jfr_write_checkpoint() {
    enum layout {
      rbp_off,
      rbpH_off,
      return_off,
      return_off2,
      framesize // inclusive of return address
    };

    int insts_size = 512;
    int locs_size = 64;
    CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
    OopMapSet* oop_maps = new OopMapSet();
    MacroAssembler* masm = new MacroAssembler(&code);
    MacroAssembler* _masm = masm;

    address start = __ pc();
    __ enter();
    int frame_complete = __ pc() - start;
    address the_pc = __ pc();
    jfr_prologue(the_pc, _masm, rthread);
    __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
    jfr_epilogue(_masm);
    __ leave();
    __ ret(lr);

    OopMap* map = new OopMap(framesize, 1); // rfp
    oop_maps->add_gc_map(the_pc - start, map);

    RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
      RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
                                    oop_maps, false);
    return stub;
  }

#endif // INCLUDE_JFR

  // Continuation point for throwing of implicit exceptions that are
  // not handled in the current activation. Fabricates an exception
  // oop and initiates normal exception dispatching in this
  // frame. Since we need to preserve callee-saved values (currently
  // only for C2, but done for C1 as well) we need a callee-saved oop
  // map and therefore have to make these stubs into RuntimeStubs
  // rather than BufferBlobs.  If the compiler needs all registers to
  // be preserved between the fault point and the exception handler
  // then it must assume responsibility for that in
  // AbstractCompiler::continuation_for_implicit_null_exception or
  // continuation_for_implicit_division_by_zero_exception. All other
  // implicit exceptions (e.g., NullPointerException or
  // AbstractMethodError on entry) are either at call sites or
  // otherwise assume that stack unwinding will be initiated, so
  // caller saved registers were assumed volatile in the compiler.

#undef __
#define __ masm->

  address generate_throw_exception(const char* name,
                                   address runtime_entry,
                                   Register arg1 = noreg,
                                   Register arg2 = noreg) {
    // Information about frame layout at time of blocking runtime call.
    // Note that we only have to preserve callee-saved registers since
    // the compilers are responsible for supplying a continuation point
    // if they expect all registers to be preserved.
    // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
    enum layout {
      rfp_off = 0,
      rfp_off2,
      return_off,
      return_off2,
      framesize // inclusive of return address
    };

    int insts_size = 512;
    int locs_size  = 64;

    CodeBuffer code(name, insts_size, locs_size);
    OopMapSet* oop_maps  = new OopMapSet();
    MacroAssembler* masm = new MacroAssembler(&code);

    address start = __ pc();

    // This is an inlined and slightly modified version of call_VM
    // which has the ability to fetch the return PC out of
    // thread-local storage and also sets up last_Java_sp slightly
    // differently than the real call_VM

    __ enter(); // Save FP and LR before call

    assert(is_even(framesize/2), "sp not 16-byte aligned");

    // lr and fp are already in place
    __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog

    int frame_complete = __ pc() - start;

    // Set up last_Java_sp and last_Java_fp
    address the_pc = __ pc();
    __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);

    // Call runtime
    if (arg1 != noreg) {
      assert(arg2 != c_rarg1, "clobbered");
      __ mov(c_rarg1, arg1);
    }
    if (arg2 != noreg) {
      __ mov(c_rarg2, arg2);
    }
    __ mov(c_rarg0, rthread);
    BLOCK_COMMENT("call runtime_entry");
    __ mov(rscratch1, runtime_entry);
    __ blr(rscratch1);

    // Generate oop map
    OopMap* map = new OopMap(framesize, 0);

    oop_maps->add_gc_map(the_pc - start, map);

    __ reset_last_Java_frame(true);

    // Reinitialize the ptrue predicate register, in case the external runtime
    // call clobbers ptrue reg, as we may return to SVE compiled code.
    __ reinitialize_ptrue();

    __ leave();

    // check for pending exceptions
#ifdef ASSERT
    Label L;
    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
    __ cbnz(rscratch1, L);
    __ should_not_reach_here();
    __ bind(L);
#endif // ASSERT
    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));

    // codeBlob framesize is in words (not VMRegImpl::slot_size)
    RuntimeStub* stub =
      RuntimeStub::new_runtime_stub(name,
                                    &code,
                                    frame_complete,
                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
                                    oop_maps, false);
    return stub->entry_point();
  }

  class MontgomeryMultiplyGenerator : public MacroAssembler {

    Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
      Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;

    RegSet _toSave;
    bool _squaring;

  public:
    MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
      : MacroAssembler(as->code()), _squaring(squaring) {

      // Register allocation

      RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
      Pa_base = *regs;       // Argument registers
      if (squaring)
        Pb_base = Pa_base;
      else
        Pb_base = *++regs;
      Pn_base = *++regs;
      Rlen= *++regs;
      inv = *++regs;
      Pm_base = *++regs;

                          // Working registers:
      Ra =  *++regs;        // The current digit of a, b, n, and m.
      Rb =  *++regs;
      Rm =  *++regs;
      Rn =  *++regs;

      Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
      Pb =  *++regs;
      Pm =  *++regs;
      Pn =  *++regs;

      t0 =  *++regs;        // Three registers which form a
      t1 =  *++regs;        // triple-precision accumuator.
      t2 =  *++regs;

      Ri =  *++regs;        // Inner and outer loop indexes.
      Rj =  *++regs;

      Rhi_ab = *++regs;     // Product registers: low and high parts
      Rlo_ab = *++regs;     // of a*b and m*n.
      Rhi_mn = *++regs;
      Rlo_mn = *++regs;

      // r19 and up are callee-saved.
      _toSave = RegSet::range(r19, *regs) + Pm_base;
    }

  private:
    void save_regs() {
      push(_toSave, sp);
    }

    void restore_regs() {
      pop(_toSave, sp);
    }

    template <typename T>
    void unroll_2(Register count, T block) {
      Label loop, end, odd;
      tbnz(count, 0, odd);
      cbz(count, end);
      align(16);
      bind(loop);
      (this->*block)();
      bind(odd);
      (this->*block)();
      subs(count, count, 2);
      br(Assembler::GT, loop);
      bind(end);
    }

    template <typename T>
    void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
      Label loop, end, odd;
      tbnz(count, 0, odd);
      cbz(count, end);
      align(16);
      bind(loop);
      (this->*block)(d, s, tmp);
      bind(odd);
      (this->*block)(d, s, tmp);
      subs(count, count, 2);
      br(Assembler::GT, loop);
      bind(end);
    }

    void pre1(RegisterOrConstant i) {
      block_comment("pre1");
      // Pa = Pa_base;
      // Pb = Pb_base + i;
      // Pm = Pm_base;
      // Pn = Pn_base + i;
      // Ra = *Pa;
      // Rb = *Pb;
      // Rm = *Pm;
      // Rn = *Pn;
      ldr(Ra, Address(Pa_base));
      ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
      ldr(Rm, Address(Pm_base));
      ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
      lea(Pa, Address(Pa_base));
      lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
      lea(Pm, Address(Pm_base));
      lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));

      // Zero the m*n result.
      mov(Rhi_mn, zr);
      mov(Rlo_mn, zr);
    }

    // The core multiply-accumulate step of a Montgomery
    // multiplication.  The idea is to schedule operations as a
    // pipeline so that instructions with long latencies (loads and
    // multiplies) have time to complete before their results are
    // used.  This most benefits in-order implementations of the
    // architecture but out-of-order ones also benefit.
    void step() {
      block_comment("step");
      // MACC(Ra, Rb, t0, t1, t2);
      // Ra = *++Pa;
      // Rb = *--Pb;
      umulh(Rhi_ab, Ra, Rb);
      mul(Rlo_ab, Ra, Rb);
      ldr(Ra, pre(Pa, wordSize));
      ldr(Rb, pre(Pb, -wordSize));
      acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
                                       // previous iteration.
      // MACC(Rm, Rn, t0, t1, t2);
      // Rm = *++Pm;
      // Rn = *--Pn;
      umulh(Rhi_mn, Rm, Rn);
      mul(Rlo_mn, Rm, Rn);
      ldr(Rm, pre(Pm, wordSize));
      ldr(Rn, pre(Pn, -wordSize));
      acc(Rhi_ab, Rlo_ab, t0, t1, t2);
    }

    void post1() {
      block_comment("post1");

      // MACC(Ra, Rb, t0, t1, t2);
      // Ra = *++Pa;
      // Rb = *--Pb;
      umulh(Rhi_ab, Ra, Rb);
      mul(Rlo_ab, Ra, Rb);
      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
      acc(Rhi_ab, Rlo_ab, t0, t1, t2);

      // *Pm = Rm = t0 * inv;
      mul(Rm, t0, inv);
      str(Rm, Address(Pm));

      // MACC(Rm, Rn, t0, t1, t2);
      // t0 = t1; t1 = t2; t2 = 0;
      umulh(Rhi_mn, Rm, Rn);

#ifndef PRODUCT
      // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
      {
        mul(Rlo_mn, Rm, Rn);
        add(Rlo_mn, t0, Rlo_mn);
        Label ok;
        cbz(Rlo_mn, ok); {
          stop("broken Montgomery multiply");
        } bind(ok);
      }
#endif
      // We have very carefully set things up so that
      // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
      // the lower half of Rm * Rn because we know the result already:
      // it must be -t0.  t0 + (-t0) must generate a carry iff
      // t0 != 0.  So, rather than do a mul and an adds we just set
      // the carry flag iff t0 is nonzero.
      //
      // mul(Rlo_mn, Rm, Rn);
      // adds(zr, t0, Rlo_mn);
      subs(zr, t0, 1); // Set carry iff t0 is nonzero
      adcs(t0, t1, Rhi_mn);
      adc(t1, t2, zr);
      mov(t2, zr);
    }

    void pre2(RegisterOrConstant i, RegisterOrConstant len) {
      block_comment("pre2");
      // Pa = Pa_base + i-len;
      // Pb = Pb_base + len;
      // Pm = Pm_base + i-len;
      // Pn = Pn_base + len;

      if (i.is_register()) {
        sub(Rj, i.as_register(), len);
      } else {
        mov(Rj, i.as_constant());
        sub(Rj, Rj, len);
      }
      // Rj == i-len

      lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
      lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
      lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
      lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));

      // Ra = *++Pa;
      // Rb = *--Pb;
      // Rm = *++Pm;
      // Rn = *--Pn;
      ldr(Ra, pre(Pa, wordSize));
      ldr(Rb, pre(Pb, -wordSize));
      ldr(Rm, pre(Pm, wordSize));
      ldr(Rn, pre(Pn, -wordSize));

      mov(Rhi_mn, zr);
      mov(Rlo_mn, zr);
    }

    void post2(RegisterOrConstant i, RegisterOrConstant len) {
      block_comment("post2");
      if (i.is_constant()) {
        mov(Rj, i.as_constant()-len.as_constant());
      } else {
        sub(Rj, i.as_register(), len);
      }

      adds(t0, t0, Rlo_mn); // The pending m*n, low part

      // As soon as we know the least significant digit of our result,
      // store it.
      // Pm_base[i-len] = t0;
      str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));

      // t0 = t1; t1 = t2; t2 = 0;
      adcs(t0, t1, Rhi_mn); // The pending m*n, high part
      adc(t1, t2, zr);
      mov(t2, zr);
    }

    // A carry in t0 after Montgomery multiplication means that we
    // should subtract multiples of n from our result in m.  We'll
    // keep doing that until there is no carry.
    void normalize(RegisterOrConstant len) {
      block_comment("normalize");
      // while (t0)
      //   t0 = sub(Pm_base, Pn_base, t0, len);
      Label loop, post, again;
      Register cnt = t1, i = t2; // Re-use registers; we're done with them now
      cbz(t0, post); {
        bind(again); {
          mov(i, zr);
          mov(cnt, len);
          ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
          ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
          subs(zr, zr, zr); // set carry flag, i.e. no borrow
          align(16);
          bind(loop); {
            sbcs(Rm, Rm, Rn);
            str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
            add(i, i, 1);
            ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
            ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
            sub(cnt, cnt, 1);
          } cbnz(cnt, loop);
          sbc(t0, t0, zr);
        } cbnz(t0, again);
      } bind(post);
    }

    // Move memory at s to d, reversing words.
    //    Increments d to end of copied memory
    //    Destroys tmp1, tmp2
    //    Preserves len
    //    Leaves s pointing to the address which was in d at start
    void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
      assert(tmp1->encoding() < r19->encoding(), "register corruption");
      assert(tmp2->encoding() < r19->encoding(), "register corruption");

      lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
      mov(tmp1, len);
      unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
      sub(s, d, len, ext::uxtw, LogBytesPerWord);
    }
    // where
    void reverse1(Register d, Register s, Register tmp) {
      ldr(tmp, pre(s, -wordSize));
      ror(tmp, tmp, 32);
      str(tmp, post(d, wordSize));
    }

    void step_squaring() {
      // An extra ACC
      step();
      acc(Rhi_ab, Rlo_ab, t0, t1, t2);
    }

    void last_squaring(RegisterOrConstant i) {
      Label dont;
      // if ((i & 1) == 0) {
      tbnz(i.as_register(), 0, dont); {
        // MACC(Ra, Rb, t0, t1, t2);
        // Ra = *++Pa;
        // Rb = *--Pb;
        umulh(Rhi_ab, Ra, Rb);
        mul(Rlo_ab, Ra, Rb);
        acc(Rhi_ab, Rlo_ab, t0, t1, t2);
      } bind(dont);
    }

    void extra_step_squaring() {
      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n

      // MACC(Rm, Rn, t0, t1, t2);
      // Rm = *++Pm;
      // Rn = *--Pn;
      umulh(Rhi_mn, Rm, Rn);
      mul(Rlo_mn, Rm, Rn);
      ldr(Rm, pre(Pm, wordSize));
      ldr(Rn, pre(Pn, -wordSize));
    }

    void post1_squaring() {
      acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n

      // *Pm = Rm = t0 * inv;
      mul(Rm, t0, inv);
      str(Rm, Address(Pm));

      // MACC(Rm, Rn, t0, t1, t2);
      // t0 = t1; t1 = t2; t2 = 0;
      umulh(Rhi_mn, Rm, Rn);

#ifndef PRODUCT
      // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
      {
        mul(Rlo_mn, Rm, Rn);
        add(Rlo_mn, t0, Rlo_mn);
        Label ok;
        cbz(Rlo_mn, ok); {
          stop("broken Montgomery multiply");
        } bind(ok);
      }
#endif
      // We have very carefully set things up so that
      // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
      // the lower half of Rm * Rn because we know the result already:
      // it must be -t0.  t0 + (-t0) must generate a carry iff
      // t0 != 0.  So, rather than do a mul and an adds we just set
      // the carry flag iff t0 is nonzero.
      //
      // mul(Rlo_mn, Rm, Rn);
      // adds(zr, t0, Rlo_mn);
      subs(zr, t0, 1); // Set carry iff t0 is nonzero
      adcs(t0, t1, Rhi_mn);
      adc(t1, t2, zr);
      mov(t2, zr);
    }

    void acc(Register Rhi, Register Rlo,
             Register t0, Register t1, Register t2) {
      adds(t0, t0, Rlo);
      adcs(t1, t1, Rhi);
      adc(t2, t2, zr);
    }

  public:
    /**
     * Fast Montgomery multiplication.  The derivation of the
     * algorithm is in A Cryptographic Library for the Motorola
     * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
     *
     * Arguments:
     *
     * Inputs for multiplication:
     *   c_rarg0   - int array elements a
     *   c_rarg1   - int array elements b
     *   c_rarg2   - int array elements n (the modulus)
     *   c_rarg3   - int length
     *   c_rarg4   - int inv
     *   c_rarg5   - int array elements m (the result)
     *
     * Inputs for squaring:
     *   c_rarg0   - int array elements a
     *   c_rarg1   - int array elements n (the modulus)
     *   c_rarg2   - int length
     *   c_rarg3   - int inv
     *   c_rarg4   - int array elements m (the result)
     *
     */
    address generate_multiply() {
      Label argh, nothing;
      bind(argh);
      stop("MontgomeryMultiply total_allocation must be <= 8192");

      align(CodeEntryAlignment);
      address entry = pc();

      cbzw(Rlen, nothing);

      enter();

      // Make room.
      cmpw(Rlen, 512);
      br(Assembler::HI, argh);
      sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
      andr(sp, Ra, -2 * wordSize);

      lsrw(Rlen, Rlen, 1);  // length in longwords = len/2

      {
        // Copy input args, reversing as we go.  We use Ra as a
        // temporary variable.
        reverse(Ra, Pa_base, Rlen, t0, t1);
        if (!_squaring)
          reverse(Ra, Pb_base, Rlen, t0, t1);
        reverse(Ra, Pn_base, Rlen, t0, t1);
      }

      // Push all call-saved registers and also Pm_base which we'll need
      // at the end.
      save_regs();

#ifndef PRODUCT
      // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
      {
        ldr(Rn, Address(Pn_base, 0));
        mul(Rlo_mn, Rn, inv);
        subs(zr, Rlo_mn, -1);
        Label ok;
        br(EQ, ok); {
          stop("broken inverse in Montgomery multiply");
        } bind(ok);
      }
#endif

      mov(Pm_base, Ra);

      mov(t0, zr);
      mov(t1, zr);
      mov(t2, zr);

      block_comment("for (int i = 0; i < len; i++) {");
      mov(Ri, zr); {
        Label loop, end;
        cmpw(Ri, Rlen);
        br(Assembler::GE, end);

        bind(loop);
        pre1(Ri);

        block_comment("  for (j = i; j; j--) {"); {
          movw(Rj, Ri);
          unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
        } block_comment("  } // j");

        post1();
        addw(Ri, Ri, 1);
        cmpw(Ri, Rlen);
        br(Assembler::LT, loop);
        bind(end);
        block_comment("} // i");
      }

      block_comment("for (int i = len; i < 2*len; i++) {");
      mov(Ri, Rlen); {
        Label loop, end;
        cmpw(Ri, Rlen, Assembler::LSL, 1);
        br(Assembler::GE, end);

        bind(loop);
        pre2(Ri, Rlen);

        block_comment("  for (j = len*2-i-1; j; j--) {"); {
          lslw(Rj, Rlen, 1);
          subw(Rj, Rj, Ri);
          subw(Rj, Rj, 1);
          unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
        } block_comment("  } // j");

        post2(Ri, Rlen);
        addw(Ri, Ri, 1);
        cmpw(Ri, Rlen, Assembler::LSL, 1);
        br(Assembler::LT, loop);
        bind(end);
      }
      block_comment("} // i");

      normalize(Rlen);

      mov(Ra, Pm_base);  // Save Pm_base in Ra
      restore_regs();  // Restore caller's Pm_base

      // Copy our result into caller's Pm_base
      reverse(Pm_base, Ra, Rlen, t0, t1);

      leave();
      bind(nothing);
      ret(lr);

      return entry;
    }
    // In C, approximately:

    // void
    // montgomery_multiply(julong Pa_base[], julong Pb_base[],
    //                     julong Pn_base[], julong Pm_base[],
    //                     julong inv, int len) {
    //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
    //   julong *Pa, *Pb, *Pn, *Pm;
    //   julong Ra, Rb, Rn, Rm;

    //   int i;

    //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");

    //   for (i = 0; i < len; i++) {
    //     int j;

    //     Pa = Pa_base;
    //     Pb = Pb_base + i;
    //     Pm = Pm_base;
    //     Pn = Pn_base + i;

    //     Ra = *Pa;
    //     Rb = *Pb;
    //     Rm = *Pm;
    //     Rn = *Pn;

    //     int iters = i;
    //     for (j = 0; iters--; j++) {
    //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
    //       MACC(Ra, Rb, t0, t1, t2);
    //       Ra = *++Pa;
    //       Rb = *--Pb;
    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
    //       MACC(Rm, Rn, t0, t1, t2);
    //       Rm = *++Pm;
    //       Rn = *--Pn;
    //     }

    //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
    //     MACC(Ra, Rb, t0, t1, t2);
    //     *Pm = Rm = t0 * inv;
    //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
    //     MACC(Rm, Rn, t0, t1, t2);

    //     assert(t0 == 0, "broken Montgomery multiply");

    //     t0 = t1; t1 = t2; t2 = 0;
    //   }

    //   for (i = len; i < 2*len; i++) {
    //     int j;

    //     Pa = Pa_base + i-len;
    //     Pb = Pb_base + len;
    //     Pm = Pm_base + i-len;
    //     Pn = Pn_base + len;

    //     Ra = *++Pa;
    //     Rb = *--Pb;
    //     Rm = *++Pm;
    //     Rn = *--Pn;

    //     int iters = len*2-i-1;
    //     for (j = i-len+1; iters--; j++) {
    //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
    //       MACC(Ra, Rb, t0, t1, t2);
    //       Ra = *++Pa;
    //       Rb = *--Pb;
    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
    //       MACC(Rm, Rn, t0, t1, t2);
    //       Rm = *++Pm;
    //       Rn = *--Pn;
    //     }

    //     Pm_base[i-len] = t0;
    //     t0 = t1; t1 = t2; t2 = 0;
    //   }

    //   while (t0)
    //     t0 = sub(Pm_base, Pn_base, t0, len);
    // }

    /**
     * Fast Montgomery squaring.  This uses asymptotically 25% fewer
     * multiplies than Montgomery multiplication so it should be up to
     * 25% faster.  However, its loop control is more complex and it
     * may actually run slower on some machines.
     *
     * Arguments:
     *
     * Inputs:
     *   c_rarg0   - int array elements a
     *   c_rarg1   - int array elements n (the modulus)
     *   c_rarg2   - int length
     *   c_rarg3   - int inv
     *   c_rarg4   - int array elements m (the result)
     *
     */
    address generate_square() {
      Label argh;
      bind(argh);
      stop("MontgomeryMultiply total_allocation must be <= 8192");

      align(CodeEntryAlignment);
      address entry = pc();

      enter();

      // Make room.
      cmpw(Rlen, 512);
      br(Assembler::HI, argh);
      sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
      andr(sp, Ra, -2 * wordSize);

      lsrw(Rlen, Rlen, 1);  // length in longwords = len/2

      {
        // Copy input args, reversing as we go.  We use Ra as a
        // temporary variable.
        reverse(Ra, Pa_base, Rlen, t0, t1);
        reverse(Ra, Pn_base, Rlen, t0, t1);
      }

      // Push all call-saved registers and also Pm_base which we'll need
      // at the end.
      save_regs();

      mov(Pm_base, Ra);

      mov(t0, zr);
      mov(t1, zr);
      mov(t2, zr);

      block_comment("for (int i = 0; i < len; i++) {");
      mov(Ri, zr); {
        Label loop, end;
        bind(loop);
        cmp(Ri, Rlen);
        br(Assembler::GE, end);

        pre1(Ri);

        block_comment("for (j = (i+1)/2; j; j--) {"); {
          add(Rj, Ri, 1);
          lsr(Rj, Rj, 1);
          unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
        } block_comment("  } // j");

        last_squaring(Ri);

        block_comment("  for (j = i/2; j; j--) {"); {
          lsr(Rj, Ri, 1);
          unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
        } block_comment("  } // j");

        post1_squaring();
        add(Ri, Ri, 1);
        cmp(Ri, Rlen);
        br(Assembler::LT, loop);

        bind(end);
        block_comment("} // i");
      }

      block_comment("for (int i = len; i < 2*len; i++) {");
      mov(Ri, Rlen); {
        Label loop, end;
        bind(loop);
        cmp(Ri, Rlen, Assembler::LSL, 1);
        br(Assembler::GE, end);

        pre2(Ri, Rlen);

        block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
          lsl(Rj, Rlen, 1);
          sub(Rj, Rj, Ri);
          sub(Rj, Rj, 1);
          lsr(Rj, Rj, 1);
          unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
        } block_comment("  } // j");

        last_squaring(Ri);

        block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
          lsl(Rj, Rlen, 1);
          sub(Rj, Rj, Ri);
          lsr(Rj, Rj, 1);
          unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
        } block_comment("  } // j");

        post2(Ri, Rlen);
        add(Ri, Ri, 1);
        cmp(Ri, Rlen, Assembler::LSL, 1);

        br(Assembler::LT, loop);
        bind(end);
        block_comment("} // i");
      }

      normalize(Rlen);

      mov(Ra, Pm_base);  // Save Pm_base in Ra
      restore_regs();  // Restore caller's Pm_base

      // Copy our result into caller's Pm_base
      reverse(Pm_base, Ra, Rlen, t0, t1);

      leave();
      ret(lr);

      return entry;
    }
    // In C, approximately:

    // void
    // montgomery_square(julong Pa_base[], julong Pn_base[],
    //                   julong Pm_base[], julong inv, int len) {
    //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
    //   julong *Pa, *Pb, *Pn, *Pm;
    //   julong Ra, Rb, Rn, Rm;

    //   int i;

    //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");

    //   for (i = 0; i < len; i++) {
    //     int j;

    //     Pa = Pa_base;
    //     Pb = Pa_base + i;
    //     Pm = Pm_base;
    //     Pn = Pn_base + i;

    //     Ra = *Pa;
    //     Rb = *Pb;
    //     Rm = *Pm;
    //     Rn = *Pn;

    //     int iters = (i+1)/2;
    //     for (j = 0; iters--; j++) {
    //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
    //       MACC2(Ra, Rb, t0, t1, t2);
    //       Ra = *++Pa;
    //       Rb = *--Pb;
    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
    //       MACC(Rm, Rn, t0, t1, t2);
    //       Rm = *++Pm;
    //       Rn = *--Pn;
    //     }
    //     if ((i & 1) == 0) {
    //       assert(Ra == Pa_base[j], "must be");
    //       MACC(Ra, Ra, t0, t1, t2);
    //     }
    //     iters = i/2;
    //     assert(iters == i-j, "must be");
    //     for (; iters--; j++) {
    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
    //       MACC(Rm, Rn, t0, t1, t2);
    //       Rm = *++Pm;
    //       Rn = *--Pn;
    //     }

    //     *Pm = Rm = t0 * inv;
    //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
    //     MACC(Rm, Rn, t0, t1, t2);

    //     assert(t0 == 0, "broken Montgomery multiply");

    //     t0 = t1; t1 = t2; t2 = 0;
    //   }

    //   for (i = len; i < 2*len; i++) {
    //     int start = i-len+1;
    //     int end = start + (len - start)/2;
    //     int j;

    //     Pa = Pa_base + i-len;
    //     Pb = Pa_base + len;
    //     Pm = Pm_base + i-len;
    //     Pn = Pn_base + len;

    //     Ra = *++Pa;
    //     Rb = *--Pb;
    //     Rm = *++Pm;
    //     Rn = *--Pn;

    //     int iters = (2*len-i-1)/2;
    //     assert(iters == end-start, "must be");
    //     for (j = start; iters--; j++) {
    //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
    //       MACC2(Ra, Rb, t0, t1, t2);
    //       Ra = *++Pa;
    //       Rb = *--Pb;
    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
    //       MACC(Rm, Rn, t0, t1, t2);
    //       Rm = *++Pm;
    //       Rn = *--Pn;
    //     }
    //     if ((i & 1) == 0) {
    //       assert(Ra == Pa_base[j], "must be");
    //       MACC(Ra, Ra, t0, t1, t2);
    //     }
    //     iters =  (2*len-i)/2;
    //     assert(iters == len-j, "must be");
    //     for (; iters--; j++) {
    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
    //       MACC(Rm, Rn, t0, t1, t2);
    //       Rm = *++Pm;
    //       Rn = *--Pn;
    //     }
    //     Pm_base[i-len] = t0;
    //     t0 = t1; t1 = t2; t2 = 0;
    //   }

    //   while (t0)
    //     t0 = sub(Pm_base, Pn_base, t0, len);
    // }
  };


  // Initialization
  void generate_initial() {
    // Generate initial stubs and initializes the entry points

    // entry points that exist in all platforms Note: This is code
    // that could be shared among different platforms - however the
    // benefit seems to be smaller than the disadvantage of having a
    // much more complicated generator structure. See also comment in
    // stubRoutines.hpp.

    StubRoutines::_forward_exception_entry = generate_forward_exception();

    StubRoutines::_call_stub_entry =
      generate_call_stub(StubRoutines::_call_stub_return_address);

    // is referenced by megamorphic call
    StubRoutines::_catch_exception_entry = generate_catch_exception();

    // Build this early so it's available for the interpreter.
    StubRoutines::_throw_StackOverflowError_entry =
      generate_throw_exception("StackOverflowError throw_exception",
                               CAST_FROM_FN_PTR(address,
                                                SharedRuntime::throw_StackOverflowError));
    StubRoutines::_throw_delayed_StackOverflowError_entry =
      generate_throw_exception("delayed StackOverflowError throw_exception",
                               CAST_FROM_FN_PTR(address,
                                                SharedRuntime::throw_delayed_StackOverflowError));
    if (UseCRC32Intrinsics) {
      // set table address before stub generation which use it
      StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
    }

    if (UseCRC32CIntrinsics) {
      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
    }

    // Disabled until JDK-8210858 is fixed
    // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
    //   StubRoutines::_dlog = generate_dlog();
    // }

    if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
      StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
    }

    if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
      StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
    }
  }

  void generate_phase1() {
    // Continuation stubs:
    StubRoutines::_cont_thaw          = generate_cont_thaw();
    StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
    StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();

    JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();)
    JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();)
  }

  void generate_all() {
    // support for verify_oop (must happen after universe_init)
    if (VerifyOops) {
      StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
    }
    StubRoutines::_throw_AbstractMethodError_entry =
      generate_throw_exception("AbstractMethodError throw_exception",
                               CAST_FROM_FN_PTR(address,
                                                SharedRuntime::
                                                throw_AbstractMethodError));

    StubRoutines::_throw_IncompatibleClassChangeError_entry =
      generate_throw_exception("IncompatibleClassChangeError throw_exception",
                               CAST_FROM_FN_PTR(address,
                                                SharedRuntime::
                                                throw_IncompatibleClassChangeError));

    StubRoutines::_throw_NullPointerException_at_call_entry =
      generate_throw_exception("NullPointerException at call throw_exception",
                               CAST_FROM_FN_PTR(address,
                                                SharedRuntime::
                                                throw_NullPointerException_at_call));

    if (UseSVE == 0) {
      StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
    }

    // arraycopy stubs used by compilers
    generate_arraycopy_stubs();

    // countPositives stub for large arrays.
    StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);

    // array equals stub for large arrays.
    if (!UseSimpleArrayEquals) {
      StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
    }

    generate_compare_long_strings();

    generate_string_indexof_stubs();

    // byte_array_inflate stub for large arrays.
    StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();

    BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
    if (bs_nm != NULL) {
      StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
    }
#ifdef COMPILER2
    if (UseMultiplyToLenIntrinsic) {
      StubRoutines::_multiplyToLen = generate_multiplyToLen();
    }

    if (UseSquareToLenIntrinsic) {
      StubRoutines::_squareToLen = generate_squareToLen();
    }

    if (UseMulAddIntrinsic) {
      StubRoutines::_mulAdd = generate_mulAdd();
    }

    if (UseSIMDForBigIntegerShiftIntrinsics) {
      StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
      StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
    }

    if (UseMontgomeryMultiplyIntrinsic) {
      StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
      MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
      StubRoutines::_montgomeryMultiply = g.generate_multiply();
    }

    if (UseMontgomerySquareIntrinsic) {
      StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
      MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
      // We use generate_multiply() rather than generate_square()
      // because it's faster for the sizes of modulus we care about.
      StubRoutines::_montgomerySquare = g.generate_multiply();
    }
#endif // COMPILER2

    if (UseBASE64Intrinsics) {
        StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
        StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
    }

    // data cache line writeback
    StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
    StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();

    if (UseAESIntrinsics) {
      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
    }
    if (UseGHASHIntrinsics) {
      // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
    }
    if (UseAESIntrinsics && UseGHASHIntrinsics) {
      StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
    }

    if (UseMD5Intrinsics) {
      StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
      StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
    }
    if (UseSHA1Intrinsics) {
      StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
      StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
    }
    if (UseSHA256Intrinsics) {
      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
    }
    if (UseSHA512Intrinsics) {
      StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
      StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
    }
    if (UseSHA3Intrinsics) {
      StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
      StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
    }

    // generate Adler32 intrinsics code
    if (UseAdler32Intrinsics) {
      StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
    }

    StubRoutines::aarch64::_spin_wait = generate_spin_wait();

#if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)

    generate_atomic_entry_points();

#endif // LINUX

    StubRoutines::aarch64::set_completed();
  }

 public:
  StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
    if (phase == 0) {
      generate_initial();
    } else if (phase == 1) {
      generate_phase1(); // stubs that must be available for the interpreter
    } else {
      generate_all();
    }
  }
}; // end class declaration

#define UCM_TABLE_MAX_ENTRIES 8
void StubGenerator_generate(CodeBuffer* code, int phase) {
  if (UnsafeCopyMemory::_table == NULL) {
    UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
  }
  StubGenerator g(code, phase);
}


#if defined (LINUX)

// Define pointers to atomic stubs and initialize them to point to the
// code in atomic_aarch64.S.

#define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
  extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
    (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
  aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
    = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;

DEFAULT_ATOMIC_OP(fetch_add, 4, )
DEFAULT_ATOMIC_OP(fetch_add, 8, )
DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
DEFAULT_ATOMIC_OP(xchg, 4, )
DEFAULT_ATOMIC_OP(xchg, 8, )
DEFAULT_ATOMIC_OP(cmpxchg, 1, )
DEFAULT_ATOMIC_OP(cmpxchg, 4, )
DEFAULT_ATOMIC_OP(cmpxchg, 8, )
DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)

#undef DEFAULT_ATOMIC_OP

#endif // LINUX