Merge

2026-05-02 01:35:32 +00:00 · 2012-12-21 01:39:34 -08:00 · 2012-12-21 01:39:34 -08:00 · 42c79d741b
commit 42c79d741b
parent ddb2b5d004 113e9ab39d
48 changed files with 1562 additions and 1231 deletions
--- a/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
@ -298,7 +298,7 @@ void PatchingStub::emit_code(LIR_Assembler* ce) {
    for (int i = 0; i < _bytes_to_copy; i++) {
      address ptr = (address)(_pc_start + i);
      int a_byte = (*ptr) & 0xFF;
-      __ a_byte (a_byte);
+      __ emit_int8 (a_byte);
    }
  }

@ -340,10 +340,10 @@ void PatchingStub::emit_code(LIR_Assembler* ce) {
  int being_initialized_entry_offset = __ offset() - being_initialized_entry + sizeof_patch_record;

  // Emit the patch record.  We need to emit a full word, so emit an extra empty byte
-  __ a_byte(0);
-  __ a_byte(being_initialized_entry_offset);
-  __ a_byte(bytes_to_skip);
-  __ a_byte(_bytes_to_copy);
+  __ emit_int8(0);
+  __ emit_int8(being_initialized_entry_offset);
+  __ emit_int8(bytes_to_skip);
+  __ emit_int8(_bytes_to_copy);
  address patch_info_pc = __ pc();
  assert(patch_info_pc - end_of_patch == bytes_to_skip, "incorrect patch info");

--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
@ -100,34 +100,6 @@ const char* Argument::name() const {
 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
 #endif

-
-void MacroAssembler::print_instruction(int inst) {
-  const char* s;
-  switch (inv_op(inst)) {
-  default:         s = "????"; break;
-  case call_op:    s = "call"; break;
-  case branch_op:
-    switch (inv_op2(inst)) {
-      case fb_op2:     s = "fb";   break;
-      case fbp_op2:    s = "fbp";  break;
-      case br_op2:     s = "br";   break;
-      case bp_op2:     s = "bp";   break;
-      case cb_op2:     s = "cb";   break;
-      case bpr_op2: {
-        if (is_cbcond(inst)) {
-          s = is_cxb(inst) ? "cxb" : "cwb";
-        } else {
-          s = "bpr";
-        }
-        break;
-      }
-      default:         s = "????"; break;
-    }
-  }
-  ::tty->print("%s", s);
-}
-
-
 // Patch instruction inst at offset inst_pos to refer to dest_pos
 // and return the resulting instruction.
 // We should have pcs, not offsets, but since all is relative, it will work out
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp
@ -603,7 +603,6 @@ class MacroAssembler : public Assembler {
  friend class Label;

 protected:
-  static void print_instruction(int inst);
  static int  patched_branch(int dest_pos, int inst, int inst_pos);
  static int  branch_destination(int inst, int pos);

@ -759,9 +758,6 @@ class MacroAssembler : public Assembler {
  // Required platform-specific helpers for Label::patch_instructions.
  // They _shadow_ the declarations in AbstractAssembler, which are undefined.
  void pd_patch_instruction(address branch, address target);
-#ifndef PRODUCT
-  static void pd_print_patched_instruction(address branch);
-#endif

  // sethi Macro handles optimizations and relocations
 private:
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.inline.hpp
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.inline.hpp
@ -43,14 +43,6 @@ inline void MacroAssembler::pd_patch_instruction(address branch, address target)
  stub_inst = patched_branch(target - branch, stub_inst, 0);
 }

-#ifndef PRODUCT
-inline void MacroAssembler::pd_print_patched_instruction(address branch) {
-  jint stub_inst = *(jint*) branch;
-  print_instruction(stub_inst);
-  ::tty->print("%s", " (unresolved)");
-}
-#endif // PRODUCT
-
 // Use the right loads/stores for the platform
 inline void MacroAssembler::ld_ptr( Register s1, Register s2, Register d ) {
 #ifdef _LP64
--- a/hotspot/src/cpu/sparc/vm/sparc.ad
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad
@ -10224,7 +10224,7 @@ instruct array_equals(o0RegP ary1, o1RegP ary2, g3RegI tmp1, notemp_iRegI result

 //---------- Zeros Count Instructions ------------------------------------------

-instruct countLeadingZerosI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{
+instruct countLeadingZerosI(iRegIsafe dst, iRegI src, iRegI tmp, flagsReg cr) %{
  predicate(UsePopCountInstruction);  // See Matcher::match_rule_supported
  match(Set dst (CountLeadingZerosI src));
  effect(TEMP dst, TEMP tmp, KILL cr);
@ -10321,7 +10321,7 @@ instruct countLeadingZerosL(iRegIsafe dst, iRegL src, iRegL tmp, flagsReg cr) %{
  ins_pipe(ialu_reg);
 %}

-instruct countTrailingZerosI(iRegI dst, iRegI src, flagsReg cr) %{
+instruct countTrailingZerosI(iRegIsafe dst, iRegI src, flagsReg cr) %{
  predicate(UsePopCountInstruction);  // See Matcher::match_rule_supported
  match(Set dst (CountTrailingZerosI src));
  effect(TEMP dst, KILL cr);
@ -10364,19 +10364,21 @@ instruct countTrailingZerosL(iRegIsafe dst, iRegL src, flagsReg cr) %{

 //---------- Population Count Instructions -------------------------------------

-instruct popCountI(iRegI dst, iRegI src) %{
+instruct popCountI(iRegIsafe dst, iRegI src) %{
  predicate(UsePopCountInstruction);
  match(Set dst (PopCountI src));

-  format %{ "POPC   $src, $dst" %}
+  format %{ "SRL    $src, G0, $dst\t! clear upper word for 64 bit POPC\n\t"
+            "POPC   $dst, $dst" %}
  ins_encode %{
-    __ popc($src$$Register, $dst$$Register);
+    __ srl($src$$Register, G0, $dst$$Register);
+    __ popc($dst$$Register, $dst$$Register);
  %}
  ins_pipe(ialu_reg);
 %}

 // Note: Long.bitCount(long) returns an int.
-instruct popCountL(iRegI dst, iRegL src) %{
+instruct popCountL(iRegIsafe dst, iRegL src) %{
  predicate(UsePopCountInstruction);
  match(Set dst (PopCountL src));

--- a/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp
@ -434,7 +434,7 @@ void TemplateInterpreterGenerator::generate_stack_overflow_check(Register Rframe

  // the frame is greater than one page in size, so check against
  // the bottom of the stack
-  __ cmp_and_brx_short(SP, Rscratch, Assembler::greater, Assembler::pt, after_frame_check);
+  __ cmp_and_brx_short(SP, Rscratch, Assembler::greaterUnsigned, Assembler::pt, after_frame_check);

  // the stack will overflow, throw an exception

--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
@ -313,10 +313,10 @@ void PatchingStub::emit_code(LIR_Assembler* ce) {
 #endif
  } else {
    // make a copy the code which is going to be patched.
-    for ( int i = 0; i < _bytes_to_copy; i++) {
+    for (int i = 0; i < _bytes_to_copy; i++) {
      address ptr = (address)(_pc_start + i);
      int a_byte = (*ptr) & 0xFF;
-      __ a_byte (a_byte);
+      __ emit_int8(a_byte);
      *ptr = 0x90; // make the site look like a nop
    }
  }
@ -363,11 +363,11 @@ void PatchingStub::emit_code(LIR_Assembler* ce) {
  // emit the offsets needed to find the code to patch
  int being_initialized_entry_offset = __ pc() - being_initialized_entry + sizeof_patch_record;

-  __ a_byte(0xB8);
-  __ a_byte(0);
-  __ a_byte(being_initialized_entry_offset);
-  __ a_byte(bytes_to_skip);
-  __ a_byte(_bytes_to_copy);
+  __ emit_int8((unsigned char)0xB8);
+  __ emit_int8(0);
+  __ emit_int8(being_initialized_entry_offset);
+  __ emit_int8(bytes_to_skip);
+  __ emit_int8(_bytes_to_copy);
  address patch_info_pc = __ pc();
  assert(patch_info_pc - end_of_patch == bytes_to_skip, "incorrect patch info");

--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -1023,7 +1023,7 @@ void MacroAssembler::lea(Address dst, AddressLiteral adr) {

 void MacroAssembler::leave() {
  // %%% is this really better? Why not on 32bit too?
-  emit_byte(0xC9); // LEAVE
+  emit_int8((unsigned char)0xC9); // LEAVE
 }

 void MacroAssembler::lneg(Register hi, Register lo) {
@ -2112,11 +2112,11 @@ void MacroAssembler::fat_nop() {
  if (UseAddressNop) {
    addr_nop_5();
  } else {
-    emit_byte(0x26); // es:
-    emit_byte(0x2e); // cs:
-    emit_byte(0x64); // fs:
-    emit_byte(0x65); // gs:
-    emit_byte(0x90);
+    emit_int8(0x26); // es:
+    emit_int8(0x2e); // cs:
+    emit_int8(0x64); // fs:
+    emit_int8(0x65); // gs:
+    emit_int8((unsigned char)0x90);
  }
 }

@ -2534,12 +2534,12 @@ void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
    int offs = (intptr_t)dst.target() - ((intptr_t)pc());
    if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
      // 0111 tttn #8-bit disp
-      emit_byte(0x70 | cc);
-      emit_byte((offs - short_size) & 0xFF);
+      emit_int8(0x70 | cc);
+      emit_int8((offs - short_size) & 0xFF);
    } else {
      // 0000 1111 1000 tttn #32-bit disp
-      emit_byte(0x0F);
-      emit_byte(0x80 | cc);
+      emit_int8(0x0F);
+      emit_int8((unsigned char)(0x80 | cc));
      emit_long(offs - long_size);
    }
  } else {
@ -3085,7 +3085,8 @@ void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {

 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
  // Used in sign-bit flipping with aligned address.
-  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
+  assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
  if (reachable(src)) {
    Assembler::pshufb(dst, as_Address(src));
  } else {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -126,25 +126,6 @@ class MacroAssembler: public Assembler {
    }
  }

-#ifndef PRODUCT
-  static void pd_print_patched_instruction(address branch) {
-    const char* s;
-    unsigned char op = branch[0];
-    if (op == 0xE8) {
-      s = "call";
-    } else if (op == 0xE9 || op == 0xEB) {
-      s = "jmp";
-    } else if ((op & 0xF0) == 0x70) {
-      s = "jcc";
-    } else if (op == 0x0F) {
-      s = "jcc";
-    } else {
-      s = "????";
-    }
-    tty->print("%s (unresolved)", s);
-  }
-#endif
-
  // The following 4 methods return the offset of the appropriate move instruction

  // Support for fast byte/short loading with zero extension (depending on particular CPU)
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@ -2174,13 +2174,13 @@ class StubGenerator: public StubCodeGenerator {
  //   c_rarg2   - K (key) in little endian int array
  //
  address generate_aescrypt_encryptBlock() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
    Label L_doLast;
    address start = __ pc();

-    const Register from        = rsi;      // source array address
+    const Register from        = rdx;      // source array address
    const Register to          = rdx;      // destination array address
    const Register key         = rcx;      // key array address
    const Register keylen      = rax;
@ -2189,47 +2189,74 @@ class StubGenerator: public StubCodeGenerator {
    const Address  key_param (rbp, 8+8);

    const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    const XMMRegister xmm_key_shuf_mask = xmm2;
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    const XMMRegister xmm_temp1  = xmm2;
+    const XMMRegister xmm_temp2  = xmm3;
+    const XMMRegister xmm_temp3  = xmm4;
+    const XMMRegister xmm_temp4  = xmm5;

-    __ enter(); // required for proper stackwalking of RuntimeStub frame
-    __ push(rsi);
-    __ movptr(from , from_param);
-    __ movptr(to   , to_param);
-    __ movptr(key  , key_param);
+    __ enter();   // required for proper stackwalking of RuntimeStub frame
+    __ movptr(from, from_param);
+    __ movptr(key, key_param);

+    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    // keylen = # of 32-bit words, convert to 128-bit words
-    __ shrl(keylen, 2);
-    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more

    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
+    __ movptr(to, to_param);

    // For encryption, the java expanded key ordering is just what we need

-    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
-    __ pxor(xmm_result, xmm_temp);
-    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
-      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
-    }
-    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
-    __ cmpl(keylen, 0);
-    __ jcc(Assembler::equal, L_doLast);
-    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
-    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
-    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-    __ subl(keylen, 2);
-    __ jcc(Assembler::equal, L_doLast);
-    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
-    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
-    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
+    __ pxor(xmm_result, xmm_temp1);
+
+    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+    __ aesenc(xmm_result, xmm_temp3);
+    __ aesenc(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+    __ aesenc(xmm_result, xmm_temp3);
+    __ aesenc(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 44);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 52);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

    __ BIND(L_doLast);
-    __ aesenclast(xmm_result, xmm_temp);
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenclast(xmm_result, xmm_temp2);
    __ movdqu(Address(to, 0), xmm_result);        // store the result
    __ xorptr(rax, rax); // return 0
-    __ pop(rsi);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);

@ -2245,13 +2272,13 @@ class StubGenerator: public StubCodeGenerator {
  //   c_rarg2   - K (key) in little endian int array
  //
  address generate_aescrypt_decryptBlock() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
    Label L_doLast;
    address start = __ pc();

-    const Register from        = rsi;      // source array address
+    const Register from        = rdx;      // source array address
    const Register to          = rdx;      // destination array address
    const Register key         = rcx;      // key array address
    const Register keylen      = rax;
@ -2260,51 +2287,76 @@ class StubGenerator: public StubCodeGenerator {
    const Address  key_param (rbp, 8+8);

    const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    const XMMRegister xmm_key_shuf_mask = xmm2;
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    const XMMRegister xmm_temp1  = xmm2;
+    const XMMRegister xmm_temp2  = xmm3;
+    const XMMRegister xmm_temp3  = xmm4;
+    const XMMRegister xmm_temp4  = xmm5;

    __ enter(); // required for proper stackwalking of RuntimeStub frame
-    __ push(rsi);
-    __ movptr(from , from_param);
-    __ movptr(to   , to_param);
-    __ movptr(key  , key_param);
+    __ movptr(from, from_param);
+    __ movptr(key, key_param);

+    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    // keylen = # of 32-bit words, convert to 128-bit words
-    __ shrl(keylen, 2);
-    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more

    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    __ movdqu(xmm_result, Address(from, 0));
+    __ movptr(to, to_param);

    // for decryption java expanded key ordering is rotated one position from what we want
    // so we start from 0x10 here and hit 0x00 last
    // we don't know if the key is aligned, hence not using load-execute form
-    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
-    __ pxor  (xmm_result, xmm_temp);
-    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
-      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
-    }
-    __ cmpl(keylen, 0);
-    __ jcc(Assembler::equal, L_doLast);
-    // only in 192 and 256 bit keys
-    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-    __ subl(keylen, 2);
-    __ jcc(Assembler::equal, L_doLast);
-    // only in 256 bit keys
-    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+    __ pxor  (xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+    __ aesdec(xmm_result, xmm_temp3);
+    __ aesdec(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+    __ aesdec(xmm_result, xmm_temp3);
+    __ aesdec(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 44);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 52);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

    __ BIND(L_doLast);
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
    // for decryption the aesdeclast operation is always on key+0x00
-    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
-    __ aesdeclast(xmm_result, xmm_temp);
-
+    __ aesdeclast(xmm_result, xmm_temp3);
    __ movdqu(Address(to, 0), xmm_result);  // store the result
-
    __ xorptr(rax, rax); // return 0
-    __ pop(rsi);
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);

@ -2340,7 +2392,7 @@ class StubGenerator: public StubCodeGenerator {
  //   c_rarg4   - input length
  //
  address generate_cipherBlockChaining_encryptAESCrypt() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
    address start = __ pc();
@ -2393,7 +2445,7 @@ class StubGenerator: public StubCodeGenerator {
    __ jcc(Assembler::notEqual, L_key_192_256);

    // 128 bit code follows here
-    __ movptr(pos, 0);
+    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_loopTop_128);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
@ -2423,15 +2475,15 @@ class StubGenerator: public StubCodeGenerator {
    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
    __ ret(0);

-  __ BIND(L_key_192_256);
-  // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ BIND(L_key_192_256);
+    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
    __ cmpl(rax, 52);
    __ jcc(Assembler::notEqual, L_key_256);

    // 192-bit code follows here (could be changed to use more xmm registers)
-    __ movptr(pos, 0);
-  __ align(OptoLoopAlignment);
-  __ BIND(L_loopTop_192);
+    __ movl(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_192);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector

@ -2452,11 +2504,11 @@ class StubGenerator: public StubCodeGenerator {
    __ jcc(Assembler::notEqual, L_loopTop_192);
    __ jmp(L_exit);

-  __ BIND(L_key_256);
+    __ BIND(L_key_256);
    // 256-bit code follows here (could be changed to use more xmm registers)
-    __ movptr(pos, 0);
-  __ align(OptoLoopAlignment);
-  __ BIND(L_loopTop_256);
+    __ movl(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_256);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector

@ -2495,7 +2547,7 @@ class StubGenerator: public StubCodeGenerator {
  //

  address generate_cipherBlockChaining_decryptAESCrypt() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
    address start = __ pc();
@ -2556,9 +2608,9 @@ class StubGenerator: public StubCodeGenerator {


    // 128-bit code follows here, parallelized
-    __ movptr(pos, 0);
-  __ align(OptoLoopAlignment);
-  __ BIND(L_singleBlock_loopTop_128);
+    __ movl(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_128);
    __ cmpptr(len_reg, 0);           // any blocks left??
    __ jcc(Assembler::equal, L_exit);
    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
@ -2597,7 +2649,7 @@ class StubGenerator: public StubCodeGenerator {
    __ jcc(Assembler::notEqual, L_key_256);

    // 192-bit code follows here (could be optimized to use parallelism)
-    __ movptr(pos, 0);
+    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_singleBlock_loopTop_192);
    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
@ -2622,7 +2674,7 @@ class StubGenerator: public StubCodeGenerator {

    __ BIND(L_key_256);
    // 256-bit code follows here (could be optimized to use parallelism)
-    __ movptr(pos, 0);
+    __ movl(pos, 0);
    __ align(OptoLoopAlignment);
    __ BIND(L_singleBlock_loopTop_256);
    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -2953,21 +2953,6 @@ class StubGenerator: public StubCodeGenerator {
    }
  }

-  // aesenc using specified key+offset
-  // can optionally specify that the shuffle mask is already in an xmmregister
-  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
-    load_key(xmmtmp, key, offset, xmm_shuf_mask);
-    __ aesenc(xmmdst, xmmtmp);
-  }
-
-  // aesdec using specified key+offset
-  // can optionally specify that the shuffle mask is already in an xmmregister
-  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
-    load_key(xmmtmp, key, offset, xmm_shuf_mask);
-    __ aesdec(xmmdst, xmmtmp);
-  }
-
-
  // Arguments:
  //
  // Inputs:
@ -2976,7 +2961,7 @@ class StubGenerator: public StubCodeGenerator {
  //   c_rarg2   - K (key) in little endian int array
  //
  address generate_aescrypt_encryptBlock() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
    Label L_doLast;
@ -2988,15 +2973,17 @@ class StubGenerator: public StubCodeGenerator {
    const Register keylen      = rax;

    const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    const XMMRegister xmm_key_shuf_mask = xmm2;
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    // On win64 xmm6-xmm15 must be preserved so don't use them.
+    const XMMRegister xmm_temp1  = xmm2;
+    const XMMRegister xmm_temp2  = xmm3;
+    const XMMRegister xmm_temp3  = xmm4;
+    const XMMRegister xmm_temp4  = xmm5;

    __ enter(); // required for proper stackwalking of RuntimeStub frame

+    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    // keylen = # of 32-bit words, convert to 128-bit words
-    __ shrl(keylen, 2);
-    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more

    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
@ -3004,25 +2991,53 @@ class StubGenerator: public StubCodeGenerator {
    // For encryption, the java expanded key ordering is just what we need
    // we don't know if the key is aligned, hence not using load-execute form

-    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
-    __ pxor(xmm_result, xmm_temp);
-    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
-      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
-    }
-    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
-    __ cmpl(keylen, 0);
-    __ jcc(Assembler::equal, L_doLast);
-    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
-    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
-    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-    __ subl(keylen, 2);
-    __ jcc(Assembler::equal, L_doLast);
-    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
-    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
-    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
+    __ pxor(xmm_result, xmm_temp1);
+
+    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+    __ aesenc(xmm_result, xmm_temp3);
+    __ aesenc(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+    __ aesenc(xmm_result, xmm_temp3);
+    __ aesenc(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 44);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 52);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

    __ BIND(L_doLast);
-    __ aesenclast(xmm_result, xmm_temp);
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenclast(xmm_result, xmm_temp2);
    __ movdqu(Address(to, 0), xmm_result);        // store the result
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
@ -3040,7 +3055,7 @@ class StubGenerator: public StubCodeGenerator {
  //   c_rarg2   - K (key) in little endian int array
  //
  address generate_aescrypt_decryptBlock() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
    Label L_doLast;
@ -3052,15 +3067,17 @@ class StubGenerator: public StubCodeGenerator {
    const Register keylen      = rax;

    const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    const XMMRegister xmm_key_shuf_mask = xmm2;
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    // On win64 xmm6-xmm15 must be preserved so don't use them.
+    const XMMRegister xmm_temp1  = xmm2;
+    const XMMRegister xmm_temp2  = xmm3;
+    const XMMRegister xmm_temp3  = xmm4;
+    const XMMRegister xmm_temp4  = xmm5;

    __ enter(); // required for proper stackwalking of RuntimeStub frame

+    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    // keylen = # of 32-bit words, convert to 128-bit words
-    __ shrl(keylen, 2);
-    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more

    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    __ movdqu(xmm_result, Address(from, 0));
@ -3068,29 +3085,55 @@ class StubGenerator: public StubCodeGenerator {
    // for decryption java expanded key ordering is rotated one position from what we want
    // so we start from 0x10 here and hit 0x00 last
    // we don't know if the key is aligned, hence not using load-execute form
-    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
-    __ pxor  (xmm_result, xmm_temp);
-    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
-      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
-    }
-    __ cmpl(keylen, 0);
-    __ jcc(Assembler::equal, L_doLast);
-    // only in 192 and 256 bit keys
-    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-    __ subl(keylen, 2);
-    __ jcc(Assembler::equal, L_doLast);
-    // only in 256 bit keys
-    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+    __ pxor  (xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+    __ aesdec(xmm_result, xmm_temp3);
+    __ aesdec(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+    __ aesdec(xmm_result, xmm_temp3);
+    __ aesdec(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 44);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 52);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

    __ BIND(L_doLast);
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
    // for decryption the aesdeclast operation is always on key+0x00
-    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
-    __ aesdeclast(xmm_result, xmm_temp);
-
+    __ aesdeclast(xmm_result, xmm_temp3);
    __ movdqu(Address(to, 0), xmm_result);  // store the result
-
    __ xorptr(rax, rax); // return 0
    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(0);
@ -3109,7 +3152,7 @@ class StubGenerator: public StubCodeGenerator {
  //   c_rarg4   - input length
  //
  address generate_cipherBlockChaining_encryptAESCrypt() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
    address start = __ pc();
@ -3133,16 +3176,19 @@ class StubGenerator: public StubCodeGenerator {
    const XMMRegister xmm_temp   = xmm1;
    // keys 0-10 preloaded into xmm2-xmm12
    const int XMM_REG_NUM_KEY_FIRST = 2;
-    const int XMM_REG_NUM_KEY_LAST  = 12;
+    const int XMM_REG_NUM_KEY_LAST  = 15;
    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
-    const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
+    const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
+    const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
+    const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
+    const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);

    __ enter(); // required for proper stackwalking of RuntimeStub frame

 #ifdef _WIN64
    // on win64, fill len_reg from stack position
    __ movl(len_reg, len_mem);
-    // save the xmm registers which must be preserved 6-12
+    // save the xmm registers which must be preserved 6-15
    __ subptr(rsp, -rsp_after_call_off * wordSize);
    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
      __ movdqu(xmm_save(i), as_XMMRegister(i));
@ -3151,12 +3197,11 @@ class StubGenerator: public StubCodeGenerator {

    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
-    // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
-    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+    // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
      offset += 0x10;
    }
-
    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec

    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
@ -3167,16 +3212,15 @@ class StubGenerator: public StubCodeGenerator {
    // 128 bit code follows here
    __ movptr(pos, 0);
    __ align(OptoLoopAlignment);
+
    __ BIND(L_loopTop_128);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
-
    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
      __ aesenc(xmm_result, as_XMMRegister(rnum));
    }
    __ aesenclast(xmm_result, xmm_key10);
-
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
    // no need to store r to memory until we exit
    __ addptr(pos, AESBlockSize);
@ -3198,24 +3242,23 @@ class StubGenerator: public StubCodeGenerator {

    __ BIND(L_key_192_256);
    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
    __ cmpl(rax, 52);
    __ jcc(Assembler::notEqual, L_key_256);

    // 192-bit code follows here (could be changed to use more xmm registers)
    __ movptr(pos, 0);
    __ align(OptoLoopAlignment);
+
    __ BIND(L_loopTop_192);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
-
    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
      __ aesenc(xmm_result, as_XMMRegister(rnum));
    }
-    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
-    load_key(xmm_temp, key, 0xc0);
-    __ aesenclast(xmm_result, xmm_temp);
-
+    __ aesenclast(xmm_result, xmm_key12);
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
    // no need to store r to memory until we exit
    __ addptr(pos, AESBlockSize);
@ -3225,22 +3268,19 @@ class StubGenerator: public StubCodeGenerator {

    __ BIND(L_key_256);
    // 256-bit code follows here (could be changed to use more xmm registers)
+    load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
    __ movptr(pos, 0);
    __ align(OptoLoopAlignment);
+
    __ BIND(L_loopTop_256);
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
    __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
-
    __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
      __ aesenc(xmm_result, as_XMMRegister(rnum));
    }
-    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
-    aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
-    aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
    load_key(xmm_temp, key, 0xe0);
    __ aesenclast(xmm_result, xmm_temp);
-
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
    // no need to store r to memory until we exit
    __ addptr(pos, AESBlockSize);
@ -3267,7 +3307,7 @@ class StubGenerator: public StubCodeGenerator {
  //

  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
    address start = __ pc();
@ -3288,12 +3328,10 @@ class StubGenerator: public StubCodeGenerator {
 #endif
    const Register pos         = rax;

-    // xmm register assignments for the loops below
-    const XMMRegister xmm_result = xmm0;
    // keys 0-10 preloaded into xmm2-xmm12
    const int XMM_REG_NUM_KEY_FIRST = 5;
    const int XMM_REG_NUM_KEY_LAST  = 15;
-    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+    const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
    const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);

    __ enter(); // required for proper stackwalking of RuntimeStub frame
@ -3312,13 +3350,14 @@ class StubGenerator: public StubCodeGenerator {
    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
    // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
-    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
-      if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
      offset += 0x10;
    }
+    load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);

    const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
+
    // registers holding the four results in the parallelized loop
    const XMMRegister xmm_result0 = xmm0;
    const XMMRegister xmm_result1 = xmm2;
@ -3376,8 +3415,12 @@ class StubGenerator: public StubCodeGenerator {
    __ jmp(L_multiBlock_loopTop_128);

    // registers used in the non-parallelized loops
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
    const XMMRegister xmm_prev_block_cipher_save = xmm2;
-    const XMMRegister xmm_temp   = xmm3;
+    const XMMRegister xmm_key11 = xmm3;
+    const XMMRegister xmm_key12 = xmm4;
+    const XMMRegister xmm_temp  = xmm4;

    __ align(OptoLoopAlignment);
    __ BIND(L_singleBlock_loopTop_128);
@ -3415,12 +3458,15 @@ class StubGenerator: public StubCodeGenerator {

    __ BIND(L_key_192_256);
    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    load_key(xmm_key11, key, 0xb0);
    __ cmpl(rax, 52);
    __ jcc(Assembler::notEqual, L_key_256);

    // 192-bit code follows here (could be optimized to use parallelism)
+    load_key(xmm_key12, key, 0xc0);     // 192-bit key goes up to c0
    __ movptr(pos, 0);
    __ align(OptoLoopAlignment);
+
    __ BIND(L_singleBlock_loopTop_192);
    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
@ -3428,14 +3474,13 @@ class StubGenerator: public StubCodeGenerator {
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
      __ aesdec(xmm_result, as_XMMRegister(rnum));
    }
-    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 192-bit key goes up to c0
-    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
+    __ aesdec(xmm_result, xmm_key11);
+    __ aesdec(xmm_result, xmm_key12);
    __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
    // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
-
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
@ -3445,23 +3490,26 @@ class StubGenerator: public StubCodeGenerator {
    // 256-bit code follows here (could be optimized to use parallelism)
    __ movptr(pos, 0);
    __ align(OptoLoopAlignment);
+
    __ BIND(L_singleBlock_loopTop_256);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
      __ aesdec(xmm_result, as_XMMRegister(rnum));
    }
-    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 256-bit key goes up to e0
-    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
-    __ aesdeclast(xmm_result, xmm_key_last);             // xmm15 came from key+0
+    __ aesdec(xmm_result, xmm_key11);
+    load_key(xmm_temp, key, 0xc0);
+    __ aesdec(xmm_result, xmm_temp);
+    load_key(xmm_temp, key, 0xd0);
+    __ aesdec(xmm_result, xmm_temp);
+    load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
+    __ aesdec(xmm_result, xmm_temp);
+    __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
    // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
-
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
    __ addptr(pos, AESBlockSize);
    __ subptr(len_reg, AESBlockSize);
    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -489,8 +489,8 @@ void VM_Version::get_processor_features() {
  }

  // The AES intrinsic stubs require AES instruction support (of course)
-  // but also require AVX and sse3 modes for instructions it use.
-  if (UseAES && (UseAVX > 0) && (UseSSE > 2)) {
+  // but also require sse3 mode for instructions it use.
+  if (UseAES && (UseSSE > 2)) {
    if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
      UseAESIntrinsics = true;
    }
--- a/hotspot/src/cpu/zero/vm/assembler_zero.cpp
+++ b/hotspot/src/cpu/zero/vm/assembler_zero.cpp
@ -56,15 +56,9 @@ void Assembler::pd_patch_instruction(address branch, address target) {
  ShouldNotCallThis();
 }

-#ifndef PRODUCT
-void Assembler::pd_print_patched_instruction(address branch) {
-  ShouldNotCallThis();
-}
-#endif // PRODUCT
-
 void MacroAssembler::align(int modulus) {
  while (offset() % modulus != 0)
-    emit_byte(AbstractAssembler::code_fill_byte());
+    emit_int8(AbstractAssembler::code_fill_byte());
 }

 void MacroAssembler::bang_stack_with_offset(int offset) {
@ -72,8 +66,7 @@ void MacroAssembler::bang_stack_with_offset(int offset) {
 }

 void MacroAssembler::advance(int bytes) {
-  _code_pos += bytes;
-  sync();
+  code_section()->set_end(code_section()->end() + bytes);
 }

 RegisterOrConstant MacroAssembler::delayed_value_impl(
--- a/hotspot/src/cpu/zero/vm/assembler_zero.hpp
+++ b/hotspot/src/cpu/zero/vm/assembler_zero.hpp
@ -37,9 +37,6 @@ class Assembler : public AbstractAssembler {

 public:
  void pd_patch_instruction(address branch, address target);
-#ifndef PRODUCT
-  static void pd_print_patched_instruction(address branch);
-#endif // PRODUCT
 };

 class MacroAssembler : public Assembler {
--- a/hotspot/src/os_cpu/solaris_x86/vm/assembler_solaris_x86.cpp
+++ b/hotspot/src/os_cpu/solaris_x86/vm/assembler_solaris_x86.cpp
@ -116,7 +116,7 @@ void MacroAssembler::get_thread(Register thread) {
  ThreadLocalStorage::pd_tlsAccessMode tlsMode = ThreadLocalStorage::pd_getTlsAccessMode ();
  if (tlsMode == ThreadLocalStorage::pd_tlsAccessIndirect) {            // T1
     // Use thread as a temporary: mov r, gs:[0]; mov r, [r+tlsOffset]
-     emit_byte (segment);
+     emit_int8 (segment);
     // ExternalAddress doesn't work because it can't take NULL
     AddressLiteral null(0, relocInfo::none);
     movptr (thread, null);
@ -125,7 +125,7 @@ void MacroAssembler::get_thread(Register thread) {
  } else
  if (tlsMode == ThreadLocalStorage::pd_tlsAccessDirect) {              // T2
     // mov r, gs:[tlsOffset]
-     emit_byte (segment);
+     emit_int8 (segment);
     AddressLiteral tls_off((address)ThreadLocalStorage::pd_getTlsOffset(), relocInfo::none);
     movptr (thread, tls_off);
     return ;
--- a/hotspot/src/os_cpu/windows_x86/vm/assembler_windows_x86.cpp
+++ b/hotspot/src/os_cpu/windows_x86/vm/assembler_windows_x86.cpp
@ -30,7 +30,7 @@


 void MacroAssembler::int3() {
-  emit_byte(0xCC);
+  emit_int8((unsigned char)0xCC);
 }

 #ifndef _LP64
--- a/hotspot/src/share/vm/asm/assembler.cpp
+++ b/hotspot/src/share/vm/asm/assembler.cpp
@ -109,37 +109,6 @@ void AbstractAssembler::flush() {
  ICache::invalidate_range(addr_at(0), offset());
 }

-
-void AbstractAssembler::a_byte(int x) {
-  emit_byte(x);
-}
-
-
-void AbstractAssembler::a_long(jint x) {
-  emit_long(x);
-}
-
-// Labels refer to positions in the (to be) generated code.  There are bound
-// and unbound
-//
-// Bound labels refer to known positions in the already generated code.
-// offset() is the position the label refers to.
-//
-// Unbound labels refer to unknown positions in the code to be generated; it
-// may contain a list of unresolved displacements that refer to it
-#ifndef PRODUCT
-void AbstractAssembler::print(Label& L) {
-  if (L.is_bound()) {
-    tty->print_cr("bound label to %d|%d", L.loc_pos(), L.loc_sect());
-  } else if (L.is_unbound()) {
-    L.print_instructions((MacroAssembler*)this);
-  } else {
-    tty->print_cr("label in inconsistent state (loc = %d)", L.loc());
-  }
-}
-#endif // PRODUCT
-
-
 void AbstractAssembler::bind(Label& L) {
  if (L.is_bound()) {
    // Assembler can bind a label more than once to the same place.
@ -342,28 +311,3 @@ bool MacroAssembler::needs_explicit_null_check(intptr_t offset) {
 #endif
  return offset < 0 || os::vm_page_size() <= offset;
 }
-
-#ifndef PRODUCT
-void Label::print_instructions(MacroAssembler* masm) const {
-  CodeBuffer* cb = masm->code();
-  for (int i = 0; i < _patch_index; ++i) {
-    int branch_loc;
-    if (i >= PatchCacheSize) {
-      branch_loc = _patch_overflow->at(i - PatchCacheSize);
-    } else {
-      branch_loc = _patches[i];
-    }
-    int branch_pos  = CodeBuffer::locator_pos(branch_loc);
-    int branch_sect = CodeBuffer::locator_sect(branch_loc);
-    address branch = cb->locator_address(branch_loc);
-    tty->print_cr("unbound label");
-    tty->print("@ %d|%d ", branch_pos, branch_sect);
-    if (branch_sect == CodeBuffer::SECT_CONSTS) {
-      tty->print_cr(PTR_FORMAT, *(address*)branch);
-      continue;
-    }
-    masm->pd_print_patched_instruction(branch);
-    tty->cr();
-  }
-}
-#endif // ndef PRODUCT
--- a/hotspot/src/share/vm/asm/assembler.hpp
+++ b/hotspot/src/share/vm/asm/assembler.hpp
@ -216,17 +216,6 @@ class AbstractAssembler : public ResourceObj  {
  bool isByte(int x) const             { return 0 <= x && x < 0x100; }
  bool isShiftCount(int x) const       { return 0 <= x && x < 32; }

-  void emit_int8(   int8_t  x) { code_section()->emit_int8(   x); }
-  void emit_int16(  int16_t x) { code_section()->emit_int16(  x); }
-  void emit_int32(  int32_t x) { code_section()->emit_int32(  x); }
-  void emit_int64(  int64_t x) { code_section()->emit_int64(  x); }
-
-  void emit_float(  jfloat  x) { code_section()->emit_float(  x); }
-  void emit_double( jdouble x) { code_section()->emit_double( x); }
-  void emit_address(address x) { code_section()->emit_address(x); }
-
-  void emit_byte(int x)  { emit_int8 (x); }  // deprecated
-  void emit_word(int x)  { emit_int16(x); }  // deprecated
  void emit_long(jint x) { emit_int32(x); }  // deprecated

  // Instruction boundaries (required when emitting relocatable values).
@ -277,9 +266,6 @@ class AbstractAssembler : public ResourceObj  {
  };
 #endif

-  // Label functions
-  void print(Label& L);
-
 public:

  // Creation
@ -288,6 +274,15 @@ class AbstractAssembler : public ResourceObj  {
  // ensure buf contains all code (call this before using/copying the code)
  void flush();

+  void emit_int8(   int8_t  x) { code_section()->emit_int8(   x); }
+  void emit_int16(  int16_t x) { code_section()->emit_int16(  x); }
+  void emit_int32(  int32_t x) { code_section()->emit_int32(  x); }
+  void emit_int64(  int64_t x) { code_section()->emit_int64(  x); }
+
+  void emit_float(  jfloat  x) { code_section()->emit_float(  x); }
+  void emit_double( jdouble x) { code_section()->emit_double( x); }
+  void emit_address(address x) { code_section()->emit_address(x); }
+
  // min and max values for signed immediate ranges
  static int min_simm(int nbits) { return -(intptr_t(1) << (nbits - 1))    ; }
  static int max_simm(int nbits) { return  (intptr_t(1) << (nbits - 1)) - 1; }
@ -327,8 +322,6 @@ class AbstractAssembler : public ResourceObj  {
  void    clear_inst_mark()       {        code_section()->clear_mark(); }

  // Constants in code
-  void a_byte(int x);
-  void a_long(jint x);
  void relocate(RelocationHolder const& rspec, int format = 0) {
    assert(!pd_check_instruction_mark()
        || inst_mark() == NULL || inst_mark() == code_section()->end(),
@ -441,15 +434,6 @@ class AbstractAssembler : public ResourceObj  {
   */
  void pd_patch_instruction(address branch, address target);

-#ifndef PRODUCT
-  /**
-   * Platform-dependent method of printing an instruction that needs to be
-   * patched.
-   *
-   * @param branch the instruction to be patched in the buffer.
-   */
-  static void pd_print_patched_instruction(address branch);
-#endif // PRODUCT
 };

 #ifdef TARGET_ARCH_x86
--- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
+++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
@ -3442,6 +3442,11 @@ bool GraphBuilder::try_inline_intrinsics(ciMethod* callee) {
      preserves_state = true;
      break;

+    case vmIntrinsics::_loadFence :
+    case vmIntrinsics::_storeFence:
+    case vmIntrinsics::_fullFence :
+      break;
+
    default                       : return false; // do not inline
  }
  // create intrinsic node
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
@ -2977,6 +2977,16 @@ void LIRGenerator::do_Intrinsic(Intrinsic* x) {
    do_CompareAndSwap(x, longType);
    break;

+  case vmIntrinsics::_loadFence :
+    if (os::is_MP()) __ membar_acquire();
+    break;
+  case vmIntrinsics::_storeFence:
+    if (os::is_MP()) __ membar_release();
+    break;
+  case vmIntrinsics::_fullFence :
+    if (os::is_MP()) __ membar();
+    break;
+
  case vmIntrinsics::_Reference_get:
    do_Reference_get(x);
    break;
--- a/hotspot/src/share/vm/ci/ciField.cpp
+++ b/hotspot/src/share/vm/ci/ciField.cpp
@ -366,10 +366,12 @@ bool ciField::will_link(ciInstanceKlass* accessing_klass,
 // ------------------------------------------------------------------
 // ciField::print
 void ciField::print() {
-  tty->print("<ciField ");
+  tty->print("<ciField name=");
  _holder->print_name();
  tty->print(".");
  _name->print_symbol();
+  tty->print(" signature=");
+  _signature->print_symbol();
  tty->print(" offset=%d type=", _offset);
  if (_type != NULL) _type->print_name();
  else               tty->print("(reference)");
--- a/hotspot/src/share/vm/classfile/classLoaderData.cpp
+++ b/hotspot/src/share/vm/classfile/classLoaderData.cpp
@ -169,16 +169,18 @@ void ClassLoaderData::add_dependency(Handle dependency, TRAPS) {
    ok = (objArrayOop)ok->obj_at(1);
  }

+  // Must handle over GC points
+  assert (last != NULL, "dependencies should be initialized");
+  objArrayHandle last_handle(THREAD, last);
+
  // Create a new dependency node with fields for (class_loader or mirror, next)
  objArrayOop deps = oopFactory::new_objectArray(2, CHECK);
  deps->obj_at_put(0, dependency());

-  // Must handle over more GC points
+  // Must handle over GC points
  objArrayHandle new_dependency(THREAD, deps);

  // Add the dependency under lock
-  assert (last != NULL, "dependencies should be initialized");
-  objArrayHandle last_handle(THREAD, last);
  locked_add_dependency(last_handle, new_dependency);
 }

--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@ -756,6 +756,15 @@
  do_intrinsic(_unpark,                   sun_misc_Unsafe,        unpark_name, unpark_signature,                 F_RN)  \
   do_name(     unpark_name,                                     "unpark")                                              \
   do_alias(    unpark_signature,                               /*(LObject;)V*/ object_void_signature)                  \
+  do_intrinsic(_loadFence,                sun_misc_Unsafe,        loadFence_name, loadFence_signature,           F_RN)  \
+   do_name(     loadFence_name,                                  "loadFence")                                           \
+   do_alias(    loadFence_signature,                              void_method_signature)                                \
+  do_intrinsic(_storeFence,               sun_misc_Unsafe,        storeFence_name, storeFence_signature,         F_RN)  \
+   do_name(     storeFence_name,                                 "storeFence")                                          \
+   do_alias(    storeFence_signature,                             void_method_signature)                                \
+  do_intrinsic(_fullFence,                sun_misc_Unsafe,        fullFence_name, fullFence_signature,           F_RN)  \
+   do_name(     fullFence_name,                                  "fullFence")                                           \
+   do_alias(    fullFence_signature,                              void_method_signature)                                \
                                                                                                                        \
  /* unsafe memory references (there are a lot of them...) */                                                           \
  do_signature(getObject_signature,       "(Ljava/lang/Object;J)Ljava/lang/Object;")                                    \
@ -897,12 +906,14 @@
  do_intrinsic(_getAndAddLong,            sun_misc_Unsafe,        getAndAddLong_name, getAndAddLong_signature, F_R)     \
   do_name(     getAndAddLong_name,                               "getAndAddLong")                                      \
   do_signature(getAndAddLong_signature,                          "(Ljava/lang/Object;JJ)J" )                           \
-  do_intrinsic(_getAndSetInt,             sun_misc_Unsafe,        getAndSet_name, getAndSetInt_signature, F_R)          \
-   do_name(     getAndSet_name,                                   "getAndSet")                                          \
+  do_intrinsic(_getAndSetInt,             sun_misc_Unsafe,        getAndSetInt_name, getAndSetInt_signature, F_R)       \
+   do_name(     getAndSetInt_name,                                "getAndSetInt")                                       \
   do_alias(    getAndSetInt_signature,                         /*"(Ljava/lang/Object;JI)I"*/ getAndAddInt_signature)   \
-  do_intrinsic(_getAndSetLong,            sun_misc_Unsafe,        getAndSet_name, getAndSetLong_signature, F_R)         \
+  do_intrinsic(_getAndSetLong,            sun_misc_Unsafe,        getAndSetLong_name, getAndSetLong_signature, F_R)     \
+   do_name(     getAndSetLong_name,                               "getAndSetLong")                                      \
   do_alias(    getAndSetLong_signature,                        /*"(Ljava/lang/Object;JJ)J"*/ getAndAddLong_signature)  \
-  do_intrinsic(_getAndSetObject,          sun_misc_Unsafe,        getAndSet_name, getAndSetObject_signature,  F_R)      \
+  do_intrinsic(_getAndSetObject,          sun_misc_Unsafe,        getAndSetObject_name, getAndSetObject_signature,  F_R)\
+   do_name(     getAndSetObject_name,                             "getAndSetObject")                                    \
   do_signature(getAndSetObject_signature,                        "(Ljava/lang/Object;JLjava/lang/Object;)Ljava/lang/Object;" ) \
                                                                                                                        \
  /* prefetch_signature is shared by all prefetch variants */                                                           \
--- a/hotspot/src/share/vm/compiler/compilerOracle.cpp
+++ b/hotspot/src/share/vm/compiler/compilerOracle.cpp
@ -538,6 +538,7 @@ void CompilerOracle::parse_from_line(char* line) {

  if (match != NULL) {
    if (!_quiet) {
+      ResourceMark rm;
      tty->print("CompilerOracle: %s ", command_names[command]);
      match->print();
    }
--- a/hotspot/src/share/vm/opto/addnode.cpp
+++ b/hotspot/src/share/vm/opto/addnode.cpp
@ -189,6 +189,11 @@ Node *AddNode::Ideal(PhaseGVN *phase, bool can_reshape) {
      set_req(1, addx);
      set_req(2, a22);
      progress = this;
+      PhaseIterGVN *igvn = phase->is_IterGVN();
+      if (add2->outcnt() == 0 && igvn) {
+        // add disconnected.
+        igvn->_worklist.push(add2);
+      }
    }
  }

@ -624,6 +629,11 @@ Node *AddPNode::Ideal(PhaseGVN *phase, bool can_reshape) {
    if( t22->singleton() && (t22 != Type::TOP) ) {  // Right input is an add of a constant?
      set_req(Address, phase->transform(new (phase->C) AddPNode(in(Base),in(Address),add->in(1))));
      set_req(Offset, add->in(2));
+      PhaseIterGVN *igvn = phase->is_IterGVN();
+      if (add->outcnt() == 0 && igvn) {
+        // add disconnected.
+        igvn->_worklist.push((Node*)add);
+      }
      return this;              // Made progress
    }
  }
--- a/hotspot/src/share/vm/opto/bytecodeInfo.cpp
+++ b/hotspot/src/share/vm/opto/bytecodeInfo.cpp
@ -403,7 +403,7 @@ const char* InlineTree::check_can_parse(ciMethod* callee) {
 //------------------------------print_inlining---------------------------------
 // Really, the failure_msg can be a success message also.
 void InlineTree::print_inlining(ciMethod* callee_method, int caller_bci, const char* failure_msg) const {
-  CompileTask::print_inlining(callee_method, inline_level(), caller_bci, failure_msg ? failure_msg : "inline");
+  C->print_inlining(callee_method, inline_level(), caller_bci, failure_msg ? failure_msg : "inline");
  if (callee_method == NULL)  tty->print(" callee not monotonic or profiled");
  if (Verbose && callee_method) {
    const InlineTree *top = this;
--- a/hotspot/src/share/vm/opto/callGenerator.cpp
+++ b/hotspot/src/share/vm/opto/callGenerator.cpp
@ -274,6 +274,9 @@ class LateInlineCallGenerator : public DirectCallGenerator {
  virtual void do_late_inline();

  virtual JVMState* generate(JVMState* jvms) {
+    Compile *C = Compile::current();
+    C->print_inlining_skip(this);
+
    // Record that this call site should be revisited once the main
    // parse is finished.
    Compile::current()->add_late_inline(this);
@ -284,7 +287,6 @@ class LateInlineCallGenerator : public DirectCallGenerator {
    // as is done for allocations and macro expansion.
    return DirectCallGenerator::generate(jvms);
  }
-
 };


@ -307,7 +309,9 @@ void LateInlineCallGenerator::do_late_inline() {

  // Make sure the state is a MergeMem for parsing.
  if (!map->in(TypeFunc::Memory)->is_MergeMem()) {
-    map->set_req(TypeFunc::Memory, MergeMemNode::make(C, map->in(TypeFunc::Memory)));
+    Node* mem = MergeMemNode::make(C, map->in(TypeFunc::Memory));
+    C->initial_gvn()->set_type_bottom(mem);
+    map->set_req(TypeFunc::Memory, mem);
  }

  // Make enough space for the expression stack and transfer the incoming arguments
@ -320,6 +324,8 @@ void LateInlineCallGenerator::do_late_inline() {
    }
  }

+  C->print_inlining_insert(this);
+
  CompileLog* log = C->log();
  if (log != NULL) {
    log->head("late_inline method='%d'", log->identify(method()));
@ -608,7 +614,7 @@ CallGenerator* CallGenerator::for_method_handle_inline(JVMState* jvms, ciMethod*
        if (cg != NULL && cg->is_inline())
          return cg;
      } else {
-        if (PrintInlining)  CompileTask::print_inlining(callee, jvms->depth() - 1, jvms->bci(), "receiver not constant");
+        if (PrintInlining)  C->print_inlining(callee, jvms->depth() - 1, jvms->bci(), "receiver not constant");
      }
    }
    break;
--- a/hotspot/src/share/vm/opto/callGenerator.hpp
+++ b/hotspot/src/share/vm/opto/callGenerator.hpp
@ -147,9 +147,9 @@ class CallGenerator : public ResourceObj {
                                                CallGenerator* cg);
  virtual Node* generate_predicate(JVMState* jvms) { return NULL; };

-  static void print_inlining(ciMethod* callee, int inline_level, int bci, const char* msg) {
+  static void print_inlining(Compile* C, ciMethod* callee, int inline_level, int bci, const char* msg) {
    if (PrintInlining)
-      CompileTask::print_inlining(callee, inline_level, bci, msg);
+      C->print_inlining(callee, inline_level, bci, msg);
  }
 };

--- a/hotspot/src/share/vm/opto/callnode.cpp
+++ b/hotspot/src/share/vm/opto/callnode.cpp
@ -751,7 +751,7 @@ void CallNode::extract_projections(CallProjections* projs, bool separate_io_proj
        projs->fallthrough_ioproj = pn;
      for (DUIterator j = pn->outs(); pn->has_out(j); j++) {
        Node* e = pn->out(j);
-        if (e->Opcode() == Op_CreateEx && e->in(0)->is_CatchProj()) {
+        if (e->Opcode() == Op_CreateEx && e->in(0)->is_CatchProj() && e->outcnt() > 0) {
          assert(projs->exobj == NULL, "only one");
          projs->exobj = e;
        }
--- a/hotspot/src/share/vm/opto/cfgnode.cpp
+++ b/hotspot/src/share/vm/opto/cfgnode.cpp
@ -1566,6 +1566,10 @@ Node *PhiNode::Ideal(PhaseGVN *phase, bool can_reshape) {
    Node* n = in(j);            // Get the input
    if (rc == NULL || phase->type(rc) == Type::TOP) {
      if (n != top) {           // Not already top?
+        PhaseIterGVN *igvn = phase->is_IterGVN();
+        if (can_reshape && igvn != NULL) {
+          igvn->_worklist.push(r);
+        }
        set_req(j, top);        // Nuke it down
        progress = this;        // Record progress
      }
--- a/hotspot/src/share/vm/opto/compile.cpp
+++ b/hotspot/src/share/vm/opto/compile.cpp
@ -610,7 +610,9 @@ Compile::Compile( ciEnv* ci_env, C2Compiler* compiler, ciMethod* target, int osr
                  _trace_opto_output(TraceOptoOutput || method()->has_option("TraceOptoOutput")),
                  _printer(IdealGraphPrinter::printer()),
 #endif
-                  _congraph(NULL) {
+                  _congraph(NULL),
+                  _print_inlining_list(NULL),
+                  _print_inlining(0) {
  C = this;

  CompileWrapper cw(this);
@ -666,6 +668,9 @@ Compile::Compile( ciEnv* ci_env, C2Compiler* compiler, ciMethod* target, int osr
  PhaseGVN gvn(node_arena(), estimated_size);
  set_initial_gvn(&gvn);

+  if (PrintInlining) {
+    _print_inlining_list = new (comp_arena())GrowableArray<PrintInliningBuffer>(comp_arena(), 1, 1, PrintInliningBuffer());
+  }
  { // Scope for timing the parser
    TracePhase t3("parse", &_t_parser, true);

@ -754,6 +759,7 @@ Compile::Compile( ciEnv* ci_env, C2Compiler* compiler, ciMethod* target, int osr
      }
    }
    assert(_late_inlines.length() == 0, "should have been processed");
+    dump_inlining();

    print_method("Before RemoveUseless", 3);

@ -899,7 +905,9 @@ Compile::Compile( ciEnv* ci_env,
 #endif
    _dead_node_list(comp_arena()),
    _dead_node_count(0),
-    _congraph(NULL) {
+    _congraph(NULL),
+    _print_inlining_list(NULL),
+    _print_inlining(0) {
  C = this;

 #ifndef PRODUCT
@ -3351,3 +3359,11 @@ void Compile::ConstantTable::fill_jump_table(CodeBuffer& cb, MachConstantNode* n
    cb.consts()->relocate((address) constant_addr, relocInfo::internal_word_type);
  }
 }
+
+void Compile::dump_inlining() {
+  if (PrintInlining) {
+    for (int i = 0; i < _print_inlining_list->length(); i++) {
+      tty->print(_print_inlining_list->at(i).ss()->as_string());
+    }
+  }
+}
--- a/hotspot/src/share/vm/opto/compile.hpp
+++ b/hotspot/src/share/vm/opto/compile.hpp
@ -30,6 +30,7 @@
 #include "code/debugInfoRec.hpp"
 #include "code/exceptionHandlerTable.hpp"
 #include "compiler/compilerOracle.hpp"
+#include "compiler/compileBroker.hpp"
 #include "libadt/dict.hpp"
 #include "libadt/port.hpp"
 #include "libadt/vectset.hpp"
@ -369,6 +370,61 @@ class Compile : public Phase {
  GrowableArray<CallGenerator*> _late_inlines;  // List of CallGenerators to be revisited after
                                                // main parsing has finished.

+  // Inlining may not happen in parse order which would make
+  // PrintInlining output confusing. Keep track of PrintInlining
+  // pieces in order.
+  class PrintInliningBuffer : public ResourceObj {
+   private:
+    CallGenerator* _cg;
+    stringStream* _ss;
+
+   public:
+    PrintInliningBuffer()
+      : _cg(NULL) { _ss = new stringStream(); }
+
+    stringStream* ss() const { return _ss; }
+    CallGenerator* cg() const { return _cg; }
+    void set_cg(CallGenerator* cg) { _cg = cg; }
+  };
+
+  GrowableArray<PrintInliningBuffer>* _print_inlining_list;
+  int _print_inlining;
+
+ public:
+
+  outputStream* print_inlining_stream() const {
+    return _print_inlining_list->at(_print_inlining).ss();
+  }
+
+  void print_inlining_skip(CallGenerator* cg) {
+    if (PrintInlining) {
+      _print_inlining_list->at(_print_inlining).set_cg(cg);
+      _print_inlining++;
+      _print_inlining_list->insert_before(_print_inlining, PrintInliningBuffer());
+    }
+  }
+
+  void print_inlining_insert(CallGenerator* cg) {
+    if (PrintInlining) {
+      for (int i = 0; i < _print_inlining_list->length(); i++) {
+        if (_print_inlining_list->at(i).cg() == cg) {
+          _print_inlining_list->insert_before(i+1, PrintInliningBuffer());
+          _print_inlining = i+1;
+          _print_inlining_list->at(i).set_cg(NULL);
+          return;
+        }
+      }
+      ShouldNotReachHere();
+    }
+  }
+
+  void print_inlining(ciMethod* method, int inline_level, int bci, const char* msg = NULL) {
+    stringStream ss;
+    CompileTask::print_inlining(&ss, method, inline_level, bci, msg);
+    print_inlining_stream()->print(ss.as_string());
+  }
+
+ private:
  // Matching, CFG layout, allocation, code generation
  PhaseCFG*             _cfg;                   // Results of CFG finding
  bool                  _select_24_bit_instr;   // We selected an instruction with a 24-bit result
@ -591,7 +647,7 @@ class Compile : public Phase {
  void         reset_dead_node_list()      { _dead_node_list.Reset();
                                             _dead_node_count = 0;
                                           }
-  uint          live_nodes()               {
+  uint          live_nodes() const         {
    int  val = _unique - _dead_node_count;
    assert (val >= 0, err_msg_res("number of tracked dead nodes %d more than created nodes %d", _unique, _dead_node_count));
            return (uint) val;
@ -702,7 +758,7 @@ class Compile : public Phase {

  void              identify_useful_nodes(Unique_Node_List &useful);
  void              update_dead_node_list(Unique_Node_List &useful);
-  void              remove_useless_nodes  (Unique_Node_List &useful);
+  void              remove_useless_nodes (Unique_Node_List &useful);

  WarmCallInfo*     warm_calls() const          { return _warm_calls; }
  void          set_warm_calls(WarmCallInfo* l) { _warm_calls = l; }
@ -711,6 +767,8 @@ class Compile : public Phase {
  // Record this CallGenerator for inlining at the end of parsing.
  void              add_late_inline(CallGenerator* cg) { _late_inlines.push(cg); }

+  void dump_inlining();
+
  // Matching, CFG layout, allocation, code generation
  PhaseCFG*         cfg()                       { return _cfg; }
  bool              select_24_bit_instr() const { return _select_24_bit_instr; }
--- a/hotspot/src/share/vm/opto/doCall.cpp
+++ b/hotspot/src/share/vm/opto/doCall.cpp
@ -40,19 +40,24 @@
 #include "prims/nativeLookup.hpp"
 #include "runtime/sharedRuntime.hpp"

-void trace_type_profile(ciMethod *method, int depth, int bci, ciMethod *prof_method, ciKlass *prof_klass, int site_count, int receiver_count) {
+void trace_type_profile(Compile* C, ciMethod *method, int depth, int bci, ciMethod *prof_method, ciKlass *prof_klass, int site_count, int receiver_count) {
  if (TraceTypeProfile || PrintInlining NOT_PRODUCT(|| PrintOptoInlining)) {
+    outputStream* out = tty;
    if (!PrintInlining) {
      if (NOT_PRODUCT(!PrintOpto &&) !PrintCompilation) {
        method->print_short_name();
        tty->cr();
      }
      CompileTask::print_inlining(prof_method, depth, bci);
+    } else {
+      out = C->print_inlining_stream();
    }
-    CompileTask::print_inline_indent(depth);
-    tty->print(" \\-> TypeProfile (%d/%d counts) = ", receiver_count, site_count);
-    prof_klass->name()->print_symbol();
-    tty->cr();
+    CompileTask::print_inline_indent(depth, out);
+    out->print(" \\-> TypeProfile (%d/%d counts) = ", receiver_count, site_count);
+    stringStream ss;
+    prof_klass->name()->print_symbol_on(&ss);
+    out->print(ss.as_string());
+    out->cr();
  }
 }

@ -233,13 +238,13 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
          }
          if (miss_cg != NULL) {
            if (next_hit_cg != NULL) {
-              trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1));
+              trace_type_profile(C, jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1));
              // We don't need to record dependency on a receiver here and below.
              // Whenever we inline, the dependency is added by Parse::Parse().
              miss_cg = CallGenerator::for_predicted_call(profile.receiver(1), miss_cg, next_hit_cg, PROB_MAX);
            }
            if (miss_cg != NULL) {
-              trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, profile.receiver(0), site_count, receiver_count);
+              trace_type_profile(C, jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, profile.receiver(0), site_count, receiver_count);
              CallGenerator* cg = CallGenerator::for_predicted_call(profile.receiver(0), miss_cg, hit_cg, profile.receiver_prob(0));
              if (cg != NULL)  return cg;
            }
--- a/hotspot/src/share/vm/opto/graphKit.cpp
+++ b/hotspot/src/share/vm/opto/graphKit.cpp
@ -1771,11 +1771,21 @@ void GraphKit::replace_call(CallNode* call, Node* result) {
  CallProjections callprojs;
  call->extract_projections(&callprojs, true);

-  // Replace all the old call edges with the edges from the inlining result
-  C->gvn_replace_by(callprojs.fallthrough_catchproj, final_state->in(TypeFunc::Control));
-  C->gvn_replace_by(callprojs.fallthrough_memproj,   final_state->in(TypeFunc::Memory));
-  C->gvn_replace_by(callprojs.fallthrough_ioproj,    final_state->in(TypeFunc::I_O));
+  Node* init_mem = call->in(TypeFunc::Memory);
  Node* final_mem = final_state->in(TypeFunc::Memory);
+  Node* final_ctl = final_state->in(TypeFunc::Control);
+  Node* final_io = final_state->in(TypeFunc::I_O);
+
+  // Replace all the old call edges with the edges from the inlining result
+  if (callprojs.fallthrough_catchproj != NULL) {
+    C->gvn_replace_by(callprojs.fallthrough_catchproj, final_ctl);
+  }
+  if (callprojs.fallthrough_memproj != NULL) {
+    C->gvn_replace_by(callprojs.fallthrough_memproj,   final_mem);
+  }
+  if (callprojs.fallthrough_ioproj != NULL) {
+    C->gvn_replace_by(callprojs.fallthrough_ioproj,    final_io);
+  }

  // Replace the result with the new result if it exists and is used
  if (callprojs.resproj != NULL && result != NULL) {
@ -2980,7 +2990,7 @@ Node* GraphKit::set_output_for_allocation(AllocateNode* alloc,
  set_control( _gvn.transform(new (C) ProjNode(allocx, TypeFunc::Control) ) );
  // create memory projection for i_o
  set_memory ( _gvn.transform( new (C) ProjNode(allocx, TypeFunc::Memory, true) ), rawidx );
-  make_slow_call_ex(allocx, env()->OutOfMemoryError_klass(), true);
+  make_slow_call_ex(allocx, env()->Throwable_klass(), true);

  // create a memory projection as for the normal control path
  Node* malloc = _gvn.transform(new (C) ProjNode(allocx, TypeFunc::Memory));
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -282,6 +282,7 @@ class LibraryCallKit : public GraphKit {
  typedef enum { LS_xadd, LS_xchg, LS_cmpxchg } LoadStoreKind;
  bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind);
  bool inline_unsafe_ordered_store(BasicType type);
+  bool inline_unsafe_fence(vmIntrinsics::ID id);
  bool inline_fp_conversions(vmIntrinsics::ID id);
  bool inline_number_methods(vmIntrinsics::ID id);
  bool inline_reference_get();
@ -334,6 +335,9 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
    case vmIntrinsics::_getAndSetInt:
    case vmIntrinsics::_getAndSetLong:
    case vmIntrinsics::_getAndSetObject:
+    case vmIntrinsics::_loadFence:
+    case vmIntrinsics::_storeFence:
+    case vmIntrinsics::_fullFence:
      break;  // InlineNatives does not control String.compareTo
    case vmIntrinsics::_Reference_get:
      break;  // InlineNatives does not control Reference.get
@ -536,7 +540,7 @@ JVMState* LibraryIntrinsic::generate(JVMState* jvms) {
  // Try to inline the intrinsic.
  if (kit.try_to_inline()) {
    if (PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) {
-      CompileTask::print_inlining(callee, jvms->depth() - 1, bci, is_virtual() ? "(intrinsic, virtual)" : "(intrinsic)");
+      C->print_inlining(callee, jvms->depth() - 1, bci, is_virtual() ? "(intrinsic, virtual)" : "(intrinsic)");
    }
    C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_worked);
    if (C->log()) {
@ -555,7 +559,7 @@ JVMState* LibraryIntrinsic::generate(JVMState* jvms) {
    if (jvms->has_method()) {
      // Not a root compile.
      const char* msg = is_virtual() ? "failed to inline (intrinsic, virtual)" : "failed to inline (intrinsic)";
-      CompileTask::print_inlining(callee, jvms->depth() - 1, bci, msg);
+      C->print_inlining(callee, jvms->depth() - 1, bci, msg);
    } else {
      // Root compile
      tty->print("Did not generate intrinsic %s%s at bci:%d in",
@ -585,7 +589,7 @@ Node* LibraryIntrinsic::generate_predicate(JVMState* jvms) {
  Node* slow_ctl = kit.try_to_predicate();
  if (!kit.failing()) {
    if (PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) {
-      CompileTask::print_inlining(callee, jvms->depth() - 1, bci, is_virtual() ? "(intrinsic, virtual)" : "(intrinsic)");
+      C->print_inlining(callee, jvms->depth() - 1, bci, is_virtual() ? "(intrinsic, virtual)" : "(intrinsic)");
    }
    C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_worked);
    if (C->log()) {
@ -602,12 +606,12 @@ Node* LibraryIntrinsic::generate_predicate(JVMState* jvms) {
    if (jvms->has_method()) {
      // Not a root compile.
      const char* msg = "failed to generate predicate for intrinsic";
-      CompileTask::print_inlining(kit.callee(), jvms->depth() - 1, bci, msg);
+      C->print_inlining(kit.callee(), jvms->depth() - 1, bci, msg);
    } else {
      // Root compile
-      tty->print("Did not generate predicate for intrinsic %s%s at bci:%d in",
-               vmIntrinsics::name_at(intrinsic_id()),
-               (is_virtual() ? " (virtual)" : ""), bci);
+      C->print_inlining_stream()->print("Did not generate predicate for intrinsic %s%s at bci:%d in",
+                                        vmIntrinsics::name_at(intrinsic_id()),
+                                        (is_virtual() ? " (virtual)" : ""), bci);
    }
  }
  C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_failed);
@ -732,6 +736,10 @@ bool LibraryCallKit::try_to_inline() {
  case vmIntrinsics::_getAndSetLong:            return inline_unsafe_load_store(T_LONG,   LS_xchg);
  case vmIntrinsics::_getAndSetObject:          return inline_unsafe_load_store(T_OBJECT, LS_xchg);

+  case vmIntrinsics::_loadFence:
+  case vmIntrinsics::_storeFence:
+  case vmIntrinsics::_fullFence:                return inline_unsafe_fence(intrinsic_id());
+
  case vmIntrinsics::_currentThread:            return inline_native_currentThread();
  case vmIntrinsics::_isInterrupted:            return inline_native_isInterrupted();

@ -2840,6 +2848,26 @@ bool LibraryCallKit::inline_unsafe_ordered_store(BasicType type) {
  return true;
 }

+bool LibraryCallKit::inline_unsafe_fence(vmIntrinsics::ID id) {
+  // Regardless of form, don't allow previous ld/st to move down,
+  // then issue acquire, release, or volatile mem_bar.
+  insert_mem_bar(Op_MemBarCPUOrder);
+  switch(id) {
+    case vmIntrinsics::_loadFence:
+      insert_mem_bar(Op_MemBarAcquire);
+      return true;
+    case vmIntrinsics::_storeFence:
+      insert_mem_bar(Op_MemBarRelease);
+      return true;
+    case vmIntrinsics::_fullFence:
+      insert_mem_bar(Op_MemBarVolatile);
+      return true;
+    default:
+      fatal_unexpected_iid(id);
+      return false;
+  }
+}
+
 //----------------------------inline_unsafe_allocate---------------------------
 // public native Object sun.mics.Unsafe.allocateInstance(Class<?> cls);
 bool LibraryCallKit::inline_unsafe_allocate() {
@ -2952,14 +2980,23 @@ bool LibraryCallKit::inline_native_isInterrupted() {

  // We only go to the fast case code if we pass two guards.
  // Paths which do not pass are accumulated in the slow_region.
+
+  enum {
+    no_int_result_path   = 1, // t == Thread.current() && !TLS._osthread._interrupted
+    no_clear_result_path = 2, // t == Thread.current() &&  TLS._osthread._interrupted && !clear_int
+    slow_result_path     = 3, // slow path: t.isInterrupted(clear_int)
+    PATH_LIMIT
+  };
+
+  // Ensure that it's not possible to move the load of TLS._osthread._interrupted flag
+  // out of the function.
+  insert_mem_bar(Op_MemBarCPUOrder);
+
+  RegionNode* result_rgn = new (C) RegionNode(PATH_LIMIT);
+  PhiNode*    result_val = new (C) PhiNode(result_rgn, TypeInt::BOOL);
+
  RegionNode* slow_region = new (C) RegionNode(1);
  record_for_igvn(slow_region);
-  RegionNode* result_rgn = new (C) RegionNode(1+3); // fast1, fast2, slow
-  PhiNode*    result_val = new (C) PhiNode(result_rgn, TypeInt::BOOL);
-  enum { no_int_result_path   = 1,
-         no_clear_result_path = 2,
-         slow_result_path     = 3
-  };

  // (a) Receiving thread must be the current thread.
  Node* rec_thr = argument(0);
@ -2968,14 +3005,13 @@ bool LibraryCallKit::inline_native_isInterrupted() {
  Node* cmp_thr = _gvn.transform( new (C) CmpPNode(cur_thr, rec_thr) );
  Node* bol_thr = _gvn.transform( new (C) BoolNode(cmp_thr, BoolTest::ne) );

-  bool known_current_thread = (_gvn.type(bol_thr) == TypeInt::ZERO);
-  if (!known_current_thread)
-    generate_slow_guard(bol_thr, slow_region);
+  generate_slow_guard(bol_thr, slow_region);

  // (b) Interrupt bit on TLS must be false.
  Node* p = basic_plus_adr(top()/*!oop*/, tls_ptr, in_bytes(JavaThread::osthread_offset()));
  Node* osthread = make_load(NULL, p, TypeRawPtr::NOTNULL, T_ADDRESS);
  p = basic_plus_adr(top()/*!oop*/, osthread, in_bytes(OSThread::interrupted_offset()));
+
  // Set the control input on the field _interrupted read to prevent it floating up.
  Node* int_bit = make_load(control(), p, TypeInt::BOOL, T_INT);
  Node* cmp_bit = _gvn.transform( new (C) CmpINode(int_bit, intcon(0)) );
@ -3020,22 +3056,20 @@ bool LibraryCallKit::inline_native_isInterrupted() {
    Node* slow_val = set_results_for_java_call(slow_call);
    // this->control() comes from set_results_for_java_call

-    // If we know that the result of the slow call will be true, tell the optimizer!
-    if (known_current_thread)  slow_val = intcon(1);
-
    Node* fast_io  = slow_call->in(TypeFunc::I_O);
    Node* fast_mem = slow_call->in(TypeFunc::Memory);
+
    // These two phis are pre-filled with copies of of the fast IO and Memory
-    Node* io_phi   = PhiNode::make(result_rgn, fast_io,  Type::ABIO);
-    Node* mem_phi  = PhiNode::make(result_rgn, fast_mem, Type::MEMORY, TypePtr::BOTTOM);
+    PhiNode* result_mem  = PhiNode::make(result_rgn, fast_mem, Type::MEMORY, TypePtr::BOTTOM);
+    PhiNode* result_io   = PhiNode::make(result_rgn, fast_io,  Type::ABIO);

    result_rgn->init_req(slow_result_path, control());
-    io_phi    ->init_req(slow_result_path, i_o());
-    mem_phi   ->init_req(slow_result_path, reset_memory());
+    result_io ->init_req(slow_result_path, i_o());
+    result_mem->init_req(slow_result_path, reset_memory());
    result_val->init_req(slow_result_path, slow_val);

-    set_all_memory( _gvn.transform(mem_phi) );
-    set_i_o(        _gvn.transform(io_phi) );
+    set_all_memory(_gvn.transform(result_mem));
+    set_i_o(       _gvn.transform(result_io));
  }

  C->set_has_split_ifs(true); // Has chance for split-if optimization
@ -3319,7 +3353,7 @@ bool LibraryCallKit::inline_native_subtype_check() {
    Node* arg = args[which_arg];
    arg = null_check(arg);
    if (stopped())  break;
-    args[which_arg] = _gvn.transform(arg);
+    args[which_arg] = arg;

    Node* p = basic_plus_adr(arg, class_klass_offset);
    Node* kls = LoadKlassNode::make(_gvn, immutable_memory(), p, adr_type, kls_type);
--- a/hotspot/src/share/vm/opto/parse3.cpp
+++ b/hotspot/src/share/vm/opto/parse3.cpp
@ -509,6 +509,7 @@ void Parse::do_multianewarray() {
                          makecon(TypeKlassPtr::make(array_klass)),
                          dims);
  }
+  make_slow_call_ex(c, env()->Throwable_klass(), false);

  Node* res = _gvn.transform(new (C) ProjNode(c, TypeFunc::Parms));

--- a/hotspot/src/share/vm/opto/runtime.cpp
+++ b/hotspot/src/share/vm/opto/runtime.cpp
@ -989,7 +989,7 @@ JRT_ENTRY_NO_ASYNC(address, OptoRuntime::handle_exception_C_helper(JavaThread* t
      // since we're notifying the VM on every catch.
      // Force deoptimization and the rest of the lookup
      // will be fine.
-      deoptimize_caller_frame(thread, true);
+      deoptimize_caller_frame(thread);
    }

    // Check the stack guard pages.  If enabled, look for handler in this frame;
@ -1143,19 +1143,24 @@ const TypeFunc *OptoRuntime::rethrow_Type() {


 void OptoRuntime::deoptimize_caller_frame(JavaThread *thread, bool doit) {
-  // Deoptimize frame
-  if (doit) {
-    // Called from within the owner thread, so no need for safepoint
-    RegisterMap reg_map(thread);
-    frame stub_frame = thread->last_frame();
-    assert(stub_frame.is_runtime_frame() || exception_blob()->contains(stub_frame.pc()), "sanity check");
-    frame caller_frame = stub_frame.sender(&reg_map);
-
-    // Deoptimize the caller frame.
-    Deoptimization::deoptimize_frame(thread, caller_frame.id());
+  // Deoptimize the caller before continuing, as the compiled
+  // exception handler table may not be valid.
+  if (!StressCompiledExceptionHandlers && doit) {
+    deoptimize_caller_frame(thread);
  }
 }

+void OptoRuntime::deoptimize_caller_frame(JavaThread *thread) {
+  // Called from within the owner thread, so no need for safepoint
+  RegisterMap reg_map(thread);
+  frame stub_frame = thread->last_frame();
+  assert(stub_frame.is_runtime_frame() || exception_blob()->contains(stub_frame.pc()), "sanity check");
+  frame caller_frame = stub_frame.sender(&reg_map);
+
+  // Deoptimize the caller frame.
+  Deoptimization::deoptimize_frame(thread, caller_frame.id());
+}
+

 bool OptoRuntime::is_deoptimized_caller_frame(JavaThread *thread) {
  // Called from within the owner thread, so no need for safepoint
--- a/hotspot/src/share/vm/opto/runtime.hpp
+++ b/hotspot/src/share/vm/opto/runtime.hpp
@ -174,6 +174,7 @@ private:
  static address handle_exception_C       (JavaThread* thread);
  static address handle_exception_C_helper(JavaThread* thread, nmethod*& nm);
  static address rethrow_C                (oopDesc* exception, JavaThread *thread, address return_pc );
+  static void deoptimize_caller_frame     (JavaThread *thread);
  static void deoptimize_caller_frame     (JavaThread *thread, bool doit);
  static bool is_deoptimized_caller_frame (JavaThread *thread);

--- a/hotspot/src/share/vm/opto/stringopts.cpp
+++ b/hotspot/src/share/vm/opto/stringopts.cpp
@ -744,7 +744,9 @@ bool StringConcat::validate_control_flow() {
      ctrl_path.push(cn);
      ctrl_path.push(cn->proj_out(0));
      ctrl_path.push(cn->proj_out(0)->unique_out());
-      ctrl_path.push(cn->proj_out(0)->unique_out()->as_Catch()->proj_out(0));
+      if (cn->proj_out(0)->unique_out()->as_Catch()->proj_out(0) != NULL) {
+        ctrl_path.push(cn->proj_out(0)->unique_out()->as_Catch()->proj_out(0));
+      }
    } else {
      ShouldNotReachHere();
    }
@ -762,6 +764,12 @@ bool StringConcat::validate_control_flow() {
    } else if (ptr->is_IfTrue()) {
      IfNode* iff = ptr->in(0)->as_If();
      BoolNode* b = iff->in(1)->isa_Bool();
+
+      if (b == NULL) {
+        fail = true;
+        break;
+      }
+
      Node* cmp = b->in(1);
      Node* v1 = cmp->in(1);
      Node* v2 = cmp->in(2);
@ -1408,71 +1416,76 @@ void PhaseStringOpts::replace_string_concat(StringConcat* sc) {
                      Deoptimization::Action_make_not_entrant);
  }

-  // length now contains the number of characters needed for the
-  // char[] so create a new AllocateArray for the char[]
-  Node* char_array = NULL;
-  {
-    PreserveReexecuteState preexecs(&kit);
-    // The original jvms is for an allocation of either a String or
-    // StringBuffer so no stack adjustment is necessary for proper
-    // reexecution.  If we deoptimize in the slow path the bytecode
-    // will be reexecuted and the char[] allocation will be thrown away.
-    kit.jvms()->set_should_reexecute(true);
-    char_array = kit.new_array(__ makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_CHAR))),
-                               length, 1);
-  }
+  Node* result;
+  if (!kit.stopped()) {

-  // Mark the allocation so that zeroing is skipped since the code
-  // below will overwrite the entire array
-  AllocateArrayNode* char_alloc = AllocateArrayNode::Ideal_array_allocation(char_array, _gvn);
-  char_alloc->maybe_set_complete(_gvn);
-
-  // Now copy the string representations into the final char[]
-  Node* start = __ intcon(0);
-  for (int argi = 0; argi < sc->num_arguments(); argi++) {
-    Node* arg = sc->argument(argi);
-    switch (sc->mode(argi)) {
-      case StringConcat::IntMode: {
-        Node* end = __ AddI(start, string_sizes->in(argi));
-        // getChars words backwards so pass the ending point as well as the start
-        int_getChars(kit, arg, char_array, start, end);
-        start = end;
-        break;
-      }
-      case StringConcat::StringNullCheckMode:
-      case StringConcat::StringMode: {
-        start = copy_string(kit, arg, char_array, start);
-        break;
-      }
-      case StringConcat::CharMode: {
-        __ store_to_memory(kit.control(), kit.array_element_address(char_array, start, T_CHAR),
-                           arg, T_CHAR, char_adr_idx);
-        start = __ AddI(start, __ intcon(1));
-        break;
-      }
-      default:
-        ShouldNotReachHere();
+    // length now contains the number of characters needed for the
+    // char[] so create a new AllocateArray for the char[]
+    Node* char_array = NULL;
+    {
+      PreserveReexecuteState preexecs(&kit);
+      // The original jvms is for an allocation of either a String or
+      // StringBuffer so no stack adjustment is necessary for proper
+      // reexecution.  If we deoptimize in the slow path the bytecode
+      // will be reexecuted and the char[] allocation will be thrown away.
+      kit.jvms()->set_should_reexecute(true);
+      char_array = kit.new_array(__ makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_CHAR))),
+                                 length, 1);
    }
-  }

-  // If we're not reusing an existing String allocation then allocate one here.
-  Node* result = sc->string_alloc();
-  if (result == NULL) {
-    PreserveReexecuteState preexecs(&kit);
-    // The original jvms is for an allocation of either a String or
-    // StringBuffer so no stack adjustment is necessary for proper
-    // reexecution.
-    kit.jvms()->set_should_reexecute(true);
-    result = kit.new_instance(__ makecon(TypeKlassPtr::make(C->env()->String_klass())));
-  }
+    // Mark the allocation so that zeroing is skipped since the code
+    // below will overwrite the entire array
+    AllocateArrayNode* char_alloc = AllocateArrayNode::Ideal_array_allocation(char_array, _gvn);
+    char_alloc->maybe_set_complete(_gvn);

-  // Intialize the string
-  if (java_lang_String::has_offset_field()) {
-    kit.store_String_offset(kit.control(), result, __ intcon(0));
-    kit.store_String_length(kit.control(), result, length);
-  }
-  kit.store_String_value(kit.control(), result, char_array);
+    // Now copy the string representations into the final char[]
+    Node* start = __ intcon(0);
+    for (int argi = 0; argi < sc->num_arguments(); argi++) {
+      Node* arg = sc->argument(argi);
+      switch (sc->mode(argi)) {
+        case StringConcat::IntMode: {
+          Node* end = __ AddI(start, string_sizes->in(argi));
+          // getChars words backwards so pass the ending point as well as the start
+          int_getChars(kit, arg, char_array, start, end);
+          start = end;
+          break;
+        }
+        case StringConcat::StringNullCheckMode:
+        case StringConcat::StringMode: {
+          start = copy_string(kit, arg, char_array, start);
+          break;
+        }
+        case StringConcat::CharMode: {
+          __ store_to_memory(kit.control(), kit.array_element_address(char_array, start, T_CHAR),
+                             arg, T_CHAR, char_adr_idx);
+          start = __ AddI(start, __ intcon(1));
+          break;
+        }
+        default:
+          ShouldNotReachHere();
+      }
+    }

+    // If we're not reusing an existing String allocation then allocate one here.
+    result = sc->string_alloc();
+    if (result == NULL) {
+      PreserveReexecuteState preexecs(&kit);
+      // The original jvms is for an allocation of either a String or
+      // StringBuffer so no stack adjustment is necessary for proper
+      // reexecution.
+      kit.jvms()->set_should_reexecute(true);
+      result = kit.new_instance(__ makecon(TypeKlassPtr::make(C->env()->String_klass())));
+    }
+
+    // Intialize the string
+    if (java_lang_String::has_offset_field()) {
+      kit.store_String_offset(kit.control(), result, __ intcon(0));
+      kit.store_String_length(kit.control(), result, length);
+    }
+    kit.store_String_value(kit.control(), result, char_array);
+  } else {
+    result = C->top();
+  }
  // hook up the outgoing control and result
  kit.replace_call(sc->end(), result);

--- a/hotspot/src/share/vm/prims/methodHandles.cpp
+++ b/hotspot/src/share/vm/prims/methodHandles.cpp
@ -1168,8 +1168,8 @@ JVM_ENTRY(void, MHN_setCallSiteTargetNormal(JNIEnv* env, jobject igcls, jobject
    // Walk all nmethods depending on this call site.
    MutexLocker mu(Compile_lock, thread);
    Universe::flush_dependents_on(call_site, target);
+    java_lang_invoke_CallSite::set_target(call_site(), target());
  }
-  java_lang_invoke_CallSite::set_target(call_site(), target());
 }
 JVM_END

@ -1180,8 +1180,8 @@ JVM_ENTRY(void, MHN_setCallSiteTargetVolatile(JNIEnv* env, jobject igcls, jobjec
    // Walk all nmethods depending on this call site.
    MutexLocker mu(Compile_lock, thread);
    Universe::flush_dependents_on(call_site, target);
+    java_lang_invoke_CallSite::set_target_volatile(call_site(), target());
  }
-  java_lang_invoke_CallSite::set_target_volatile(call_site(), target());
 }
 JVM_END

--- a/hotspot/src/share/vm/prims/unsafe.cpp
+++ b/hotspot/src/share/vm/prims/unsafe.cpp
@ -468,6 +468,21 @@ UNSAFE_ENTRY(void, Unsafe_SetOrderedLong(JNIEnv *env, jobject unsafe, jobject ob
 #endif
 UNSAFE_END

+UNSAFE_ENTRY(void, Unsafe_LoadFence(JNIEnv *env, jobject unsafe))
+  UnsafeWrapper("Unsafe_LoadFence");
+  OrderAccess::acquire();
+UNSAFE_END
+
+UNSAFE_ENTRY(void, Unsafe_StoreFence(JNIEnv *env, jobject unsafe))
+  UnsafeWrapper("Unsafe_StoreFence");
+  OrderAccess::release();
+UNSAFE_END
+
+UNSAFE_ENTRY(void, Unsafe_FullFence(JNIEnv *env, jobject unsafe))
+  UnsafeWrapper("Unsafe_FullFence");
+  OrderAccess::fence();
+UNSAFE_END
+
 ////// Data in the C heap.

 // Note:  These do not throw NullPointerException for bad pointers.
@ -1550,6 +1565,9 @@ static JNINativeMethod methods[] = {
    {CC"putOrderedObject",   CC"("OBJ"J"OBJ")V",         FN_PTR(Unsafe_SetOrderedObject)},
    {CC"putOrderedInt",      CC"("OBJ"JI)V",             FN_PTR(Unsafe_SetOrderedInt)},
    {CC"putOrderedLong",     CC"("OBJ"JJ)V",             FN_PTR(Unsafe_SetOrderedLong)},
+    {CC"loadFence",          CC"()V",                    FN_PTR(Unsafe_LoadFence)},
+    {CC"storeFence",         CC"()V",                    FN_PTR(Unsafe_StoreFence)},
+    {CC"fullFence",          CC"()V",                    FN_PTR(Unsafe_FullFence)},
    {CC"park",               CC"(ZJ)V",                  FN_PTR(Unsafe_Park)},
    {CC"unpark",             CC"("OBJ")V",               FN_PTR(Unsafe_Unpark)}

--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@ -922,6 +922,9 @@ class CommandLineFlags {
  develop(bool, PrintExceptionHandlers, false,                              \
          "Print exception handler tables for all nmethods when generated") \
                                                                            \
+  develop(bool, StressCompiledExceptionHandlers, false,                     \
+         "Exercise compiled exception handlers")                            \
+                                                                            \
  develop(bool, InterceptOSException, false,                                \
          "Starts debugger when an implicit OS (e.g., NULL) "               \
          "exception happens")                                              \
--- a/hotspot/src/share/vm/runtime/thread.cpp
+++ b/hotspot/src/share/vm/runtime/thread.cpp
@ -2190,7 +2190,7 @@ void JavaThread::send_thread_stop(oop java_throwable)  {
          // BiasedLocking needs an updated RegisterMap for the revoke monitors pass
          RegisterMap reg_map(this, UseBiasedLocking);
          frame compiled_frame = f.sender(&reg_map);
-          if (compiled_frame.can_be_deoptimized()) {
+          if (!StressCompiledExceptionHandlers && compiled_frame.can_be_deoptimized()) {
            Deoptimization::deoptimize(this, compiled_frame, &reg_map);
          }
        }
--- a/hotspot/test/compiler/7184394/TestAESBase.java
+++ b/hotspot/test/compiler/7184394/TestAESBase.java
@ -54,7 +54,6 @@ abstract public class TestAESBase {
  String paddingStr = "PKCS5Padding";
  AlgorithmParameters algParams;
  SecretKey key;
-  int ivLen;

  static int numThreads = 0;
  int  threadId;
@ -68,7 +67,7 @@ abstract public class TestAESBase {

  public void prepare() {
    try {
-    System.out.println("\nmsgSize=" + msgSize + ", key size=" + keySize + ", reInit=" + !noReinit + ", checkOutput=" + checkOutput);
+    System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput);

      int keyLenBytes = (keySize == 0 ? 16 : keySize/8);
      byte keyBytes[] = new byte[keyLenBytes];
@ -90,10 +89,14 @@ abstract public class TestAESBase {
      cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
      dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");

-      ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
-      IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
-
-      cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+      if (mode.equals("CBC")) {
+        int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
+        IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
+        cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+      } else {
+        algParams = cipher.getParameters();
+        cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+      }
      algParams = cipher.getParameters();
      dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
      if (threadId == 0) {
--- a/hotspot/test/compiler/7184394/TestAESMain.java
+++ b/hotspot/test/compiler/7184394/TestAESMain.java
@ -27,7 +27,8 @@
 * @bug 7184394
 * @summary add intrinsics to use AES instructions
 *
- * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB TestAESMain
 *
 * @author Tom Deneau
 */
--- a/hotspot/test/compiler/8004741/Test8004741.java
+++ b/hotspot/test/compiler/8004741/Test8004741.java
@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test Test8004741.java
+ * @bug 8004741
+ * @summary Missing compiled exception handle table entry for multidimensional array allocation
+ * @run main/othervm -Xmx64m -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:+StressCompiledExceptionHandlers Test8004741
+ *
+ */
+
+import java.util.*;
+
+public class Test8004741 extends Thread {
+
+  static int[][] test(int a, int b) throws Exception {
+    int[][] ar = null;
+    try {
+      ar = new int[a][b];
+    } catch (Error e) {
+      System.out.println("test got Error");
+      passed = true;
+      throw(e);
+    } catch (Exception e) {
+      System.out.println("test got Exception");
+      throw(e);
+    }
+    return ar;
+  }
+
+  static boolean passed = false;
+
+  public void run() {
+      System.out.println("test started");
+      try {
+        while(true) {
+          test(2,20000);
+        }
+      } catch (ThreadDeath e) {
+        System.out.println("test got ThreadDeath");
+        passed = true;
+      } catch (Error e) {
+        e.printStackTrace();
+        System.out.println("test got Error");
+      } catch (Exception e) {
+        e.printStackTrace();
+        System.out.println("test got Exception");
+      }
+  }
+
+  public static void main(String[] args) throws Exception {
+    for (int n = 0; n < 11000; n++) {
+      test(2, 20);
+    }
+
+    // First test exception catch
+    Test8004741 t = new Test8004741();
+
+    passed = false;
+    t.start();
+    Thread.sleep(1000);
+    t.stop();
+
+    Thread.sleep(5000);
+    t.join();
+    if (passed) {
+      System.out.println("PASSED");
+    } else {
+      System.out.println("FAILED");
+      System.exit(97);
+    }
+  }
+
+};
--- a/hotspot/test/compiler/8005033/Test8005033.java
+++ b/hotspot/test/compiler/8005033/Test8005033.java
@ -0,0 +1,50 @@
+/*
+ * Copyright 2012 SAP AG.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8005033
+ * @summary On sparcv9, C2's intrinsic for Integer.bitCount(OV) returns wrong result if OV is the result of an operation with int overflow.
+ * @run main/othervm -Xcomp -XX:CompileOnly=Test8005033::testBitCount Test8005033
+ * @author Richard Reingruber richard DOT reingruber AT sap DOT com
+ */
+
+public class Test8005033 {
+    public static int MINUS_ONE = -1;
+
+    public static void main(String[] args) {
+        System.out.println("EXECUTING test.");
+        Integer.bitCount(1);   // load class
+        int expectedBitCount = 0;
+        int calculatedBitCount = testBitCount();
+        if (expectedBitCount != calculatedBitCount) {
+            throw new InternalError("got " + calculatedBitCount + " but expected " + expectedBitCount);
+        }
+        System.out.println("SUCCESSFULLY passed test.");
+    }
+
+    // testBitCount will be compiled using the Integer.bitCount() intrinsic if possible
+    private static int testBitCount() {
+        return Integer.bitCount(MINUS_ONE+1);   // -1 + 1 => int overflow
+    }
+}