Merge

2026-04-18 10:50:23 +00:00 · 2012-09-24 14:46:06 -07:00 · 2012-09-24 14:46:06 -07:00 · 913a550c76
commit 913a550c76
parent d6538dc298 370f0c73b3
76 changed files with 1945 additions and 876 deletions
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
@ -725,24 +725,6 @@ void MacroAssembler::jump(const AddressLiteral& addrlit, Register temp, int offs
 }


-// Convert to C varargs format
-void MacroAssembler::set_varargs( Argument inArg, Register d ) {
-  // spill register-resident args to their memory slots
-  // (SPARC calling convention requires callers to have already preallocated these)
-  // Note that the inArg might in fact be an outgoing argument,
-  // if a leaf routine or stub does some tricky argument shuffling.
-  // This routine must work even though one of the saved arguments
-  // is in the d register (e.g., set_varargs(Argument(0, false), O0)).
-  for (Argument savePtr = inArg;
-       savePtr.is_register();
-       savePtr = savePtr.successor()) {
-    st_ptr(savePtr.as_register(), savePtr.address_in_frame());
-  }
-  // return the address of the first memory slot
-  Address a = inArg.address_in_frame();
-  add(a.base(), a.disp(), d);
-}
-
 // Conditional breakpoint (for assertion checks in assembly code)
 void MacroAssembler::breakpoint_trap(Condition c, CC cc) {
  trap(c, cc, G0, ST_RESERVED_FOR_USER_0);
@ -2943,6 +2925,20 @@ void MacroAssembler::lookup_interface_method(Register recv_klass,
  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
         "caller must use same register for non-constant itable index as for method");

+  Label L_no_such_interface_restore;
+  bool did_save = false;
+  if (scan_temp == noreg || sethi_temp == noreg) {
+    Register recv_2 = recv_klass->is_global() ? recv_klass : L0;
+    Register intf_2 = intf_klass->is_global() ? intf_klass : L1;
+    assert(method_result->is_global(), "must be able to return value");
+    scan_temp  = L2;
+    sethi_temp = L3;
+    save_frame_and_mov(0, recv_klass, recv_2, intf_klass, intf_2);
+    recv_klass = recv_2;
+    intf_klass = intf_2;
+    did_save = true;
+  }
+
  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
  int scan_step   = itableOffsetEntry::size() * wordSize;
@ -2981,7 +2977,7 @@ void MacroAssembler::lookup_interface_method(Register recv_klass,
  //     result = (klass + scan->offset() + itable_index);
  //   }
  // }
-  Label search, found_method;
+  Label L_search, L_found_method;

  for (int peel = 1; peel >= 0; peel--) {
    // %%%% Could load both offset and interface in one ldx, if they were
@ -2991,23 +2987,23 @@ void MacroAssembler::lookup_interface_method(Register recv_klass,
    // Check that this entry is non-null.  A null entry means that
    // the receiver class doesn't implement the interface, and wasn't the
    // same as when the caller was compiled.
-    bpr(Assembler::rc_z, false, Assembler::pn, method_result, L_no_such_interface);
+    bpr(Assembler::rc_z, false, Assembler::pn, method_result, did_save ? L_no_such_interface_restore : L_no_such_interface);
    delayed()->cmp(method_result, intf_klass);

    if (peel) {
-      brx(Assembler::equal,    false, Assembler::pt, found_method);
+      brx(Assembler::equal,    false, Assembler::pt, L_found_method);
    } else {
-      brx(Assembler::notEqual, false, Assembler::pn, search);
+      brx(Assembler::notEqual, false, Assembler::pn, L_search);
      // (invert the test to fall through to found_method...)
    }
    delayed()->add(scan_temp, scan_step, scan_temp);

    if (!peel)  break;

-    bind(search);
+    bind(L_search);
  }

-  bind(found_method);
+  bind(L_found_method);

  // Got a hit.
  int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
@ -3015,6 +3011,18 @@ void MacroAssembler::lookup_interface_method(Register recv_klass,
  ito_offset -= scan_step;
  lduw(scan_temp, ito_offset, scan_temp);
  ld_ptr(recv_klass, scan_temp, method_result);
+
+  if (did_save) {
+    Label L_done;
+    ba(L_done);
+    delayed()->restore();
+
+    bind(L_no_such_interface_restore);
+    ba(L_no_such_interface);
+    delayed()->restore();
+
+    bind(L_done);
+  }
 }


--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
@ -2428,9 +2428,6 @@ public:
  static void test();
 #endif

-  // convert an incoming arglist to varargs format; put the pointer in d
-  void set_varargs( Argument a, Register d );
-
  int total_frame_size_in_bytes(int extraWords);

  // used when extraWords known statically
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.inline.hpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.inline.hpp
@ -347,7 +347,11 @@ inline void Assembler::sub(Register s1, RegisterOrConstant s2, Register d, int o
 inline void Assembler::swap(    Register s1, Register s2, Register d) { v9_dep();  emit_long( op(ldst_op) | rd(d) | op3(swap_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::swap(    Register s1, int simm13a, Register d) { v9_dep();  emit_data( op(ldst_op) | rd(d) | op3(swap_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }

-inline void Assembler::swap(    Address& a, Register d, int offset ) { relocate(a.rspec(offset)); swap(  a.base(), a.disp() + offset, d ); }
+inline void Assembler::swap(    Address& a, Register d, int offset ) {
+  relocate(a.rspec(offset));
+  if (a.has_index()) { assert(offset == 0, ""); swap( a.base(), a.index(), d         ); }
+  else               {                          swap( a.base(), a.disp() + offset, d ); }
+}


 // Use the right loads/stores for the platform
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
@ -1315,7 +1315,13 @@ void LIR_Assembler::const2reg(LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_cod

 Address LIR_Assembler::as_Address(LIR_Address* addr) {
  Register reg = addr->base()->as_register();
-  return Address(reg, addr->disp());
+  LIR_Opr index = addr->index();
+  if (index->is_illegal()) {
+    return Address(reg, addr->disp());
+  } else {
+    assert (addr->disp() == 0, "unsupported address mode");
+    return Address(reg, index->as_pointer_register());
+  }
 }


@ -3438,7 +3444,28 @@ void LIR_Assembler::peephole(LIR_List* lir) {
  }
 }

+void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp) {
+  LIR_Address* addr = src->as_address_ptr();

+  assert(data == dest, "swap uses only 2 operands");
+  assert (code == lir_xchg, "no xadd on sparc");

+  if (data->type() == T_INT) {
+    __ swap(as_Address(addr), data->as_register());
+  } else if (data->is_oop()) {
+    Register obj = data->as_register();
+    Register narrow = tmp->as_register();
+#ifdef _LP64
+    assert(UseCompressedOops, "swap is 32bit only");
+    __ encode_heap_oop(obj, narrow);
+    __ swap(as_Address(addr), narrow);
+    __ decode_heap_oop(narrow, obj);
+#else
+    __ swap(as_Address(addr), obj);
+#endif
+  } else {
+    ShouldNotReachHere();
+  }
+}

 #undef __
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
@ -1204,3 +1204,58 @@ void LIRGenerator::get_Object_unsafe(LIR_Opr dst, LIR_Opr src, LIR_Opr offset,
    __ load(addr, dst);
  }
 }
+
+void LIRGenerator::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {
+  BasicType type = x->basic_type();
+  LIRItem src(x->object(), this);
+  LIRItem off(x->offset(), this);
+  LIRItem value(x->value(), this);
+
+  src.load_item();
+  value.load_item();
+  off.load_nonconstant();
+
+  LIR_Opr dst = rlock_result(x, type);
+  LIR_Opr data = value.result();
+  bool is_obj = (type == T_ARRAY || type == T_OBJECT);
+  LIR_Opr offset = off.result();
+
+  if (data != dst) {
+    __ move(data, dst);
+    data = dst;
+  }
+
+  assert (!x->is_add() && (type == T_INT || (is_obj LP64_ONLY(&& UseCompressedOops))), "unexpected type");
+  LIR_Address* addr;
+  if (offset->is_constant()) {
+
+#ifdef _LP64
+    jlong l = offset->as_jlong();
+    assert((jlong)((jint)l) == l, "offset too large for constant");
+    jint c = (jint)l;
+#else
+    jint c = offset->as_jint();
+#endif
+    addr = new LIR_Address(src.result(), c, type);
+  } else {
+    addr = new LIR_Address(src.result(), offset, type);
+  }
+
+  LIR_Opr tmp = LIR_OprFact::illegalOpr;
+  LIR_Opr ptr = LIR_OprFact::illegalOpr;
+
+  if (is_obj) {
+    // Do the pre-write barrier, if any.
+    // barriers on sparc don't work with a base + index address
+    tmp = FrameMap::G3_opr;
+    ptr = new_pointer_register();
+    __ add(src.result(), off.result(), ptr);
+    pre_barrier(ptr, LIR_OprFact::illegalOpr /* pre_val */,
+                true /* do_load */, false /* patch */, NULL);
+  }
+  __ xchg(LIR_OprFact::address(addr), data, dst, tmp);
+  if (is_obj) {
+    // Seems to be a precise address
+    post_barrier(ptr, data);
+  }
+}
--- a/hotspot/src/cpu/sparc/vm/methodHandles_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/methodHandles_sparc.cpp
@ -121,6 +121,7 @@ void MethodHandles::verify_ref_kind(MacroAssembler* _masm, int ref_kind, Registe
 void MethodHandles::jump_from_method_handle(MacroAssembler* _masm, Register method, Register target, Register temp,
                                            bool for_compiler_entry) {
  assert(method == G5_method, "interpreter calling convention");
+  assert_different_registers(method, target, temp);

  if (!for_compiler_entry && JvmtiExport::can_post_interpreter_events()) {
    Label run_compiled_code;
@ -153,19 +154,19 @@ void MethodHandles::jump_to_lambda_form(MacroAssembler* _masm,
  BLOCK_COMMENT("jump_to_lambda_form {");
  // This is the initial entry point of a lazy method handle.
  // After type checking, it picks up the invoker from the LambdaForm.
-  assert_different_registers(recv, method_temp, temp2, temp3);
+  assert_different_registers(recv, method_temp, temp2);  // temp3 is only passed on
  assert(method_temp == G5_method, "required register for loading method");

  //NOT_PRODUCT({ FlagSetting fs(TraceMethodHandles, true); trace_method_handle(_masm, "LZMH"); });

  // Load the invoker, as MH -> MH.form -> LF.vmentry
  __ verify_oop(recv);
-  __ load_heap_oop(Address(recv,        NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes())),       method_temp);
+  __ load_heap_oop(Address(recv,        NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes())),   method_temp);
  __ verify_oop(method_temp);
-  __ load_heap_oop(Address(method_temp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())), method_temp);
+  __ load_heap_oop(Address(method_temp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())),  method_temp);
  __ verify_oop(method_temp);
  // the following assumes that a Method* is normally compressed in the vmtarget field:
-  __ ld_ptr(Address(method_temp, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes())),     method_temp);
+  __ ld_ptr(       Address(method_temp, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes())), method_temp);

  if (VerifyMethodHandles && !for_compiler_entry) {
    // make sure recv is already on stack
@ -303,25 +304,25 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
                                                    Register member_reg,
                                                    bool for_compiler_entry) {
  assert(is_signature_polymorphic(iid), "expected invoke iid");
-  // temps used in this code are not used in *either* compiled or interpreted calling sequences
  Register temp1 = (for_compiler_entry ? G1_scratch : O1);
-  Register temp2 = (for_compiler_entry ? G4_scratch : O4);
-  Register temp3 = G3_scratch;
-  Register temp4 = (for_compiler_entry ? noreg      : O2);
+  Register temp2 = (for_compiler_entry ? G3_scratch : O2);
+  Register temp3 = (for_compiler_entry ? G4_scratch : O3);
+  Register temp4 = (for_compiler_entry ? noreg      : O4);
  if (for_compiler_entry) {
    assert(receiver_reg == (iid == vmIntrinsics::_linkToStatic ? noreg : O0), "only valid assignment");
-    assert_different_registers(temp1,      O0, O1, O2, O3, O4, O5);
-    assert_different_registers(temp2,      O0, O1, O2, O3, O4, O5);
-    assert_different_registers(temp3,      O0, O1, O2, O3, O4, O5);
-    assert_different_registers(temp4,      O0, O1, O2, O3, O4, O5);
+    assert_different_registers(temp1, O0, O1, O2, O3, O4, O5);
+    assert_different_registers(temp2, O0, O1, O2, O3, O4, O5);
+    assert_different_registers(temp3, O0, O1, O2, O3, O4, O5);
+    assert_different_registers(temp4, O0, O1, O2, O3, O4, O5);
+  } else {
+    assert_different_registers(temp1, temp2, temp3, temp4, O5_savedSP);  // don't trash lastSP
  }
  if (receiver_reg != noreg)  assert_different_registers(temp1, temp2, temp3, temp4, receiver_reg);
  if (member_reg   != noreg)  assert_different_registers(temp1, temp2, temp3, temp4, member_reg);
-  if (!for_compiler_entry)    assert_different_registers(temp1, temp2, temp3, temp4, O5_savedSP);  // don't trash lastSP

  if (iid == vmIntrinsics::_invokeBasic) {
    // indirect through MH.form.vmentry.vmtarget
-    jump_to_lambda_form(_masm, receiver_reg, G5_method, temp2, temp3, for_compiler_entry);
+    jump_to_lambda_form(_masm, receiver_reg, G5_method, temp1, temp2, for_compiler_entry);

  } else {
    // The method is a member invoker used by direct method handles.
@ -378,24 +379,22 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
    //  member_reg - MemberName that was the trailing argument
    //  temp1_recv_klass - klass of stacked receiver, if needed
    //  O5_savedSP - interpreter linkage (if interpreted)
-    //  O0..O7,G1,G4 - compiler arguments (if compiled)
+    //  O0..O5 - compiler arguments (if compiled)

-    bool method_is_live = false;
+    Label L_incompatible_class_change_error;
    switch (iid) {
    case vmIntrinsics::_linkToSpecial:
      if (VerifyMethodHandles) {
-        verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
+        verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp2);
      }
      __ ld_ptr(member_vmtarget, G5_method);
-      method_is_live = true;
      break;

    case vmIntrinsics::_linkToStatic:
      if (VerifyMethodHandles) {
-        verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
+        verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp2);
      }
      __ ld_ptr(member_vmtarget, G5_method);
-      method_is_live = true;
      break;

    case vmIntrinsics::_linkToVirtual:
@ -404,7 +403,7 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
      // minus the CP setup and profiling:

      if (VerifyMethodHandles) {
-        verify_ref_kind(_masm, JVM_REF_invokeVirtual, member_reg, temp3);
+        verify_ref_kind(_masm, JVM_REF_invokeVirtual, member_reg, temp2);
      }

      // pick out the vtable index from the MemberName, and then we can discard it:
@ -423,7 +422,6 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,

      // get target Method* & entry point
      __ lookup_virtual_method(temp1_recv_klass, temp2_index, G5_method);
-      method_is_live = true;
      break;
    }

@ -432,13 +430,13 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
      // same as TemplateTable::invokeinterface
      // (minus the CP setup and profiling, with different argument motion)
      if (VerifyMethodHandles) {
-        verify_ref_kind(_masm, JVM_REF_invokeInterface, member_reg, temp3);
+        verify_ref_kind(_masm, JVM_REF_invokeInterface, member_reg, temp2);
      }

-      Register temp3_intf = temp3;
-      __ load_heap_oop(member_clazz, temp3_intf);
-      load_klass_from_Class(_masm, temp3_intf, temp2, temp4);
-      __ verify_klass_ptr(temp3_intf);
+      Register temp2_intf = temp2;
+      __ load_heap_oop(member_clazz, temp2_intf);
+      load_klass_from_Class(_masm, temp2_intf, temp3, temp4);
+      __ verify_klass_ptr(temp2_intf);

      Register G5_index = G5_method;
      __ ld_ptr(member_vmindex, G5_index);
@ -450,37 +448,34 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
      }

      // given intf, index, and recv klass, dispatch to the implementation method
-      Label L_no_such_interface;
-      Register no_sethi_temp = noreg;
-      __ lookup_interface_method(temp1_recv_klass, temp3_intf,
+      __ lookup_interface_method(temp1_recv_klass, temp2_intf,
                                 // note: next two args must be the same:
                                 G5_index, G5_method,
-                                 temp2, no_sethi_temp,
-                                 L_no_such_interface);
-
-      __ verify_method_ptr(G5_method);
-      jump_from_method_handle(_masm, G5_method, temp2, temp3, for_compiler_entry);
-
-      __ bind(L_no_such_interface);
-      AddressLiteral icce(StubRoutines::throw_IncompatibleClassChangeError_entry());
-      __ jump_to(icce, temp3);
-      __ delayed()->nop();
+                                 temp3, temp4,
+                                 L_incompatible_class_change_error);
      break;
    }

    default:
-      fatal(err_msg("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
+      fatal(err_msg_res("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
      break;
    }

-    if (method_is_live) {
-      // live at this point:  G5_method, O5_savedSP (if interpreted)
+    // Live at this point:
+    //   G5_method
+    //   O5_savedSP (if interpreted)

-      // After figuring out which concrete method to call, jump into it.
-      // Note that this works in the interpreter with no data motion.
-      // But the compiled version will require that rcx_recv be shifted out.
-      __ verify_method_ptr(G5_method);
-      jump_from_method_handle(_masm, G5_method, temp1, temp3, for_compiler_entry);
+    // After figuring out which concrete method to call, jump into it.
+    // Note that this works in the interpreter with no data motion.
+    // But the compiled version will require that rcx_recv be shifted out.
+    __ verify_method_ptr(G5_method);
+    jump_from_method_handle(_masm, G5_method, temp1, temp2, for_compiler_entry);
+
+    if (iid == vmIntrinsics::_linkToInterface) {
+      __ BIND(L_incompatible_class_change_error);
+      AddressLiteral icce(StubRoutines::throw_IncompatibleClassChangeError_entry());
+      __ jump_to(icce, temp1);
+      __ delayed()->nop();
    }
  }
 }
--- a/hotspot/src/cpu/sparc/vm/sharedRuntime_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/sharedRuntime_sparc.cpp
@ -313,6 +313,14 @@ void RegisterSaver::restore_result_registers(MacroAssembler* masm) {

 }

+// Is vector's size (in bytes) bigger than a size saved by default?
+// 8 bytes FP registers are saved by default on SPARC.
+bool SharedRuntime::is_wide_vector(int size) {
+  // Note, MaxVectorSize == 8 on SPARC.
+  assert(size <= 8, err_msg_res("%d bytes vectors are not supported", size));
+  return size > 8;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@ -364,9 +372,9 @@ static VMRegPair reg64_to_VMRegPair(Register r) {
 // ---------------------------------------------------------------------------
 // The compiled Java calling convention.  The Java convention always passes
 // 64-bit values in adjacent aligned locations (either registers or stack),
-// floats in float registers and doubles in aligned float pairs.  Values are
-// packed in the registers.  There is no backing varargs store for values in
-// registers.  In the 32-bit build, longs are passed in G1 and G4 (cannot be
+// floats in float registers and doubles in aligned float pairs.  There is
+// no backing varargs store for values in registers.
+// In the 32-bit build, longs are passed on the stack (cannot be
 // passed in I's, because longs in I's get their heads chopped off at
 // interrupt).
 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
@ -375,76 +383,13 @@ int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
                                           int is_outgoing) {
  assert(F31->as_VMReg()->is_reg(), "overlapping stack/register numbers");

-  // Convention is to pack the first 6 int/oop args into the first 6 registers
-  // (I0-I5), extras spill to the stack.  Then pack the first 8 float args
-  // into F0-F7, extras spill to the stack.  Then pad all register sets to
-  // align.  Then put longs and doubles into the same registers as they fit,
-  // else spill to the stack.
  const int int_reg_max = SPARC_ARGS_IN_REGS_NUM;
  const int flt_reg_max = 8;
-  //
-  // Where 32-bit 1-reg longs start being passed
-  // In tiered we must pass on stack because c1 can't use a "pair" in a single reg.
-  // So make it look like we've filled all the G regs that c2 wants to use.
-  Register g_reg = TieredCompilation ? noreg : G1;

-  // Count int/oop and float args.  See how many stack slots we'll need and
-  // where the longs & doubles will go.
-  int int_reg_cnt   = 0;
-  int flt_reg_cnt   = 0;
-  // int stk_reg_pairs = frame::register_save_words*(wordSize>>2);
-  // int stk_reg_pairs = SharedRuntime::out_preserve_stack_slots();
-  int stk_reg_pairs = 0;
-  for (int i = 0; i < total_args_passed; i++) {
-    switch (sig_bt[i]) {
-    case T_LONG:                // LP64, longs compete with int args
-      assert(sig_bt[i+1] == T_VOID, "");
-#ifdef _LP64
-      if (int_reg_cnt < int_reg_max)  int_reg_cnt++;
-#endif
-      break;
-    case T_OBJECT:
-    case T_ARRAY:
-    case T_ADDRESS: // Used, e.g., in slow-path locking for the lock's stack address
-      if (int_reg_cnt < int_reg_max)  int_reg_cnt++;
-#ifndef _LP64
-      else                            stk_reg_pairs++;
-#endif
-      break;
-    case T_INT:
-    case T_SHORT:
-    case T_CHAR:
-    case T_BYTE:
-    case T_BOOLEAN:
-      if (int_reg_cnt < int_reg_max)  int_reg_cnt++;
-      else                            stk_reg_pairs++;
-      break;
-    case T_FLOAT:
-      if (flt_reg_cnt < flt_reg_max)  flt_reg_cnt++;
-      else                            stk_reg_pairs++;
-      break;
-    case T_DOUBLE:
-      assert(sig_bt[i+1] == T_VOID, "");
-      break;
-    case T_VOID:
-      break;
-    default:
-      ShouldNotReachHere();
-    }
-  }
-
-  // This is where the longs/doubles start on the stack.
-  stk_reg_pairs = (stk_reg_pairs+1) & ~1; // Round
-
-  int flt_reg_pairs = (flt_reg_cnt+1) & ~1;
-
-  // int stk_reg = frame::register_save_words*(wordSize>>2);
-  // int stk_reg = SharedRuntime::out_preserve_stack_slots();
-  int stk_reg = 0;
  int int_reg = 0;
  int flt_reg = 0;
+  int slot = 0;

-  // Now do the signature layout
  for (int i = 0; i < total_args_passed; i++) {
    switch (sig_bt[i]) {
    case T_INT:
@ -461,11 +406,14 @@ int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
        Register r = is_outgoing ? as_oRegister(int_reg++) : as_iRegister(int_reg++);
        regs[i].set1(r->as_VMReg());
      } else {
-        regs[i].set1(VMRegImpl::stack2reg(stk_reg++));
+        regs[i].set1(VMRegImpl::stack2reg(slot++));
      }
      break;

 #ifdef _LP64
+    case T_LONG:
+      assert(sig_bt[i+1] == T_VOID, "expecting VOID in other half");
+      // fall-through
    case T_OBJECT:
    case T_ARRAY:
    case T_ADDRESS: // Used, e.g., in slow-path locking for the lock's stack address
@ -473,78 +421,57 @@ int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
        Register r = is_outgoing ? as_oRegister(int_reg++) : as_iRegister(int_reg++);
        regs[i].set2(r->as_VMReg());
      } else {
-        regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-        stk_reg_pairs += 2;
+        slot = round_to(slot, 2);  // align
+        regs[i].set2(VMRegImpl::stack2reg(slot));
+        slot += 2;
      }
      break;
-#endif // _LP64
-
+#else
    case T_LONG:
      assert(sig_bt[i+1] == T_VOID, "expecting VOID in other half");
-#ifdef _LP64
-        if (int_reg < int_reg_max) {
-          Register r = is_outgoing ? as_oRegister(int_reg++) : as_iRegister(int_reg++);
-          regs[i].set2(r->as_VMReg());
-        } else {
-          regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-          stk_reg_pairs += 2;
-        }
-#else
-#ifdef COMPILER2
-        // For 32-bit build, can't pass longs in O-regs because they become
-        // I-regs and get trashed.  Use G-regs instead.  G1 and G4 are almost
-        // spare and available.  This convention isn't used by the Sparc ABI or
-        // anywhere else. If we're tiered then we don't use G-regs because c1
-        // can't deal with them as a "pair". (Tiered makes this code think g's are filled)
-        // G0: zero
-        // G1: 1st Long arg
-        // G2: global allocated to TLS
-        // G3: used in inline cache check
-        // G4: 2nd Long arg
-        // G5: used in inline cache check
-        // G6: used by OS
-        // G7: used by OS
-
-        if (g_reg == G1) {
-          regs[i].set2(G1->as_VMReg()); // This long arg in G1
-          g_reg = G4;                  // Where the next arg goes
-        } else if (g_reg == G4) {
-          regs[i].set2(G4->as_VMReg()); // The 2nd long arg in G4
-          g_reg = noreg;               // No more longs in registers
-        } else {
-          regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-          stk_reg_pairs += 2;
-        }
-#else // COMPILER2
-          regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-          stk_reg_pairs += 2;
-#endif // COMPILER2
-#endif // _LP64
+      // On 32-bit SPARC put longs always on the stack to keep the pressure off
+      // integer argument registers.  They should be used for oops.
+      slot = round_to(slot, 2);  // align
+      regs[i].set2(VMRegImpl::stack2reg(slot));
+      slot += 2;
+#endif
      break;

    case T_FLOAT:
-      if (flt_reg < flt_reg_max) regs[i].set1(as_FloatRegister(flt_reg++)->as_VMReg());
-      else                       regs[i].set1(VMRegImpl::stack2reg(stk_reg++));
-      break;
-    case T_DOUBLE:
-      assert(sig_bt[i+1] == T_VOID, "expecting half");
-      if (flt_reg_pairs + 1 < flt_reg_max) {
-        regs[i].set2(as_FloatRegister(flt_reg_pairs)->as_VMReg());
-        flt_reg_pairs += 2;
+      if (flt_reg < flt_reg_max) {
+        FloatRegister r = as_FloatRegister(flt_reg++);
+        regs[i].set1(r->as_VMReg());
      } else {
-        regs[i].set2(VMRegImpl::stack2reg(stk_reg_pairs));
-        stk_reg_pairs += 2;
+        regs[i].set1(VMRegImpl::stack2reg(slot++));
      }
      break;
-    case T_VOID: regs[i].set_bad();  break; // Halves of longs & doubles
+
+    case T_DOUBLE:
+      assert(sig_bt[i+1] == T_VOID, "expecting half");
+      if (round_to(flt_reg, 2) + 1 < flt_reg_max) {
+        flt_reg = round_to(flt_reg, 2);  // align
+        FloatRegister r = as_FloatRegister(flt_reg);
+        regs[i].set2(r->as_VMReg());
+        flt_reg += 2;
+      } else {
+        slot = round_to(slot, 2);  // align
+        regs[i].set2(VMRegImpl::stack2reg(slot));
+        slot += 2;
+      }
+      break;
+
+    case T_VOID:
+      regs[i].set_bad();   // Halves of longs & doubles
+      break;
+
    default:
-      ShouldNotReachHere();
+      fatal(err_msg_res("unknown basic type %d", sig_bt[i]));
+      break;
    }
  }

  // retun the amount of stack space these arguments will need.
-  return stk_reg_pairs;
-
+  return slot;
 }

 // Helper class mostly to avoid passing masm everywhere, and handle
@ -601,8 +528,7 @@ void AdapterGenerator::patch_callers_callsite() {
  Label L;
  __ ld_ptr(G5_method, in_bytes(Method::code_offset()), G3_scratch);
  __ br_null(G3_scratch, false, Assembler::pt, L);
-  // Schedule the branch target address early.
-  __ delayed()->ld_ptr(G5_method, in_bytes(Method::interpreter_entry_offset()), G3_scratch);
+  __ delayed()->nop();
  // Call into the VM to patch the caller, then jump to compiled callee
  __ save_frame(4);     // Args in compiled layout; do not blow them

@ -645,7 +571,6 @@ void AdapterGenerator::patch_callers_callsite() {
  __ ldx(FP, -8 + STACK_BIAS, G1);
  __ ldx(FP, -16 + STACK_BIAS, G4);
  __ mov(L5, G5_method);
-  __ ld_ptr(G5_method, in_bytes(Method::interpreter_entry_offset()), G3_scratch);
 #endif /* _LP64 */

  __ restore();      // Restore args
@ -726,7 +651,7 @@ void AdapterGenerator::gen_c2i_adapter(
                            int comp_args_on_stack, // VMRegStackSlots
                            const BasicType *sig_bt,
                            const VMRegPair *regs,
-                            Label& skip_fixup) {
+                            Label& L_skip_fixup) {

  // Before we get into the guts of the C2I adapter, see if we should be here
  // at all.  We've come from compiled code and are attempting to jump to the
@ -747,7 +672,7 @@ void AdapterGenerator::gen_c2i_adapter(

  patch_callers_callsite();

-  __ bind(skip_fixup);
+  __ bind(L_skip_fixup);

  // Since all args are passed on the stack, total_args_passed*wordSize is the
  // space we need.  Add in varargs area needed by the interpreter. Round up
@ -757,46 +682,18 @@ void AdapterGenerator::gen_c2i_adapter(
                 (frame::varargs_offset - frame::register_save_words)*wordSize;
  const int extraspace = round_to(arg_size + varargs_area, 2*wordSize);

-  int bias = STACK_BIAS;
+  const int bias = STACK_BIAS;
  const int interp_arg_offset = frame::varargs_offset*wordSize +
                        (total_args_passed-1)*Interpreter::stackElementSize;

-  Register base = SP;
+  const Register base = SP;

-#ifdef _LP64
-  // In the 64bit build because of wider slots and STACKBIAS we can run
-  // out of bits in the displacement to do loads and stores.  Use g3 as
-  // temporary displacement.
-  if (!Assembler::is_simm13(extraspace)) {
-    __ set(extraspace, G3_scratch);
-    __ sub(SP, G3_scratch, SP);
-  } else {
-    __ sub(SP, extraspace, SP);
-  }
+  // Make some extra space on the stack.
+  __ sub(SP, __ ensure_simm13_or_reg(extraspace, G3_scratch), SP);
  set_Rdisp(G3_scratch);
-#else
-  __ sub(SP, extraspace, SP);
-#endif // _LP64

-  // First write G1 (if used) to where ever it must go
-  for (int i=0; i<total_args_passed; i++) {
-    const int st_off = interp_arg_offset - (i*Interpreter::stackElementSize) + bias;
-    VMReg r_1 = regs[i].first();
-    VMReg r_2 = regs[i].second();
-    if (r_1 == G1_scratch->as_VMReg()) {
-      if (sig_bt[i] == T_OBJECT || sig_bt[i] == T_ARRAY) {
-        store_c2i_object(G1_scratch, base, st_off);
-      } else if (sig_bt[i] == T_LONG) {
-        assert(!TieredCompilation, "should not use register args for longs");
-        store_c2i_long(G1_scratch, base, st_off, false);
-      } else {
-        store_c2i_int(G1_scratch, base, st_off);
-      }
-    }
-  }
-
-  // Now write the args into the outgoing interpreter space
-  for (int i=0; i<total_args_passed; i++) {
+  // Write the args into the outgoing interpreter space.
+  for (int i = 0; i < total_args_passed; i++) {
    const int st_off = interp_arg_offset - (i*Interpreter::stackElementSize) + bias;
    VMReg r_1 = regs[i].first();
    VMReg r_2 = regs[i].second();
@ -804,23 +701,9 @@ void AdapterGenerator::gen_c2i_adapter(
      assert(!r_2->is_valid(), "");
      continue;
    }
-    // Skip G1 if found as we did it first in order to free it up
-    if (r_1 == G1_scratch->as_VMReg()) {
-      continue;
-    }
-#ifdef ASSERT
-    bool G1_forced = false;
-#endif // ASSERT
    if (r_1->is_stack()) {        // Pretend stack targets are loaded into G1
-#ifdef _LP64
-      Register ld_off = Rdisp;
-      __ set(reg2offset(r_1) + extraspace + bias, ld_off);
-#else
-      int ld_off = reg2offset(r_1) + extraspace + bias;
-#endif // _LP64
-#ifdef ASSERT
-      G1_forced = true;
-#endif // ASSERT
+      RegisterOrConstant ld_off = reg2offset(r_1) + extraspace + bias;
+      ld_off = __ ensure_simm13_or_reg(ld_off, Rdisp);
      r_1 = G1_scratch->as_VMReg();// as part of the load/store shuffle
      if (!r_2->is_valid()) __ ld (base, ld_off, G1_scratch);
      else                  __ ldx(base, ld_off, G1_scratch);
@ -831,11 +714,6 @@ void AdapterGenerator::gen_c2i_adapter(
      if (sig_bt[i] == T_OBJECT || sig_bt[i] == T_ARRAY) {
        store_c2i_object(r, base, st_off);
      } else if (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
-#ifndef _LP64
-        if (TieredCompilation) {
-          assert(G1_forced || sig_bt[i] != T_LONG, "should not use register args for longs");
-        }
-#endif // _LP64
        store_c2i_long(r, base, st_off, r_2->is_stack());
      } else {
        store_c2i_int(r, base, st_off);
@ -851,19 +729,12 @@ void AdapterGenerator::gen_c2i_adapter(
    }
  }

-#ifdef _LP64
-  // Need to reload G3_scratch, used for temporary displacements.
+  // Load the interpreter entry point.
  __ ld_ptr(G5_method, in_bytes(Method::interpreter_entry_offset()), G3_scratch);

  // Pass O5_savedSP as an argument to the interpreter.
  // The interpreter will restore SP to this value before returning.
-  __ set(extraspace, G1);
-  __ add(SP, G1, O5_savedSP);
-#else
-  // Pass O5_savedSP as an argument to the interpreter.
-  // The interpreter will restore SP to this value before returning.
-  __ add(SP, extraspace, O5_savedSP);
-#endif // _LP64
+  __ add(SP, __ ensure_simm13_or_reg(extraspace, G1), O5_savedSP);

  __ mov((frame::varargs_offset)*wordSize -
         1*Interpreter::stackElementSize+bias+BytesPerWord, G1);
@ -971,7 +842,6 @@ void AdapterGenerator::gen_i2c_adapter(

  // Outputs:
  // G2_thread      - TLS
-  // G1, G4         - Outgoing long args in 32-bit build
  // O0-O5          - Outgoing args in compiled layout
  // O6             - Adjusted or restored SP
  // O7             - Valid return address
@ -1016,10 +886,10 @@ void AdapterGenerator::gen_i2c_adapter(
  // +--------------+ <--- start of outgoing args
  // |  pad, align  |   |
  // +--------------+   |
-  // | ints, floats |   |---Outgoing stack args, packed low.
-  // +--------------+   |   First few args in registers.
-  // :   doubles    :   |
-  // |   longs      |   |
+  // | ints, longs, |   |
+  // |    floats,   |   |---Outgoing stack args.
+  // :    doubles   :   |   First few args in registers.
+  // |              |   |
  // +--------------+ <--- SP' + 16*wordsize
  // |              |
  // :    window    :
@ -1033,7 +903,6 @@ void AdapterGenerator::gen_i2c_adapter(
  // Cut-out for having no stack args.  Since up to 6 args are passed
  // in registers, we will commonly have no stack args.
  if (comp_args_on_stack > 0) {
-
    // Convert VMReg stack slots to words.
    int comp_words_on_stack = round_to(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
    // Round up to miminum stack alignment, in wordSize
@ -1044,13 +913,9 @@ void AdapterGenerator::gen_i2c_adapter(
    __ sub(SP, (comp_words_on_stack)*wordSize, SP);
  }

-  // Will jump to the compiled code just as if compiled code was doing it.
-  // Pre-load the register-jump target early, to schedule it better.
-  __ ld_ptr(G5_method, in_bytes(Method::from_compiled_offset()), G3);
-
  // Now generate the shuffle code.  Pick up all register args and move the
  // rest through G1_scratch.
-  for (int i=0; i<total_args_passed; i++) {
+  for (int i = 0; i < total_args_passed; i++) {
    if (sig_bt[i] == T_VOID) {
      // Longs and doubles are passed in native word order, but misaligned
      // in the 32-bit build.
@ -1088,14 +953,13 @@ void AdapterGenerator::gen_i2c_adapter(
              next_arg_slot(ld_off) : arg_slot(ld_off);
        __ ldx(Gargs, slot, r);
 #else
-        // Need to load a 64-bit value into G1/G4, but G1/G4 is being used in the
-        // stack shuffle.  Load the first 2 longs into G1/G4 later.
+        fatal("longs should be on stack");
 #endif
      }
    } else {
      assert(r_1->is_FloatRegister(), "");
      if (!r_2->is_valid()) {
-        __ ldf(FloatRegisterImpl::S, Gargs, arg_slot(ld_off), r_1->as_FloatRegister());
+        __ ldf(FloatRegisterImpl::S, Gargs,      arg_slot(ld_off), r_1->as_FloatRegister());
      } else {
 #ifdef _LP64
        // In V9, doubles are given 2 64-bit slots in the interpreter, but the
@ -1104,11 +968,11 @@ void AdapterGenerator::gen_i2c_adapter(
        // spare float register.
        RegisterOrConstant slot = (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) ?
              next_arg_slot(ld_off) : arg_slot(ld_off);
-        __ ldf(FloatRegisterImpl::D, Gargs, slot, r_1->as_FloatRegister());
+        __ ldf(FloatRegisterImpl::D, Gargs,                  slot, r_1->as_FloatRegister());
 #else
        // Need to marshal 64-bit value from misaligned Lesp loads
        __ ldf(FloatRegisterImpl::S, Gargs, next_arg_slot(ld_off), r_1->as_FloatRegister());
-        __ ldf(FloatRegisterImpl::S, Gargs, arg_slot(ld_off), r_2->as_FloatRegister());
+        __ ldf(FloatRegisterImpl::S, Gargs,      arg_slot(ld_off), r_2->as_FloatRegister());
 #endif
      }
    }
@ -1124,76 +988,35 @@ void AdapterGenerator::gen_i2c_adapter(
      else                  __ stf(FloatRegisterImpl::D, r_1->as_FloatRegister(), SP, slot);
    }
  }
-  bool made_space = false;
-#ifndef _LP64
-  // May need to pick up a few long args in G1/G4
-  bool g4_crushed = false;
-  bool g3_crushed = false;
-  for (int i=0; i<total_args_passed; i++) {
-    if (regs[i].first()->is_Register() && regs[i].second()->is_valid()) {
-      // Load in argument order going down
-      int ld_off = (total_args_passed-i)*Interpreter::stackElementSize;
-      // Need to marshal 64-bit value from misaligned Lesp loads
-      Register r = regs[i].first()->as_Register()->after_restore();
-      if (r == G1 || r == G4) {
-        assert(!g4_crushed, "ordering problem");
-        if (r == G4){
-          g4_crushed = true;
-          __ lduw(Gargs, arg_slot(ld_off)     , G3_scratch); // Load lo bits
-          __ ld  (Gargs, next_arg_slot(ld_off), r);          // Load hi bits
-        } else {
-          // better schedule this way
-          __ ld  (Gargs, next_arg_slot(ld_off), r);          // Load hi bits
-          __ lduw(Gargs, arg_slot(ld_off)     , G3_scratch); // Load lo bits
-        }
-        g3_crushed = true;
-        __ sllx(r, 32, r);
-        __ or3(G3_scratch, r, r);
-      } else {
-        assert(r->is_out(), "longs passed in two O registers");
-        __ ld  (Gargs, arg_slot(ld_off)     , r->successor()); // Load lo bits
-        __ ld  (Gargs, next_arg_slot(ld_off), r);              // Load hi bits
-      }
-    }
-  }
-#endif

  // Jump to the compiled code just as if compiled code was doing it.
-  //
-#ifndef _LP64
-    if (g3_crushed) {
-      // Rats load was wasted, at least it is in cache...
-      __ ld_ptr(G5_method, Method::from_compiled_offset(), G3);
-    }
-#endif /* _LP64 */
+  __ ld_ptr(G5_method, in_bytes(Method::from_compiled_offset()), G3);

-    // 6243940 We might end up in handle_wrong_method if
-    // the callee is deoptimized as we race thru here. If that
-    // happens we don't want to take a safepoint because the
-    // caller frame will look interpreted and arguments are now
-    // "compiled" so it is much better to make this transition
-    // invisible to the stack walking code. Unfortunately if
-    // we try and find the callee by normal means a safepoint
-    // is possible. So we stash the desired callee in the thread
-    // and the vm will find there should this case occur.
-    Address callee_target_addr(G2_thread, JavaThread::callee_target_offset());
-    __ st_ptr(G5_method, callee_target_addr);
+  // 6243940 We might end up in handle_wrong_method if
+  // the callee is deoptimized as we race thru here. If that
+  // happens we don't want to take a safepoint because the
+  // caller frame will look interpreted and arguments are now
+  // "compiled" so it is much better to make this transition
+  // invisible to the stack walking code. Unfortunately if
+  // we try and find the callee by normal means a safepoint
+  // is possible. So we stash the desired callee in the thread
+  // and the vm will find there should this case occur.
+  Address callee_target_addr(G2_thread, JavaThread::callee_target_offset());
+  __ st_ptr(G5_method, callee_target_addr);

-    if (StressNonEntrant) {
-      // Open a big window for deopt failure
-      __ save_frame(0);
-      __ mov(G0, L0);
-      Label loop;
-      __ bind(loop);
-      __ sub(L0, 1, L0);
-      __ br_null_short(L0, Assembler::pt, loop);
+  if (StressNonEntrant) {
+    // Open a big window for deopt failure
+    __ save_frame(0);
+    __ mov(G0, L0);
+    Label loop;
+    __ bind(loop);
+    __ sub(L0, 1, L0);
+    __ br_null_short(L0, Assembler::pt, loop);
+    __ restore();
+  }

-      __ restore();
-    }
-
-
-    __ jmpl(G3, 0, G0);
-    __ delayed()->nop();
+  __ jmpl(G3, 0, G0);
+  __ delayed()->nop();
 }

 // ---------------------------------------------------------------
@ -1221,28 +1044,17 @@ AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm
  // compiled code, which relys solely on SP and not FP, get sick).

  address c2i_unverified_entry = __ pc();
-  Label skip_fixup;
+  Label L_skip_fixup;
  {
-#if !defined(_LP64) && defined(COMPILER2)
-    Register R_temp   = L0;   // another scratch register
-#else
-    Register R_temp   = G1;   // another scratch register
-#endif
+    Register R_temp = G1;  // another scratch register

    AddressLiteral ic_miss(SharedRuntime::get_ic_miss_stub());

    __ verify_oop(O0);
    __ load_klass(O0, G3_scratch);

-#if !defined(_LP64) && defined(COMPILER2)
-    __ save(SP, -frame::register_save_words*wordSize, SP);
    __ ld_ptr(G5_method, CompiledICHolder::holder_klass_offset(), R_temp);
    __ cmp(G3_scratch, R_temp);
-    __ restore();
-#else
-    __ ld_ptr(G5_method, CompiledICHolder::holder_klass_offset(), R_temp);
-    __ cmp(G3_scratch, R_temp);
-#endif

    Label ok, ok2;
    __ brx(Assembler::equal, false, Assembler::pt, ok);
@ -1256,8 +1068,8 @@ AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm
    // the call site corrected.
    __ ld_ptr(G5_method, in_bytes(Method::code_offset()), G3_scratch);
    __ bind(ok2);
-    __ br_null(G3_scratch, false, Assembler::pt, skip_fixup);
-    __ delayed()->ld_ptr(G5_method, in_bytes(Method::interpreter_entry_offset()), G3_scratch);
+    __ br_null(G3_scratch, false, Assembler::pt, L_skip_fixup);
+    __ delayed()->nop();
    __ jump_to(ic_miss, G3_scratch);
    __ delayed()->nop();

@ -1265,7 +1077,7 @@ AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm

  address c2i_entry = __ pc();

-  agen.gen_c2i_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
+  agen.gen_c2i_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs, L_skip_fixup);

  __ flush();
  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
@ -1985,12 +1797,12 @@ static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType
 }

 static void verify_oop_args(MacroAssembler* masm,
-                            int total_args_passed,
+                            methodHandle method,
                            const BasicType* sig_bt,
                            const VMRegPair* regs) {
  Register temp_reg = G5_method;  // not part of any compiled calling seq
  if (VerifyOops) {
-    for (int i = 0; i < total_args_passed; i++) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
      if (sig_bt[i] == T_OBJECT ||
          sig_bt[i] == T_ARRAY) {
        VMReg r = regs[i].first();
@ -2009,35 +1821,32 @@ static void verify_oop_args(MacroAssembler* masm,
 }

 static void gen_special_dispatch(MacroAssembler* masm,
-                                 int total_args_passed,
-                                 int comp_args_on_stack,
-                                 vmIntrinsics::ID special_dispatch,
+                                 methodHandle method,
                                 const BasicType* sig_bt,
                                 const VMRegPair* regs) {
-  verify_oop_args(masm, total_args_passed, sig_bt, regs);
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();

  // Now write the args into the outgoing interpreter space
  bool     has_receiver   = false;
  Register receiver_reg   = noreg;
  int      member_arg_pos = -1;
  Register member_reg     = noreg;
-  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(special_dispatch);
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
  if (ref_kind != 0) {
-    member_arg_pos = total_args_passed - 1;  // trailing MemberName argument
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
    member_reg = G5_method;  // known to be free at this point
    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
-  } else if (special_dispatch == vmIntrinsics::_invokeBasic) {
+  } else if (iid == vmIntrinsics::_invokeBasic) {
    has_receiver = true;
  } else {
-    fatal(err_msg("special_dispatch=%d", special_dispatch));
+    fatal(err_msg_res("unexpected intrinsic id %d", iid));
  }

  if (member_reg != noreg) {
    // Load the member_arg into register, if necessary.
-    assert(member_arg_pos >= 0 && member_arg_pos < total_args_passed, "oob");
-    assert(sig_bt[member_arg_pos] == T_OBJECT, "dispatch argument must be an object");
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
    VMReg r = regs[member_arg_pos].first();
-    assert(r->is_valid(), "bad member arg");
    if (r->is_stack()) {
      RegisterOrConstant ld_off = reg2offset(r) + STACK_BIAS;
      ld_off = __ ensure_simm13_or_reg(ld_off, member_reg);
@ -2050,7 +1859,7 @@ static void gen_special_dispatch(MacroAssembler* masm,

  if (has_receiver) {
    // Make sure the receiver is loaded into a register.
-    assert(total_args_passed > 0, "oob");
+    assert(method->size_of_parameters() > 0, "oob");
    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
    VMReg r = regs[0].first();
    assert(r->is_valid(), "bad receiver arg");
@ -2058,7 +1867,7 @@ static void gen_special_dispatch(MacroAssembler* masm,
      // Porting note:  This assumes that compiled calling conventions always
      // pass the receiver oop in a register.  If this is not true on some
      // platform, pick a temp and load the receiver from stack.
-      assert(false, "receiver always in a register");
+      fatal("receiver always in a register");
      receiver_reg = G3_scratch;  // known to be free at this point
      RegisterOrConstant ld_off = reg2offset(r) + STACK_BIAS;
      ld_off = __ ensure_simm13_or_reg(ld_off, member_reg);
@ -2070,7 +1879,7 @@ static void gen_special_dispatch(MacroAssembler* masm,
  }

  // Figure out which address we are really jumping to:
-  MethodHandles::generate_method_handle_dispatch(masm, special_dispatch,
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
 }

@ -2103,11 +1912,9 @@ static void gen_special_dispatch(MacroAssembler* masm,
 //    transition back to thread_in_Java
 //    return to caller
 //
-nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
+nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
                                                methodHandle method,
                                                int compile_id,
-                                                int total_in_args,
-                                                int comp_args_on_stack, // in VMRegStackSlots
                                                BasicType* in_sig_bt,
                                                VMRegPair* in_regs,
                                                BasicType ret_type) {
@ -2116,9 +1923,7 @@ nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
    intptr_t start = (intptr_t)__ pc();
    int vep_offset = ((intptr_t)__ pc()) - start;
    gen_special_dispatch(masm,
-                         total_in_args,
-                         comp_args_on_stack,
-                         method->intrinsic_id(),
+                         method,
                         in_sig_bt,
                         in_regs);
    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
@ -2220,6 +2025,7 @@ nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
  // we convert the java signature to a C signature by inserting
  // the hidden arguments as arg[0] and possibly arg[1] (static method)

+  const int total_in_args = method->size_of_parameters();
  int total_c_args = total_in_args;
  int total_save_slots = 6 * VMRegImpl::slots_per_word;
  if (!is_critical_native) {
@ -3936,7 +3742,7 @@ void SharedRuntime::generate_uncommon_trap_blob() {
 // the 64-bit %o's, then do a save, then fixup the caller's SP (our FP).
 // Tricky, tricky, tricky...

-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");

  // allocate space for the code
@ -3954,6 +3760,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause

  int start = __ offset();

+  bool cause_return = (poll_type == POLL_AT_RETURN);
  // If this causes a return before the processing, then do a "restore"
  if (cause_return) {
    __ restore();
--- a/hotspot/src/cpu/sparc/vm/sparc.ad
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad
@ -1838,6 +1838,12 @@ const bool Matcher::match_rule_supported(int opcode) {
  case Op_PopCountL:
    if (!UsePopCountInstruction)
      return false;
+  case Op_CompareAndSwapL:
+#ifdef _LP64
+  case Op_CompareAndSwapP:
+#endif
+    if (!VM_Version::supports_cx8())
+      return false;
    break;
  }

@ -7199,6 +7205,7 @@ instruct storeLConditional( iRegP mem_ptr, iRegL oldval, g3RegL newval, flagsReg
 // No flag versions for CompareAndSwap{P,I,L} because matcher can't match them

 instruct compareAndSwapL_bool(iRegP mem_ptr, iRegL oldval, iRegL newval, iRegI res, o7RegI tmp1, flagsReg ccr ) %{
+  predicate(VM_Version::supports_cx8());
  match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
  effect( USE mem_ptr, KILL ccr, KILL tmp1);
  format %{
@ -7230,6 +7237,9 @@ instruct compareAndSwapI_bool(iRegP mem_ptr, iRegI oldval, iRegI newval, iRegI r
 %}

 instruct compareAndSwapP_bool(iRegP mem_ptr, iRegP oldval, iRegP newval, iRegI res, o7RegI tmp1, flagsReg ccr ) %{
+#ifdef _LP64
+  predicate(VM_Version::supports_cx8());
+#endif
  match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
  effect( USE mem_ptr, KILL ccr, KILL tmp1);
  format %{
@ -7264,6 +7274,38 @@ instruct compareAndSwapN_bool(iRegP mem_ptr, iRegN oldval, iRegN newval, iRegI r
  ins_pipe( long_memory_op );
 %}

+instruct xchgI( memory mem, iRegI newval) %{
+  match(Set newval (GetAndSetI mem newval));
+  format %{ "SWAP  [$mem],$newval" %}
+  size(4);
+  ins_encode %{
+    __ swap($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+#ifndef _LP64
+instruct xchgP( memory mem, iRegP newval) %{
+  match(Set newval (GetAndSetP mem newval));
+  format %{ "SWAP  [$mem],$newval" %}
+  size(4);
+  ins_encode %{
+    __ swap($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+instruct xchgN( memory mem, iRegN newval) %{
+  match(Set newval (GetAndSetN mem newval));
+  format %{ "SWAP  [$mem],$newval" %}
+  size(4);
+  ins_encode %{
+    __ swap($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
 //---------------------
 // Subtraction Instructions
 // Register Subtraction
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
@ -96,6 +96,7 @@ void VM_Version::initialize() {
  UseSSE = 0; // Only on x86 and x64

  _supports_cx8 = has_v9();
+  _supports_atomic_getset4 = true; // swap instruction

  if (is_niagara()) {
    // Indirect branch is the same cost as direct
@ -338,7 +339,11 @@ void VM_Version::revert() {

 unsigned int VM_Version::calc_parallel_worker_threads() {
  unsigned int result;
-  if (is_niagara_plus()) {
+  if (is_M_series()) {
+    // for now, use same gc thread calculation for M-series as for niagara-plus
+    // in future, we may want to tweak parameters for nof_parallel_worker_thread
+    result = nof_parallel_worker_threads(5, 16, 8);
+  } else if (is_niagara_plus()) {
    result = nof_parallel_worker_threads(5, 16, 8);
  } else {
    result = nof_parallel_worker_threads(5, 8, 8);
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -124,6 +124,8 @@ public:
  // Returns true if the platform is in the niagara line (T series)
  // and newer than the niagara1.
  static bool is_niagara_plus()         { return is_T_family(_features) && !is_T1_model(_features); }
+
+  static bool is_M_series()             { return is_M_family(_features); }
  static bool is_T4()                   { return is_T_family(_features) && has_cbcond(); }

  // Fujitsu SPARC64
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -3496,6 +3496,33 @@ void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src)
  emit_byte(0x01);
 }

+void Assembler::vinsertf128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x18);
+  emit_operand(dst, src);
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vextractf128h(Address dst, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x19);
+  emit_operand(src, dst);
+  // 0x01 - extract from upper 128 bits
+  emit_byte(0x01);
+}
+
 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx2(), "");
  bool vector256 = true;
@ -3507,6 +3534,33 @@ void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src)
  emit_byte(0x01);
 }

+void Assembler::vinserti128h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x38);
+  emit_operand(dst, src);
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vextracti128h(Address dst, XMMRegister src) {
+  assert(VM_Version::supports_avx2(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  emit_byte(0x39);
+  emit_operand(src, dst);
+  // 0x01 - extract from upper 128 bits
+  emit_byte(0x01);
+}
+
 void Assembler::vzeroupper() {
  assert(VM_Version::supports_avx(), "");
  (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
@ -8907,11 +8961,9 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
  pusha();

  // if we are coming from c1, xmm registers may be live
-  if (UseSSE >= 1) {
-    subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
-  }
  int off = 0;
  if (UseSSE == 1)  {
+    subptr(rsp, sizeof(jdouble)*8);
    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
@ -8921,23 +8973,50 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
  } else if (UseSSE >= 2)  {
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+      // Save upper half of YMM registes
+      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+      vextractf128h(Address(rsp,  0),xmm0);
+      vextractf128h(Address(rsp, 16),xmm1);
+      vextractf128h(Address(rsp, 32),xmm2);
+      vextractf128h(Address(rsp, 48),xmm3);
+      vextractf128h(Address(rsp, 64),xmm4);
+      vextractf128h(Address(rsp, 80),xmm5);
+      vextractf128h(Address(rsp, 96),xmm6);
+      vextractf128h(Address(rsp,112),xmm7);
 #ifdef _LP64
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
-    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
+      vextractf128h(Address(rsp,128),xmm8);
+      vextractf128h(Address(rsp,144),xmm9);
+      vextractf128h(Address(rsp,160),xmm10);
+      vextractf128h(Address(rsp,176),xmm11);
+      vextractf128h(Address(rsp,192),xmm12);
+      vextractf128h(Address(rsp,208),xmm13);
+      vextractf128h(Address(rsp,224),xmm14);
+      vextractf128h(Address(rsp,240),xmm15);
+#endif
+    }
+#endif
+    // Save whole 128bit (16 bytes) XMM regiters
+    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    movdqu(Address(rsp,off++*16),xmm0);
+    movdqu(Address(rsp,off++*16),xmm1);
+    movdqu(Address(rsp,off++*16),xmm2);
+    movdqu(Address(rsp,off++*16),xmm3);
+    movdqu(Address(rsp,off++*16),xmm4);
+    movdqu(Address(rsp,off++*16),xmm5);
+    movdqu(Address(rsp,off++*16),xmm6);
+    movdqu(Address(rsp,off++*16),xmm7);
+#ifdef _LP64
+    movdqu(Address(rsp,off++*16),xmm8);
+    movdqu(Address(rsp,off++*16),xmm9);
+    movdqu(Address(rsp,off++*16),xmm10);
+    movdqu(Address(rsp,off++*16),xmm11);
+    movdqu(Address(rsp,off++*16),xmm12);
+    movdqu(Address(rsp,off++*16),xmm13);
+    movdqu(Address(rsp,off++*16),xmm14);
+    movdqu(Address(rsp,off++*16),xmm15);
 #endif
  }

@ -9015,28 +9094,52 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    addptr(rsp, sizeof(jdouble)*8);
  } else if (UseSSE >= 2)  {
-    movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    // Restore whole 128bit (16 bytes) XMM regiters
+    movdqu(xmm0, Address(rsp,off++*16));
+    movdqu(xmm1, Address(rsp,off++*16));
+    movdqu(xmm2, Address(rsp,off++*16));
+    movdqu(xmm3, Address(rsp,off++*16));
+    movdqu(xmm4, Address(rsp,off++*16));
+    movdqu(xmm5, Address(rsp,off++*16));
+    movdqu(xmm6, Address(rsp,off++*16));
+    movdqu(xmm7, Address(rsp,off++*16));
 #ifdef _LP64
-    movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
-    movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
+    movdqu(xmm8, Address(rsp,off++*16));
+    movdqu(xmm9, Address(rsp,off++*16));
+    movdqu(xmm10, Address(rsp,off++*16));
+    movdqu(xmm11, Address(rsp,off++*16));
+    movdqu(xmm12, Address(rsp,off++*16));
+    movdqu(xmm13, Address(rsp,off++*16));
+    movdqu(xmm14, Address(rsp,off++*16));
+    movdqu(xmm15, Address(rsp,off++*16));
+#endif
+    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      // Restore upper half of YMM registes.
+      vinsertf128h(xmm0, Address(rsp,  0));
+      vinsertf128h(xmm1, Address(rsp, 16));
+      vinsertf128h(xmm2, Address(rsp, 32));
+      vinsertf128h(xmm3, Address(rsp, 48));
+      vinsertf128h(xmm4, Address(rsp, 64));
+      vinsertf128h(xmm5, Address(rsp, 80));
+      vinsertf128h(xmm6, Address(rsp, 96));
+      vinsertf128h(xmm7, Address(rsp,112));
+#ifdef _LP64
+      vinsertf128h(xmm8, Address(rsp,128));
+      vinsertf128h(xmm9, Address(rsp,144));
+      vinsertf128h(xmm10, Address(rsp,160));
+      vinsertf128h(xmm11, Address(rsp,176));
+      vinsertf128h(xmm12, Address(rsp,192));
+      vinsertf128h(xmm13, Address(rsp,208));
+      vinsertf128h(xmm14, Address(rsp,224));
+      vinsertf128h(xmm15, Address(rsp,240));
+#endif
+      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    }
 #endif
-  }
-  if (UseSSE >= 1) {
-    addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
  }
  popa();
 }
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -1743,6 +1743,12 @@ private:
  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);

+  // Load/store high 128bit of YMM registers which does not destroy other half.
+  void vinsertf128h(XMMRegister dst, Address src);
+  void vinserti128h(XMMRegister dst, Address src);
+  void vextractf128h(Address dst, XMMRegister src);
+  void vextracti128h(Address dst, XMMRegister src);
+
  // AVX instruction which is used to clear upper 128 bits of YMM registers and
  // to avoid transaction penalty between AVX and SSE states. There is no
  // penalty if legacy SSE instructions are encoded using VEX prefix because
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@ -3794,5 +3794,49 @@ void LIR_Assembler::peephole(LIR_List*) {
  // do nothing for now
 }

+void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp) {
+  assert(data == dest, "xchg/xadd uses only 2 operands");
+
+  if (data->type() == T_INT) {
+    if (code == lir_xadd) {
+      if (os::is_MP()) {
+        __ lock();
+      }
+      __ xaddl(as_Address(src->as_address_ptr()), data->as_register());
+    } else {
+      __ xchgl(data->as_register(), as_Address(src->as_address_ptr()));
+    }
+  } else if (data->is_oop()) {
+    assert (code == lir_xchg, "xadd for oops");
+    Register obj = data->as_register();
+#ifdef _LP64
+    if (UseCompressedOops) {
+      __ encode_heap_oop(obj);
+      __ xchgl(obj, as_Address(src->as_address_ptr()));
+      __ decode_heap_oop(obj);
+    } else {
+      __ xchgptr(obj, as_Address(src->as_address_ptr()));
+    }
+#else
+    __ xchgl(obj, as_Address(src->as_address_ptr()));
+#endif
+  } else if (data->type() == T_LONG) {
+#ifdef _LP64
+    assert(data->as_register_lo() == data->as_register_hi(), "should be a single register");
+    if (code == lir_xadd) {
+      if (os::is_MP()) {
+        __ lock();
+      }
+      __ xaddq(as_Address(src->as_address_ptr()), data->as_register_lo());
+    } else {
+      __ xchgq(data->as_register_lo(), as_Address(src->as_address_ptr()));
+    }
+#else
+    ShouldNotReachHere();
+#endif
+  } else {
+    ShouldNotReachHere();
+  }
+}

 #undef __
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
@ -753,9 +753,24 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
  LIR_Opr addr = new_pointer_register();
  LIR_Address* a;
  if(offset.result()->is_constant()) {
+#ifdef _LP64
+    jlong c = offset.result()->as_jlong();
+    if ((jlong)((jint)c) == c) {
+      a = new LIR_Address(obj.result(),
+                          (jint)c,
+                          as_BasicType(type));
+    } else {
+      LIR_Opr tmp = new_register(T_LONG);
+      __ move(offset.result(), tmp);
+      a = new LIR_Address(obj.result(),
+                          tmp,
+                          as_BasicType(type));
+    }
+#else
    a = new LIR_Address(obj.result(),
-                        NOT_LP64(offset.result()->as_constant_ptr()->as_jint()) LP64_ONLY((int)offset.result()->as_constant_ptr()->as_jlong()),
+                        offset.result()->as_jint(),
                        as_BasicType(type));
+#endif
  } else {
    a = new LIR_Address(obj.result(),
                        offset.result(),
@ -1345,3 +1360,57 @@ void LIRGenerator::put_Object_unsafe(LIR_Opr src, LIR_Opr offset, LIR_Opr data,
    }
  }
 }
+
+void LIRGenerator::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {
+  BasicType type = x->basic_type();
+  LIRItem src(x->object(), this);
+  LIRItem off(x->offset(), this);
+  LIRItem value(x->value(), this);
+
+  src.load_item();
+  value.load_item();
+  off.load_nonconstant();
+
+  LIR_Opr dst = rlock_result(x, type);
+  LIR_Opr data = value.result();
+  bool is_obj = (type == T_ARRAY || type == T_OBJECT);
+  LIR_Opr offset = off.result();
+
+  assert (type == T_INT || (!x->is_add() && is_obj) LP64_ONLY( || type == T_LONG ), "unexpected type");
+  LIR_Address* addr;
+  if (offset->is_constant()) {
+#ifdef _LP64
+    jlong c = offset->as_jlong();
+    if ((jlong)((jint)c) == c) {
+      addr = new LIR_Address(src.result(), (jint)c, type);
+    } else {
+      LIR_Opr tmp = new_register(T_LONG);
+      __ move(offset, tmp);
+      addr = new LIR_Address(src.result(), tmp, type);
+    }
+#else
+    addr = new LIR_Address(src.result(), offset->as_jint(), type);
+#endif
+  } else {
+    addr = new LIR_Address(src.result(), offset, type);
+  }
+
+  if (data != dst) {
+    __ move(data, dst);
+    data = dst;
+  }
+  if (x->is_add()) {
+    __ xadd(LIR_OprFact::address(addr), data, dst, LIR_OprFact::illegalOpr);
+  } else {
+    if (is_obj) {
+      // Do the pre-write barrier, if any.
+      pre_barrier(LIR_OprFact::address(addr), LIR_OprFact::illegalOpr /* pre_val */,
+                  true /* do_load */, false /* patch */, NULL);
+    }
+    __ xchg(LIR_OprFact::address(addr), data, dst, LIR_OprFact::illegalOpr);
+    if (is_obj) {
+      // Seems to be a precise address
+      post_barrier(LIR_OprFact::address(addr), data);
+    }
+  }
+}
--- a/hotspot/src/cpu/x86/vm/methodHandles_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/methodHandles_x86.cpp
@ -327,10 +327,11 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
    assert_different_registers(temp3,        rcx, rdx);
  }
 #endif
+  else {
+    assert_different_registers(temp1, temp2, temp3, saved_last_sp_register());  // don't trash lastSP
+  }
  assert_different_registers(temp1, temp2, temp3, receiver_reg);
  assert_different_registers(temp1, temp2, temp3, member_reg);
-  if (!for_compiler_entry)
-    assert_different_registers(temp1, temp2, temp3, saved_last_sp_register());  // don't trash lastSP

  if (iid == vmIntrinsics::_invokeBasic) {
    // indirect through MH.form.vmentry.vmtarget
@ -392,14 +393,13 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
    //  rsi/r13 - interpreter linkage (if interpreted)
    //  rcx, rdx, rsi, rdi, r8, r8 - compiler arguments (if compiled)

-    bool method_is_live = false;
+    Label L_incompatible_class_change_error;
    switch (iid) {
    case vmIntrinsics::_linkToSpecial:
      if (VerifyMethodHandles) {
        verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
      }
      __ movptr(rbx_method, member_vmtarget);
-      method_is_live = true;
      break;

    case vmIntrinsics::_linkToStatic:
@ -407,7 +407,6 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
        verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
      }
      __ movptr(rbx_method, member_vmtarget);
-      method_is_live = true;
      break;

    case vmIntrinsics::_linkToVirtual:
@ -436,7 +435,6 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,

      // get target Method* & entry point
      __ lookup_virtual_method(temp1_recv_klass, temp2_index, rbx_method);
-      method_is_live = true;
      break;
    }

@ -464,35 +462,32 @@ void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
      }

      // given intf, index, and recv klass, dispatch to the implementation method
-      Label L_no_such_interface;
      __ lookup_interface_method(temp1_recv_klass, temp3_intf,
                                 // note: next two args must be the same:
                                 rbx_index, rbx_method,
                                 temp2,
-                                 L_no_such_interface);
-
-      __ verify_method_ptr(rbx_method);
-      jump_from_method_handle(_masm, rbx_method, temp2, for_compiler_entry);
-      __ hlt();
-
-      __ bind(L_no_such_interface);
-      __ jump(RuntimeAddress(StubRoutines::throw_IncompatibleClassChangeError_entry()));
+                                 L_incompatible_class_change_error);
      break;
    }

    default:
-      fatal(err_msg("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
+      fatal(err_msg_res("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
      break;
    }

-    if (method_is_live) {
-      // live at this point:  rbx_method, rsi/r13 (if interpreted)
+    // Live at this point:
+    //   rbx_method
+    //   rsi/r13 (if interpreted)

-      // After figuring out which concrete method to call, jump into it.
-      // Note that this works in the interpreter with no data motion.
-      // But the compiled version will require that rcx_recv be shifted out.
-      __ verify_method_ptr(rbx_method);
-      jump_from_method_handle(_masm, rbx_method, temp1, for_compiler_entry);
+    // After figuring out which concrete method to call, jump into it.
+    // Note that this works in the interpreter with no data motion.
+    // But the compiled version will require that rcx_recv be shifted out.
+    __ verify_method_ptr(rbx_method);
+    jump_from_method_handle(_masm, rbx_method, temp1, for_compiler_entry);
+
+    if (iid == vmIntrinsics::_linkToInterface) {
+      __ bind(L_incompatible_class_change_error);
+      __ jump(RuntimeAddress(StubRoutines::throw_IncompatibleClassChangeError_entry()));
    }
  }
 }
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
@ -46,11 +46,11 @@
 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;

 class RegisterSaver {
-  enum { FPU_regs_live = 8 /*for the FPU stack*/+8/*eight more for XMM registers*/ };
  // Capture info about frame layout
+#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  enum layout {
                fpu_state_off = 0,
-                fpu_state_end = fpu_state_off+FPUStateSizeInWords-1,
+                fpu_state_end = fpu_state_off+FPUStateSizeInWords,
                st0_off, st0H_off,
                st1_off, st1H_off,
                st2_off, st2H_off,
@ -59,16 +59,16 @@ class RegisterSaver {
                st5_off, st5H_off,
                st6_off, st6H_off,
                st7_off, st7H_off,
-
-                xmm0_off, xmm0H_off,
-                xmm1_off, xmm1H_off,
-                xmm2_off, xmm2H_off,
-                xmm3_off, xmm3H_off,
-                xmm4_off, xmm4H_off,
-                xmm5_off, xmm5H_off,
-                xmm6_off, xmm6H_off,
-                xmm7_off, xmm7H_off,
-                flags_off,
+                xmm_off,
+                DEF_XMM_OFFS(0),
+                DEF_XMM_OFFS(1),
+                DEF_XMM_OFFS(2),
+                DEF_XMM_OFFS(3),
+                DEF_XMM_OFFS(4),
+                DEF_XMM_OFFS(5),
+                DEF_XMM_OFFS(6),
+                DEF_XMM_OFFS(7),
+                flags_off = xmm7_off + 16/BytesPerInt + 1, // 16-byte stack alignment fill word
                rdi_off,
                rsi_off,
                ignore_off,  // extra copy of rbp,
@ -83,13 +83,13 @@ class RegisterSaver {
                rbp_off,
                return_off,      // slot for return address
                reg_save_size };
-
+  enum { FPU_regs_live = flags_off - fpu_state_end };

  public:

  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words,
-                                     int* total_frame_words, bool verify_fpu = true);
-  static void restore_live_registers(MacroAssembler* masm);
+                                     int* total_frame_words, bool verify_fpu = true, bool save_vectors = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);

  static int rax_offset() { return rax_off; }
  static int rbx_offset() { return rbx_off; }
@ -113,9 +113,20 @@ class RegisterSaver {
 };

 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
-                                           int* total_frame_words, bool verify_fpu) {
-
-  int frame_size_in_bytes =  (reg_save_size + additional_frame_words) * wordSize;
+                                           int* total_frame_words, bool verify_fpu, bool save_vectors) {
+  int vect_words = 0;
+#ifdef COMPILER2
+  if (save_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    // Save upper half of YMM registes
+    vect_words = 8 * 16 / wordSize;
+    additional_frame_words += vect_words;
+  }
+#else
+  assert(!save_vectors, "vectors are generated only by C2");
+#endif
+  int frame_size_in_bytes = (reg_save_size + additional_frame_words) * wordSize;
  int frame_words = frame_size_in_bytes / wordSize;
  *total_frame_words = frame_words;

@ -129,7 +140,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
  __ enter();
  __ pusha();
  __ pushf();
-  __ subptr(rsp,FPU_regs_live*sizeof(jdouble)); // Push FPU registers space
+  __ subptr(rsp,FPU_regs_live*wordSize); // Push FPU registers space
  __ push_FPU_state();          // Save FPU state & init

  if (verify_fpu) {
@ -183,14 +194,28 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
    __ movflt(Address(rsp,xmm6_off*wordSize),xmm6);
    __ movflt(Address(rsp,xmm7_off*wordSize),xmm7);
  } else if( UseSSE >= 2 ) {
-    __ movdbl(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movdbl(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movdbl(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movdbl(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movdbl(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movdbl(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movdbl(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movdbl(Address(rsp,xmm7_off*wordSize),xmm7);
+    // Save whole 128bit (16 bytes) XMM regiters
+    __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0);
+    __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1);
+    __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2);
+    __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3);
+    __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4);
+    __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5);
+    __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6);
+    __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7);
+  }
+
+  if (vect_words > 0) {
+    assert(vect_words*wordSize == 128, "");
+    __ subptr(rsp, 128); // Save upper half of YMM registes
+    __ vextractf128h(Address(rsp,  0),xmm0);
+    __ vextractf128h(Address(rsp, 16),xmm1);
+    __ vextractf128h(Address(rsp, 32),xmm2);
+    __ vextractf128h(Address(rsp, 48),xmm3);
+    __ vextractf128h(Address(rsp, 64),xmm4);
+    __ vextractf128h(Address(rsp, 80),xmm5);
+    __ vextractf128h(Address(rsp, 96),xmm6);
+    __ vextractf128h(Address(rsp,112),xmm7);
  }

  // Set an oopmap for the call site.  This oopmap will map all
@ -253,10 +278,20 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_

 }

-void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
-
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
  // Recover XMM & FPU state
-  if( UseSSE == 1 ) {
+  int additional_frame_bytes = 0;
+#ifdef COMPILER2
+  if (restore_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    additional_frame_bytes = 128;
+  }
+#else
+  assert(!restore_vectors, "vectors are generated only by C2");
+#endif
+  if (UseSSE == 1) {
+    assert(additional_frame_bytes == 0, "");
    __ movflt(xmm0,Address(rsp,xmm0_off*wordSize));
    __ movflt(xmm1,Address(rsp,xmm1_off*wordSize));
    __ movflt(xmm2,Address(rsp,xmm2_off*wordSize));
@ -265,18 +300,33 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
    __ movflt(xmm5,Address(rsp,xmm5_off*wordSize));
    __ movflt(xmm6,Address(rsp,xmm6_off*wordSize));
    __ movflt(xmm7,Address(rsp,xmm7_off*wordSize));
-  } else if( UseSSE >= 2 ) {
-    __ movdbl(xmm0,Address(rsp,xmm0_off*wordSize));
-    __ movdbl(xmm1,Address(rsp,xmm1_off*wordSize));
-    __ movdbl(xmm2,Address(rsp,xmm2_off*wordSize));
-    __ movdbl(xmm3,Address(rsp,xmm3_off*wordSize));
-    __ movdbl(xmm4,Address(rsp,xmm4_off*wordSize));
-    __ movdbl(xmm5,Address(rsp,xmm5_off*wordSize));
-    __ movdbl(xmm6,Address(rsp,xmm6_off*wordSize));
-    __ movdbl(xmm7,Address(rsp,xmm7_off*wordSize));
+  } else if (UseSSE >= 2) {
+#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes)
+    __ movdqu(xmm0,STACK_ADDRESS(xmm0_off));
+    __ movdqu(xmm1,STACK_ADDRESS(xmm1_off));
+    __ movdqu(xmm2,STACK_ADDRESS(xmm2_off));
+    __ movdqu(xmm3,STACK_ADDRESS(xmm3_off));
+    __ movdqu(xmm4,STACK_ADDRESS(xmm4_off));
+    __ movdqu(xmm5,STACK_ADDRESS(xmm5_off));
+    __ movdqu(xmm6,STACK_ADDRESS(xmm6_off));
+    __ movdqu(xmm7,STACK_ADDRESS(xmm7_off));
+#undef STACK_ADDRESS
+  }
+  if (restore_vectors) {
+    // Restore upper half of YMM registes.
+    assert(additional_frame_bytes == 128, "");
+    __ vinsertf128h(xmm0, Address(rsp,  0));
+    __ vinsertf128h(xmm1, Address(rsp, 16));
+    __ vinsertf128h(xmm2, Address(rsp, 32));
+    __ vinsertf128h(xmm3, Address(rsp, 48));
+    __ vinsertf128h(xmm4, Address(rsp, 64));
+    __ vinsertf128h(xmm5, Address(rsp, 80));
+    __ vinsertf128h(xmm6, Address(rsp, 96));
+    __ vinsertf128h(xmm7, Address(rsp,112));
+    __ addptr(rsp, additional_frame_bytes);
  }
  __ pop_FPU_state();
-  __ addptr(rsp, FPU_regs_live*sizeof(jdouble)); // Pop FPU registers
+  __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers

  __ popf();
  __ popa();
@ -308,6 +358,13 @@ void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
  __ addptr(rsp, return_off * wordSize);
 }

+// Is vector's size (in bytes) bigger than a size saved by default?
+// 16 bytes XMM registers are saved by default using SSE2 movdqu instructions.
+// Note, MaxVectorSize == 0 with UseSSE < 2 and vectors are not generated.
+bool SharedRuntime::is_wide_vector(int size) {
+  return size > 16;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@ -1346,12 +1403,12 @@ static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType
 }

 static void verify_oop_args(MacroAssembler* masm,
-                            int total_args_passed,
+                            methodHandle method,
                            const BasicType* sig_bt,
                            const VMRegPair* regs) {
  Register temp_reg = rbx;  // not part of any compiled calling seq
  if (VerifyOops) {
-    for (int i = 0; i < total_args_passed; i++) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
      if (sig_bt[i] == T_OBJECT ||
          sig_bt[i] == T_ARRAY) {
        VMReg r = regs[i].first();
@ -1368,35 +1425,32 @@ static void verify_oop_args(MacroAssembler* masm,
 }

 static void gen_special_dispatch(MacroAssembler* masm,
-                                 int total_args_passed,
-                                 int comp_args_on_stack,
-                                 vmIntrinsics::ID special_dispatch,
+                                 methodHandle method,
                                 const BasicType* sig_bt,
                                 const VMRegPair* regs) {
-  verify_oop_args(masm, total_args_passed, sig_bt, regs);
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();

  // Now write the args into the outgoing interpreter space
  bool     has_receiver   = false;
  Register receiver_reg   = noreg;
  int      member_arg_pos = -1;
  Register member_reg     = noreg;
-  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(special_dispatch);
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
  if (ref_kind != 0) {
-    member_arg_pos = total_args_passed - 1;  // trailing MemberName argument
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
    member_reg = rbx;  // known to be free at this point
    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
-  } else if (special_dispatch == vmIntrinsics::_invokeBasic) {
+  } else if (iid == vmIntrinsics::_invokeBasic) {
    has_receiver = true;
  } else {
-    guarantee(false, err_msg("special_dispatch=%d", special_dispatch));
+    fatal(err_msg_res("unexpected intrinsic id %d", iid));
  }

  if (member_reg != noreg) {
    // Load the member_arg into register, if necessary.
-    assert(member_arg_pos >= 0 && member_arg_pos < total_args_passed, "oob");
-    assert(sig_bt[member_arg_pos] == T_OBJECT, "dispatch argument must be an object");
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
    VMReg r = regs[member_arg_pos].first();
-    assert(r->is_valid(), "bad member arg");
    if (r->is_stack()) {
      __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
    } else {
@ -1407,7 +1461,7 @@ static void gen_special_dispatch(MacroAssembler* masm,

  if (has_receiver) {
    // Make sure the receiver is loaded into a register.
-    assert(total_args_passed > 0, "oob");
+    assert(method->size_of_parameters() > 0, "oob");
    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
    VMReg r = regs[0].first();
    assert(r->is_valid(), "bad receiver arg");
@ -1415,7 +1469,7 @@ static void gen_special_dispatch(MacroAssembler* masm,
      // Porting note:  This assumes that compiled calling conventions always
      // pass the receiver oop in a register.  If this is not true on some
      // platform, pick a temp and load the receiver from stack.
-      assert(false, "receiver always in a register");
+      fatal("receiver always in a register");
      receiver_reg = rcx;  // known to be free at this point
      __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
    } else {
@ -1425,7 +1479,7 @@ static void gen_special_dispatch(MacroAssembler* masm,
  }

  // Figure out which address we are really jumping to:
-  MethodHandles::generate_method_handle_dispatch(masm, special_dispatch,
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
 }

@ -1461,8 +1515,6 @@ static void gen_special_dispatch(MacroAssembler* masm,
 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
                                                methodHandle method,
                                                int compile_id,
-                                                int total_in_args,
-                                                int comp_args_on_stack,
                                                BasicType* in_sig_bt,
                                                VMRegPair* in_regs,
                                                BasicType ret_type) {
@ -1471,9 +1523,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
    intptr_t start = (intptr_t)__ pc();
    int vep_offset = ((intptr_t)__ pc()) - start;
    gen_special_dispatch(masm,
-                         total_in_args,
-                         comp_args_on_stack,
-                         method->intrinsic_id(),
+                         method,
                         in_sig_bt,
                         in_regs);
    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
@ -1506,6 +1556,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
  // we convert the java signature to a C signature by inserting
  // the hidden arguments as arg[0] and possibly arg[1] (static method)

+  const int total_in_args = method->size_of_parameters();
  int total_c_args = total_in_args;
  if (!is_critical_native) {
    total_c_args += 1;
@ -2738,7 +2789,6 @@ uint SharedRuntime::out_preserve_stack_slots() {
  return 0;
 }

-
 //------------------------------generate_deopt_blob----------------------------
 void SharedRuntime::generate_deopt_blob() {
  // allocate space for the code
@ -3276,7 +3326,7 @@ void SharedRuntime::generate_uncommon_trap_blob() {
 // setup oopmap, and calls safepoint code to stop the compiled code for
 // a safepoint.
 //
-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {

  // Account for thread arg in our frame
  const int additional_words = 1;
@ -3296,17 +3346,18 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause
  const Register java_thread = rdi; // callee-saved for VC++
  address start   = __ pc();
  address call_pc = NULL;
-
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
  // If cause_return is true we are at a poll_return and there is
  // the return address on the stack to the caller on the nmethod
  // that is safepoint. We can leave this return on the stack and
  // effectively complete the return and safepoint in the caller.
  // Otherwise we push space for a return address that the safepoint
  // handler will install later to make the stack walking sensible.
-  if( !cause_return )
-    __ push(rbx);                // Make room for return address (or push it again)
+  if (!cause_return)
+    __ push(rbx);  // Make room for return address (or push it again)

-  map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, false);
+  map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, false, save_vectors);

  // The following is basically a call_VM. However, we need the precise
  // address of the call in order to generate an oopmap. Hence, we do all the
@ -3318,7 +3369,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause
  __ set_last_Java_frame(java_thread, noreg, noreg, NULL);

  // if this was not a poll_return then we need to correct the return address now.
-  if( !cause_return ) {
+  if (!cause_return) {
    __ movptr(rax, Address(java_thread, JavaThread::saved_exception_pc_offset()));
    __ movptr(Address(rbp, wordSize), rax);
  }
@ -3346,15 +3397,14 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause
  __ jcc(Assembler::equal, noException);

  // Exception pending
-
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);

  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));

  __ bind(noException);

  // Normal exit, register restoring and exit
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);

  __ ret(0);

--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
@ -116,8 +116,8 @@ class RegisterSaver {
  };

 public:
-  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words);
-  static void restore_live_registers(MacroAssembler* masm);
+  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);

  // Offsets into the register save area
  // Used by deoptimization when it is managing result register
@ -134,7 +134,19 @@ class RegisterSaver {
  static void restore_result_registers(MacroAssembler* masm);
 };

-OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words) {
+OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
+  int vect_words = 0;
+#ifdef COMPILER2
+  if (save_vectors) {
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    // Save upper half of YMM registes
+    vect_words = 16 * 16 / wordSize;
+    additional_frame_words += vect_words;
+  }
+#else
+  assert(!save_vectors, "vectors are generated only by C2");
+#endif

  // Always make the frame size 16-byte aligned
  int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
@ -155,6 +167,27 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_

  __ enter();          // rsp becomes 16-byte aligned here
  __ push_CPU_state(); // Push a multiple of 16 bytes
+
+  if (vect_words > 0) {
+    assert(vect_words*wordSize == 256, "");
+    __ subptr(rsp, 256); // Save upper half of YMM registes
+    __ vextractf128h(Address(rsp,  0),xmm0);
+    __ vextractf128h(Address(rsp, 16),xmm1);
+    __ vextractf128h(Address(rsp, 32),xmm2);
+    __ vextractf128h(Address(rsp, 48),xmm3);
+    __ vextractf128h(Address(rsp, 64),xmm4);
+    __ vextractf128h(Address(rsp, 80),xmm5);
+    __ vextractf128h(Address(rsp, 96),xmm6);
+    __ vextractf128h(Address(rsp,112),xmm7);
+    __ vextractf128h(Address(rsp,128),xmm8);
+    __ vextractf128h(Address(rsp,144),xmm9);
+    __ vextractf128h(Address(rsp,160),xmm10);
+    __ vextractf128h(Address(rsp,176),xmm11);
+    __ vextractf128h(Address(rsp,192),xmm12);
+    __ vextractf128h(Address(rsp,208),xmm13);
+    __ vextractf128h(Address(rsp,224),xmm14);
+    __ vextractf128h(Address(rsp,240),xmm15);
+  }
  if (frame::arg_reg_save_area_bytes != 0) {
    // Allocate argument register save area
    __ subptr(rsp, frame::arg_reg_save_area_bytes);
@ -167,112 +200,111 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_

  OopMapSet *oop_maps = new OopMapSet();
  OopMap* map = new OopMap(frame_size_in_slots, 0);
-  map->set_callee_saved(VMRegImpl::stack2reg( rax_off  + additional_frame_slots), rax->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rcx_off  + additional_frame_slots), rcx->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rdx_off  + additional_frame_slots), rdx->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rbx_off  + additional_frame_slots), rbx->as_VMReg());
+
+#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
+
+  map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
  // rbp location is known implicitly by the frame sender code, needs no oopmap
  // and the location where rbp was saved by is ignored
-  map->set_callee_saved(VMRegImpl::stack2reg( rsi_off  + additional_frame_slots), rsi->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( rdi_off  + additional_frame_slots), rdi->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r8_off   + additional_frame_slots), r8->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r9_off   + additional_frame_slots), r9->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r10_off  + additional_frame_slots), r10->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r11_off  + additional_frame_slots), r11->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r12_off  + additional_frame_slots), r12->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r13_off  + additional_frame_slots), r13->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r14_off  + additional_frame_slots), r14->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg( r15_off  + additional_frame_slots), r15->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm0_off  + additional_frame_slots), xmm0->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm1_off  + additional_frame_slots), xmm1->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm2_off  + additional_frame_slots), xmm2->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm3_off  + additional_frame_slots), xmm3->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm4_off  + additional_frame_slots), xmm4->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm5_off  + additional_frame_slots), xmm5->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm6_off  + additional_frame_slots), xmm6->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm7_off  + additional_frame_slots), xmm7->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm8_off  + additional_frame_slots), xmm8->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm9_off  + additional_frame_slots), xmm9->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm10_off + additional_frame_slots), xmm10->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm11_off + additional_frame_slots), xmm11->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm12_off + additional_frame_slots), xmm12->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm13_off + additional_frame_slots), xmm13->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm14_off + additional_frame_slots), xmm14->as_VMReg());
-  map->set_callee_saved(VMRegImpl::stack2reg(xmm15_off + additional_frame_slots), xmm15->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());

  // %%% These should all be a waste but we'll keep things as they were for now
  if (true) {
-    map->set_callee_saved(VMRegImpl::stack2reg( raxH_off  + additional_frame_slots),
-                          rax->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rcxH_off  + additional_frame_slots),
-                          rcx->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rdxH_off  + additional_frame_slots),
-                          rdx->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rbxH_off  + additional_frame_slots),
-                          rbx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
    // rbp location is known implicitly by the frame sender code, needs no oopmap
-    map->set_callee_saved(VMRegImpl::stack2reg( rsiH_off  + additional_frame_slots),
-                          rsi->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( rdiH_off  + additional_frame_slots),
-                          rdi->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r8H_off   + additional_frame_slots),
-                          r8->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r9H_off   + additional_frame_slots),
-                          r9->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r10H_off  + additional_frame_slots),
-                          r10->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r11H_off  + additional_frame_slots),
-                          r11->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r12H_off  + additional_frame_slots),
-                          r12->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r13H_off  + additional_frame_slots),
-                          r13->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r14H_off  + additional_frame_slots),
-                          r14->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg( r15H_off  + additional_frame_slots),
-                          r15->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm0H_off  + additional_frame_slots),
-                          xmm0->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm1H_off  + additional_frame_slots),
-                          xmm1->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm2H_off  + additional_frame_slots),
-                          xmm2->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm3H_off  + additional_frame_slots),
-                          xmm3->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm4H_off  + additional_frame_slots),
-                          xmm4->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm5H_off  + additional_frame_slots),
-                          xmm5->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm6H_off  + additional_frame_slots),
-                          xmm6->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm7H_off  + additional_frame_slots),
-                          xmm7->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm8H_off  + additional_frame_slots),
-                          xmm8->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm9H_off  + additional_frame_slots),
-                          xmm9->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm10H_off + additional_frame_slots),
-                          xmm10->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm11H_off + additional_frame_slots),
-                          xmm11->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm12H_off + additional_frame_slots),
-                          xmm12->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm13H_off + additional_frame_slots),
-                          xmm13->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm14H_off + additional_frame_slots),
-                          xmm14->as_VMReg()->next());
-    map->set_callee_saved(VMRegImpl::stack2reg(xmm15H_off + additional_frame_slots),
-                          xmm15->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
+    map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
  }

  return map;
 }

-void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
  if (frame::arg_reg_save_area_bytes != 0) {
    // Pop arg register save area
    __ addptr(rsp, frame::arg_reg_save_area_bytes);
  }
+#ifdef COMPILER2
+  if (restore_vectors) {
+    // Restore upper half of YMM registes.
+    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    __ vinsertf128h(xmm0, Address(rsp,  0));
+    __ vinsertf128h(xmm1, Address(rsp, 16));
+    __ vinsertf128h(xmm2, Address(rsp, 32));
+    __ vinsertf128h(xmm3, Address(rsp, 48));
+    __ vinsertf128h(xmm4, Address(rsp, 64));
+    __ vinsertf128h(xmm5, Address(rsp, 80));
+    __ vinsertf128h(xmm6, Address(rsp, 96));
+    __ vinsertf128h(xmm7, Address(rsp,112));
+    __ vinsertf128h(xmm8, Address(rsp,128));
+    __ vinsertf128h(xmm9, Address(rsp,144));
+    __ vinsertf128h(xmm10, Address(rsp,160));
+    __ vinsertf128h(xmm11, Address(rsp,176));
+    __ vinsertf128h(xmm12, Address(rsp,192));
+    __ vinsertf128h(xmm13, Address(rsp,208));
+    __ vinsertf128h(xmm14, Address(rsp,224));
+    __ vinsertf128h(xmm15, Address(rsp,240));
+    __ addptr(rsp, 256);
+  }
+#else
+  assert(!restore_vectors, "vectors are generated only by C2");
+#endif
  // Recover CPU state
  __ pop_CPU_state();
  // Get the rbp described implicitly by the calling convention (no oopMap)
@ -297,6 +329,12 @@ void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
  __ addptr(rsp, return_offset_in_bytes());
 }

+// Is vector's size (in bytes) bigger than a size saved by default?
+// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
+bool SharedRuntime::is_wide_vector(int size) {
+  return size > 16;
+}
+
 // The java_calling_convention describes stack locations as ideal slots on
 // a frame with no abi restrictions. Since we must observe abi restrictions
 // (like the placement of the register window) the slots must be biased by
@ -1593,12 +1631,12 @@ class ComputeMoveOrder: public StackObj {
 };

 static void verify_oop_args(MacroAssembler* masm,
-                            int total_args_passed,
+                            methodHandle method,
                            const BasicType* sig_bt,
                            const VMRegPair* regs) {
  Register temp_reg = rbx;  // not part of any compiled calling seq
  if (VerifyOops) {
-    for (int i = 0; i < total_args_passed; i++) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
      if (sig_bt[i] == T_OBJECT ||
          sig_bt[i] == T_ARRAY) {
        VMReg r = regs[i].first();
@ -1615,35 +1653,32 @@ static void verify_oop_args(MacroAssembler* masm,
 }

 static void gen_special_dispatch(MacroAssembler* masm,
-                                 int total_args_passed,
-                                 int comp_args_on_stack,
-                                 vmIntrinsics::ID special_dispatch,
+                                 methodHandle method,
                                 const BasicType* sig_bt,
                                 const VMRegPair* regs) {
-  verify_oop_args(masm, total_args_passed, sig_bt, regs);
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();

  // Now write the args into the outgoing interpreter space
  bool     has_receiver   = false;
  Register receiver_reg   = noreg;
  int      member_arg_pos = -1;
  Register member_reg     = noreg;
-  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(special_dispatch);
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
  if (ref_kind != 0) {
-    member_arg_pos = total_args_passed - 1;  // trailing MemberName argument
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
    member_reg = rbx;  // known to be free at this point
    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
-  } else if (special_dispatch == vmIntrinsics::_invokeBasic) {
+  } else if (iid == vmIntrinsics::_invokeBasic) {
    has_receiver = true;
  } else {
-    guarantee(false, err_msg("special_dispatch=%d", special_dispatch));
+    fatal(err_msg_res("unexpected intrinsic id %d", iid));
  }

  if (member_reg != noreg) {
    // Load the member_arg into register, if necessary.
-    assert(member_arg_pos >= 0 && member_arg_pos < total_args_passed, "oob");
-    assert(sig_bt[member_arg_pos] == T_OBJECT, "dispatch argument must be an object");
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
    VMReg r = regs[member_arg_pos].first();
-    assert(r->is_valid(), "bad member arg");
    if (r->is_stack()) {
      __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
    } else {
@ -1654,7 +1689,7 @@ static void gen_special_dispatch(MacroAssembler* masm,

  if (has_receiver) {
    // Make sure the receiver is loaded into a register.
-    assert(total_args_passed > 0, "oob");
+    assert(method->size_of_parameters() > 0, "oob");
    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
    VMReg r = regs[0].first();
    assert(r->is_valid(), "bad receiver arg");
@ -1662,7 +1697,7 @@ static void gen_special_dispatch(MacroAssembler* masm,
      // Porting note:  This assumes that compiled calling conventions always
      // pass the receiver oop in a register.  If this is not true on some
      // platform, pick a temp and load the receiver from stack.
-      assert(false, "receiver always in a register");
+      fatal("receiver always in a register");
      receiver_reg = j_rarg0;  // known to be free at this point
      __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
    } else {
@ -1672,7 +1707,7 @@ static void gen_special_dispatch(MacroAssembler* masm,
  }

  // Figure out which address we are really jumping to:
-  MethodHandles::generate_method_handle_dispatch(masm, special_dispatch,
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
 }

@ -1708,8 +1743,6 @@ static void gen_special_dispatch(MacroAssembler* masm,
 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
                                                methodHandle method,
                                                int compile_id,
-                                                int total_in_args,
-                                                int comp_args_on_stack,
                                                BasicType* in_sig_bt,
                                                VMRegPair* in_regs,
                                                BasicType ret_type) {
@ -1718,9 +1751,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
    intptr_t start = (intptr_t)__ pc();
    int vep_offset = ((intptr_t)__ pc()) - start;
    gen_special_dispatch(masm,
-                         total_in_args,
-                         comp_args_on_stack,
-                         method->intrinsic_id(),
+                         method,
                         in_sig_bt,
                         in_regs);
    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
@ -1754,6 +1785,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
  // we convert the java signature to a C signature by inserting
  // the hidden arguments as arg[0] and possibly arg[1] (static method)

+  const int total_in_args = method->size_of_parameters();
  int total_c_args = total_in_args;
  if (!is_critical_native) {
    total_c_args += 1;
@ -3241,7 +3273,6 @@ uint SharedRuntime::out_preserve_stack_slots() {
  return 0;
 }

-
 //------------------------------generate_deopt_blob----------------------------
 void SharedRuntime::generate_deopt_blob() {
  // Allocate space for the code
@ -3746,7 +3777,7 @@ void SharedRuntime::generate_uncommon_trap_blob() {
 // Generate a special Compile2Runtime blob that saves all registers,
 // and setup oopmap.
 //
-SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause_return) {
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
  assert(StubRoutines::forward_exception_entry() != NULL,
         "must be generated before");

@ -3761,6 +3792,8 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause
  address start   = __ pc();
  address call_pc = NULL;
  int frame_size_in_words;
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);

  // Make room for return address (or push it again)
  if (!cause_return) {
@ -3768,7 +3801,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause
  }

  // Save registers, fpu state, and flags
-  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);

  // The following is basically a call_VM.  However, we need the precise
  // address of the call in order to generate an oopmap. Hence, we do all the
@ -3805,7 +3838,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause

  // Exception pending

-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);

  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));

@ -3813,7 +3846,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, bool cause
  __ bind(noException);

  // Normal exit, restore registers and exit.
-  RegisterSaver::restore_live_registers(masm);
+  RegisterSaver::restore_live_registers(masm, save_vectors);

  __ ret(0);

--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -363,6 +363,11 @@ void VM_Version::get_processor_features() {
  }

  _supports_cx8 = supports_cmpxchg8();
+  // xchg and xadd instructions
+  _supports_atomic_getset4 = true;
+  _supports_atomic_getadd4 = true;
+  LP64_ONLY(_supports_atomic_getset8 = true);
+  LP64_ONLY(_supports_atomic_getadd8 = true);

 #ifdef _LP64
  // OS should support SSE for x64 and hardware should support at least SSE2.
@ -562,10 +567,10 @@ void VM_Version::get_processor_features() {
        AllocatePrefetchInstr = 3;
      }
      // On family 15h processors use XMM and UnalignedLoadStores for Array Copy
-      if( supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+      if (supports_sse2() && FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
        UseXMMForArrayCopy = true;
      }
-      if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+      if (supports_sse2() && FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
        UseUnalignedLoadStores = true;
      }
    }
@ -612,16 +617,16 @@ void VM_Version::get_processor_features() {
        MaxLoopPad = 11;
      }
 #endif // COMPILER2
-      if( FLAG_IS_DEFAULT(UseXMMForArrayCopy) ) {
+      if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
        UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
      }
-      if( supports_sse4_2() && supports_ht() ) { // Newest Intel cpus
-        if( FLAG_IS_DEFAULT(UseUnalignedLoadStores) && UseXMMForArrayCopy ) {
+      if (supports_sse4_2() && supports_ht()) { // Newest Intel cpus
+        if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
          UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
        }
      }
-      if( supports_sse4_2() && UseSSE >= 4 ) {
-        if( FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
+      if (supports_sse4_2() && UseSSE >= 4) {
+        if (FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
          UseSSE42Intrinsics = true;
        }
      }
@ -638,6 +643,13 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UsePopCountInstruction, false);
  }

+#ifdef COMPILER2
+  if (FLAG_IS_DEFAULT(AlignVector)) {
+    // Modern processors allow misaligned memory operations for vectors.
+    AlignVector = !UseUnalignedLoadStores;
+  }
+#endif // COMPILER2
+
  assert(0 <= ReadPrefetchInstr && ReadPrefetchInstr <= 3, "invalid value");
  assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 3, "invalid value");

--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@ -498,10 +498,18 @@ const bool Matcher::match_rule_supported(int opcode) {
    case Op_PopCountL:
      if (!UsePopCountInstruction)
        return false;
+    break;
    case Op_MulVI:
      if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
        return false;
    break;
+    case Op_CompareAndSwapL:
+#ifdef _LP64
+    case Op_CompareAndSwapP:
+#endif
+      if (!VM_Version::supports_cx8())
+        return false;
+    break;
  }

  return true;  // Per default match rules are supported.
--- a/hotspot/src/cpu/x86/vm/x86_32.ad
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad
@ -7762,6 +7762,7 @@ instruct storeLConditional( memory mem, eADXRegL oldval, eBCXRegL newval, eFlags
 // No flag versions for CompareAndSwap{P,I,L} because matcher can't match them

 instruct compareAndSwapL( rRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
+  predicate(VM_Version::supports_cx8());
  match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);
  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
@ -7798,6 +7799,47 @@ instruct compareAndSwapI( rRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newv
  ins_pipe( pipe_cmpxchg );
 %}

+instruct xaddI_no_res( memory mem, Universe dummy, immI add, eFlagsReg cr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(KILL cr);
+  format %{ "ADDL  [$mem],$add" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ addl($mem$$Address, $add$$constant);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xaddI( memory mem, rRegI newval, eFlagsReg cr) %{
+  match(Set newval (GetAndAddI mem newval));
+  effect(KILL cr);
+  format %{ "XADDL  [$mem],$newval" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ xaddl($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgI( memory mem, rRegI newval) %{
+  match(Set newval (GetAndSetI mem newval));
+  format %{ "XCHGL  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgl($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgP( memory mem, pRegP newval) %{
+  match(Set newval (GetAndSetP mem newval));
+  format %{ "XCHGL  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgl($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
 //----------Subtraction Instructions-------------------------------------------
 // Integer Subtraction Instructions
 instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
--- a/hotspot/src/cpu/x86/vm/x86_64.ad
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad
@ -7242,6 +7242,7 @@ instruct compareAndSwapP(rRegI res,
                         rax_RegP oldval, rRegP newval,
                         rFlagsReg cr)
 %{
+  predicate(VM_Version::supports_cx8());
  match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);

@ -7265,6 +7266,7 @@ instruct compareAndSwapL(rRegI res,
                         rax_RegL oldval, rRegL newval,
                         rFlagsReg cr)
 %{
+  predicate(VM_Version::supports_cx8());
  match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);

@ -7329,6 +7331,88 @@ instruct compareAndSwapN(rRegI res,
  ins_pipe( pipe_cmpxchg );
 %}

+instruct xaddI_no_res( memory mem, Universe dummy, immI add, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(KILL cr);
+  format %{ "ADDL  [$mem],$add" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ addl($mem$$Address, $add$$constant);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xaddI( memory mem, rRegI newval, rFlagsReg cr) %{
+  match(Set newval (GetAndAddI mem newval));
+  effect(KILL cr);
+  format %{ "XADDL  [$mem],$newval" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ xaddl($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xaddL_no_res( memory mem, Universe dummy, immL add, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem add));
+  effect(KILL cr);
+  format %{ "ADDQ  [$mem],$add" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ addq($mem$$Address, $add$$constant);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xaddL( memory mem, rRegL newval, rFlagsReg cr) %{
+  match(Set newval (GetAndAddL mem newval));
+  effect(KILL cr);
+  format %{ "XADDQ  [$mem],$newval" %}
+  ins_encode %{
+    if (os::is_MP()) { __ lock(); }
+    __ xaddq($mem$$Address, $newval$$Register);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgI( memory mem, rRegI newval) %{
+  match(Set newval (GetAndSetI mem newval));
+  format %{ "XCHGL  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgl($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgL( memory mem, rRegL newval) %{
+  match(Set newval (GetAndSetL mem newval));
+  format %{ "XCHGL  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgq($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgP( memory mem, rRegP newval) %{
+  match(Set newval (GetAndSetP mem newval));
+  format %{ "XCHGQ  $newval,[$mem]" %}
+  ins_encode %{
+    __ xchgq($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
+instruct xchgN( memory mem, rRegN newval) %{
+  match(Set newval (GetAndSetN mem newval));
+  format %{ "XCHGL  $newval,$mem]" %}
+  ins_encode %{
+    __ xchgl($newval$$Register, $mem$$Address);
+  %}
+  ins_pipe( pipe_cmpxchg );
+%}
+
 //----------Subtraction Instructions-------------------------------------------

 // Integer Subtraction Instructions
--- a/hotspot/src/share/vm/adlc/formssel.cpp
+++ b/hotspot/src/share/vm/adlc/formssel.cpp
@ -751,6 +751,7 @@ bool InstructForm::captures_bottom_type(FormDict &globals) const {
        !strcmp(_matrule->_rChild->_opType,"DecodeN")    ||
        !strcmp(_matrule->_rChild->_opType,"EncodeP")    ||
        !strcmp(_matrule->_rChild->_opType,"LoadN")      ||
+        !strcmp(_matrule->_rChild->_opType,"GetAndSetN") ||
        !strcmp(_matrule->_rChild->_opType,"LoadNKlass") ||
        !strcmp(_matrule->_rChild->_opType,"CreateEx")   ||  // type of exception
        !strcmp(_matrule->_rChild->_opType,"CheckCastPP")) ) return true;
@ -3399,7 +3400,9 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const {
    "StorePConditional", "StoreIConditional", "StoreLConditional",
    "CompareAndSwapI", "CompareAndSwapL", "CompareAndSwapP", "CompareAndSwapN",
    "StoreCM",
-    "ClearArray"
+    "ClearArray",
+    "GetAndAddI", "GetAndSetI", "GetAndSetP",
+    "GetAndAddL", "GetAndSetL", "GetAndSetN",
  };
  int cnt = sizeof(needs_ideal_memory_list)/sizeof(char*);
  if( strcmp(_opType,"PrefetchRead")==0 ||
--- a/hotspot/src/share/vm/asm/codeBuffer.cpp
+++ b/hotspot/src/share/vm/asm/codeBuffer.cpp
@ -1026,25 +1026,30 @@ class CodeComment: public CHeapObj<mtCode> {
    }
    return a;
  }
+
+  // Convenience for add_comment.
+  CodeComment* find_last(intptr_t offset) {
+    CodeComment* a = find(offset);
+    if (a != NULL) {
+      while ((a->_next != NULL) && (a->_next->_offset == offset)) {
+        a = a->_next;
+      }
+    }
+    return a;
+  }
 };


 void CodeComments::add_comment(intptr_t offset, const char * comment) {
-  CodeComment* c = new CodeComment(offset, comment);
-  CodeComment* insert = NULL;
-  if (_comments != NULL) {
-    CodeComment* c = _comments->find(offset);
-    insert = c;
-    while (c && c->offset() == offset) {
-      insert = c;
-      c = c->next();
-    }
-  }
-  if (insert) {
-    // insert after comments with same offset
-    c->set_next(insert->next());
-    insert->set_next(c);
+  CodeComment* c      = new CodeComment(offset, comment);
+  CodeComment* inspos = (_comments == NULL) ? NULL : _comments->find_last(offset);
+
+  if (inspos) {
+    // insert after already existing comments with same offset
+    c->set_next(inspos->next());
+    inspos->set_next(c);
  } else {
+    // no comments with such offset, yet. Insert before anything else.
    c->set_next(_comments);
    _comments = c;
  }
@ -1052,12 +1057,11 @@ void CodeComments::add_comment(intptr_t offset, const char * comment) {


 void CodeComments::assign(CodeComments& other) {
-  assert(_comments == NULL, "don't overwrite old value");
  _comments = other._comments;
 }


-void CodeComments::print_block_comment(outputStream* stream, intptr_t offset) {
+void CodeComments::print_block_comment(outputStream* stream, intptr_t offset) const {
  if (_comments != NULL) {
    CodeComment* c = _comments->find(offset);
    while (c && c->offset() == offset) {
@ -1085,6 +1089,7 @@ void CodeComments::free() {


 void CodeBuffer::decode() {
+  ttyLocker ttyl;
  Disassembler::decode(decode_begin(), insts_end());
  _decode_begin = insts_end();
 }
@ -1096,6 +1101,7 @@ void CodeBuffer::skip_decode() {


 void CodeBuffer::decode_all() {
+  ttyLocker ttyl;
  for (int n = 0; n < (int)SECT_LIMIT; n++) {
    // dump contents of each section
    CodeSection* cs = code_section(n);
--- a/hotspot/src/share/vm/asm/codeBuffer.hpp
+++ b/hotspot/src/share/vm/asm/codeBuffer.hpp
@ -253,7 +253,7 @@ public:
  }

  void add_comment(intptr_t offset, const char * comment) PRODUCT_RETURN;
-  void print_block_comment(outputStream* stream, intptr_t offset)  PRODUCT_RETURN;
+  void print_block_comment(outputStream* stream, intptr_t offset) const PRODUCT_RETURN;
  void assign(CodeComments& other)  PRODUCT_RETURN;
  void free() PRODUCT_RETURN;
 };
--- a/hotspot/src/share/vm/asm/register.hpp
+++ b/hotspot/src/share/vm/asm/register.hpp
@ -103,8 +103,8 @@ inline void assert_different_registers(
 ) {
  assert(
    a != b,
-    err_msg("registers must be different: a=%d, b=%d",
-            a, b)
+    err_msg_res("registers must be different: a=%d, b=%d",
+                a, b)
  );
 }

@ -117,8 +117,8 @@ inline void assert_different_registers(
  assert(
    a != b && a != c
           && b != c,
-    err_msg("registers must be different: a=%d, b=%d, c=%d",
-            a, b, c)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d",
+                a, b, c)
  );
 }

@ -133,8 +133,8 @@ inline void assert_different_registers(
    a != b && a != c && a != d
           && b != c && b != d
                     && c != d,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d",
-            a, b, c, d)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d",
+                a, b, c, d)
  );
 }

@ -151,8 +151,8 @@ inline void assert_different_registers(
           && b != c && b != d && b != e
                     && c != d && c != e
                               && d != e,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d",
-            a, b, c, d, e)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d",
+                a, b, c, d, e)
  );
 }

@ -171,8 +171,8 @@ inline void assert_different_registers(
                     && c != d && c != e && c != f
                               && d != e && d != f
                                         && e != f,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d",
-            a, b, c, d, e, f)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d",
+                a, b, c, d, e, f)
  );
 }

@ -193,8 +193,8 @@ inline void assert_different_registers(
                               && d != e && d != f && d != g
                                         && e != f && e != g
                                                   && f != g,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d",
-            a, b, c, d, e, f, g)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d",
+                a, b, c, d, e, f, g)
  );
 }

@ -217,8 +217,8 @@ inline void assert_different_registers(
                                         && e != f && e != g && e != h
                                                   && f != g && f != h
                                                             && g != h,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d, h=%d",
-            a, b, c, d, e, f, g, h)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d, h=%d",
+                a, b, c, d, e, f, g, h)
  );
 }

@ -243,8 +243,8 @@ inline void assert_different_registers(
                                                   && f != g && f != h && f != i
                                                             && g != h && g != i
                                                                       && h != i,
-    err_msg("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d, h=%d, i=%d",
-            a, b, c, d, e, f, g, h, i)
+    err_msg_res("registers must be different: a=%d, b=%d, c=%d, d=%d, e=%d, f=%d, g=%d, h=%d, i=%d",
+                a, b, c, d, e, f, g, h, i)
  );
 }

--- a/hotspot/src/share/vm/c1/c1_Canonicalizer.cpp
+++ b/hotspot/src/share/vm/c1/c1_Canonicalizer.cpp
@ -931,6 +931,7 @@ void Canonicalizer::do_UnsafeGetRaw(UnsafeGetRaw* x) { if (OptimizeUnsafes) do_U
 void Canonicalizer::do_UnsafePutRaw(UnsafePutRaw* x) { if (OptimizeUnsafes) do_UnsafeRawOp(x); }
 void Canonicalizer::do_UnsafeGetObject(UnsafeGetObject* x) {}
 void Canonicalizer::do_UnsafePutObject(UnsafePutObject* x) {}
+void Canonicalizer::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {}
 void Canonicalizer::do_UnsafePrefetchRead (UnsafePrefetchRead*  x) {}
 void Canonicalizer::do_UnsafePrefetchWrite(UnsafePrefetchWrite* x) {}
 void Canonicalizer::do_ProfileCall(ProfileCall* x) {}
--- a/hotspot/src/share/vm/c1/c1_Canonicalizer.hpp
+++ b/hotspot/src/share/vm/c1/c1_Canonicalizer.hpp
@ -100,6 +100,7 @@ class Canonicalizer: InstructionVisitor {
  virtual void do_UnsafePutRaw   (UnsafePutRaw*    x);
  virtual void do_UnsafeGetObject(UnsafeGetObject* x);
  virtual void do_UnsafePutObject(UnsafePutObject* x);
+  virtual void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x);
  virtual void do_UnsafePrefetchRead (UnsafePrefetchRead*  x);
  virtual void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x);
  virtual void do_ProfileCall    (ProfileCall*     x);
--- a/hotspot/src/share/vm/c1/c1_Compilation.cpp
+++ b/hotspot/src/share/vm/c1/c1_Compilation.cpp
@ -346,7 +346,8 @@ void Compilation::install_code(int frame_size) {
    implicit_exception_table(),
    compiler(),
    _env->comp_level(),
-    has_unsafe_access()
+    has_unsafe_access(),
+    SharedRuntime::is_wide_vector(max_vector_size())
  );
 }

--- a/hotspot/src/share/vm/c1/c1_Compilation.hpp
+++ b/hotspot/src/share/vm/c1/c1_Compilation.hpp
@ -127,6 +127,7 @@ class Compilation: public StackObj {
  bool has_exception_handlers() const            { return _has_exception_handlers; }
  bool has_fpu_code() const                      { return _has_fpu_code; }
  bool has_unsafe_access() const                 { return _has_unsafe_access; }
+  int max_vector_size() const                    { return 0; }
  ciMethod* method() const                       { return _method; }
  int osr_bci() const                            { return _osr_bci; }
  bool is_osr_compile() const                    { return osr_bci() >= 0; }
--- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
+++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
@ -3383,6 +3383,41 @@ bool GraphBuilder::try_inline_intrinsics(ciMethod* callee) {
      append_unsafe_CAS(callee);
      return true;

+    case vmIntrinsics::_getAndAddInt:
+      if (!VM_Version::supports_atomic_getadd4()) {
+        return false;
+      }
+      return append_unsafe_get_and_set_obj(callee, true);
+    case vmIntrinsics::_getAndAddLong:
+      if (!VM_Version::supports_atomic_getadd8()) {
+        return false;
+      }
+      return append_unsafe_get_and_set_obj(callee, true);
+    case vmIntrinsics::_getAndSetInt:
+      if (!VM_Version::supports_atomic_getset4()) {
+        return false;
+      }
+      return append_unsafe_get_and_set_obj(callee, false);
+    case vmIntrinsics::_getAndSetLong:
+      if (!VM_Version::supports_atomic_getset8()) {
+        return false;
+      }
+      return append_unsafe_get_and_set_obj(callee, false);
+    case vmIntrinsics::_getAndSetObject:
+#ifdef _LP64
+      if (!UseCompressedOops && !VM_Version::supports_atomic_getset8()) {
+        return false;
+      }
+      if (UseCompressedOops && !VM_Version::supports_atomic_getset4()) {
+        return false;
+      }
+#else
+      if (!VM_Version::supports_atomic_getset4()) {
+        return false;
+      }
+#endif
+      return append_unsafe_get_and_set_obj(callee, false);
+
    case vmIntrinsics::_Reference_get:
      // Use the intrinsic version of Reference.get() so that the value in
      // the referent field can be registered by the G1 pre-barrier code.
@ -4106,6 +4141,22 @@ void GraphBuilder::print_inlining(ciMethod* callee, const char* msg, bool succes
  }
 }

+bool GraphBuilder::append_unsafe_get_and_set_obj(ciMethod* callee, bool is_add) {
+  if (InlineUnsafeOps) {
+    Values* args = state()->pop_arguments(callee->arg_size());
+    BasicType t = callee->return_type()->basic_type();
+    null_check(args->at(0));
+    Instruction* offset = args->at(2);
+#ifndef _LP64
+    offset = append(new Convert(Bytecodes::_l2i, offset, as_ValueType(T_INT)));
+#endif
+    Instruction* op = append(new UnsafeGetAndSetObject(t, args->at(1), offset, args->at(3), is_add));
+    compilation()->set_has_unsafe_access(true);
+    kill_all();
+    push(op->type(), op);
+  }
+  return InlineUnsafeOps;
+}

 #ifndef PRODUCT
 void GraphBuilder::print_stats() {
--- a/hotspot/src/share/vm/c1/c1_GraphBuilder.hpp
+++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.hpp
@ -367,6 +367,7 @@ class GraphBuilder VALUE_OBJ_CLASS_SPEC {
  bool append_unsafe_put_raw(ciMethod* callee, BasicType t);
  bool append_unsafe_prefetch(ciMethod* callee, bool is_store, bool is_static);
  void append_unsafe_CAS(ciMethod* callee);
+  bool append_unsafe_get_and_set_obj(ciMethod* callee, bool is_add);

  void print_inlining(ciMethod* callee, const char* msg, bool success = true);

--- a/hotspot/src/share/vm/c1/c1_Instruction.hpp
+++ b/hotspot/src/share/vm/c1/c1_Instruction.hpp
@ -102,6 +102,7 @@ class       UnsafePutRaw;
 class     UnsafeObjectOp;
 class       UnsafeGetObject;
 class       UnsafePutObject;
+class         UnsafeGetAndSetObject;
 class       UnsafePrefetch;
 class         UnsafePrefetchRead;
 class         UnsafePrefetchWrite;
@ -202,6 +203,7 @@ class InstructionVisitor: public StackObj {
  virtual void do_UnsafePutRaw   (UnsafePutRaw*    x) = 0;
  virtual void do_UnsafeGetObject(UnsafeGetObject* x) = 0;
  virtual void do_UnsafePutObject(UnsafePutObject* x) = 0;
+  virtual void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) = 0;
  virtual void do_UnsafePrefetchRead (UnsafePrefetchRead*  x) = 0;
  virtual void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x) = 0;
  virtual void do_ProfileCall    (ProfileCall*     x) = 0;
@ -2273,6 +2275,27 @@ LEAF(UnsafePutObject, UnsafeObjectOp)
                                                   f->visit(&_value); }
 };

+LEAF(UnsafeGetAndSetObject, UnsafeObjectOp)
+ private:
+  Value _value;                                  // Value to be stored
+  bool  _is_add;
+ public:
+  UnsafeGetAndSetObject(BasicType basic_type, Value object, Value offset, Value value, bool is_add)
+  : UnsafeObjectOp(basic_type, object, offset, false, false)
+    , _value(value)
+    , _is_add(is_add)
+  {
+    ASSERT_VALUES
+  }
+
+  // accessors
+  bool is_add() const                            { return _is_add; }
+  Value value()                                  { return _value; }
+
+  // generic
+  virtual void input_values_do(ValueVisitor* f)   { UnsafeObjectOp::input_values_do(f);
+                                                   f->visit(&_value); }
+};

 BASE(UnsafePrefetch, UnsafeObjectOp)
 public:
--- a/hotspot/src/share/vm/c1/c1_InstructionPrinter.cpp
+++ b/hotspot/src/share/vm/c1/c1_InstructionPrinter.cpp
@ -831,6 +831,12 @@ void InstructionPrinter::do_UnsafePutObject(UnsafePutObject* x) {
  output()->put(')');
 }

+void InstructionPrinter::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {
+  print_unsafe_object_op(x, x->is_add()?"UnsafeGetAndSetObject (add)":"UnsafeGetAndSetObject");
+  output()->print(", value ");
+  print_value(x->value());
+  output()->put(')');
+}

 void InstructionPrinter::do_UnsafePrefetchRead(UnsafePrefetchRead* x) {
  print_unsafe_object_op(x, "UnsafePrefetchRead");
--- a/hotspot/src/share/vm/c1/c1_InstructionPrinter.hpp
+++ b/hotspot/src/share/vm/c1/c1_InstructionPrinter.hpp
@ -128,6 +128,7 @@ class InstructionPrinter: public InstructionVisitor {
  virtual void do_UnsafePutRaw   (UnsafePutRaw*    x);
  virtual void do_UnsafeGetObject(UnsafeGetObject* x);
  virtual void do_UnsafePutObject(UnsafePutObject* x);
+  virtual void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x);
  virtual void do_UnsafePrefetchRead (UnsafePrefetchRead*  x);
  virtual void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x);
  virtual void do_ProfileCall    (ProfileCall*     x);
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp
@ -264,6 +264,7 @@ void LIR_Op2::verify() const {
 #ifdef ASSERT
  switch (code()) {
    case lir_cmove:
+    case lir_xchg:
      break;

    default:
@ -630,6 +631,8 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
    case lir_shl:
    case lir_shr:
    case lir_ushr:
+    case lir_xadd:
+    case lir_xchg:
    {
      assert(op->as_Op2() != NULL, "must be");
      LIR_Op2* op2 = (LIR_Op2*)op;
@ -641,6 +644,13 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
      if (op2->_opr2->is_valid())         do_input(op2->_opr2);
      if (op2->_tmp1->is_valid())         do_temp(op2->_tmp1);
      if (op2->_result->is_valid())       do_output(op2->_result);
+      if (op->code() == lir_xchg || op->code() == lir_xadd) {
+        // on ARM and PPC, return value is loaded first so could
+        // destroy inputs. On other platforms that implement those
+        // (x86, sparc), the extra constrainsts are harmless.
+        if (op2->_opr1->is_valid())       do_temp(op2->_opr1);
+        if (op2->_opr2->is_valid())       do_temp(op2->_opr2);
+      }

      break;
    }
@ -1733,6 +1743,8 @@ const char * LIR_Op::name() const {
     case lir_shr:                   s = "shift_right";   break;
     case lir_ushr:                  s = "ushift_right";  break;
     case lir_alloc_array:           s = "alloc_array";   break;
+     case lir_xadd:                  s = "xadd";          break;
+     case lir_xchg:                  s = "xchg";          break;
     // LIR_Op3
     case lir_idiv:                  s = "idiv";          break;
     case lir_irem:                  s = "irem";          break;
--- a/hotspot/src/share/vm/c1/c1_LIR.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.hpp
@ -963,6 +963,8 @@ enum LIR_Code {
      , lir_alloc_array
      , lir_throw
      , lir_compare_to
+      , lir_xadd
+      , lir_xchg
  , end_op2
  , begin_op3
      , lir_idiv
@ -2191,6 +2193,9 @@ class LIR_List: public CompilationResourceObj {
  void profile_call(ciMethod* method, int bci, ciMethod* callee, LIR_Opr mdo, LIR_Opr recv, LIR_Opr t1, ciKlass* cha_klass) {
    append(new LIR_OpProfileCall(lir_profile_call, method, bci, callee, mdo, recv, t1, cha_klass));
  }
+
+  void xadd(LIR_Opr src, LIR_Opr add, LIR_Opr res, LIR_Opr tmp) { append(new LIR_Op2(lir_xadd, src, add, res, tmp)); }
+  void xchg(LIR_Opr src, LIR_Opr set, LIR_Opr res, LIR_Opr tmp) { append(new LIR_Op2(lir_xchg, src, set, res, tmp)); }
 };

 void print_LIR(BlockList* blocks);
@ -2287,16 +2292,21 @@ class LIR_OpVisitState: public StackObj {
      LIR_Address* address = opr->as_address_ptr();
      if (address != NULL) {
        // special handling for addresses: add base and index register of the address
-        // both are always input operands!
+        // both are always input operands or temp if we want to extend
+        // their liveness!
+        if (mode == outputMode) {
+          mode = inputMode;
+        }
+        assert (mode == inputMode || mode == tempMode, "input or temp only for addresses");
        if (address->_base->is_valid()) {
          assert(address->_base->is_register(), "must be");
-          assert(_oprs_len[inputMode] < maxNumberOfOperands, "array overflow");
-          _oprs_new[inputMode][_oprs_len[inputMode]++] = &address->_base;
+          assert(_oprs_len[mode] < maxNumberOfOperands, "array overflow");
+          _oprs_new[mode][_oprs_len[mode]++] = &address->_base;
        }
        if (address->_index->is_valid()) {
          assert(address->_index->is_register(), "must be");
-          assert(_oprs_len[inputMode] < maxNumberOfOperands, "array overflow");
-          _oprs_new[inputMode][_oprs_len[inputMode]++] = &address->_index;
+          assert(_oprs_len[mode] < maxNumberOfOperands, "array overflow");
+          _oprs_new[mode][_oprs_len[mode]++] = &address->_index;
        }

      } else {
--- a/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp
@ -773,6 +773,11 @@ void LIR_Assembler::emit_op2(LIR_Op2* op) {
      throw_op(op->in_opr1(), op->in_opr2(), op->info());
      break;

+    case lir_xadd:
+    case lir_xchg:
+      atomic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op->tmp1_opr());
+      break;
+
    default:
      Unimplemented();
      break;
--- a/hotspot/src/share/vm/c1/c1_LIRAssembler.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIRAssembler.hpp
@ -252,6 +252,8 @@ class LIR_Assembler: public CompilationResourceObj {

  void verify_oop_map(CodeEmitInfo* info);

+  void atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp);
+
 #ifdef TARGET_ARCH_x86
 # include "c1_LIRAssembler_x86.hpp"
 #endif
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
@ -527,6 +527,7 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
  virtual void do_UnsafePutRaw   (UnsafePutRaw*    x);
  virtual void do_UnsafeGetObject(UnsafeGetObject* x);
  virtual void do_UnsafePutObject(UnsafePutObject* x);
+  virtual void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x);
  virtual void do_UnsafePrefetchRead (UnsafePrefetchRead*  x);
  virtual void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x);
  virtual void do_ProfileCall    (ProfileCall*     x);
--- a/hotspot/src/share/vm/c1/c1_Optimizer.cpp
+++ b/hotspot/src/share/vm/c1/c1_Optimizer.cpp
@ -505,6 +505,7 @@ public:
  void do_UnsafePutRaw   (UnsafePutRaw*    x);
  void do_UnsafeGetObject(UnsafeGetObject* x);
  void do_UnsafePutObject(UnsafePutObject* x);
+  void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x);
  void do_UnsafePrefetchRead (UnsafePrefetchRead*  x);
  void do_UnsafePrefetchWrite(UnsafePrefetchWrite* x);
  void do_ProfileCall    (ProfileCall*     x);
@ -676,6 +677,7 @@ void NullCheckVisitor::do_UnsafeGetRaw   (UnsafeGetRaw*    x) {}
 void NullCheckVisitor::do_UnsafePutRaw   (UnsafePutRaw*    x) {}
 void NullCheckVisitor::do_UnsafeGetObject(UnsafeGetObject* x) {}
 void NullCheckVisitor::do_UnsafePutObject(UnsafePutObject* x) {}
+void NullCheckVisitor::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {}
 void NullCheckVisitor::do_UnsafePrefetchRead (UnsafePrefetchRead*  x) {}
 void NullCheckVisitor::do_UnsafePrefetchWrite(UnsafePrefetchWrite* x) {}
 void NullCheckVisitor::do_ProfileCall    (ProfileCall*     x) { nce()->clear_last_explicit_null_check(); }
--- a/hotspot/src/share/vm/c1/c1_ValueMap.hpp
+++ b/hotspot/src/share/vm/c1/c1_ValueMap.hpp
@ -157,6 +157,7 @@ class ValueNumberingVisitor: public InstructionVisitor {
  void do_Invoke         (Invoke*          x) { kill_memory(); }
  void do_UnsafePutRaw   (UnsafePutRaw*    x) { kill_memory(); }
  void do_UnsafePutObject(UnsafePutObject* x) { kill_memory(); }
+  void do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) { kill_memory(); }
  void do_Intrinsic      (Intrinsic*       x) { if (!x->preserves_state()) kill_memory(); }

  void do_Phi            (Phi*             x) { /* nothing to do */ }
--- a/hotspot/src/share/vm/ci/ciEnv.cpp
+++ b/hotspot/src/share/vm/ci/ciEnv.cpp
@ -921,7 +921,8 @@ void ciEnv::register_method(ciMethod* target,
                            ImplicitExceptionTable* inc_table,
                            AbstractCompiler* compiler,
                            int comp_level,
-                            bool has_unsafe_access) {
+                            bool has_unsafe_access,
+                            bool has_wide_vectors) {
  VM_ENTRY_MARK;
  nmethod* nm = NULL;
  {
@ -1016,6 +1017,7 @@ void ciEnv::register_method(ciMethod* target,
      }
    } else {
      nm->set_has_unsafe_access(has_unsafe_access);
+      nm->set_has_wide_vectors(has_wide_vectors);

      // Record successful registration.
      // (Put nm into the task handle *before* publishing to the Java heap.)
--- a/hotspot/src/share/vm/ci/ciEnv.hpp
+++ b/hotspot/src/share/vm/ci/ciEnv.hpp
@ -362,7 +362,8 @@ public:
                       ImplicitExceptionTable*   inc_table,
                       AbstractCompiler*         compiler,
                       int                       comp_level,
-                       bool                      has_unsafe_access);
+                       bool                      has_unsafe_access,
+                       bool                      has_wide_vectors);


  // Access to certain well known ciObjects.
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@ -873,6 +873,20 @@
   do_name(     putOrderedInt_name,                              "putOrderedInt")                                       \
   do_alias(    putOrderedInt_signature,                        /*(Ljava/lang/Object;JI)V*/ putInt_signature)           \
                                                                                                                        \
+  do_intrinsic(_getAndAddInt,             sun_misc_Unsafe,        getAndAddInt_name, getAndAddInt_signature, F_R)       \
+   do_name(     getAndAddInt_name,                                "getAndAddInt")                                       \
+   do_signature(getAndAddInt_signature,                           "(Ljava/lang/Object;JI)I" )                           \
+  do_intrinsic(_getAndAddLong,            sun_misc_Unsafe,        getAndAddLong_name, getAndAddLong_signature, F_R)     \
+   do_name(     getAndAddLong_name,                               "getAndAddLong")                                      \
+   do_signature(getAndAddLong_signature,                          "(Ljava/lang/Object;JJ)J" )                           \
+  do_intrinsic(_getAndSetInt,             sun_misc_Unsafe,        getAndSet_name, getAndSetInt_signature, F_R)          \
+   do_name(     getAndSet_name,                                   "getAndSet")                                          \
+   do_alias(    getAndSetInt_signature,                         /*"(Ljava/lang/Object;JI)I"*/ getAndAddInt_signature)   \
+  do_intrinsic(_getAndSetLong,            sun_misc_Unsafe,        getAndSet_name, getAndSetLong_signature, F_R)         \
+   do_alias(    getAndSetLong_signature,                        /*"(Ljava/lang/Object;JJ)J"*/ getAndAddLong_signature)  \
+  do_intrinsic(_getAndSetObject,          sun_misc_Unsafe,        getAndSet_name, getAndSetObject_signature,  F_R)      \
+   do_signature(getAndSetObject_signature,                        "(Ljava/lang/Object;JLjava/lang/Object;)Ljava/lang/Object;" ) \
+                                                                                                                        \
  /* prefetch_signature is shared by all prefetch variants */                                                           \
  do_signature( prefetch_signature,        "(Ljava/lang/Object;J)V")                                                    \
                                                                                                                        \
--- a/hotspot/src/share/vm/code/codeBlob.cpp
+++ b/hotspot/src/share/vm/code/codeBlob.cpp
@ -162,8 +162,10 @@ void CodeBlob::trace_new_stub(CodeBlob* stub, const char* name1, const char* nam
    assert(strlen(name1) + strlen(name2) < sizeof(stub_id), "");
    jio_snprintf(stub_id, sizeof(stub_id), "%s%s", name1, name2);
    if (PrintStubCode) {
+      ttyLocker ttyl;
      tty->print_cr("Decoding %s " INTPTR_FORMAT, stub_id, (intptr_t) stub);
      Disassembler::decode(stub->code_begin(), stub->code_end());
+      tty->cr();
    }
    Forte::register_stub(stub_id, stub->code_begin(), stub->code_end());

@ -548,6 +550,7 @@ void RuntimeStub::verify() {
 }

 void RuntimeStub::print_on(outputStream* st) const {
+  ttyLocker ttyl;
  CodeBlob::print_on(st);
  st->print("Runtime Stub (" INTPTR_FORMAT "): ", this);
  st->print_cr(name());
@ -563,6 +566,7 @@ void SingletonBlob::verify() {
 }

 void SingletonBlob::print_on(outputStream* st) const {
+  ttyLocker ttyl;
  CodeBlob::print_on(st);
  st->print_cr(name());
  Disassembler::decode((CodeBlob*)this, st);
--- a/hotspot/src/share/vm/code/codeBlob.hpp
+++ b/hotspot/src/share/vm/code/codeBlob.hpp
@ -184,7 +184,7 @@ class CodeBlob VALUE_OBJ_CLASS_SPEC {
  static void trace_new_stub(CodeBlob* blob, const char* name1, const char* name2 = "");

  // Print the comment associated with offset on stream, if there is one
-  virtual void print_block_comment(outputStream* stream, address block_begin) {
+  virtual void print_block_comment(outputStream* stream, address block_begin) const {
    intptr_t offset = (intptr_t)(block_begin - code_begin());
    _comments.print_block_comment(stream, offset);
  }
--- a/hotspot/src/share/vm/code/icBuffer.hpp
+++ b/hotspot/src/share/vm/code/icBuffer.hpp
@ -25,6 +25,7 @@
 #ifndef SHARE_VM_CODE_ICBUFFER_HPP
 #define SHARE_VM_CODE_ICBUFFER_HPP

+#include "asm/codeBuffer.hpp"
 #include "code/stubs.hpp"
 #include "interpreter/bytecodes.hpp"
 #include "memory/allocation.hpp"
@ -48,7 +49,8 @@ class ICStub: public Stub {
 protected:
  friend class ICStubInterface;
  // This will be called only by ICStubInterface
-  void    initialize(int size) { _size = size; _ic_site = NULL; }
+  void    initialize(int size,
+                     CodeComments comments)      { _size = size; _ic_site = NULL; }
  void    finalize(); // called when a method is removed

  // General info
--- a/hotspot/src/share/vm/code/nmethod.cpp
+++ b/hotspot/src/share/vm/code/nmethod.cpp
@ -463,6 +463,7 @@ void nmethod::init_defaults() {
  _has_unsafe_access          = 0;
  _has_method_handle_invokes  = 0;
  _lazy_critical_native       = 0;
+  _has_wide_vectors           = 0;
  _marked_for_deoptimization  = 0;
  _lock_count                 = 0;
  _stack_traversal_mark       = 0;
@ -700,7 +701,9 @@ nmethod::nmethod(
    // then print the requested information
    if (PrintNativeNMethods) {
      print_code();
-      oop_maps->print();
+      if (oop_maps != NULL) {
+        oop_maps->print();
+      }
    }
    if (PrintRelocations) {
      print_relocations();
@ -2669,7 +2672,7 @@ ScopeDesc* nmethod::scope_desc_in(address begin, address end) {
  return NULL;
 }

-void nmethod::print_nmethod_labels(outputStream* stream, address block_begin) {
+void nmethod::print_nmethod_labels(outputStream* stream, address block_begin) const {
  if (block_begin == entry_point())             stream->print_cr("[Entry Point]");
  if (block_begin == verified_entry_point())    stream->print_cr("[Verified Entry Point]");
  if (block_begin == exception_begin())         stream->print_cr("[Exception Handler]");
--- a/hotspot/src/share/vm/code/nmethod.hpp
+++ b/hotspot/src/share/vm/code/nmethod.hpp
@ -177,6 +177,7 @@ class nmethod : public CodeBlob {
  unsigned int _has_unsafe_access:1;         // May fault due to unsafe access.
  unsigned int _has_method_handle_invokes:1; // Has this method MethodHandle invokes?
  unsigned int _lazy_critical_native:1;      // Lazy JNI critical native
+  unsigned int _has_wide_vectors:1;          // Preserve wide vectors at safepoints

  // Protected by Patching_lock
  unsigned char _state;                      // {alive, not_entrant, zombie, unloaded}
@ -442,6 +443,9 @@ class nmethod : public CodeBlob {
  bool  is_lazy_critical_native() const           { return _lazy_critical_native; }
  void  set_lazy_critical_native(bool z)          { _lazy_critical_native = z; }

+  bool  has_wide_vectors() const                  { return _has_wide_vectors; }
+  void  set_has_wide_vectors(bool z)              { _has_wide_vectors = z; }
+
  int   comp_level() const                        { return _comp_level; }

  // Support for oops in scopes and relocs:
@ -649,11 +653,11 @@ public:
  void log_state_change() const;

  // Prints block-level comments, including nmethod specific block labels:
-  virtual void print_block_comment(outputStream* stream, address block_begin) {
+  virtual void print_block_comment(outputStream* stream, address block_begin) const {
    print_nmethod_labels(stream, block_begin);
    CodeBlob::print_block_comment(stream, block_begin);
  }
-  void print_nmethod_labels(outputStream* stream, address block_begin);
+  void print_nmethod_labels(outputStream* stream, address block_begin) const;

  // Prints a comment for one native instruction (reloc info, pc desc)
  void print_code_comment_on(outputStream* st, int column, address begin, address end);
--- a/hotspot/src/share/vm/code/stubs.cpp
+++ b/hotspot/src/share/vm/code/stubs.cpp
@ -101,7 +101,8 @@ Stub* StubQueue::stub_containing(address pc) const {

 Stub* StubQueue::request_committed(int code_size) {
  Stub* s = request(code_size);
-  if (s != NULL) commit(code_size);
+  CodeComments comments;
+  if (s != NULL) commit(code_size, comments);
  return s;
 }

@ -118,7 +119,8 @@ Stub* StubQueue::request(int requested_code_size) {
      assert(_buffer_limit == _buffer_size, "buffer must be fully usable");
      if (_queue_end + requested_size <= _buffer_size) {
        // code fits in at the end => nothing to do
-        stub_initialize(s, requested_size);
+        CodeComments comments;
+        stub_initialize(s, requested_size, comments);
        return s;
      } else {
        // stub doesn't fit in at the queue end
@ -135,7 +137,8 @@ Stub* StubQueue::request(int requested_code_size) {
    // Queue: |XXX|.......|XXXXXXX|.......|
    //        ^0  ^end    ^begin  ^limit  ^size
    s = current_stub();
-    stub_initialize(s, requested_size);
+    CodeComments comments;
+    stub_initialize(s, requested_size, comments);
    return s;
  }
  // Not enough space left
@ -144,12 +147,12 @@ Stub* StubQueue::request(int requested_code_size) {
 }


-void StubQueue::commit(int committed_code_size) {
+void StubQueue::commit(int committed_code_size, CodeComments& comments) {
  assert(committed_code_size > 0, "committed_code_size must be > 0");
  int committed_size = round_to(stub_code_size_to_size(committed_code_size), CodeEntryAlignment);
  Stub* s = current_stub();
  assert(committed_size <= stub_size(s), "committed size must not exceed requested size");
-  stub_initialize(s, committed_size);
+  stub_initialize(s, committed_size, comments);
  _queue_end += committed_size;
  _number_of_stubs++;
  if (_mutex != NULL) _mutex->unlock();
--- a/hotspot/src/share/vm/code/stubs.hpp
+++ b/hotspot/src/share/vm/code/stubs.hpp
@ -25,6 +25,7 @@
 #ifndef SHARE_VM_CODE_STUBS_HPP
 #define SHARE_VM_CODE_STUBS_HPP

+#include "asm/codeBuffer.hpp"
 #include "memory/allocation.hpp"
 #ifdef TARGET_OS_FAMILY_linux
 # include "os_linux.inline.hpp"
@ -71,7 +72,8 @@
 class Stub VALUE_OBJ_CLASS_SPEC {
 public:
  // Initialization/finalization
-  void    initialize(int size)                   { ShouldNotCallThis(); }                // called to initialize/specify the stub's size
+  void    initialize(int size,
+                     CodeComments& comments)     { ShouldNotCallThis(); }                // called to initialize/specify the stub's size
  void    finalize()                             { ShouldNotCallThis(); }                // called before the stub is deallocated

  // General info/converters
@ -104,7 +106,8 @@ class Stub VALUE_OBJ_CLASS_SPEC {
 class StubInterface: public CHeapObj<mtCode> {
 public:
  // Initialization/finalization
-  virtual void    initialize(Stub* self, int size)         = 0; // called after creation (called twice if allocated via (request, commit))
+  virtual void    initialize(Stub* self, int size,
+                             CodeComments& comments)       = 0; // called after creation (called twice if allocated via (request, commit))
  virtual void    finalize(Stub* self)                     = 0; // called before deallocation

  // General info/converters
@ -132,7 +135,8 @@ class StubInterface: public CHeapObj<mtCode> {
                                                           \
   public:                                                 \
    /* Initialization/finalization */                      \
-    virtual void    initialize(Stub* self, int size)       { cast(self)->initialize(size); }       \
+    virtual void    initialize(Stub* self, int size,       \
+                               CodeComments& comments)     { cast(self)->initialize(size, comments); } \
    virtual void    finalize(Stub* self)                   { cast(self)->finalize(); }             \
                                                           \
    /* General info */                                     \
@ -171,7 +175,8 @@ class StubQueue: public CHeapObj<mtCode> {
  Stub* current_stub() const                     { return stub_at(_queue_end); }

  // Stub functionality accessed via interface
-  void  stub_initialize(Stub* s, int size)       { assert(size % CodeEntryAlignment == 0, "size not aligned"); _stub_interface->initialize(s, size); }
+  void  stub_initialize(Stub* s, int size,
+                        CodeComments& comments)  { assert(size % CodeEntryAlignment == 0, "size not aligned"); _stub_interface->initialize(s, size, comments); }
  void  stub_finalize(Stub* s)                   { _stub_interface->finalize(s); }
  int   stub_size(Stub* s) const                 { return _stub_interface->size(s); }
  bool  stub_contains(Stub* s, address pc) const { return _stub_interface->code_begin(s) <= pc && pc < _stub_interface->code_end(s); }
@ -200,7 +205,8 @@ class StubQueue: public CHeapObj<mtCode> {
  // Stub allocation (atomic transactions)
  Stub* request_committed(int code_size);        // request a stub that provides exactly code_size space for code
  Stub* request(int requested_code_size);        // request a stub with a (maximum) code space - locks the queue
-  void  commit (int committed_code_size);        // commit the previously requested stub - unlocks the queue
+  void  commit (int committed_code_size,
+                CodeComments& comments);         // commit the previously requested stub - unlocks the queue

  // Stub deallocation
  void  remove_first();                          // remove the first stub in the queue
--- a/hotspot/src/share/vm/compiler/disassembler.cpp
+++ b/hotspot/src/share/vm/compiler/disassembler.cpp
@ -148,6 +148,7 @@ class decode_env {
 private:
  nmethod*      _nm;
  CodeBlob*     _code;
+  CodeComments  _comments;
  outputStream* _output;
  address       _start, _end;

@ -187,7 +188,7 @@ class decode_env {
  void print_address(address value);

 public:
-  decode_env(CodeBlob* code, outputStream* output);
+  decode_env(CodeBlob* code, outputStream* output, CodeComments c = CodeComments());

  address decode_instructions(address start, address end);

@ -229,12 +230,13 @@ class decode_env {
  const char* options() { return _option_buf; }
 };

-decode_env::decode_env(CodeBlob* code, outputStream* output) {
+decode_env::decode_env(CodeBlob* code, outputStream* output, CodeComments c) {
  memset(this, 0, sizeof(*this));
  _output = output ? output : tty;
  _code = code;
  if (code != NULL && code->is_nmethod())
    _nm = (nmethod*) code;
+  _comments.assign(c);

  // by default, output pc but not bytes:
  _print_pc       = true;
@ -356,6 +358,7 @@ void decode_env::print_insn_labels() {
  if (cb != NULL) {
    cb->print_block_comment(st, p);
  }
+  _comments.print_block_comment(st, (intptr_t)(p - _start));
  if (_print_pc) {
    st->print("  " PTR_FORMAT ": ", p);
  }
@ -467,10 +470,9 @@ void Disassembler::decode(CodeBlob* cb, outputStream* st) {
  env.decode_instructions(cb->code_begin(), cb->code_end());
 }

-
-void Disassembler::decode(address start, address end, outputStream* st) {
+void Disassembler::decode(address start, address end, outputStream* st, CodeComments c) {
  if (!load_library())  return;
-  decode_env env(CodeCache::find_blob_unsafe(start), st);
+  decode_env env(CodeCache::find_blob_unsafe(start), st, c);
  env.decode_instructions(start, end);
 }

--- a/hotspot/src/share/vm/compiler/disassembler.hpp
+++ b/hotspot/src/share/vm/compiler/disassembler.hpp
@ -25,6 +25,7 @@
 #ifndef SHARE_VM_COMPILER_DISASSEMBLER_HPP
 #define SHARE_VM_COMPILER_DISASSEMBLER_HPP

+#include "asm/codeBuffer.hpp"
 #include "runtime/globals.hpp"
 #ifdef TARGET_OS_FAMILY_linux
 # include "os_linux.inline.hpp"
@ -87,7 +88,7 @@ class Disassembler {
  }
  static void decode(CodeBlob *cb,               outputStream* st = NULL);
  static void decode(nmethod* nm,                outputStream* st = NULL);
-  static void decode(address begin, address end, outputStream* st = NULL);
+  static void decode(address begin, address end, outputStream* st = NULL, CodeComments c = CodeComments());
 };

 #endif // SHARE_VM_COMPILER_DISASSEMBLER_HPP
--- a/hotspot/src/share/vm/interpreter/interpreter.cpp
+++ b/hotspot/src/share/vm/interpreter/interpreter.cpp
@ -60,6 +60,8 @@ void InterpreterCodelet::verify() {


 void InterpreterCodelet::print_on(outputStream* st) const {
+  ttyLocker ttyl;
+
  if (PrintInterpreter) {
    st->cr();
    st->print_cr("----------------------------------------------------------------------");
@ -72,7 +74,7 @@ void InterpreterCodelet::print_on(outputStream* st) const {

  if (PrintInterpreter) {
    st->cr();
-    Disassembler::decode(code_begin(), code_end(), st);
+    Disassembler::decode(code_begin(), code_end(), st, DEBUG_ONLY(_comments) NOT_DEBUG(CodeComments()));
  }
 }

--- a/hotspot/src/share/vm/interpreter/interpreter.hpp
+++ b/hotspot/src/share/vm/interpreter/interpreter.hpp
@ -48,10 +48,12 @@ class InterpreterCodelet: public Stub {
  int         _size;                             // the size in bytes
  const char* _description;                      // a description of the codelet, for debugging & printing
  Bytecodes::Code _bytecode;                     // associated bytecode if any
+  DEBUG_ONLY(CodeComments _comments;)            // Comments for annotating assembler output.

 public:
  // Initialization/finalization
-  void    initialize(int size)                   { _size = size; }
+  void    initialize(int size,
+                     CodeComments& comments)     { _size = size; DEBUG_ONLY(_comments.assign(comments);) }
  void    finalize()                             { ShouldNotCallThis(); }

  // General info/converters
@ -129,7 +131,7 @@ class CodeletMark: ResourceMark {


    // commit Codelet
-    AbstractInterpreter::code()->commit((*_masm)->code()->pure_insts_size());
+    AbstractInterpreter::code()->commit((*_masm)->code()->pure_insts_size(), (*_masm)->code()->comments());
    // make sure nobody can use _masm outside a CodeletMark lifespan
    *_masm = NULL;
  }
--- a/hotspot/src/share/vm/oops/method.cpp
+++ b/hotspot/src/share/vm/oops/method.cpp
@ -251,8 +251,12 @@ void Method::mask_for(int bci, InterpreterOopMap* mask) {


 int Method::bci_from(address bcp) const {
+#ifdef ASSERT
+  { ResourceMark rm;
  assert(is_native() && bcp == code_base() || contains(bcp) || is_error_reported(),
         err_msg("bcp doesn't belong to this method: bcp: " INTPTR_FORMAT ", method: %s", bcp, name_and_sig_as_C_string()));
+  }
+#endif
  return bcp - code_base();
 }

--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@ -85,7 +85,7 @@
          "Max vector size in bytes, "                                      \
          "actual size could be less depending on elements type")           \
                                                                            \
-  product(bool, AlignVector, false,                                         \
+  product(bool, AlignVector, true,                                          \
          "Perform vector store/load alignment in loop")                    \
                                                                            \
  product(intx, NumberOfLoopInstrToAlign, 4,                                \
@ -535,7 +535,7 @@
  notproduct(bool, TraceSpilling, false,                                    \
          "Trace spilling")                                                 \
                                                                            \
-  notproduct(bool, TraceTypeProfile, false,                                 \
+  diagnostic(bool, TraceTypeProfile, false,                                 \
          "Trace type profile")                                             \
                                                                            \
  develop(bool, PoisonOSREntry, true,                                       \
--- a/hotspot/src/share/vm/opto/classes.hpp
+++ b/hotspot/src/share/vm/opto/classes.hpp
@ -83,6 +83,12 @@ macro(CompareAndSwapI)
 macro(CompareAndSwapL)
 macro(CompareAndSwapP)
 macro(CompareAndSwapN)
+macro(GetAndAddI)
+macro(GetAndAddL)
+macro(GetAndSetI)
+macro(GetAndSetL)
+macro(GetAndSetP)
+macro(GetAndSetN)
 macro(Con)
 macro(ConN)
 macro(ConD)
--- a/hotspot/src/share/vm/opto/compile.cpp
+++ b/hotspot/src/share/vm/opto/compile.cpp
@ -825,7 +825,8 @@ Compile::Compile( ciEnv* ci_env, C2Compiler* compiler, ciMethod* target, int osr
                           &_handler_table, &_inc_table,
                           compiler,
                           env()->comp_level(),
-                           has_unsafe_access()
+                           has_unsafe_access(),
+                           SharedRuntime::is_wide_vector(max_vector_size())
                           );
  }
 }
@ -963,6 +964,7 @@ void Compile::Init(int aliaslevel) {
  _trap_can_recompile = false;  // no traps emitted yet
  _major_progress = true; // start out assuming good things will happen
  set_has_unsafe_access(false);
+  set_max_vector_size(0);
  Copy::zero_to_bytes(_trap_hist, sizeof(_trap_hist));
  set_decompile_count(0);

@ -2274,6 +2276,12 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) {
  case Op_CompareAndSwapL:
  case Op_CompareAndSwapP:
  case Op_CompareAndSwapN:
+  case Op_GetAndAddI:
+  case Op_GetAndAddL:
+  case Op_GetAndSetI:
+  case Op_GetAndSetL:
+  case Op_GetAndSetP:
+  case Op_GetAndSetN:
  case Op_StoreP:
  case Op_StoreN:
  case Op_LoadB:
--- a/hotspot/src/share/vm/opto/compile.hpp
+++ b/hotspot/src/share/vm/opto/compile.hpp
@ -279,6 +279,7 @@ class Compile : public Phase {
  bool                  _has_split_ifs;         // True if the method _may_ have some split-if
  bool                  _has_unsafe_access;     // True if the method _may_ produce faults in unsafe loads or stores.
  bool                  _has_stringbuilder;     // True StringBuffers or StringBuilders are allocated
+  int                   _max_vector_size;       // Maximum size of generated vectors
  uint                  _trap_hist[trapHistLength];  // Cumulative traps
  bool                  _trap_can_recompile;    // Have we emitted a recompiling trap?
  uint                  _decompile_count;       // Cumulative decompilation counts.
@ -443,6 +444,8 @@ class Compile : public Phase {
  void          set_has_unsafe_access(bool z)   { _has_unsafe_access = z; }
  bool              has_stringbuilder() const   { return _has_stringbuilder; }
  void          set_has_stringbuilder(bool z)   { _has_stringbuilder = z; }
+  int               max_vector_size() const     { return _max_vector_size; }
+  void          set_max_vector_size(int s)      { _max_vector_size = s; }
  void          set_trap_count(uint r, uint c)  { assert(r < trapHistLength, "oob");        _trap_hist[r] = c; }
  uint              trap_count(uint r) const    { assert(r < trapHistLength, "oob"); return _trap_hist[r]; }
  bool              trap_can_recompile() const  { return _trap_can_recompile; }
--- a/hotspot/src/share/vm/opto/connode.cpp
+++ b/hotspot/src/share/vm/opto/connode.cpp
@ -480,7 +480,9 @@ static bool can_cause_alias(Node *n, PhaseTransform *phase) {
        opc == Op_CheckCastPP ||
        opc == Op_StorePConditional ||
        opc == Op_CompareAndSwapP ||
-        opc == Op_CompareAndSwapN;
+        opc == Op_CompareAndSwapN ||
+        opc == Op_GetAndSetP ||
+        opc == Op_GetAndSetN;
  }
  return possible_alias;
 }
--- a/hotspot/src/share/vm/opto/doCall.cpp
+++ b/hotspot/src/share/vm/opto/doCall.cpp
@ -40,11 +40,10 @@
 #include "prims/nativeLookup.hpp"
 #include "runtime/sharedRuntime.hpp"

-#ifndef PRODUCT
 void trace_type_profile(ciMethod *method, int depth, int bci, ciMethod *prof_method, ciKlass *prof_klass, int site_count, int receiver_count) {
-  if (TraceTypeProfile || PrintInlining || PrintOptoInlining) {
+  if (TraceTypeProfile || PrintInlining NOT_PRODUCT(|| PrintOptoInlining)) {
    if (!PrintInlining) {
-      if (!PrintOpto && !PrintCompilation) {
+      if (NOT_PRODUCT(!PrintOpto &&) !PrintCompilation) {
        method->print_short_name();
        tty->cr();
      }
@ -56,7 +55,6 @@ void trace_type_profile(ciMethod *method, int depth, int bci, ciMethod *prof_met
    tty->cr();
  }
 }
-#endif

 CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool call_is_virtual,
                                       JVMState* jvms, bool allow_inline,
@ -225,13 +223,13 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
          }
          if (miss_cg != NULL) {
            if (next_hit_cg != NULL) {
-              NOT_PRODUCT(trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1)));
+              trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1));
              // We don't need to record dependency on a receiver here and below.
              // Whenever we inline, the dependency is added by Parse::Parse().
              miss_cg = CallGenerator::for_predicted_call(profile.receiver(1), miss_cg, next_hit_cg, PROB_MAX);
            }
            if (miss_cg != NULL) {
-              NOT_PRODUCT(trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, profile.receiver(0), site_count, receiver_count));
+              trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, profile.receiver(0), site_count, receiver_count);
              CallGenerator* cg = CallGenerator::for_predicted_call(profile.receiver(0), miss_cg, hit_cg, profile.receiver_prob(0));
              if (cg != NULL)  return cg;
            }
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@ -282,6 +282,26 @@ bool ConnectionGraph::compute_escape() {
  return has_non_escaping_obj;
 }

+// Utility function for nodes that load an object
+void ConnectionGraph::add_objload_to_connection_graph(Node *n, Unique_Node_List *delayed_worklist) {
+  // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
+  // ThreadLocal has RawPtr type.
+  const Type* t = _igvn->type(n);
+  if (t->make_ptr() != NULL) {
+    Node* adr = n->in(MemNode::Address);
+#ifdef ASSERT
+    if (!adr->is_AddP()) {
+      assert(_igvn->type(adr)->isa_rawptr(), "sanity");
+    } else {
+      assert((ptnode_adr(adr->_idx) == NULL ||
+              ptnode_adr(adr->_idx)->as_Field()->is_oop()), "sanity");
+    }
+#endif
+    add_local_var_and_edge(n, PointsToNode::NoEscape,
+                           adr, delayed_worklist);
+  }
+}
+
 // Populate Connection Graph with PointsTo nodes and create simple
 // connection graph edges.
 void ConnectionGraph::add_node_to_connection_graph(Node *n, Unique_Node_List *delayed_worklist) {
@ -387,22 +407,7 @@ void ConnectionGraph::add_node_to_connection_graph(Node *n, Unique_Node_List *de
    case Op_LoadP:
    case Op_LoadN:
    case Op_LoadPLocked: {
-      // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
-      // ThreadLocal has RawPrt type.
-      const Type* t = igvn->type(n);
-      if (t->make_ptr() != NULL) {
-        Node* adr = n->in(MemNode::Address);
-#ifdef ASSERT
-        if (!adr->is_AddP()) {
-          assert(igvn->type(adr)->isa_rawptr(), "sanity");
-        } else {
-          assert((ptnode_adr(adr->_idx) == NULL ||
-                  ptnode_adr(adr->_idx)->as_Field()->is_oop()), "sanity");
-        }
-#endif
-        add_local_var_and_edge(n, PointsToNode::NoEscape,
-                               adr, delayed_worklist);
-      }
+      add_objload_to_connection_graph(n, delayed_worklist);
      break;
    }
    case Op_Parm: {
@ -417,7 +422,7 @@ void ConnectionGraph::add_node_to_connection_graph(Node *n, Unique_Node_List *de
    }
    case Op_Phi: {
      // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
-      // ThreadLocal has RawPrt type.
+      // ThreadLocal has RawPtr type.
      const Type* t = n->as_Phi()->type();
      if (t->make_ptr() != NULL) {
        add_local_var(n, PointsToNode::NoEscape);
@ -446,6 +451,11 @@ void ConnectionGraph::add_node_to_connection_graph(Node *n, Unique_Node_List *de
      }
      break;
    }
+    case Op_GetAndSetP:
+    case Op_GetAndSetN: {
+      add_objload_to_connection_graph(n, delayed_worklist);
+      // fallthrough
+    }
    case Op_StoreP:
    case Op_StoreN:
    case Op_StorePConditional:
@ -585,7 +595,7 @@ void ConnectionGraph::add_final_edges(Node *n) {
    case Op_LoadN:
    case Op_LoadPLocked: {
      // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
-      // ThreadLocal has RawPrt type.
+      // ThreadLocal has RawPtr type.
      const Type* t = _igvn->type(n);
      if (t->make_ptr() != NULL) {
        Node* adr = n->in(MemNode::Address);
@ -596,7 +606,7 @@ void ConnectionGraph::add_final_edges(Node *n) {
    }
    case Op_Phi: {
      // Using isa_ptr() instead of isa_oopptr() for LoadP and Phi because
-      // ThreadLocal has RawPrt type.
+      // ThreadLocal has RawPtr type.
      const Type* t = n->as_Phi()->type();
      if (t->make_ptr() != NULL) {
        for (uint i = 1; i < n->req(); i++) {
@ -638,8 +648,16 @@ void ConnectionGraph::add_final_edges(Node *n) {
    case Op_StoreN:
    case Op_StorePConditional:
    case Op_CompareAndSwapP:
-    case Op_CompareAndSwapN: {
+    case Op_CompareAndSwapN:
+    case Op_GetAndSetP:
+    case Op_GetAndSetN: {
      Node* adr = n->in(MemNode::Address);
+      if (opcode == Op_GetAndSetP || opcode == Op_GetAndSetN) {
+        const Type* t = _igvn->type(n);
+        if (t->make_ptr() != NULL) {
+          add_local_var_and_edge(n, PointsToNode::NoEscape, adr, NULL);
+        }
+      }
      const Type *adr_type = _igvn->type(adr);
      adr_type = adr_type->make_ptr();
      if (adr_type->isa_oopptr() ||
--- a/hotspot/src/share/vm/opto/escape.hpp
+++ b/hotspot/src/share/vm/opto/escape.hpp
@ -371,6 +371,8 @@ private:
    _nodes.at_put(n->_idx, ptn);
  }

+  // Utility function for nodes that load an object
+  void add_objload_to_connection_graph(Node *n, Unique_Node_List *delayed_worklist);
  // Create PointsToNode node and add it to Connection Graph.
  void add_node_to_connection_graph(Node *n, Unique_Node_List *delayed_worklist);

--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -65,6 +65,8 @@ class LibraryCallKit : public GraphKit {
 private:
  LibraryIntrinsic* _intrinsic;   // the library intrinsic being called

+  const TypeOopPtr* sharpen_unsafe_type(Compile::AliasType* alias_type, const TypePtr *adr_type, bool is_native_ptr = false);
+
 public:
  LibraryCallKit(JVMState* caller, LibraryIntrinsic* intrinsic)
    : GraphKit(caller),
@ -241,7 +243,8 @@ class LibraryCallKit : public GraphKit {
                                    Node* src,  Node* src_offset,
                                    Node* dest, Node* dest_offset,
                                    Node* copy_length, bool dest_uninitialized);
-  bool inline_unsafe_CAS(BasicType type);
+  typedef enum { LS_xadd, LS_xchg, LS_cmpxchg } LoadStoreKind;
+  bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind);
  bool inline_unsafe_ordered_store(BasicType type);
  bool inline_fp_conversions(vmIntrinsics::ID id);
  bool inline_numberOfLeadingZeros(vmIntrinsics::ID id);
@ -290,6 +293,11 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
    case vmIntrinsics::_compareTo:
    case vmIntrinsics::_equals:
    case vmIntrinsics::_equalsC:
+    case vmIntrinsics::_getAndAddInt:
+    case vmIntrinsics::_getAndAddLong:
+    case vmIntrinsics::_getAndSetInt:
+    case vmIntrinsics::_getAndSetLong:
+    case vmIntrinsics::_getAndSetObject:
      break;  // InlineNatives does not control String.compareTo
    case vmIntrinsics::_Reference_get:
      break;  // InlineNatives does not control Reference.get
@ -369,6 +377,42 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
    // across safepoint since GC can change it value.
    break;

+  case vmIntrinsics::_compareAndSwapObject:
+#ifdef _LP64
+    if (!UseCompressedOops && !Matcher::match_rule_supported(Op_CompareAndSwapP)) return NULL;
+#endif
+    break;
+
+  case vmIntrinsics::_compareAndSwapLong:
+    if (!Matcher::match_rule_supported(Op_CompareAndSwapL)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndAddInt:
+    if (!Matcher::match_rule_supported(Op_GetAndAddI)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndAddLong:
+    if (!Matcher::match_rule_supported(Op_GetAndAddL)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndSetInt:
+    if (!Matcher::match_rule_supported(Op_GetAndSetI)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndSetLong:
+    if (!Matcher::match_rule_supported(Op_GetAndSetL)) return NULL;
+    break;
+
+  case vmIntrinsics::_getAndSetObject:
+#ifdef _LP64
+    if (!UseCompressedOops && !Matcher::match_rule_supported(Op_GetAndSetP)) return NULL;
+    if (UseCompressedOops && !Matcher::match_rule_supported(Op_GetAndSetN)) return NULL;
+    break;
+#else
+    if (!Matcher::match_rule_supported(Op_GetAndSetP)) return NULL;
+    break;
+#endif
+
 default:
    assert(id <= vmIntrinsics::LAST_COMPILER_INLINE, "caller responsibility");
    assert(id != vmIntrinsics::_Object_init && id != vmIntrinsics::_invoke, "enum out of order?");
@ -620,11 +664,11 @@ bool LibraryCallKit::try_to_inline() {
    return inline_unsafe_prefetch(!is_native_ptr, is_store, is_static);

  case vmIntrinsics::_compareAndSwapObject:
-    return inline_unsafe_CAS(T_OBJECT);
+    return inline_unsafe_load_store(T_OBJECT, LS_cmpxchg);
  case vmIntrinsics::_compareAndSwapInt:
-    return inline_unsafe_CAS(T_INT);
+    return inline_unsafe_load_store(T_INT, LS_cmpxchg);
  case vmIntrinsics::_compareAndSwapLong:
-    return inline_unsafe_CAS(T_LONG);
+    return inline_unsafe_load_store(T_LONG, LS_cmpxchg);

  case vmIntrinsics::_putOrderedObject:
    return inline_unsafe_ordered_store(T_OBJECT);
@ -633,6 +677,17 @@ bool LibraryCallKit::try_to_inline() {
  case vmIntrinsics::_putOrderedLong:
    return inline_unsafe_ordered_store(T_LONG);

+  case vmIntrinsics::_getAndAddInt:
+    return inline_unsafe_load_store(T_INT, LS_xadd);
+  case vmIntrinsics::_getAndAddLong:
+    return inline_unsafe_load_store(T_LONG, LS_xadd);
+  case vmIntrinsics::_getAndSetInt:
+    return inline_unsafe_load_store(T_INT, LS_xchg);
+  case vmIntrinsics::_getAndSetLong:
+    return inline_unsafe_load_store(T_LONG, LS_xchg);
+  case vmIntrinsics::_getAndSetObject:
+    return inline_unsafe_load_store(T_OBJECT, LS_xchg);
+
  case vmIntrinsics::_currentThread:
    return inline_native_currentThread();
  case vmIntrinsics::_isInterrupted:
@ -2301,6 +2356,43 @@ void LibraryCallKit::insert_pre_barrier(Node* base_oop, Node* offset,
 // Interpret Unsafe.fieldOffset cookies correctly:
 extern jlong Unsafe_field_offset_to_byte_offset(jlong field_offset);

+const TypeOopPtr* LibraryCallKit::sharpen_unsafe_type(Compile::AliasType* alias_type, const TypePtr *adr_type, bool is_native_ptr) {
+  // Attempt to infer a sharper value type from the offset and base type.
+  ciKlass* sharpened_klass = NULL;
+
+  // See if it is an instance field, with an object type.
+  if (alias_type->field() != NULL) {
+    assert(!is_native_ptr, "native pointer op cannot use a java address");
+    if (alias_type->field()->type()->is_klass()) {
+      sharpened_klass = alias_type->field()->type()->as_klass();
+    }
+  }
+
+  // See if it is a narrow oop array.
+  if (adr_type->isa_aryptr()) {
+    if (adr_type->offset() >= objArrayOopDesc::base_offset_in_bytes()) {
+      const TypeOopPtr *elem_type = adr_type->is_aryptr()->elem()->isa_oopptr();
+      if (elem_type != NULL) {
+        sharpened_klass = elem_type->klass();
+      }
+    }
+  }
+
+  if (sharpened_klass != NULL) {
+    const TypeOopPtr* tjp = TypeOopPtr::make_from_klass(sharpened_klass);
+
+#ifndef PRODUCT
+    if (PrintIntrinsics || PrintInlining || PrintOptoInlining) {
+      tty->print("  from base type:  ");   adr_type->dump();
+      tty->print("  sharpened value: ");   tjp->dump();
+    }
+#endif
+    // Sharpen the value type.
+    return tjp;
+  }
+  return NULL;
+}
+
 bool LibraryCallKit::inline_unsafe_access(bool is_native_ptr, bool is_store, BasicType type, bool is_volatile) {
  if (callee()->is_static())  return false;  // caller must have the capability!

@ -2430,39 +2522,9 @@ bool LibraryCallKit::inline_unsafe_access(bool is_native_ptr, bool is_store, Bas
                           offset != top() && heap_base_oop != top();

  if (!is_store && type == T_OBJECT) {
-    // Attempt to infer a sharper value type from the offset and base type.
-    ciKlass* sharpened_klass = NULL;
-
-    // See if it is an instance field, with an object type.
-    if (alias_type->field() != NULL) {
-      assert(!is_native_ptr, "native pointer op cannot use a java address");
-      if (alias_type->field()->type()->is_klass()) {
-        sharpened_klass = alias_type->field()->type()->as_klass();
-      }
-    }
-
-    // See if it is a narrow oop array.
-    if (adr_type->isa_aryptr()) {
-      if (adr_type->offset() >= objArrayOopDesc::base_offset_in_bytes()) {
-        const TypeOopPtr *elem_type = adr_type->is_aryptr()->elem()->isa_oopptr();
-        if (elem_type != NULL) {
-          sharpened_klass = elem_type->klass();
-        }
-      }
-    }
-
-    if (sharpened_klass != NULL) {
-      const TypeOopPtr* tjp = TypeOopPtr::make_from_klass(sharpened_klass);
-
-      // Sharpen the value type.
+    const TypeOopPtr* tjp = sharpen_unsafe_type(alias_type, adr_type, is_native_ptr);
+    if (tjp != NULL) {
      value_type = tjp;
-
-#ifndef PRODUCT
-      if (PrintIntrinsics || PrintInlining || PrintOptoInlining) {
-        tty->print("  from base type:  ");   adr_type->dump();
-        tty->print("  sharpened value: "); value_type->dump();
-      }
-#endif
    }
  }

@ -2673,9 +2735,9 @@ bool LibraryCallKit::inline_unsafe_prefetch(bool is_native_ptr, bool is_store, b
  return true;
 }

-//----------------------------inline_unsafe_CAS----------------------------
+//----------------------------inline_unsafe_load_store----------------------------

-bool LibraryCallKit::inline_unsafe_CAS(BasicType type) {
+bool LibraryCallKit::inline_unsafe_load_store(BasicType type, LoadStoreKind kind) {
  // This basic scheme here is the same as inline_unsafe_access, but
  // differs in enough details that combining them would make the code
  // overly confusing.  (This is a true fact! I originally combined
@ -2686,37 +2748,47 @@ bool LibraryCallKit::inline_unsafe_CAS(BasicType type) {
  if (callee()->is_static())  return false;  // caller must have the capability!

 #ifndef PRODUCT
+  BasicType rtype;
  {
    ResourceMark rm;
-    // Check the signatures.
    ciSignature* sig = signature();
+    rtype = sig->return_type()->basic_type();
+    if (kind == LS_xadd || kind == LS_xchg) {
+      // Check the signatures.
 #ifdef ASSERT
-    BasicType rtype = sig->return_type()->basic_type();
-    assert(rtype == T_BOOLEAN, "CAS must return boolean");
-    assert(sig->count() == 4, "CAS has 4 arguments");
-    assert(sig->type_at(0)->basic_type() == T_OBJECT, "CAS base is object");
-    assert(sig->type_at(1)->basic_type() == T_LONG, "CAS offset is long");
+      assert(rtype == type, "get and set must return the expected type");
+      assert(sig->count() == 3, "get and set has 3 arguments");
+      assert(sig->type_at(0)->basic_type() == T_OBJECT, "get and set base is object");
+      assert(sig->type_at(1)->basic_type() == T_LONG, "get and set offset is long");
+      assert(sig->type_at(2)->basic_type() == type, "get and set must take expected type as new value/delta");
 #endif // ASSERT
+    } else if (kind == LS_cmpxchg) {
+      // Check the signatures.
+#ifdef ASSERT
+      assert(rtype == T_BOOLEAN, "CAS must return boolean");
+      assert(sig->count() == 4, "CAS has 4 arguments");
+      assert(sig->type_at(0)->basic_type() == T_OBJECT, "CAS base is object");
+      assert(sig->type_at(1)->basic_type() == T_LONG, "CAS offset is long");
+#endif // ASSERT
+    } else {
+      ShouldNotReachHere();
+    }
  }
 #endif //PRODUCT

  // number of stack slots per value argument (1 or 2)
  int type_words = type2size[type];

-  // Cannot inline wide CAS on machines that don't support it natively
-  if (type2aelembytes(type) > BytesPerInt && !VM_Version::supports_cx8())
-    return false;
-
  C->set_has_unsafe_access(true);  // Mark eventual nmethod as "unsafe".

-  // Argument words:  "this" plus oop plus offset plus oldvalue plus newvalue;
-  int nargs = 1 + 1 + 2  + type_words + type_words;
+  // Argument words:  "this" plus oop plus offset (plus oldvalue) plus newvalue/delta;
+  int nargs = 1 + 1 + 2  + ((kind == LS_cmpxchg) ? type_words : 0) + type_words;

-  // pop arguments: newval, oldval, offset, base, and receiver
+  // pop arguments: newval, offset, base, and receiver
  debug_only(int saved_sp = _sp);
  _sp += nargs;
  Node* newval   = (type_words == 1) ? pop() : pop_pair();
-  Node* oldval   = (type_words == 1) ? pop() : pop_pair();
+  Node* oldval   = (kind == LS_cmpxchg) ? ((type_words == 1) ? pop() : pop_pair()) : NULL;
  Node *offset   = pop_pair();
  Node *base     = pop();
  Node *receiver = pop();
@ -2740,16 +2812,24 @@ bool LibraryCallKit::inline_unsafe_CAS(BasicType type) {
  Node* adr = make_unsafe_address(base, offset);
  const TypePtr *adr_type = _gvn.type(adr)->isa_ptr();

-  // (Unlike inline_unsafe_access, there seems no point in trying
-  // to refine types. Just use the coarse types here.
+  // For CAS, unlike inline_unsafe_access, there seems no point in
+  // trying to refine types. Just use the coarse types here.
  const Type *value_type = Type::get_const_basic_type(type);
  Compile::AliasType* alias_type = C->alias_type(adr_type);
  assert(alias_type->index() != Compile::AliasIdxBot, "no bare pointers here");
+
+  if (kind == LS_xchg && type == T_OBJECT) {
+    const TypeOopPtr* tjp = sharpen_unsafe_type(alias_type, adr_type);
+    if (tjp != NULL) {
+      value_type = tjp;
+    }
+  }
+
  int alias_idx = C->get_alias_index(adr_type);

-  // Memory-model-wise, a CAS acts like a little synchronized block,
-  // so needs barriers on each side.  These don't translate into
-  // actual barriers on most machines, but we still need rest of
+  // Memory-model-wise, a LoadStore acts like a little synchronized
+  // block, so needs barriers on each side.  These don't translate
+  // into actual barriers on most machines, but we still need rest of
  // compiler to respect ordering.

  insert_mem_bar(Op_MemBarRelease);
@ -2762,13 +2842,29 @@ bool LibraryCallKit::inline_unsafe_CAS(BasicType type) {

  // For now, we handle only those cases that actually exist: ints,
  // longs, and Object. Adding others should be straightforward.
-  Node* cas;
+  Node* load_store;
  switch(type) {
  case T_INT:
-    cas = _gvn.transform(new (C, 5) CompareAndSwapINode(control(), mem, adr, newval, oldval));
+    if (kind == LS_xadd) {
+      load_store = _gvn.transform(new (C, 4) GetAndAddINode(control(), mem, adr, newval, adr_type));
+    } else if (kind == LS_xchg) {
+      load_store = _gvn.transform(new (C, 4) GetAndSetINode(control(), mem, adr, newval, adr_type));
+    } else if (kind == LS_cmpxchg) {
+      load_store = _gvn.transform(new (C, 5) CompareAndSwapINode(control(), mem, adr, newval, oldval));
+    } else {
+      ShouldNotReachHere();
+    }
    break;
  case T_LONG:
-    cas = _gvn.transform(new (C, 5) CompareAndSwapLNode(control(), mem, adr, newval, oldval));
+    if (kind == LS_xadd) {
+      load_store = _gvn.transform(new (C, 4) GetAndAddLNode(control(), mem, adr, newval, adr_type));
+    } else if (kind == LS_xchg) {
+      load_store = _gvn.transform(new (C, 4) GetAndSetLNode(control(), mem, adr, newval, adr_type));
+    } else if (kind == LS_cmpxchg) {
+      load_store = _gvn.transform(new (C, 5) CompareAndSwapLNode(control(), mem, adr, newval, oldval));
+    } else {
+      ShouldNotReachHere();
+    }
    break;
  case T_OBJECT:
    // Transformation of a value which could be NULL pointer (CastPP #NULL)
@ -2778,7 +2874,6 @@ bool LibraryCallKit::inline_unsafe_CAS(BasicType type) {
      newval = _gvn.makecon(TypePtr::NULL_PTR);

    // Reference stores need a store barrier.
-    // (They don't if CAS fails, but it isn't worth checking.)
    pre_barrier(true /* do_load*/,
                control(), base, adr, alias_idx, newval, value_type->make_oopptr(),
                NULL /* pre_val*/,
@ -2786,32 +2881,50 @@ bool LibraryCallKit::inline_unsafe_CAS(BasicType type) {
 #ifdef _LP64
    if (adr->bottom_type()->is_ptr_to_narrowoop()) {
      Node *newval_enc = _gvn.transform(new (C, 2) EncodePNode(newval, newval->bottom_type()->make_narrowoop()));
-      Node *oldval_enc = _gvn.transform(new (C, 2) EncodePNode(oldval, oldval->bottom_type()->make_narrowoop()));
-      cas = _gvn.transform(new (C, 5) CompareAndSwapNNode(control(), mem, adr,
-                                                          newval_enc, oldval_enc));
+      if (kind == LS_xchg) {
+        load_store = _gvn.transform(new (C, 4) GetAndSetNNode(control(), mem, adr,
+                                                              newval_enc, adr_type, value_type->make_narrowoop()));
+      } else {
+        assert(kind == LS_cmpxchg, "wrong LoadStore operation");
+        Node *oldval_enc = _gvn.transform(new (C, 2) EncodePNode(oldval, oldval->bottom_type()->make_narrowoop()));
+        load_store = _gvn.transform(new (C, 5) CompareAndSwapNNode(control(), mem, adr,
+                                                                   newval_enc, oldval_enc));
+      }
    } else
 #endif
    {
-      cas = _gvn.transform(new (C, 5) CompareAndSwapPNode(control(), mem, adr, newval, oldval));
+      if (kind == LS_xchg) {
+        load_store = _gvn.transform(new (C, 4) GetAndSetPNode(control(), mem, adr, newval, adr_type, value_type->is_oopptr()));
+      } else {
+        assert(kind == LS_cmpxchg, "wrong LoadStore operation");
+        load_store = _gvn.transform(new (C, 5) CompareAndSwapPNode(control(), mem, adr, newval, oldval));
+      }
    }
-    post_barrier(control(), cas, base, adr, alias_idx, newval, T_OBJECT, true);
+    post_barrier(control(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
    break;
  default:
    ShouldNotReachHere();
    break;
  }

-  // SCMemProjNodes represent the memory state of CAS. Their main
-  // role is to prevent CAS nodes from being optimized away when their
-  // results aren't used.
-  Node* proj = _gvn.transform( new (C, 1) SCMemProjNode(cas));
+  // SCMemProjNodes represent the memory state of a LoadStore. Their
+  // main role is to prevent LoadStore nodes from being optimized away
+  // when their results aren't used.
+  Node* proj = _gvn.transform( new (C, 1) SCMemProjNode(load_store));
  set_memory(proj, alias_idx);

  // Add the trailing membar surrounding the access
  insert_mem_bar(Op_MemBarCPUOrder);
  insert_mem_bar(Op_MemBarAcquire);

-  push(cas);
+#ifdef _LP64
+  if (type == T_OBJECT && adr->bottom_type()->is_ptr_to_narrowoop() && kind == LS_xchg) {
+    load_store = _gvn.transform(new (C, 2) DecodeNNode(load_store, load_store->bottom_type()->make_ptr()));
+  }
+#endif
+
+  assert(type2size[load_store->bottom_type()->basic_type()] == type2size[rtype], "result type should match");
+  push_node(load_store->bottom_type()->basic_type(), load_store);
  return true;
 }

--- a/hotspot/src/share/vm/opto/matcher.cpp
+++ b/hotspot/src/share/vm/opto/matcher.cpp
@ -2134,10 +2134,10 @@ void Matcher::find_shared( Node *n ) {
      case Op_CompareAndSwapP:
      case Op_CompareAndSwapN: {   // Convert trinary to binary-tree
        Node *newval = n->in(MemNode::ValueIn );
-        Node *oldval  = n->in(LoadStoreNode::ExpectedIn);
+        Node *oldval  = n->in(LoadStoreConditionalNode::ExpectedIn);
        Node *pair = new (C, 3) BinaryNode( oldval, newval );
        n->set_req(MemNode::ValueIn,pair);
-        n->del_req(LoadStoreNode::ExpectedIn);
+        n->del_req(LoadStoreConditionalNode::ExpectedIn);
        break;
      }
      case Op_CMoveD:              // Convert trinary to binary-tree
--- a/hotspot/src/share/vm/opto/memnode.cpp
+++ b/hotspot/src/share/vm/opto/memnode.cpp
@ -2552,14 +2552,38 @@ const Type * SCMemProjNode::Value( PhaseTransform *phase ) const
 }

 //=============================================================================
-LoadStoreNode::LoadStoreNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex ) : Node(5) {
+//----------------------------------LoadStoreNode------------------------------
+LoadStoreNode::LoadStoreNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at, const Type* rt, uint required )
+  : Node(required),
+    _type(rt),
+    _adr_type(at)
+{
  init_req(MemNode::Control, c  );
  init_req(MemNode::Memory , mem);
  init_req(MemNode::Address, adr);
  init_req(MemNode::ValueIn, val);
-  init_req(         ExpectedIn, ex );
  init_class_id(Class_LoadStore);
+}

+uint LoadStoreNode::ideal_reg() const {
+  return _type->ideal_reg();
+}
+
+bool LoadStoreNode::result_not_used() const {
+  for( DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++ ) {
+    Node *x = fast_out(i);
+    if (x->Opcode() == Op_SCMemProj) continue;
+    return false;
+  }
+  return true;
+}
+
+uint LoadStoreNode::size_of() const { return sizeof(*this); }
+
+//=============================================================================
+//----------------------------------LoadStoreConditionalNode--------------------
+LoadStoreConditionalNode::LoadStoreConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex ) : LoadStoreNode(c, mem, adr, val, NULL, TypeInt::BOOL, 5) {
+  init_req(ExpectedIn, ex );
 }

 //=============================================================================
--- a/hotspot/src/share/vm/opto/memnode.hpp
+++ b/hotspot/src/share/vm/opto/memnode.hpp
@ -657,23 +657,36 @@ public:
 //------------------------------LoadStoreNode---------------------------
 // Note: is_Mem() method returns 'true' for this class.
 class LoadStoreNode : public Node {
+private:
+  const Type* const _type;      // What kind of value is loaded?
+  const TypePtr* _adr_type;     // What kind of memory is being addressed?
+  virtual uint size_of() const; // Size is bigger
+public:
+  LoadStoreNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at, const Type* rt, uint required );
+  virtual bool depends_only_on_test() const { return false; }
+  virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn; }
+
+  virtual const Type *bottom_type() const { return _type; }
+  virtual uint ideal_reg() const;
+  virtual const class TypePtr *adr_type() const { return _adr_type; }  // returns bottom_type of address
+
+  bool result_not_used() const;
+};
+
+class LoadStoreConditionalNode : public LoadStoreNode {
 public:
  enum {
    ExpectedIn = MemNode::ValueIn+1 // One more input than MemNode
  };
-  LoadStoreNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex);
-  virtual bool depends_only_on_test() const { return false; }
-  virtual const Type *bottom_type() const { return TypeInt::BOOL; }
-  virtual uint ideal_reg() const { return Op_RegI; }
-  virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn; }
+  LoadStoreConditionalNode(Node *c, Node *mem, Node *adr, Node *val, Node *ex);
 };

 //------------------------------StorePConditionalNode---------------------------
 // Conditionally store pointer to memory, if no change since prior
 // load-locked.  Sets flags for success or failure of the store.
-class StorePConditionalNode : public LoadStoreNode {
+class StorePConditionalNode : public LoadStoreConditionalNode {
 public:
-  StorePConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ll ) : LoadStoreNode(c, mem, adr, val, ll) { }
+  StorePConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ll ) : LoadStoreConditionalNode(c, mem, adr, val, ll) { }
  virtual int Opcode() const;
  // Produces flags
  virtual uint ideal_reg() const { return Op_RegFlags; }
@ -682,9 +695,9 @@ public:
 //------------------------------StoreIConditionalNode---------------------------
 // Conditionally store int to memory, if no change since prior
 // load-locked.  Sets flags for success or failure of the store.
-class StoreIConditionalNode : public LoadStoreNode {
+class StoreIConditionalNode : public LoadStoreConditionalNode {
 public:
-  StoreIConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ii ) : LoadStoreNode(c, mem, adr, val, ii) { }
+  StoreIConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ii ) : LoadStoreConditionalNode(c, mem, adr, val, ii) { }
  virtual int Opcode() const;
  // Produces flags
  virtual uint ideal_reg() const { return Op_RegFlags; }
@ -693,9 +706,9 @@ public:
 //------------------------------StoreLConditionalNode---------------------------
 // Conditionally store long to memory, if no change since prior
 // load-locked.  Sets flags for success or failure of the store.
-class StoreLConditionalNode : public LoadStoreNode {
+class StoreLConditionalNode : public LoadStoreConditionalNode {
 public:
-  StoreLConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ll ) : LoadStoreNode(c, mem, adr, val, ll) { }
+  StoreLConditionalNode( Node *c, Node *mem, Node *adr, Node *val, Node *ll ) : LoadStoreConditionalNode(c, mem, adr, val, ll) { }
  virtual int Opcode() const;
  // Produces flags
  virtual uint ideal_reg() const { return Op_RegFlags; }
@ -703,32 +716,75 @@ public:


 //------------------------------CompareAndSwapLNode---------------------------
-class CompareAndSwapLNode : public LoadStoreNode {
+class CompareAndSwapLNode : public LoadStoreConditionalNode {
 public:
-  CompareAndSwapLNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreNode(c, mem, adr, val, ex) { }
+  CompareAndSwapLNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreConditionalNode(c, mem, adr, val, ex) { }
  virtual int Opcode() const;
 };


 //------------------------------CompareAndSwapINode---------------------------
-class CompareAndSwapINode : public LoadStoreNode {
+class CompareAndSwapINode : public LoadStoreConditionalNode {
 public:
-  CompareAndSwapINode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreNode(c, mem, adr, val, ex) { }
+  CompareAndSwapINode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreConditionalNode(c, mem, adr, val, ex) { }
  virtual int Opcode() const;
 };


 //------------------------------CompareAndSwapPNode---------------------------
-class CompareAndSwapPNode : public LoadStoreNode {
+class CompareAndSwapPNode : public LoadStoreConditionalNode {
 public:
-  CompareAndSwapPNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreNode(c, mem, adr, val, ex) { }
+  CompareAndSwapPNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreConditionalNode(c, mem, adr, val, ex) { }
  virtual int Opcode() const;
 };

 //------------------------------CompareAndSwapNNode---------------------------
-class CompareAndSwapNNode : public LoadStoreNode {
+class CompareAndSwapNNode : public LoadStoreConditionalNode {
 public:
-  CompareAndSwapNNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreNode(c, mem, adr, val, ex) { }
+  CompareAndSwapNNode( Node *c, Node *mem, Node *adr, Node *val, Node *ex) : LoadStoreConditionalNode(c, mem, adr, val, ex) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndAddINode---------------------------
+class GetAndAddINode : public LoadStoreNode {
+public:
+  GetAndAddINode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at ) : LoadStoreNode(c, mem, adr, val, at, TypeInt::INT, 4) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndAddLNode---------------------------
+class GetAndAddLNode : public LoadStoreNode {
+public:
+  GetAndAddLNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at ) : LoadStoreNode(c, mem, adr, val, at, TypeLong::LONG, 4) { }
+  virtual int Opcode() const;
+};
+
+
+//------------------------------GetAndSetINode---------------------------
+class GetAndSetINode : public LoadStoreNode {
+public:
+  GetAndSetINode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at ) : LoadStoreNode(c, mem, adr, val, at, TypeInt::INT, 4) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndSetINode---------------------------
+class GetAndSetLNode : public LoadStoreNode {
+public:
+  GetAndSetLNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at ) : LoadStoreNode(c, mem, adr, val, at, TypeLong::LONG, 4) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndSetPNode---------------------------
+class GetAndSetPNode : public LoadStoreNode {
+public:
+  GetAndSetPNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at, const Type* t ) : LoadStoreNode(c, mem, adr, val, at, t, 4) { }
+  virtual int Opcode() const;
+};
+
+//------------------------------GetAndSetNNode---------------------------
+class GetAndSetNNode : public LoadStoreNode {
+public:
+  GetAndSetNNode( Node *c, Node *mem, Node *adr, Node *val, const TypePtr* at, const Type* t ) : LoadStoreNode(c, mem, adr, val, at, t, 4) { }
  virtual int Opcode() const;
 };

--- a/hotspot/src/share/vm/opto/output.cpp
+++ b/hotspot/src/share/vm/opto/output.cpp
@ -1869,7 +1869,9 @@ void Compile::ScheduleAndBundle() {
  if (!do_scheduling())
    return;

-  assert(MaxVectorSize <= 8, "scheduling code works only with pairs");
+  // Scheduling code works only with pairs (8 bytes) maximum.
+  if (max_vector_size() > 8)
+    return;

  NOT_PRODUCT( TracePhase t2("isched", &_t_instrSched, TimeCompiler); )

--- a/hotspot/src/share/vm/opto/superword.cpp
+++ b/hotspot/src/share/vm/opto/superword.cpp
@ -179,6 +179,7 @@ void SuperWord::find_adjacent_refs() {
  for (int i = 0; i < _block.length(); i++) {
    Node* n = _block.at(i);
    if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
+        n->Opcode() != Op_LoadUI2L &&
        is_java_primitive(n->as_Mem()->memory_type())) {
      int align = memory_alignment(n->as_Mem(), 0);
      if (align != bottom_align) {
@ -481,12 +482,19 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
  int vw       = vector_width_in_bytes(mem_ref);
  assert(vw > 1, "sanity");
  int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
-  int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
+  // At least one iteration is executed in pre-loop by default. As result
+  // several iterations are needed to align memory operations in main-loop even
+  // if offset is 0.
+  int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
+  int elt_size = align_to_ref_p.memory_size();
+  assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
+         err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+  int iv_adjustment = iv_adjustment_in_bytes/elt_size;

 #ifndef PRODUCT
  if (TraceSuperWord)
    tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
-                  offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
+                  offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
 #endif
  return iv_adjustment;
 }
@ -1350,11 +1358,14 @@ void SuperWord::output() {
    insert_extracts(_packset.at(i));
  }

+  Compile* C = _phase->C;
+  uint max_vlen_in_bytes = 0;
  for (int i = 0; i < _block.length(); i++) {
    Node* n = _block.at(i);
    Node_List* p = my_pack(n);
    if (p && n == executed_last(p)) {
      uint vlen = p->size();
+      uint vlen_in_bytes = 0;
      Node* vn = NULL;
      Node* low_adr = p->at(0);
      Node* first   = executed_first(p);
@ -1364,7 +1375,8 @@ void SuperWord::output() {
        Node* mem = first->in(MemNode::Memory);
        Node* adr = low_adr->in(MemNode::Address);
        const TypePtr* atyp = n->adr_type();
-        vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
+        vn = LoadVectorNode::make(C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_LoadVector()->memory_size();
      } else if (n->is_Store()) {
        // Promote value to be stored to vector
        Node* val = vector_opd(p, MemNode::ValueIn);
@ -1372,7 +1384,8 @@ void SuperWord::output() {
        Node* mem = first->in(MemNode::Memory);
        Node* adr = low_adr->in(MemNode::Address);
        const TypePtr* atyp = n->adr_type();
-        vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
+        vn = StoreVectorNode::make(C, opc, ctl, mem, adr, atyp, val, vlen);
+        vlen_in_bytes = vn->as_StoreVector()->memory_size();
      } else if (n->req() == 3) {
        // Promote operands to vector
        Node* in1 = vector_opd(p, 1);
@ -1383,7 +1396,8 @@ void SuperWord::output() {
          in1 = in2;
          in2 = tmp;
        }
-        vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
+        vn = VectorNode::make(C, opc, in1, in2, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
      } else {
        ShouldNotReachHere();
      }
@ -1395,6 +1409,10 @@ void SuperWord::output() {
        _igvn.replace_node(pm, vn);
      }
      _igvn._worklist.push(vn);
+
+      if (vlen_in_bytes > max_vlen_in_bytes) {
+        max_vlen_in_bytes = vlen_in_bytes;
+      }
 #ifdef ASSERT
      if (TraceNewVectors) {
        tty->print("new Vector node: ");
@ -1403,6 +1421,7 @@ void SuperWord::output() {
 #endif
    }
  }
+  C->set_max_vector_size(max_vlen_in_bytes);
 }

 //------------------------------vector_opd---------------------------
@ -1439,7 +1458,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
        }
        assert(opd->bottom_type()->isa_int(), "int type only");
        // Move non constant shift count into XMM register.
-        cnt = new (_phase->C, 2) MoveI2FNode(cnt);
+        cnt = new (C, 2) MoveI2FNode(cnt);
      }
      if (cnt != opd) {
        _phase->_igvn.register_new_node_with_optimizer(cnt);
@ -1480,10 +1499,10 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
  _phase->_igvn.register_new_node_with_optimizer(pk);
  _phase->set_ctrl(pk, _phase->get_ctrl(opd));
 #ifdef ASSERT
-    if (TraceNewVectors) {
-      tty->print("new Vector node: ");
-      pk->dump();
-    }
+  if (TraceNewVectors) {
+    tty->print("new Vector node: ");
+    pk->dump();
+  }
 #endif
  return pk;
 }
@ -1805,7 +1824,7 @@ void SuperWord::compute_vector_element_type() {

 //------------------------------memory_alignment---------------------------
 // Alignment within a vector memory reference
-int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) {
+int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
  SWPointer p(s, this);
  if (!p.valid()) {
    return bottom_align;
@ -1815,7 +1834,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) {
    return bottom_align; // No vectors for this type
  }
  int offset  = p.offset_in_bytes();
-  offset     += iv_adjust_in_bytes;
+  offset     += iv_adjust*p.memory_size();
  int off_rem = offset % vw;
  int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
  return off_mod;
@ -1838,7 +1857,7 @@ const Type* SuperWord::container_type(Node* n) {

 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
  const Type* vt1 = velt_type(n1);
-  const Type* vt2 = velt_type(n1);
+  const Type* vt2 = velt_type(n2);
  if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
    // Compare vectors element sizes for integer types.
    return data_size(n1) == data_size(n2);
--- a/hotspot/src/share/vm/opto/superword.hpp
+++ b/hotspot/src/share/vm/opto/superword.hpp
@ -400,7 +400,7 @@ class SuperWord : public ResourceObj {
  // Return the node executed last in pack p.
  Node* executed_last(Node_List* p);
  // Alignment within a vector memory reference
-  int memory_alignment(MemNode* s, int iv_adjust_in_bytes);
+  int memory_alignment(MemNode* s, int iv_adjust);
  // (Start, end] half-open range defining which operands are vector
  void vector_opd_range(Node* n, uint* start, uint* end);
  // Smallest type containing range of values
--- a/hotspot/src/share/vm/runtime/sharedRuntime.cpp
+++ b/hotspot/src/share/vm/runtime/sharedRuntime.cpp
@ -88,6 +88,7 @@ RuntimeStub*        SharedRuntime::_resolve_virtual_call_blob;
 RuntimeStub*        SharedRuntime::_resolve_static_call_blob;

 DeoptimizationBlob* SharedRuntime::_deopt_blob;
+SafepointBlob*      SharedRuntime::_polling_page_vectors_safepoint_handler_blob;
 SafepointBlob*      SharedRuntime::_polling_page_safepoint_handler_blob;
 SafepointBlob*      SharedRuntime::_polling_page_return_handler_blob;

@ -104,8 +105,14 @@ void SharedRuntime::generate_stubs() {
  _resolve_virtual_call_blob           = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_virtual_call_C),      "resolve_virtual_call");
  _resolve_static_call_blob            = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_call_C),       "resolve_static_call");

-  _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), false);
-  _polling_page_return_handler_blob    = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), true);
+#ifdef COMPILER2
+  // Vectors are generated only by C2.
+  if (is_wide_vector(MaxVectorSize)) {
+    _polling_page_vectors_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_VECTOR_LOOP);
+  }
+#endif // COMPILER2
+  _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_LOOP);
+  _polling_page_return_handler_blob    = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_RETURN);

  generate_deopt_blob();

@ -535,10 +542,15 @@ address SharedRuntime::get_poll_stub(address pc) {
    "Only polling locations are used for safepoint");

  bool at_poll_return = ((nmethod*)cb)->is_at_poll_return(pc);
+  bool has_wide_vectors = ((nmethod*)cb)->has_wide_vectors();
  if (at_poll_return) {
    assert(SharedRuntime::polling_page_return_handler_blob() != NULL,
           "polling page return stub not created yet");
    stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
+  } else if (has_wide_vectors) {
+    assert(SharedRuntime::polling_page_vectors_safepoint_handler_blob() != NULL,
+           "polling page vectors safepoint stub not created yet");
+    stub = SharedRuntime::polling_page_vectors_safepoint_handler_blob()->entry_point();
  } else {
    assert(SharedRuntime::polling_page_safepoint_handler_blob() != NULL,
           "polling page safepoint stub not created yet");
@ -1618,6 +1630,31 @@ methodHandle SharedRuntime::reresolve_call_site(JavaThread *thread, TRAPS) {
  return callee_method;
 }

+#ifdef ASSERT
+void SharedRuntime::check_member_name_argument_is_last_argument(methodHandle method,
+                                                                const BasicType* sig_bt,
+                                                                const VMRegPair* regs) {
+  ResourceMark rm;
+  const int total_args_passed = method->size_of_parameters();
+  const VMRegPair*    regs_with_member_name = regs;
+        VMRegPair* regs_without_member_name = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed - 1);
+
+  const int member_arg_pos = total_args_passed - 1;
+  assert(member_arg_pos >= 0 && member_arg_pos < total_args_passed, "oob");
+  assert(sig_bt[member_arg_pos] == T_OBJECT, "dispatch argument must be an object");
+
+  const bool is_outgoing = method->is_method_handle_intrinsic();
+  int comp_args_on_stack = java_calling_convention(sig_bt, regs_without_member_name, total_args_passed - 1, is_outgoing);
+
+  for (int i = 0; i < member_arg_pos; i++) {
+    VMReg a =    regs_with_member_name[i].first();
+    VMReg b = regs_without_member_name[i].first();
+    assert(a->value() == b->value(), err_msg_res("register allocation mismatch: a=%d, b=%d", a->value(), b->value()));
+  }
+  assert(regs_with_member_name[member_arg_pos].first()->is_valid(), "bad member arg");
+}
+#endif
+
 // ---------------------------------------------------------------------------
 // We are calling the interpreter via a c2i. Normally this would mean that
 // we were called by a compiled method. However we could have lost a race
@ -2423,6 +2460,7 @@ AdapterHandlerEntry* AdapterHandlerLibrary::get_adapter(methodHandle method) {
 #ifndef PRODUCT
    // debugging suppport
    if (PrintAdapterHandlers || PrintStubCode) {
+      ttyLocker ttyl;
      entry->print_adapter_on(tty);
      tty->print_cr("i2c argument handler #%d for: %s %s (%d bytes generated)",
                    _adapters->number_of_entries(), (method->is_static() ? "static" : "receiver"),
@ -2430,8 +2468,10 @@ AdapterHandlerEntry* AdapterHandlerLibrary::get_adapter(methodHandle method) {
      tty->print_cr("c2i argument handler starts at %p",entry->get_c2i_entry());
      if (Verbose || PrintStubCode) {
        address first_pc = entry->base_address();
-        if (first_pc != NULL)
+        if (first_pc != NULL) {
          Disassembler::decode(first_pc, first_pc + insts_size);
+          tty->cr();
+        }
      }
    }
 #endif
@ -2546,10 +2586,10 @@ nmethod *AdapterHandlerLibrary::create_native_wrapper(methodHandle method, int c
      MacroAssembler _masm(&buffer);

      // Fill in the signature array, for the calling-convention call.
-      int total_args_passed = method->size_of_parameters();
+      const int total_args_passed = method->size_of_parameters();

-      BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType,total_args_passed);
-      VMRegPair*   regs = NEW_RESOURCE_ARRAY(VMRegPair,total_args_passed);
+      BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);
+      VMRegPair*   regs = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);
      int i=0;
      if( !method->is_static() )  // Pass in receiver first
        sig_bt[i++] = T_OBJECT;
@ -2559,7 +2599,7 @@ nmethod *AdapterHandlerLibrary::create_native_wrapper(methodHandle method, int c
        if( ss.type() == T_LONG || ss.type() == T_DOUBLE )
          sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots
      }
-      assert( i==total_args_passed, "" );
+      assert(i == total_args_passed, "");
      BasicType ret_type = ss.type();

      // Now get the compiled-Java layout as input (or output) arguments.
@ -2572,9 +2612,8 @@ nmethod *AdapterHandlerLibrary::create_native_wrapper(methodHandle method, int c
      nm = SharedRuntime::generate_native_wrapper(&_masm,
                                                  method,
                                                  compile_id,
-                                                  total_args_passed,
-                                                  comp_args_on_stack,
-                                                  sig_bt,regs,
+                                                  sig_bt,
+                                                  regs,
                                                  ret_type);
    }
  }
--- a/hotspot/src/share/vm/runtime/sharedRuntime.hpp
+++ b/hotspot/src/share/vm/runtime/sharedRuntime.hpp
@ -62,6 +62,7 @@ class SharedRuntime: AllStatic {

  static DeoptimizationBlob* _deopt_blob;

+  static SafepointBlob*      _polling_page_vectors_safepoint_handler_blob;
  static SafepointBlob*      _polling_page_safepoint_handler_blob;
  static SafepointBlob*      _polling_page_return_handler_blob;

@ -75,7 +76,8 @@ class SharedRuntime: AllStatic {
 #endif // !PRODUCT

 private:
-  static SafepointBlob* generate_handler_blob(address call_ptr, bool cause_return);
+  enum { POLL_AT_RETURN,  POLL_AT_LOOP, POLL_AT_VECTOR_LOOP };
+  static SafepointBlob* generate_handler_blob(address call_ptr, int poll_type);
  static RuntimeStub*   generate_resolve_blob(address destination, const char* name);

 public:
@ -223,6 +225,7 @@ class SharedRuntime: AllStatic {

  static SafepointBlob* polling_page_return_handler_blob()     { return _polling_page_return_handler_blob; }
  static SafepointBlob* polling_page_safepoint_handler_blob()  { return _polling_page_safepoint_handler_blob; }
+  static SafepointBlob* polling_page_vectors_safepoint_handler_blob()  { return _polling_page_vectors_safepoint_handler_blob; }

  // Counters
 #ifndef PRODUCT
@ -345,7 +348,11 @@ class SharedRuntime: AllStatic {
  // the bottom of the frame the first 16 words will be skipped and SharedInfo::stack0
  // will be just above it. (
  // return value is the maximum number of VMReg stack slots the convention will use.
-  static int java_calling_convention(const BasicType *sig_bt, VMRegPair *regs, int total_args_passed, int is_outgoing);
+  static int java_calling_convention(const BasicType* sig_bt, VMRegPair* regs, int total_args_passed, int is_outgoing);
+
+  static void check_member_name_argument_is_last_argument(methodHandle method,
+                                                          const BasicType* sig_bt,
+                                                          const VMRegPair* regs) NOT_DEBUG_RETURN;

  // Ditto except for calling C
  static int c_calling_convention(const BasicType *sig_bt, VMRegPair *regs, int total_args_passed);
@ -412,6 +419,10 @@ class SharedRuntime: AllStatic {
  // when an interrupt occurs.
  static uint out_preserve_stack_slots();

+  // Is vector's size (in bytes) bigger than a size saved by default?
+  // For example, on x86 16 bytes XMM registers are saved by default.
+  static bool is_wide_vector(int size);
+
  // Save and restore a native result
  static void    save_native_result(MacroAssembler *_masm, BasicType ret_type, int frame_slots );
  static void restore_native_result(MacroAssembler *_masm, BasicType ret_type, int frame_slots );
@ -425,13 +436,11 @@ class SharedRuntime: AllStatic {
  // The wrapper may contain special-case code if the given method
  // is a JNI critical method, or a compiled method handle adapter,
  // such as _invokeBasic, _linkToVirtual, etc.
-  static nmethod *generate_native_wrapper(MacroAssembler* masm,
+  static nmethod* generate_native_wrapper(MacroAssembler* masm,
                                          methodHandle method,
                                          int compile_id,
-                                          int total_args_passed,
-                                          int max_arg,
-                                          BasicType *sig_bt,
-                                          VMRegPair *regs,
+                                          BasicType* sig_bt,
+                                          VMRegPair* regs,
                                          BasicType ret_type );

  // Block before entering a JNI critical method
--- a/hotspot/src/share/vm/runtime/vm_version.cpp
+++ b/hotspot/src/share/vm/runtime/vm_version.cpp
@ -45,6 +45,10 @@
 const char* Abstract_VM_Version::_s_vm_release = Abstract_VM_Version::vm_release();
 const char* Abstract_VM_Version::_s_internal_vm_info_string = Abstract_VM_Version::internal_vm_info_string();
 bool Abstract_VM_Version::_supports_cx8 = false;
+bool Abstract_VM_Version::_supports_atomic_getset4 = false;
+bool Abstract_VM_Version::_supports_atomic_getset8 = false;
+bool Abstract_VM_Version::_supports_atomic_getadd4 = false;
+bool Abstract_VM_Version::_supports_atomic_getadd8 = false;
 unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U;
 int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0;

--- a/hotspot/src/share/vm/runtime/vm_version.hpp
+++ b/hotspot/src/share/vm/runtime/vm_version.hpp
@ -37,6 +37,10 @@ class Abstract_VM_Version: AllStatic {
  static const char*  _s_internal_vm_info_string;
  // These are set by machine-dependent initializations
  static bool         _supports_cx8;
+  static bool         _supports_atomic_getset4;
+  static bool         _supports_atomic_getset8;
+  static bool         _supports_atomic_getadd4;
+  static bool         _supports_atomic_getadd8;
  static unsigned int _logical_processors_per_package;
  static int          _vm_major_version;
  static int          _vm_minor_version;
@ -75,6 +79,13 @@ class Abstract_VM_Version: AllStatic {

  // does HW support an 8-byte compare-exchange operation?
  static bool supports_cx8()  {return _supports_cx8;}
+  // does HW support atomic get-and-set or atomic get-and-add?  Used
+  // to guide intrinsification decisions for Unsafe atomic ops
+  static bool supports_atomic_getset4()  {return _supports_atomic_getset4;}
+  static bool supports_atomic_getset8()  {return _supports_atomic_getset8;}
+  static bool supports_atomic_getadd4()  {return _supports_atomic_getadd4;}
+  static bool supports_atomic_getadd8()  {return _supports_atomic_getadd8;}
+
  static unsigned int logical_processors_per_package() {
    return _logical_processors_per_package;
  }
--- a/hotspot/test/compiler/7196199/Test7196199.java
+++ b/hotspot/test/compiler/7196199/Test7196199.java
@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7196199
+ * @summary java/text/Bidi/Bug6665028.java failed: Bidi run count incorrect
+ *
+ * @run main/othervm/timeout=400 -Xmx32m -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:CompileCommand=exclude,Test7196199.test -XX:+SafepointALot -XX:GuaranteedSafepointInterval=100 Test7196199
+ */
+
+
+public class Test7196199 {
+  private static final int ARRLEN = 97;
+  private static final int ITERS  = 5000;
+  private static final int INI_ITERS  = 1000;
+  private static final int SFP_ITERS  = 10000;
+  private static final float SFP_ITERS_F  = 10000.f;
+  private static final float VALUE = 15.f;
+  public static void main(String args[]) {
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    float[] a0 = new float[ARRLEN];
+    float[] a1 = new float[ARRLEN];
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a0[i] = 0.f;
+      a1[i] = (float)i;
+    }
+    System.out.println("Warmup");
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrc(a0);
+      test_incrv(a0, VALUE);
+      test_addc(a0, a1);
+      test_addv(a0, a1, VALUE);
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    for (int i=0; i<ARRLEN; i++)
+      a0[i] = 0.f;
+
+    System.out.println("  test_incrc");
+    for (int j=0; j<ITERS; j++) {
+      test_incrc(a0);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_incrc: ", i, a0[i], VALUE*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_incrv");
+    for (int j=0; j<ITERS; j++) {
+      test_incrv(a0, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_incrv: ", i, a0[i], VALUE*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_addc");
+    for (int j=0; j<ITERS; j++) {
+      test_addc(a0, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addc: ", i, a0[i], ((float)i + VALUE)*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    System.out.println("  test_addv");
+    for (int j=0; j<ITERS; j++) {
+      test_addv(a0, a1, VALUE);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_addv: ", i, a0[i], ((float)i + VALUE)*SFP_ITERS_F);
+        a0[i] = 0.f; // Reset
+      }
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrc(a0);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_incrc: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_incrv(a0, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_incrv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_addc(a0, a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addc: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<INI_ITERS; i++) {
+      test_addv(a0, a1, VALUE);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_addv: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_incrc(float[] a0) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += VALUE;
+      }
+    }
+  }
+  static void test_incrv(float[] a0, float b) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += b;
+      }
+    }
+  }
+  static void test_addc(float[] a0, float[] a1) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += a1[i]+VALUE;
+      }
+    }
+  }
+  static void test_addv(float[] a0, float[] a1, float b) {
+    // Non-counted loop with safepoint.
+    for (long l = 0; l < SFP_ITERS; l++) {
+      // Counted and vectorized loop.
+      for (int i = 0; i < a0.length; i+=1) {
+        a0[i] += a1[i]+b;
+      }
+    }
+  }
+
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
+