Merge

2026-02-28 11:10:26 +00:00 · 2015-08-21 09:12:42 +02:00 · 2015-08-21 09:12:42 +02:00 · a9e232a8ef
commit a9e232a8ef
parent 33b2496286 10c3342331
31 changed files with 1893 additions and 812 deletions
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad
--- a/hotspot/src/cpu/aarch64/vm/aarch64_ad.m4
+++ b/hotspot/src/cpu/aarch64/vm/aarch64_ad.m4
@ -42,7 +42,7 @@ instruct $2$1_reg_$4_reg(iReg$1NoSp dst,
              as_Register($src1$$reg),
              as_Register($src2$$reg),
              Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
  %}

  ins_pipe(ialu_reg_reg_shift);
@ -87,7 +87,7 @@ dnl into this canonical form.
              as_Register($src1$$reg),
              as_Register($src2$$reg),
              Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
  %}

  ins_pipe(ialu_reg_reg_shift);
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp
@ -268,7 +268,7 @@ void entry(CodeBuffer *cb) {
    __ ldar(r21, r28);                                 //       ldar    x21, [x28]

 // LoadStoreExclusiveOp
-    __ stxrw(r24, r24, r7);                            //       stxr    w24, w24, [x7]
+    __ stxrw(r21, r24, r7);                            //       stxr    w21, w24, [x7]
    __ stlxrw(r21, r26, r28);                          //       stlxr   w21, w26, [x28]
    __ ldxrw(r21, r6);                                 //       ldxr    w21, [x6]
    __ ldaxrw(r15, r30);                               //       ldaxr   w15, [x30]
@ -299,7 +299,7 @@ void entry(CodeBuffer *cb) {

 // LoadStoreExclusiveOp
    __ ldxpw(r25, r4, r22);                            //       ldxp    w25, w4, [x22]
-    __ ldaxpw(r14, r14, r15);                          //       ldaxp   w14, w14, [x15]
+    __ ldaxpw(r13, r14, r15);                          //       ldaxp   w13, w14, [x15]
    __ stxpw(r20, r26, r8, r10);                       //       stxp    w20, w26, w8, [x10]
    __ stlxpw(r23, r18, r18, r18);                     //       stlxp   w23, w18, w18, [x18]

@ -773,7 +773,7 @@ Disassembly of section .text:
 260:   c85fffbb        ldaxr   x27, [x29]
 264:   c89fffa0        stlr    x0, [x29]
 268:   c8dfff95        ldar    x21, [x28]
- 26c:   88187cf8        stxr    w24, w24, [x7]
+ 26c:   88157cf8        stxr    w21, w24, [x7]
 270:   8815ff9a        stlxr   w21, w26, [x28]
 274:   885f7cd5        ldxr    w21, [x6]
 278:   885fffcf        ldaxr   w15, [x30]
@ -796,7 +796,7 @@ Disassembly of section .text:
 2bc:   c82870bb        stxp    w8, x27, x28, [x5]
 2c0:   c825b8c8        stlxp   w5, x8, x14, [x6]
 2c4:   887f12d9        ldxp    w25, w4, [x22]
- 2c8:   887fb9ee        ldaxp   w14, w14, [x15]
+ 2c8:   887fb9ed        ldaxp   w13, w14, [x15]
 2cc:   8834215a        stxp    w20, w26, w8, [x10]
 2d0:   8837ca52        stlxp   w23, w18, w18, [x18]
 2d4:   f806317e        str     x30, [x11,#99]
@ -1085,13 +1085,13 @@ Disassembly of section .text:
    0xd444c320,     0xd503201f,     0xd69f03e0,     0xd6bf03e0,
    0xd5033fdf,     0xd5033f9f,     0xd5033abf,     0xd61f0040,
    0xd63f00a0,     0xc8147c55,     0xc805fcfd,     0xc85f7e05,
-    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88187cf8,
+    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88157cf8,
    0x8815ff9a,     0x885f7cd5,     0x885fffcf,     0x889ffc73,
    0x88dffc56,     0x48127c0f,     0x480bff85,     0x485f7cdd,
    0x485ffcf2,     0x489fff99,     0x48dffe62,     0x080a7c3e,
    0x0814fed5,     0x085f7c59,     0x085ffcb8,     0x089ffc70,
    0x08dfffb6,     0xc87f0a68,     0xc87fcdc7,     0xc82870bb,
-    0xc825b8c8,     0x887f12d9,     0x887fb9ee,     0x8834215a,
+    0xc825b8c8,     0x887f12d9,     0x887fb9ed,     0x8834215a,
    0x8837ca52,     0xf806317e,     0xb81b3337,     0x39000dc2,
    0x78005149,     0xf84391f4,     0xb85b220c,     0x385fd356,
    0x785d127e,     0x389f4149,     0x79801e3c,     0x79c014a3,
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
@ -1106,13 +1106,13 @@ public:

 #define INSN4(NAME, sz, op, o0) /* Four registers */                    \
  void NAME(Register Rs, Register Rt1, Register Rt2, Register Rn) {     \
-    assert(Rs != Rn, "unpredictable instruction");                  \
+    guarantee(Rs != Rn && Rs != Rt1 && Rs != Rt2, "unpredictable instruction"); \
    load_store_exclusive(Rs, Rt1, Rt2, Rn, sz, op, o0);                 \
  }

 #define INSN3(NAME, sz, op, o0) /* Three registers */                   \
  void NAME(Register Rs, Register Rt, Register Rn) {                    \
-    assert(Rs != Rn, "unpredictable instruction");                  \
+    guarantee(Rs != Rn && Rs != Rt, "unpredictable instruction");       \
    load_store_exclusive(Rs, Rt, (Register)0b11111, Rn, sz, op, o0);    \
  }

@ -1124,6 +1124,7 @@ public:

 #define INSN_FOO(NAME, sz, op, o0) /* Three registers, encoded differently */ \
  void NAME(Register Rt1, Register Rt2, Register Rn) {                  \
+    guarantee(Rt1 != Rt2, "unpredictable instruction");                 \
    load_store_exclusive((Register)0b11111, Rt1, Rt2, Rn, sz, op, o0);  \
  }

--- a/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp
@ -611,6 +611,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
    Label done;

    const Register swap_reg = r0;
+    const Register tmp = c_rarg2;
    const Register obj_reg = c_rarg3; // Will contain the oop

    const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
@ -624,7 +625,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
    ldr(obj_reg, Address(lock_reg, obj_offset));

    if (UseBiasedLocking) {
-      biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, done, &slow_case);
+      biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, done, &slow_case);
    }

    // Load (object->mark() | 1) into swap_reg
@ -643,7 +644,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
      cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, fast, &fail);
      bind(fast);
      atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-                  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
      b(done);
      bind(fail);
    } else {
@ -671,7 +672,7 @@ void InterpreterMacroAssembler::lock_object(Register lock_reg)
    if (PrintBiasedLockingStatistics) {
      br(Assembler::NE, slow_case);
      atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-                  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
    }
    br(Assembler::EQ, done);

--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
@ -34,6 +34,7 @@
 #include "memory/resourceArea.hpp"
 #include "nativeInst_aarch64.hpp"
 #include "oops/klass.inline.hpp"
+#include "oops/oop.inline.hpp"
 #include "opto/compile.hpp"
 #include "opto/node.hpp"
 #include "runtime/biasedLocking.hpp"
@ -398,11 +399,7 @@ int MacroAssembler::biased_locking_enter(Register lock_reg,
  if (PrintBiasedLockingStatistics && counters == NULL)
    counters = BiasedLocking::counters();

-  bool need_tmp_reg = false;
-  if (tmp_reg == noreg) {
-    tmp_reg = rscratch2;
-  }
-  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
+  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
@ -432,7 +429,7 @@ int MacroAssembler::biased_locking_enter(Register lock_reg,
  if (counters != NULL) {
    Label around;
    cbnz(tmp_reg, around);
-    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
+    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
    b(done);
    bind(around);
  } else {
@ -485,7 +482,7 @@ int MacroAssembler::biased_locking_enter(Register lock_reg,
    bind(here);
    if (counters != NULL) {
      atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
-                  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
    }
  }
  b(done);
@ -511,7 +508,7 @@ int MacroAssembler::biased_locking_enter(Register lock_reg,
    bind(here);
    if (counters != NULL) {
      atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
-                  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
    }
  }
  b(done);
@ -539,7 +536,7 @@ int MacroAssembler::biased_locking_enter(Register lock_reg,
    // removing the bias bit from the object's header.
    if (counters != NULL) {
      atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
-                  rscratch1);
+                  rscratch1, rscratch2);
    }
    bind(nope);
  }
@ -1640,15 +1637,15 @@ Address MacroAssembler::form_address(Register Rd, Register base, long byte_offse
  return Address(Rd);
 }

-void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
+void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
  Label retry_load;
  bind(retry_load);
  // flush and load exclusive from the memory location
  ldxrw(tmp, counter_addr);
  addw(tmp, tmp, 1);
  // if we store+flush with no intervening write tmp wil be zero
-  stxrw(tmp, tmp, counter_addr);
-  cbnzw(tmp, retry_load);
+  stxrw(tmp2, tmp, counter_addr);
+  cbnzw(tmp2, retry_load);
 }


@ -2021,6 +2018,14 @@ void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement)
  }
 }

+void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
+  if (decrement.is_register()) {
+    subw(Rd, Rn, decrement.as_register());
+  } else {
+    subw(Rd, Rn, decrement.as_constant());
+  }
+}
+
 void MacroAssembler::reinit_heapbase()
 {
  if (UseCompressedOops) {
@ -2110,7 +2115,7 @@ static bool different(Register a, RegisterOrConstant b, Register c) {
    return a != b.as_register() && a != c && b.as_register() != c;
 }

-#define ATOMIC_OP(LDXR, OP, STXR)                                       \
+#define ATOMIC_OP(LDXR, OP, IOP, STXR)                                       \
 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
  Register result = rscratch2;                                          \
  if (prev->is_valid())                                                 \
@ -2120,14 +2125,15 @@ void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Registe
  bind(retry_load);                                                     \
  LDXR(result, addr);                                                   \
  OP(rscratch1, result, incr);                                          \
-  STXR(rscratch1, rscratch1, addr);                                     \
-  cbnzw(rscratch1, retry_load);                                         \
-  if (prev->is_valid() && prev != result)                               \
-    mov(prev, result);                                                  \
+  STXR(rscratch2, rscratch1, addr);                                     \
+  cbnzw(rscratch2, retry_load);                                         \
+  if (prev->is_valid() && prev != result) {                             \
+    IOP(prev, rscratch1, incr);                                         \
+  }                                                                     \
 }

-ATOMIC_OP(ldxr, add, stxr)
-ATOMIC_OP(ldxrw, addw, stxrw)
+ATOMIC_OP(ldxr, add, sub, stxr)
+ATOMIC_OP(ldxrw, addw, subw, stxrw)

 #undef ATOMIC_OP

--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
@ -107,9 +107,7 @@ class MacroAssembler: public Assembler {
  // Biased locking support
  // lock_reg and obj_reg must be loaded up with the appropriate values.
  // swap_reg is killed.
-  // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
-  // be killed; if not supplied, push/pop will be used internally to
-  // allocate a temporary (inefficient, avoid if possible).
+  // tmp_reg must be supplied and must not be rscratch1 or rscratch2
  // Optional slow case is for implementations (interpreter and C1) which branch to
  // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
  // Returns offset of first potentially-faulting instruction for null
@ -126,10 +124,10 @@ class MacroAssembler: public Assembler {

  // Helper functions for statistics gathering.
  // Unconditional atomic increment.
-  void atomic_incw(Register counter_addr, Register tmp);
-  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2) {
+  void atomic_incw(Register counter_addr, Register tmp, Register tmp2);
+  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2, Register tmp3) {
    lea(tmp1, counter_addr);
-    atomic_incw(tmp1, tmp2);
+    atomic_incw(tmp1, tmp2, tmp3);
  }
  // Load Effective Address
  void lea(Register r, const Address &a) {
@ -1057,6 +1055,7 @@ public:
  void add(Register Rd, Register Rn, RegisterOrConstant increment);
  void addw(Register Rd, Register Rn, RegisterOrConstant increment);
  void sub(Register Rd, Register Rn, RegisterOrConstant decrement);
+  void subw(Register Rd, Register Rn, RegisterOrConstant decrement);

  void adrp(Register reg1, const Address &dest, unsigned long &byte_offset);

--- a/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
@ -1774,6 +1774,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
  const Register obj_reg  = r19;  // Will contain the oop
  const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
  const Register old_hdr  = r13;  // value of old header at unlock time
+  const Register tmp = c_rarg3;

  Label slow_path_lock;
  Label lock_done;
@ -1795,7 +1796,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
    __ ldr(obj_reg, Address(oop_handle_reg, 0));

    if (UseBiasedLocking) {
-      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, lock_done, &slow_path_lock);
+      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, lock_done, &slow_path_lock);
    }

    // Load (object->mark() | 1) into swap_reg %r0
--- a/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp
@ -1913,15 +1913,18 @@ address TemplateInterpreterGenerator::generate_trace_code(TosState state) {
 }

 void TemplateInterpreterGenerator::count_bytecode() {
+  Register rscratch3 = r0;
  __ push(rscratch1);
  __ push(rscratch2);
+  __ push(rscratch3);
  Label L;
  __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
  __ bind(L);
  __ ldxr(rscratch1, rscratch2);
  __ add(rscratch1, rscratch1, 1);
-  __ stxr(rscratch1, rscratch1, rscratch2);
-  __ cbnzw(rscratch1, L);
+  __ stxr(rscratch3, rscratch1, rscratch2);
+  __ cbnzw(rscratch3, L);
+  __ pop(rscratch3);
  __ pop(rscratch2);
  __ pop(rscratch1);
 }
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -1674,6 +1674,13 @@ void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
 }

+void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
+  emit_int8(0x2A);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
@ -6604,13 +6611,6 @@ void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
  emit_operand(dst, src);
 }

-void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
-  emit_int8(0x2A);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
  if (VM_Version::supports_evex()) {
--- a/hotspot/src/cpu/x86/vm/interp_masm_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86.cpp
@ -355,8 +355,8 @@ void InterpreterMacroAssembler::load_earlyret_value(TosState state) {
    case ctos:                                   // fall through
    case stos:                                   // fall through
    case itos: movl(rax, val_addr);                 break;
-    case ftos: movflt(xmm0, val_addr);              break;
-    case dtos: movdbl(xmm0, val_addr);              break;
+    case ftos: load_float(val_addr);                break;
+    case dtos: load_double(val_addr);               break;
    case vtos: /* nothing to do */                  break;
    default  : ShouldNotReachHere();
  }
@ -376,8 +376,8 @@ void InterpreterMacroAssembler::load_earlyret_value(TosState state) {
    case ctos:                                     // fall through
    case stos:                                     // fall through
    case itos: movl(rax, val_addr);                   break;
-    case ftos: fld_s(val_addr);                       break;
-    case dtos: fld_d(val_addr);                       break;
+    case ftos: load_float(val_addr);                  break;
+    case dtos: load_double(val_addr);                 break;
    case vtos: /* nothing to do */                    break;
    default  : ShouldNotReachHere();
  }
@ -578,6 +578,26 @@ void InterpreterMacroAssembler::push_i(Register r) {
  push(r);
 }

+void InterpreterMacroAssembler::push_f(XMMRegister r) {
+  subptr(rsp, wordSize);
+  movflt(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_f(XMMRegister r) {
+  movflt(r, Address(rsp, 0));
+  addptr(rsp, wordSize);
+}
+
+void InterpreterMacroAssembler::push_d(XMMRegister r) {
+  subptr(rsp, 2 * wordSize);
+  movdbl(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_d(XMMRegister r) {
+  movdbl(r, Address(rsp, 0));
+  addptr(rsp, 2 * Interpreter::stackElementSize);
+}
+
 #ifdef _LP64
 void InterpreterMacroAssembler::pop_i(Register r) {
  // XXX can't use pop currently, upper half non clean
@ -590,31 +610,11 @@ void InterpreterMacroAssembler::pop_l(Register r) {
  addptr(rsp, 2 * Interpreter::stackElementSize);
 }

-void InterpreterMacroAssembler::pop_f(XMMRegister r) {
-  movflt(r, Address(rsp, 0));
-  addptr(rsp, wordSize);
-}
-
-void InterpreterMacroAssembler::pop_d(XMMRegister r) {
-  movdbl(r, Address(rsp, 0));
-  addptr(rsp, 2 * Interpreter::stackElementSize);
-}
-
 void InterpreterMacroAssembler::push_l(Register r) {
  subptr(rsp, 2 * wordSize);
  movq(Address(rsp, 0), r);
 }

-void InterpreterMacroAssembler::push_f(XMMRegister r) {
-  subptr(rsp, wordSize);
-  movflt(Address(rsp, 0), r);
-}
-
-void InterpreterMacroAssembler::push_d(XMMRegister r) {
-  subptr(rsp, 2 * wordSize);
-  movdbl(Address(rsp, 0), r);
-}
-
 void InterpreterMacroAssembler::pop(TosState state) {
  switch (state) {
  case atos: pop_ptr();                 break;
@ -623,8 +623,8 @@ void InterpreterMacroAssembler::pop(TosState state) {
  case stos:
  case itos: pop_i();                   break;
  case ltos: pop_l();                   break;
-  case ftos: pop_f();                   break;
-  case dtos: pop_d();                   break;
+  case ftos: pop_f(xmm0);               break;
+  case dtos: pop_d(xmm0);               break;
  case vtos: /* nothing to do */        break;
  default:   ShouldNotReachHere();
  }
@ -640,8 +640,8 @@ void InterpreterMacroAssembler::push(TosState state) {
  case stos:
  case itos: push_i();                  break;
  case ltos: push_l();                  break;
-  case ftos: push_f();                  break;
-  case dtos: push_d();                  break;
+  case ftos: push_f(xmm0);              break;
+  case dtos: push_d(xmm0);              break;
  case vtos: /* nothing to do */        break;
  default  : ShouldNotReachHere();
  }
@ -675,8 +675,20 @@ void InterpreterMacroAssembler::pop(TosState state) {
    case stos:                                               // fall through
    case itos: pop_i(rax);                                   break;
    case ltos: pop_l(rax, rdx);                              break;
-    case ftos: pop_f();                                      break;
-    case dtos: pop_d();                                      break;
+    case ftos:
+      if (UseSSE >= 1) {
+        pop_f(xmm0);
+      } else {
+        pop_f();
+      }
+      break;
+    case dtos:
+      if (UseSSE >= 2) {
+        pop_d(xmm0);
+      } else {
+        pop_d();
+      }
+      break;
    case vtos: /* nothing to do */                           break;
    default  : ShouldNotReachHere();
  }
@ -695,7 +707,7 @@ void InterpreterMacroAssembler::push_f() {
  fstp_s(Address(rsp, 0));
 }

-void InterpreterMacroAssembler::push_d(Register r) {
+void InterpreterMacroAssembler::push_d() {
  // Do not schedule for no AGI! Never write beyond rsp!
  subptr(rsp, 2 * wordSize);
  fstp_d(Address(rsp, 0));
@ -711,8 +723,20 @@ void InterpreterMacroAssembler::push(TosState state) {
    case stos:                                               // fall through
    case itos: push_i(rax);                                    break;
    case ltos: push_l(rax, rdx);                               break;
-    case ftos: push_f();                                       break;
-    case dtos: push_d(rax);                                    break;
+    case ftos:
+      if (UseSSE >= 1) {
+        push_f(xmm0);
+      } else {
+        push_f();
+      }
+      break;
+    case dtos:
+      if (UseSSE >= 2) {
+        push_d(xmm0);
+      } else {
+        push_d();
+      }
+      break;
    case vtos: /* nothing to do */                             break;
    default  : ShouldNotReachHere();
  }
@ -995,22 +1019,6 @@ void InterpreterMacroAssembler::remove_activation(
  leave();                           // remove frame anchor
  pop(ret_addr);                     // get return address
  mov(rsp, rbx);                     // set sp to sender sp
-#ifndef _LP64
-  if (UseSSE) {
-    // float and double are returned in xmm register in SSE-mode
-    if (state == ftos && UseSSE >= 1) {
-      subptr(rsp, wordSize);
-      fstp_s(Address(rsp, 0));
-      movflt(xmm0, Address(rsp, 0));
-      addptr(rsp, wordSize);
-    } else if (state == dtos && UseSSE >= 2) {
-      subptr(rsp, 2*wordSize);
-      fstp_d(Address(rsp, 0));
-      movdbl(xmm0, Address(rsp, 0));
-      addptr(rsp, 2*wordSize);
-    }
-  }
-#endif // _LP64
 }
 #endif // !CC_INTERP

@ -1783,7 +1791,10 @@ void InterpreterMacroAssembler::verify_oop(Register reg, TosState state) {

 void InterpreterMacroAssembler::verify_FPU(int stack_depth, TosState state) {
 #ifndef _LP64
-  if (state == ftos || state == dtos) MacroAssembler::verify_FPU(stack_depth);
+  if ((state == ftos && UseSSE < 1) ||
+      (state == dtos && UseSSE < 2)) {
+    MacroAssembler::verify_FPU(stack_depth);
+  }
 #endif
 }

--- a/hotspot/src/cpu/x86/vm/interp_masm_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86.hpp
@ -140,20 +140,20 @@ class InterpreterMacroAssembler: public MacroAssembler {
  void push_ptr(Register r = rax);
  void push_i(Register r = rax);

+  void push_f(XMMRegister r);
+  void pop_f(XMMRegister r);
+  void pop_d(XMMRegister r);
+  void push_d(XMMRegister r);
 #ifdef _LP64
  void pop_l(Register r = rax);
-  void pop_f(XMMRegister r = xmm0);
-  void pop_d(XMMRegister r = xmm0);
  void push_l(Register r = rax);
-  void push_f(XMMRegister r = xmm0);
-  void push_d(XMMRegister r = xmm0);
 #else
  void pop_l(Register lo = rax, Register hi = rdx);
  void pop_f();
  void pop_d();

  void push_l(Register lo = rax, Register hi = rdx);
-  void push_d(Register r = rax);
+  void push_d();
  void push_f();
 #endif // _LP64

--- a/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp
@ -42,6 +42,12 @@
  address generate_Reference_get_entry();
  address generate_CRC32_update_entry();
  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+#ifndef _LP64
+  address generate_Float_intBitsToFloat_entry();
+  address generate_Float_floatToRawIntBits_entry();
+  address generate_Double_longBitsToDouble_entry();
+  address generate_Double_doubleToRawLongBits_entry();
+#endif
  void lock_method(void);
  void generate_stack_overflow_check(void);

--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -3314,6 +3314,42 @@ void MacroAssembler::fpop() {
  fincstp();
 }

+void MacroAssembler::load_float(Address src) {
+  if (UseSSE >= 1) {
+    movflt(xmm0, src);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fld_s(src));
+  }
+}
+
+void MacroAssembler::store_float(Address dst) {
+  if (UseSSE >= 1) {
+    movflt(dst, xmm0);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fstp_s(dst));
+  }
+}
+
+void MacroAssembler::load_double(Address src) {
+  if (UseSSE >= 2) {
+    movdbl(xmm0, src);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fld_d(src));
+  }
+}
+
+void MacroAssembler::store_double(Address dst) {
+  if (UseSSE >= 2) {
+    movdbl(dst, xmm0);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fstp_d(dst));
+  }
+}
+
 void MacroAssembler::fremr(Register tmp) {
  save_rax(tmp);
  { Label L;
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -471,6 +471,22 @@ class MacroAssembler: public Assembler {
  // Pop ST (ffree & fincstp combined)
  void fpop();

+  // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
+  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+  void load_float(Address src);
+
+  // Store float value to 'address'. If UseSSE >= 1, the value is stored
+  // from register xmm0. Otherwise, the value is stored from the FPU stack.
+  void store_float(Address dst);
+
+  // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
+  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+  void load_double(Address src);
+
+  // Store double value to 'address'. If UseSSE >= 2, the value is stored
+  // from register xmm0. Otherwise, the value is stored from the FPU stack.
+  void store_double(Address dst);
+
  // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
  void push_fTOS();

--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
@ -170,22 +170,12 @@ address TemplateInterpreterGenerator::generate_return_entry_for(TosState state,
    __ MacroAssembler::verify_FPU(0, "generate_return_entry_for compiled");
  }

-  // In SSE mode, interpreter returns FP results in xmm0 but they need
-  // to end up back on the FPU so it can operate on them.
-  if (state == ftos && UseSSE >= 1) {
-    __ subptr(rsp, wordSize);
-    __ movflt(Address(rsp, 0), xmm0);
-    __ fld_s(Address(rsp, 0));
-    __ addptr(rsp, wordSize);
-  } else if (state == dtos && UseSSE >= 2) {
-    __ subptr(rsp, 2*wordSize);
-    __ movdbl(Address(rsp, 0), xmm0);
-    __ fld_d(Address(rsp, 0));
-    __ addptr(rsp, 2*wordSize);
+  if (state == ftos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_return_entry_for in interpreter");
+  } else if (state == dtos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_return_entry_for in interpreter");
  }

-  __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_return_entry_for in interpreter");
-
  // Restore stack bottom in case i2c adjusted stack
  __ movptr(rsp, Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize));
  // and NULL it as marker that rsp is now tos until next java call
@ -217,21 +207,12 @@ address TemplateInterpreterGenerator::generate_return_entry_for(TosState state,
 address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state, int step) {
  address entry = __ pc();

-  // In SSE mode, FP results are in xmm0
-  if (state == ftos && UseSSE > 0) {
-    __ subptr(rsp, wordSize);
-    __ movflt(Address(rsp, 0), xmm0);
-    __ fld_s(Address(rsp, 0));
-    __ addptr(rsp, wordSize);
-  } else if (state == dtos && UseSSE >= 2) {
-    __ subptr(rsp, 2*wordSize);
-    __ movdbl(Address(rsp, 0), xmm0);
-    __ fld_d(Address(rsp, 0));
-    __ addptr(rsp, 2*wordSize);
+  if (state == ftos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_deopt_entry_for in interpreter");
+  } else if (state == dtos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_deopt_entry_for in interpreter");
  }

-  __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_deopt_entry_for in interpreter");
-
  // The stack is not extended by deopt but we must NULL last_sp as this
  // entry is like a "return".
  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
@ -735,7 +716,7 @@ address InterpreterGenerator::generate_CRC32_update_entry() {
  if (UseCRC32Intrinsics) {
    address entry = __ pc();

-    // rbx,: Method*
+    // rbx: Method*
    // rsi: senderSP must preserved for slow path, set SP to it on fast path
    // rdx: scratch
    // rdi: scratch
@ -841,6 +822,124 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret
  return generate_native_entry(false);
 }

+/**
+ * Method entry for static native method:
+ *    java.lang.Float.intBitsToFloat(int bits)
+ */
+address InterpreterGenerator::generate_Float_intBitsToFloat_entry() {
+  address entry;
+
+  if (UseSSE >= 1) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+    __ movflt(xmm0, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Float.floatToRawIntBits(float value)
+ */
+address InterpreterGenerator::generate_Float_floatToRawIntBits_entry() {
+  address entry;
+
+  if (UseSSE >= 1) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load the parameter (a floating-point value) into rax.
+    __ movl(rax, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Double.longBitsToDouble(long bits)
+ */
+address InterpreterGenerator::generate_Double_longBitsToDouble_entry() {
+  address entry;
+
+   if (UseSSE >= 2) {
+     entry = __ pc();
+
+     // rsi: the sender's SP
+
+     // Skip safepoint check (compiler intrinsic versions of this method
+     // do not perform safepoint checks either).
+
+     // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+     __ movdbl(xmm0, Address(rsp, wordSize));
+
+     // Return
+     __ pop(rdi); // get return address
+     __ mov(rsp, rsi); // set rsp to the sender's SP
+     __ jmp(rdi);
+   } else {
+     entry = generate_native_entry(false);
+   }
+
+   return entry;
+}
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Double.doubleToRawLongBits(double value)
+ */
+address InterpreterGenerator::generate_Double_doubleToRawLongBits_entry() {
+  address entry;
+
+  if (UseSSE >= 2) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load the parameter (a floating-point value) into rax.
+    __ movl(rdx, Address(rsp, 2*wordSize));
+    __ movl(rax, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
 //
 // Interpreter stub for calling a native method. (asm interpreter)
 // This sets up a somewhat different looking stack for calling the native method
@ -1090,7 +1189,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
              double_handler.addr());
    __ jcc(Assembler::notEqual, L);
    __ bind(push_double);
-    __ push(dtos);
+    __ push_d(); // FP values are returned using the FPU, so push FPU contents (even if UseSSE > 0).
    __ bind(L);
  }
  __ push(ltos);
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
@ -1707,10 +1707,10 @@ void TemplateInterpreterGenerator::set_vtos_entry_points(Template* t,
                                                         address& vep) {
  assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
  Label L;
-  aep = __ pc();  __ push_ptr();  __ jmp(L);
-  fep = __ pc();  __ push_f();    __ jmp(L);
-  dep = __ pc();  __ push_d();    __ jmp(L);
-  lep = __ pc();  __ push_l();    __ jmp(L);
+  aep = __ pc();  __ push_ptr();   __ jmp(L);
+  fep = __ pc();  __ push_f(xmm0); __ jmp(L);
+  dep = __ pc();  __ push_d(xmm0); __ jmp(L);
+  lep = __ pc();  __ push_l();     __ jmp(L);
  bep = cep = sep =
  iep = __ pc();  __ push_i();
  vep = __ pc();
--- a/hotspot/src/cpu/x86/vm/templateTable_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86.cpp
@ -349,53 +349,60 @@ void TemplateTable::lconst(int value) {

 void TemplateTable::fconst(int value) {
  transition(vtos, ftos);
+  if (UseSSE >= 1) {
+    static float one = 1.0f, two = 2.0f;
+    switch (value) {
+    case 0:
+      __ xorps(xmm0, xmm0);
+      break;
+    case 1:
+      __ movflt(xmm0, ExternalAddress((address) &one));
+      break;
+    case 2:
+      __ movflt(xmm0, ExternalAddress((address) &two));
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
 #ifdef _LP64
-  static float one = 1.0f, two = 2.0f;
-  switch (value) {
-  case 0:
-    __ xorps(xmm0, xmm0);
-    break;
-  case 1:
-    __ movflt(xmm0, ExternalAddress((address) &one));
-    break;
-  case 2:
-    __ movflt(xmm0, ExternalAddress((address) &two));
-    break;
-  default:
    ShouldNotReachHere();
-    break;
-  }
 #else
-         if (value == 0) { __ fldz();
-  } else if (value == 1) { __ fld1();
-  } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
-  } else                 { ShouldNotReachHere();
+           if (value == 0) { __ fldz();
+    } else if (value == 1) { __ fld1();
+    } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
+    } else                 { ShouldNotReachHere();
+    }
+#endif // _LP64
  }
-#endif
 }

 void TemplateTable::dconst(int value) {
  transition(vtos, dtos);
+  if (UseSSE >= 2) {
+    static double one = 1.0;
+    switch (value) {
+    case 0:
+      __ xorpd(xmm0, xmm0);
+      break;
+    case 1:
+      __ movdbl(xmm0, ExternalAddress((address) &one));
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
 #ifdef _LP64
-  static double one = 1.0;
-  switch (value) {
-  case 0:
-    __ xorpd(xmm0, xmm0);
-    break;
-  case 1:
-    __ movdbl(xmm0, ExternalAddress((address) &one));
-    break;
-  default:
    ShouldNotReachHere();
-    break;
-  }
-
 #else
-         if (value == 0) { __ fldz();
-  } else if (value == 1) { __ fld1();
-  } else                 { ShouldNotReachHere();
-  }
+           if (value == 0) { __ fldz();
+    } else if (value == 1) { __ fld1();
+    } else                 { ShouldNotReachHere();
+    }
 #endif
+  }
 }

 void TemplateTable::bipush() {
@ -454,8 +461,7 @@ void TemplateTable::ldc(bool wide) {
  __ jccb(Assembler::notEqual, notFloat);

  // ftos
-  LP64_ONLY(__ movflt(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
-  NOT_LP64(__ fld_s(    Address(rcx, rbx, Address::times_ptr, base_offset)));
+  __ load_float(Address(rcx, rbx, Address::times_ptr, base_offset));
  __ push(ftos);
  __ jmp(Done);

@ -522,8 +528,7 @@ void TemplateTable::ldc2_w() {
  __ jccb(Assembler::notEqual, Long);

  // dtos
-  LP64_ONLY(__ movdbl(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
-  NOT_LP64(__ fld_d(    Address(rcx, rbx, Address::times_ptr, base_offset)));
+  __ load_double(Address(rcx, rbx, Address::times_ptr, base_offset));
  __ push(dtos);

  __ jmpb(Done);
@ -617,15 +622,13 @@ void TemplateTable::lload() {
 void TemplateTable::fload() {
  transition(vtos, ftos);
  locals_index(rbx);
-  LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
-  NOT_LP64(__ fld_s(faddress(rbx)));
+  __ load_float(faddress(rbx));
 }

 void TemplateTable::dload() {
  transition(vtos, dtos);
  locals_index(rbx);
-  LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
-  NOT_LP64(__ fld_d(daddress(rbx)));
+  __ load_double(daddress(rbx));
 }

 void TemplateTable::aload() {
@ -657,15 +660,13 @@ void TemplateTable::wide_lload() {
 void TemplateTable::wide_fload() {
  transition(vtos, ftos);
  locals_index_wide(rbx);
-  LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
-  NOT_LP64(__ fld_s(faddress(rbx)));
+  __ load_float(faddress(rbx));
 }

 void TemplateTable::wide_dload() {
  transition(vtos, dtos);
  locals_index_wide(rbx);
-  LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
-  NOT_LP64(__ fld_d(daddress(rbx)));
+  __ load_double(daddress(rbx));
 }

 void TemplateTable::wide_aload() {
@ -726,10 +727,9 @@ void TemplateTable::faload() {
  // rax: index
  // rdx: array
  index_check(rdx, rax); // kills rbx
-  LP64_ONLY(__ movflt(xmm0, Address(rdx, rax,
-                         Address::times_4,
-                         arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
-  NOT_LP64(__ fld_s(Address(rdx, rax, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+  __ load_float(Address(rdx, rax,
+                        Address::times_4,
+                        arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
 }

 void TemplateTable::daload() {
@ -737,10 +737,9 @@ void TemplateTable::daload() {
  // rax: index
  // rdx: array
  index_check(rdx, rax); // kills rbx
-  LP64_ONLY(__ movdbl(xmm0, Address(rdx, rax,
-                          Address::times_8,
-                          arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
-  NOT_LP64(__ fld_d(Address(rdx, rax, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+  __ load_double(Address(rdx, rax,
+                         Address::times_8,
+                         arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
 }

 void TemplateTable::aaload() {
@ -807,14 +806,12 @@ void TemplateTable::lload(int n) {

 void TemplateTable::fload(int n) {
  transition(vtos, ftos);
-  LP64_ONLY(__ movflt(xmm0, faddress(n)));
-  NOT_LP64(__ fld_s(faddress(n)));
+  __ load_float(faddress(n));
 }

 void TemplateTable::dload(int n) {
  transition(vtos, dtos);
-  LP64_ONLY(__ movdbl(xmm0, daddress(n)));
-  NOT_LP64(__ fld_d(daddress(n)));
+  __ load_double(daddress(n));
 }

 void TemplateTable::aload(int n) {
@ -919,15 +916,13 @@ void TemplateTable::lstore() {
 void TemplateTable::fstore() {
  transition(ftos, vtos);
  locals_index(rbx);
-  LP64_ONLY(__ movflt(faddress(rbx), xmm0));
-  NOT_LP64(__ fstp_s(faddress(rbx)));
+  __ store_float(faddress(rbx));
 }

 void TemplateTable::dstore() {
  transition(dtos, vtos);
  locals_index(rbx);
-  LP64_ONLY(__ movdbl(daddress(rbx), xmm0));
-  NOT_LP64(__ fstp_d(daddress(rbx)));
+  __ store_double(daddress(rbx));
 }

 void TemplateTable::astore() {
@ -956,7 +951,7 @@ void TemplateTable::wide_lstore() {
 void TemplateTable::wide_fstore() {
 #ifdef _LP64
  transition(vtos, vtos);
-  __ pop_f();
+  __ pop_f(xmm0);
  locals_index_wide(rbx);
  __ movflt(faddress(rbx), xmm0);
 #else
@ -967,7 +962,7 @@ void TemplateTable::wide_fstore() {
 void TemplateTable::wide_dstore() {
 #ifdef _LP64
  transition(vtos, vtos);
-  __ pop_d();
+  __ pop_d(xmm0);
  locals_index_wide(rbx);
  __ movdbl(daddress(rbx), xmm0);
 #else
@ -1011,29 +1006,21 @@ void TemplateTable::lastore() {
 void TemplateTable::fastore() {
  transition(ftos, vtos);
  __ pop_i(rbx);
-  // xmm0: value
+  // value is in UseSSE >= 1 ? xmm0 : ST(0)
  // rbx:  index
  // rdx:  array
  index_check(rdx, rbx); // prefer index in rbx
-  LP64_ONLY(__ movflt(Address(rdx, rbx,
-                   Address::times_4,
-                   arrayOopDesc::base_offset_in_bytes(T_FLOAT)),
-           xmm0));
-  NOT_LP64(__ fstp_s(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+  __ store_float(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
 }

 void TemplateTable::dastore() {
  transition(dtos, vtos);
  __ pop_i(rbx);
-  // xmm0: value
+  // value is in UseSSE >= 2 ? xmm0 : ST(0)
  // rbx:  index
  // rdx:  array
  index_check(rdx, rbx); // prefer index in rbx
-  LP64_ONLY(__ movdbl(Address(rdx, rbx,
-                   Address::times_8,
-                   arrayOopDesc::base_offset_in_bytes(T_DOUBLE)),
-           xmm0));
-  NOT_LP64(__ fstp_d(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+  __ store_double(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
 }

 void TemplateTable::aastore() {
@ -1134,14 +1121,12 @@ void TemplateTable::lstore(int n) {

 void TemplateTable::fstore(int n) {
  transition(ftos, vtos);
-  LP64_ONLY(__ movflt(faddress(n), xmm0));
-  NOT_LP64(__ fstp_s(faddress(n)));
+  __ store_float(faddress(n));
 }

 void TemplateTable::dstore(int n) {
  transition(dtos, vtos);
-  LP64_ONLY(__ movdbl(daddress(n), xmm0));
-  NOT_LP64(__ fstp_d(daddress(n)));
+  __ store_double(daddress(n));
 }


@ -1425,82 +1410,127 @@ void TemplateTable::lushr() {

 void TemplateTable::fop2(Operation op) {
  transition(ftos, ftos);
+
+  if (UseSSE >= 1) {
+    switch (op) {
+    case add:
+      __ addss(xmm0, at_rsp());
+      __ addptr(rsp, Interpreter::stackElementSize);
+      break;
+    case sub:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ subss(xmm0, xmm1);
+      break;
+    case mul:
+      __ mulss(xmm0, at_rsp());
+      __ addptr(rsp, Interpreter::stackElementSize);
+      break;
+    case div:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ divss(xmm0, xmm1);
+      break;
+    case rem:
+      // On x86_64 platforms the SharedRuntime::frem method is called to perform the
+      // modulo operation. The frem method calls the function
+      // double fmod(double x, double y) in math.h. The documentation of fmod states:
+      // "If x or y is a NaN, a NaN is returned." without specifying what type of NaN
+      // (signalling or quiet) is returned.
+      //
+      // On x86_32 platforms the FPU is used to perform the modulo operation. The
+      // reason is that on 32-bit Windows the sign of modulo operations diverges from
+      // what is considered the standard (e.g., -0.0f % -3.14f is 0.0f (and not -0.0f).
+      // The fprem instruction used on x86_32 is functionally equivalent to
+      // SharedRuntime::frem in that it returns a NaN.
 #ifdef _LP64
-  switch (op) {
-  case add:
-    __ addss(xmm0, at_rsp());
-    __ addptr(rsp, Interpreter::stackElementSize);
-    break;
-  case sub:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ subss(xmm0, xmm1);
-    break;
-  case mul:
-    __ mulss(xmm0, at_rsp());
-    __ addptr(rsp, Interpreter::stackElementSize);
-    break;
-  case div:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ divss(xmm0, xmm1);
-    break;
-  case rem:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
-    break;
-  default:
-    ShouldNotReachHere();
-    break;
-  }
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
 #else
-  switch (op) {
+      __ push_f(xmm0);
+      __ pop_f();
+      __ fld_s(at_rsp());
+      __ fremr(rax);
+      __ f2ieee();
+      __ pop(rax);  // pop second operand off the stack
+      __ push_f();
+      __ pop_f(xmm0);
+#endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef _LP64
+    ShouldNotReachHere();
+#else
+    switch (op) {
    case add: __ fadd_s (at_rsp());                break;
    case sub: __ fsubr_s(at_rsp());                break;
    case mul: __ fmul_s (at_rsp());                break;
    case div: __ fdivr_s(at_rsp());                break;
    case rem: __ fld_s  (at_rsp()); __ fremr(rax); break;
    default : ShouldNotReachHere();
+    }
+    __ f2ieee();
+    __ pop(rax);  // pop second operand off the stack
+#endif // _LP64
  }
-  __ f2ieee();
-  __ pop(rax);  // pop float thing off
-#endif
 }

 void TemplateTable::dop2(Operation op) {
  transition(dtos, dtos);
+  if (UseSSE >= 2) {
+    switch (op) {
+    case add:
+      __ addsd(xmm0, at_rsp());
+      __ addptr(rsp, 2 * Interpreter::stackElementSize);
+      break;
+    case sub:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ subsd(xmm0, xmm1);
+      break;
+    case mul:
+      __ mulsd(xmm0, at_rsp());
+      __ addptr(rsp, 2 * Interpreter::stackElementSize);
+      break;
+    case div:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ divsd(xmm0, xmm1);
+      break;
+    case rem:
+      // Similar to fop2(), the modulo operation is performed using the
+      // SharedRuntime::drem method (on x86_64 platforms) or using the
+      // FPU (on x86_32 platforms) for the same reasons as mentioned in fop2().
 #ifdef _LP64
-  switch (op) {
-  case add:
-    __ addsd(xmm0, at_rsp());
-    __ addptr(rsp, 2 * Interpreter::stackElementSize);
-    break;
-  case sub:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ subsd(xmm0, xmm1);
-    break;
-  case mul:
-    __ mulsd(xmm0, at_rsp());
-    __ addptr(rsp, 2 * Interpreter::stackElementSize);
-    break;
-  case div:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ divsd(xmm0, xmm1);
-    break;
-  case rem:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
-    break;
-  default:
-    ShouldNotReachHere();
-    break;
-  }
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
 #else
-  switch (op) {
+      __ push_d(xmm0);
+      __ pop_d();
+      __ fld_d(at_rsp());
+      __ fremr(rax);
+      __ d2ieee();
+      __ pop(rax);
+      __ pop(rdx);
+      __ push_d();
+      __ pop_d(xmm0);
+#endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef _LP64
+    ShouldNotReachHere();
+#else
+    switch (op) {
    case add: __ fadd_d (at_rsp());                break;
    case sub: __ fsubr_d(at_rsp());                break;
    case mul: {
@ -1543,12 +1573,13 @@ void TemplateTable::dop2(Operation op) {
    }
    case rem: __ fld_d  (at_rsp()); __ fremr(rax); break;
    default : ShouldNotReachHere();
-  }
-  __ d2ieee();
-  // Pop double precision number from rsp.
-  __ pop(rax);
-  __ pop(rdx);
+    }
+    __ d2ieee();
+    // Pop double precision number from rsp.
+    __ pop(rax);
+    __ pop(rdx);
 #endif
+  }
 }

 void TemplateTable::ineg() {
@ -1562,7 +1593,6 @@ void TemplateTable::lneg() {
  NOT_LP64(__ lneg(rdx, rax));
 }

-#ifdef _LP64
 // Note: 'double' and 'long long' have 32-bits alignment on x86.
 static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
  // Use the expression (adr)&(~0xF) to provide 128-bits aligned address
@ -1577,26 +1607,30 @@ static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
 // Buffer for 128-bits masks used by SSE instructions.
 static jlong float_signflip_pool[2*2];
 static jlong double_signflip_pool[2*2];
-#endif

 void TemplateTable::fneg() {
  transition(ftos, ftos);
-#ifdef _LP64
-  static jlong *float_signflip  = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
-  __ xorps(xmm0, ExternalAddress((address) float_signflip));
-#else
-  __ fchs();
-#endif
+  if (UseSSE >= 1) {
+    static jlong *float_signflip  = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
+    __ xorps(xmm0, ExternalAddress((address) float_signflip));
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(__ fchs());
+  }
 }

 void TemplateTable::dneg() {
  transition(dtos, dtos);
+  if (UseSSE >= 2) {
+    static jlong *double_signflip  = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
+    __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+  } else {
 #ifdef _LP64
-  static jlong *double_signflip  = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
-  __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+    ShouldNotReachHere();
 #else
-  __ fchs();
+    __ fchs();
 #endif
+  }
 }

 void TemplateTable::iinc() {
@ -1798,18 +1832,26 @@ void TemplateTable::convert() {
      __ extend_sign(rdx, rax);
      break;
    case Bytecodes::_i2f:
-      __ push(rax);          // store int on tos
-      __ fild_s(at_rsp());   // load int to ST0
-      __ f2ieee();           // truncate to float size
-      __ pop(rcx);           // adjust rsp
+      if (UseSSE >= 1) {
+        __ cvtsi2ssl(xmm0, rax);
+      } else {
+        __ push(rax);          // store int on tos
+        __ fild_s(at_rsp());   // load int to ST0
+        __ f2ieee();           // truncate to float size
+        __ pop(rcx);           // adjust rsp
+      }
      break;
    case Bytecodes::_i2d:
+      if (UseSSE >= 2) {
+        __ cvtsi2sdl(xmm0, rax);
+      } else {
      __ push(rax);          // add one slot for d2ieee()
      __ push(rax);          // store int on tos
      __ fild_s(at_rsp());   // load int to ST0
      __ d2ieee();           // truncate to double size
      __ pop(rcx);           // adjust rsp
      __ pop(rcx);
+      }
      break;
    case Bytecodes::_i2b:
      __ shll(rax, 24);      // truncate upper 24 bits
@ -1829,50 +1871,102 @@ void TemplateTable::convert() {
      /* nothing to do */
      break;
    case Bytecodes::_l2f:
+      // On 64-bit platforms, the cvtsi2ssq instruction is used to convert
+      // 64-bit long values to floats. On 32-bit platforms it is not possible
+      // to use that instruction with 64-bit operands, therefore the FPU is
+      // used to perform the conversion.
      __ push(rdx);          // store long on tos
      __ push(rax);
      __ fild_d(at_rsp());   // load long to ST0
      __ f2ieee();           // truncate to float size
      __ pop(rcx);           // adjust rsp
      __ pop(rcx);
+      if (UseSSE >= 1) {
+        __ push_f();
+        __ pop_f(xmm0);
+      }
      break;
    case Bytecodes::_l2d:
+      // On 32-bit platforms the FPU is used for conversion because on
+      // 32-bit platforms it is not not possible to use the cvtsi2sdq
+      // instruction with 64-bit operands.
      __ push(rdx);          // store long on tos
      __ push(rax);
      __ fild_d(at_rsp());   // load long to ST0
      __ d2ieee();           // truncate to double size
      __ pop(rcx);           // adjust rsp
      __ pop(rcx);
+      if (UseSSE >= 2) {
+        __ push_d();
+        __ pop_d(xmm0);
+      }
      break;
    case Bytecodes::_f2i:
-      __ push(rcx);          // reserve space for argument
-      __ fstp_s(at_rsp());   // pass float argument on stack
+      // SharedRuntime::f2i does not differentiate between sNaNs and qNaNs
+      // as it returns 0 for any NaN.
+      if (UseSSE >= 1) {
+        __ push_f(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ fstp_s(at_rsp());   // pass float argument on stack
+      }
      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2i), 1);
      break;
    case Bytecodes::_f2l:
-      __ push(rcx);          // reserve space for argument
-      __ fstp_s(at_rsp());   // pass float argument on stack
+      // SharedRuntime::f2l does not differentiate between sNaNs and qNaNs
+      // as it returns 0 for any NaN.
+      if (UseSSE >= 1) {
+       __ push_f(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ fstp_s(at_rsp());   // pass float argument on stack
+      }
      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2l), 1);
      break;
    case Bytecodes::_f2d:
-      /* nothing to do */
+      if (UseSSE < 1) {
+        /* nothing to do */
+      } else if (UseSSE == 1) {
+        __ push_f(xmm0);
+        __ pop_f();
+      } else { // UseSSE >= 2
+        __ cvtss2sd(xmm0, xmm0);
+      }
      break;
    case Bytecodes::_d2i:
-      __ push(rcx);          // reserve space for argument
-      __ push(rcx);
-      __ fstp_d(at_rsp());   // pass double argument on stack
+      if (UseSSE >= 2) {
+        __ push_d(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ push(rcx);
+        __ fstp_d(at_rsp());   // pass double argument on stack
+      }
      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2i), 2);
      break;
    case Bytecodes::_d2l:
-      __ push(rcx);          // reserve space for argument
-      __ push(rcx);
-      __ fstp_d(at_rsp());   // pass double argument on stack
+      if (UseSSE >= 2) {
+        __ push_d(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ push(rcx);
+        __ fstp_d(at_rsp());   // pass double argument on stack
+      }
      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2l), 2);
      break;
    case Bytecodes::_d2f:
-      __ push(rcx);          // reserve space for f2ieee()
-      __ f2ieee();           // truncate to float size
-      __ pop(rcx);           // adjust rsp
+      if (UseSSE <= 1) {
+        __ push(rcx);          // reserve space for f2ieee()
+        __ f2ieee();           // truncate to float size
+        __ pop(rcx);           // adjust rsp
+        if (UseSSE == 1) {
+          // The cvtsd2ss instruction is not available if UseSSE==1, therefore
+          // the conversion is performed using the FPU in this case.
+          __ push_f();
+          __ pop_f(xmm0);
+        }
+      } else { // UseSSE >= 2
+        __ cvtsd2ss(xmm0, xmm0);
+      }
      break;
    default             :
      ShouldNotReachHere();
@ -1901,42 +1995,47 @@ void TemplateTable::lcmp() {
 }

 void TemplateTable::float_cmp(bool is_float, int unordered_result) {
+  if ((is_float && UseSSE >= 1) ||
+      (!is_float && UseSSE >= 2)) {
+    Label done;
+    if (is_float) {
+      // XXX get rid of pop here, use ... reg, mem32
+      __ pop_f(xmm1);
+      __ ucomiss(xmm1, xmm0);
+    } else {
+      // XXX get rid of pop here, use ... reg, mem64
+      __ pop_d(xmm1);
+      __ ucomisd(xmm1, xmm0);
+    }
+    if (unordered_result < 0) {
+      __ movl(rax, -1);
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::below, done);
+      __ setb(Assembler::notEqual, rdx);
+      __ movzbl(rax, rdx);
+    } else {
+      __ movl(rax, 1);
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::above, done);
+      __ movl(rax, 0);
+      __ jccb(Assembler::equal, done);
+      __ decrementl(rax);
+    }
+    __ bind(done);
+  } else {
 #ifdef _LP64
-  Label done;
-  if (is_float) {
-    // XXX get rid of pop here, use ... reg, mem32
-    __ pop_f(xmm1);
-    __ ucomiss(xmm1, xmm0);
-  } else {
-    // XXX get rid of pop here, use ... reg, mem64
-    __ pop_d(xmm1);
-    __ ucomisd(xmm1, xmm0);
-  }
-  if (unordered_result < 0) {
-    __ movl(rax, -1);
-    __ jccb(Assembler::parity, done);
-    __ jccb(Assembler::below, done);
-    __ setb(Assembler::notEqual, rdx);
-    __ movzbl(rax, rdx);
-  } else {
-    __ movl(rax, 1);
-    __ jccb(Assembler::parity, done);
-    __ jccb(Assembler::above, done);
-    __ movl(rax, 0);
-    __ jccb(Assembler::equal, done);
-    __ decrementl(rax);
-  }
-  __ bind(done);
+    ShouldNotReachHere();
 #else
-  if (is_float) {
-    __ fld_s(at_rsp());
-  } else {
-    __ fld_d(at_rsp());
-    __ pop(rdx);
+    if (is_float) {
+      __ fld_s(at_rsp());
+    } else {
+      __ fld_d(at_rsp());
+      __ pop(rdx);
+    }
+    __ pop(rcx);
+    __ fcmp2int(rax, unordered_result < 0);
+#endif // _LP64
  }
-  __ pop(rcx);
-  __ fcmp2int(rax, unordered_result < 0);
-#endif
 }

 void TemplateTable::branch(bool is_jsr, bool is_wide) {
@ -2748,8 +2847,7 @@ void TemplateTable::getfield_or_static(int byte_no, bool is_static, RewriteContr
  __ jcc(Assembler::notEqual, notFloat);
  // ftos

-  LP64_ONLY(__ movflt(xmm0, field));
-  NOT_LP64(__ fld_s(field));
+  __ load_float(field);
  __ push(ftos);
  // Rewrite bytecode to be faster
  if (!is_static && rc == may_rewrite) {
@ -2763,8 +2861,7 @@ void TemplateTable::getfield_or_static(int byte_no, bool is_static, RewriteContr
  __ jcc(Assembler::notEqual, notDouble);
 #endif
  // dtos
-  LP64_ONLY(__ movdbl(xmm0, field));
-  NOT_LP64(__ fld_d(field));
+  __ load_double(field);
  __ push(dtos);
  // Rewrite bytecode to be faster
  if (!is_static && rc == may_rewrite) {
@ -3046,8 +3143,7 @@ void TemplateTable::putfield_or_static(int byte_no, bool is_static, RewriteContr
  {
    __ pop(ftos);
    if (!is_static) pop_and_check_object(obj);
-    NOT_LP64( __ fstp_s(field);)
-    LP64_ONLY( __ movflt(field, xmm0);)
+    __ store_float(field);
    if (!is_static && rc == may_rewrite) {
      patch_bytecode(Bytecodes::_fast_fputfield, bc, rbx, true, byte_no);
    }
@ -3064,8 +3160,7 @@ void TemplateTable::putfield_or_static(int byte_no, bool is_static, RewriteContr
  {
    __ pop(dtos);
    if (!is_static) pop_and_check_object(obj);
-    NOT_LP64( __ fstp_d(field);)
-    LP64_ONLY( __ movdbl(field, xmm0);)
+    __ store_double(field);
    if (!is_static && rc == may_rewrite) {
      patch_bytecode(Bytecodes::_fast_dputfield, bc, rbx, true, byte_no);
    }
@ -3123,8 +3218,8 @@ void TemplateTable::jvmti_post_fast_field_mod() {
    case Bytecodes::_fast_sputfield: // fall through
    case Bytecodes::_fast_cputfield: // fall through
    case Bytecodes::_fast_iputfield: __ push_i(rax); break;
-    case Bytecodes::_fast_dputfield: __ push_d(); break;
-    case Bytecodes::_fast_fputfield: __ push_f(); break;
+    case Bytecodes::_fast_dputfield: __ push(dtos); break;
+    case Bytecodes::_fast_fputfield: __ push(ftos); break;
    case Bytecodes::_fast_lputfield: __ push_l(rax); break;

    default:
@ -3147,8 +3242,8 @@ void TemplateTable::jvmti_post_fast_field_mod() {
    case Bytecodes::_fast_sputfield: // fall through
    case Bytecodes::_fast_cputfield: // fall through
    case Bytecodes::_fast_iputfield: __ pop_i(rax); break;
-    case Bytecodes::_fast_dputfield: __ pop_d(); break;
-    case Bytecodes::_fast_fputfield: __ pop_f(); break;
+    case Bytecodes::_fast_dputfield: __ pop(dtos); break;
+    case Bytecodes::_fast_fputfield: __ pop(ftos); break;
    case Bytecodes::_fast_lputfield: __ pop_l(rax); break;
    }
    __ bind(L2);
@ -3212,12 +3307,10 @@ void TemplateTable::fast_storefield(TosState state) {
    __ movw(field, rax);
    break;
  case Bytecodes::_fast_fputfield:
-    NOT_LP64( __ fstp_s(field); )
-    LP64_ONLY( __ movflt(field, xmm0);)
+    __ store_float(field);
    break;
  case Bytecodes::_fast_dputfield:
-    NOT_LP64( __ fstp_d(field); )
-    LP64_ONLY( __ movdbl(field, xmm0);)
+    __ store_double(field);
    break;
  default:
    ShouldNotReachHere();
@ -3302,12 +3395,10 @@ void TemplateTable::fast_accessfield(TosState state) {
    __ load_unsigned_short(rax, field);
    break;
  case Bytecodes::_fast_fgetfield:
-    LP64_ONLY(__ movflt(xmm0, field));
-    NOT_LP64(__ fld_s(field));
+    __ load_float(field);
    break;
  case Bytecodes::_fast_dgetfield:
-    LP64_ONLY(__ movdbl(xmm0, field));
-    NOT_LP64(__ fld_d(field));
+    __ load_double(field);
    break;
  default:
    ShouldNotReachHere();
@ -3347,8 +3438,7 @@ void TemplateTable::fast_xaccess(TosState state) {
    __ verify_oop(rax);
    break;
  case ftos:
-    LP64_ONLY(__ movflt(xmm0, field));
-    NOT_LP64(__ fld_s(field));
+    __ load_float(field);
    break;
  default:
    ShouldNotReachHere();
--- a/hotspot/src/share/vm/compiler/compileBroker.cpp
+++ b/hotspot/src/share/vm/compiler/compileBroker.cpp
@ -1399,6 +1399,28 @@ nmethod* CompileBroker::compile_method(methodHandle method, int osr_bci,
  // do the compilation
  if (method->is_native()) {
    if (!PreferInterpreterNativeStubs || method->is_method_handle_intrinsic()) {
+      // The following native methods:
+      //
+      // java.lang.Float.intBitsToFloat
+      // java.lang.Float.floatToRawIntBits
+      // java.lang.Double.longBitsToDouble
+      // java.lang.Double.doubleToRawLongBits
+      //
+      // are called through the interpreter even if interpreter native stubs
+      // are not preferred (i.e., calling through adapter handlers is preferred).
+      // The reason is that on x86_32 signaling NaNs (sNaNs) are not preserved
+      // if the version of the methods from the native libraries is called.
+      // As the interpreter and the C2-intrinsified version of the methods preserves
+      // sNaNs, that would result in an inconsistent way of handling of sNaNs.
+      if ((UseSSE >= 1 &&
+          (method->intrinsic_id() == vmIntrinsics::_intBitsToFloat ||
+           method->intrinsic_id() == vmIntrinsics::_floatToRawIntBits)) ||
+          (UseSSE >= 2 &&
+           (method->intrinsic_id() == vmIntrinsics::_longBitsToDouble ||
+            method->intrinsic_id() == vmIntrinsics::_doubleToRawLongBits))) {
+        return NULL;
+      }
+
      // To properly handle the appendix argument for out-of-line calls we are using a small trampoline that
      // pops off the appendix argument and jumps to the target (see gen_special_dispatch in SharedRuntime).
      //
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
@ -90,6 +90,10 @@ class AbstractInterpreter: AllStatic {
    java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
    java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
    java_util_zip_CRC32_updateByteBuffer,                       // implementation of java.util.zip.CRC32.updateByteBuffer()
+    java_lang_Float_intBitsToFloat,                             // implementation of java.lang.Float.intBitsToFloat()
+    java_lang_Float_floatToRawIntBits,                          // implementation of java.lang.Float.floatToRawIntBits()
+    java_lang_Double_longBitsToDouble,                          // implementation of java.lang.Double.longBitsToDouble()
+    java_lang_Double_doubleToRawLongBits,                       // implementation of java.lang.Double.doubleToRawLongBits()
    number_of_method_entries,
    invalid = -1
  };
--- a/hotspot/src/share/vm/interpreter/interpreter.cpp
+++ b/hotspot/src/share/vm/interpreter/interpreter.cpp
@ -234,7 +234,15 @@ AbstractInterpreter::MethodKind AbstractInterpreter::method_kind(methodHandle m)
      case vmIntrinsics::_updateByteBufferCRC32  : return java_util_zip_CRC32_updateByteBuffer;
    }
  }
-#endif
+
+  switch(m->intrinsic_id()) {
+  case vmIntrinsics::_intBitsToFloat:      return java_lang_Float_intBitsToFloat;
+  case vmIntrinsics::_floatToRawIntBits:   return java_lang_Float_floatToRawIntBits;
+  case vmIntrinsics::_longBitsToDouble:    return java_lang_Double_longBitsToDouble;
+  case vmIntrinsics::_doubleToRawLongBits: return java_lang_Double_doubleToRawLongBits;
+  }
+
+#endif // CC_INTERP

  // Native method?
  // Note: This test must come _before_ the test for intrinsic
@ -559,6 +567,25 @@ address InterpreterGenerator::generate_method_entry(
                                           : // fall thru
  case Interpreter::java_util_zip_CRC32_updateByteBuffer
                                           : entry_point = generate_CRC32_updateBytes_entry(kind); break;
+#if defined(TARGET_ARCH_x86) && !defined(_LP64)
+  // On x86_32 platforms, a special entry is generated for the following four methods.
+  // On other platforms the normal entry is used to enter these methods.
+  case Interpreter::java_lang_Float_intBitsToFloat
+                                           : entry_point = generate_Float_intBitsToFloat_entry(); break;
+  case Interpreter::java_lang_Float_floatToRawIntBits
+                                           : entry_point = generate_Float_floatToRawIntBits_entry(); break;
+  case Interpreter::java_lang_Double_longBitsToDouble
+                                           : entry_point = generate_Double_longBitsToDouble_entry(); break;
+  case Interpreter::java_lang_Double_doubleToRawLongBits
+                                           : entry_point = generate_Double_doubleToRawLongBits_entry(); break;
+#else
+  case Interpreter::java_lang_Float_intBitsToFloat:
+  case Interpreter::java_lang_Float_floatToRawIntBits:
+  case Interpreter::java_lang_Double_longBitsToDouble:
+  case Interpreter::java_lang_Double_doubleToRawLongBits:
+    entry_point = generate_native_entry(false);
+    break;
+#endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
 #endif // CC_INTERP
  default:
    fatal(err_msg("unexpected method kind: %d", kind));
--- a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp
+++ b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp
@ -397,34 +397,39 @@ void TemplateInterpreterGenerator::generate_all() {

      // all non-native method kinds
      method_entry(zerolocals)
-        method_entry(zerolocals_synchronized)
-        method_entry(empty)
-        method_entry(accessor)
-        method_entry(abstract)
-        method_entry(java_lang_math_sin  )
-        method_entry(java_lang_math_cos  )
-        method_entry(java_lang_math_tan  )
-        method_entry(java_lang_math_abs  )
-        method_entry(java_lang_math_sqrt )
-        method_entry(java_lang_math_log  )
-        method_entry(java_lang_math_log10)
-        method_entry(java_lang_math_exp  )
-        method_entry(java_lang_math_pow  )
-        method_entry(java_lang_ref_reference_get)
+      method_entry(zerolocals_synchronized)
+      method_entry(empty)
+      method_entry(accessor)
+      method_entry(abstract)
+      method_entry(java_lang_math_sin  )
+      method_entry(java_lang_math_cos  )
+      method_entry(java_lang_math_tan  )
+      method_entry(java_lang_math_abs  )
+      method_entry(java_lang_math_sqrt )
+      method_entry(java_lang_math_log  )
+      method_entry(java_lang_math_log10)
+      method_entry(java_lang_math_exp  )
+      method_entry(java_lang_math_pow  )
+      method_entry(java_lang_ref_reference_get)

-        if (UseCRC32Intrinsics) {
-          method_entry(java_util_zip_CRC32_update)
-            method_entry(java_util_zip_CRC32_updateBytes)
-            method_entry(java_util_zip_CRC32_updateByteBuffer)
-            }
+      if (UseCRC32Intrinsics) {
+        method_entry(java_util_zip_CRC32_update)
+        method_entry(java_util_zip_CRC32_updateBytes)
+        method_entry(java_util_zip_CRC32_updateByteBuffer)
+      }
+
+      method_entry(java_lang_Float_intBitsToFloat);
+      method_entry(java_lang_Float_floatToRawIntBits);
+      method_entry(java_lang_Double_longBitsToDouble);
+      method_entry(java_lang_Double_doubleToRawLongBits);

      initialize_method_handle_entries();

      // all native method kinds (must be one contiguous block)
      Interpreter::_native_entry_begin = Interpreter::code()->code_end();
      method_entry(native)
-        method_entry(native_synchronized)
-        Interpreter::_native_entry_end = Interpreter::code()->code_end();
+      method_entry(native_synchronized)
+      Interpreter::_native_entry_end = Interpreter::code()->code_end();

 #undef method_entry

--- a/hotspot/src/share/vm/memory/metaspace.hpp
+++ b/hotspot/src/share/vm/memory/metaspace.hpp
@ -254,7 +254,7 @@ class Metaspace : public CHeapObj<mtClass> {
  // Debugging support
  void verify();

-  static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0);
+  static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0) NOT_LP64({});

  class AllocRecordClosure :  public StackObj {
  public:
--- a/hotspot/src/share/vm/opto/chaitin.cpp
+++ b/hotspot/src/share/vm/opto/chaitin.cpp
@ -990,9 +990,13 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
        // FOUR registers!
 #ifdef ASSERT
        if (is_vect) {
-          assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
-          assert(!lrg._fat_proj, "sanity");
-          assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+          if (lrg.num_regs() != 0) {
+            assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
+            assert(!lrg._fat_proj, "sanity");
+            assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+          } else {
+            assert(n->is_Phi(), "not all inputs processed only if Phi");
+          }
        }
 #endif
        if (!is_vect && lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_pair()) {
--- a/hotspot/src/share/vm/opto/compile.hpp
+++ b/hotspot/src/share/vm/opto/compile.hpp
@ -93,7 +93,7 @@ class NodeCloneInfo {
 public:

  void set_idx(node_idx_t idx) {
-    _idx_clone_orig = _idx_clone_orig & 0xFFFFFFFF00000000 | idx;
+    _idx_clone_orig = _idx_clone_orig & CONST64(0xFFFFFFFF00000000) | idx;
  }
  node_idx_t idx() const { return (node_idx_t)(_idx_clone_orig & 0xFFFFFFFF); }

--- a/hotspot/src/share/vm/utilities/globalDefinitions_gcc.hpp
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_gcc.hpp
@ -161,7 +161,7 @@ typedef uint64_t julong;


 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long canstant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)

 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
--- a/hotspot/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp
@ -178,7 +178,7 @@ typedef unsigned long long julong;


 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long constant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)

 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
--- a/hotspot/src/share/vm/utilities/globalDefinitions_visCPP.hpp
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_visCPP.hpp
@ -148,9 +148,9 @@ inline int g_isfinite(jfloat  f)                 { return _finite(f); }
 inline int g_isfinite(jdouble f)                 { return _finite(f); }

 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long constant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)

-// Build a 64bit integer constant on with Visual C++
+// Build a 64bit integer constant with Visual C++
 #define  CONST64(x) (x ##  i64)
 #define UCONST64(x) (x ## ui64)

--- a/hotspot/src/share/vm/utilities/globalDefinitions_xlc.hpp
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_xlc.hpp
@ -108,7 +108,7 @@ typedef uint64_t julong;


 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long canstant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)

 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
--- a/hotspot/test/compiler/floatingpoint/NaNTest.java
+++ b/hotspot/test/compiler/floatingpoint/NaNTest.java
@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/**
+ * @test
+ * @bug 8076373
+ * @summary Verify if signaling NaNs are preserved.
+ * @run main NaNTest
+ */
+public class NaNTest {
+    static void testFloat() {
+        int originalValue = 0x7f800001;
+        int readBackValue = Float.floatToRawIntBits(Float.intBitsToFloat(originalValue));
+        if (originalValue != readBackValue) {
+            String errorMessage = String.format("Original and read back float values mismatch\n0x%X 0x%X\n",
+                                                originalValue,
+                                                readBackValue);
+            throw new RuntimeException(errorMessage);
+        } else {
+            System.out.printf("Written and read back float values match\n0x%X 0x%X\n",
+                              originalValue,
+                              readBackValue);
+        }
+    }
+
+    static void testDouble() {
+        long originalValue = 0xFFF0000000000001L;
+        long readBackValue = Double.doubleToRawLongBits(Double.longBitsToDouble(originalValue));
+        if (originalValue != readBackValue) {
+            String errorMessage = String.format("Original and read back double values mismatch\n0x%X 0x%X\n",
+                                                originalValue,
+                                                readBackValue);
+            throw new RuntimeException(errorMessage);
+        } else {
+            System.out.printf("Written and read back double values match\n0x%X 0x%X\n",
+                              originalValue,
+                              readBackValue);
+        }
+
+    }
+
+    public static void main(String args[]) {
+        System.out.println("### NanTest started");
+
+        testFloat();
+        testDouble();
+
+        System.out.println("### NanTest ended");
+    }
+}
--- a/hotspot/test/compiler/regalloc/TestVectorRegAlloc.java
+++ b/hotspot/test/compiler/regalloc/TestVectorRegAlloc.java
@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8131969
+ * @summary assert in register allocation code when vector Phi for a loop is processed because code assumes all inputs already processed
+ * @run main/othervm -Xbatch TestVectorRegAlloc
+ *
+ */
+
+public class TestVectorRegAlloc {
+
+    static int test_helper_i;
+    static boolean test_helper() {
+        test_helper_i++;
+        return (test_helper_i & 7) != 0;
+    }
+
+    static void test(double[] src, double[] dst, boolean flag) {
+        double j = 0.0;
+        while(test_helper()) {
+            for (int i = 0; i < src.length; i++) {
+                dst[i] = src[i] + j;
+            }
+            // Loop will be unswitched and ReplicateD of zero will be
+            // split through the Phi of outer loop
+            for (int i = 0; i < src.length; i++) {
+                double k;
+                if (flag) {
+                    k = j;
+                } else {
+                    k = 0;
+                }
+                dst[i] = src[i] + k;
+            }
+            j++;
+        }
+    }
+
+    static public void main(String[] args) {
+        double[] src = new double[10];
+        double[] dst = new double[10];
+        for (int i = 0; i < 20000; i++) {
+            test(src, dst, (i % 2) == 0);
+        }
+    }
+}