Merge branch 'openjdk:master' into JDK-8370196

2026-01-28 12:09:14 +00:00 · 2025-12-02 23:37:48 +08:00 · 2025-12-02 23:37:48 +08:00 · 30fa1f0380
commit 30fa1f0380
parent 6f57bcb215 a62296d8a0
398 changed files with 10129 additions and 7659 deletions
--- a/make/autoconf/flags-ldflags.m4
+++ b/make/autoconf/flags-ldflags.m4
@ -50,7 +50,14 @@ AC_DEFUN([FLAGS_SETUP_LDFLAGS_HELPER],
    # add -z,relro (mark relocations read only) for all libs
    # add -z,now ("full relro" - more of the Global Offset Table GOT is marked read only)
    # add --no-as-needed to disable default --as-needed link flag on some GCC toolchains
+    # add --icf=all (Identical Code Folding — merges identical functions)
    BASIC_LDFLAGS="-Wl,-z,defs -Wl,-z,relro -Wl,-z,now -Wl,--no-as-needed -Wl,--exclude-libs,ALL"
+    if test "x$LINKER_TYPE" = "xgold"; then
+      if test x$DEBUG_LEVEL = xrelease; then
+        BASIC_LDFLAGS="$BASIC_LDFLAGS -Wl,--icf=all"
+      fi
+    fi
+
    # Linux : remove unused code+data in link step
    if test "x$ENABLE_LINKTIME_GC" = xtrue; then
      if test "x$OPENJDK_TARGET_CPU" = xs390x; then
--- a/make/autoconf/toolchain.m4
+++ b/make/autoconf/toolchain.m4
@ -516,6 +516,7 @@ AC_DEFUN([TOOLCHAIN_EXTRACT_LD_VERSION],
    if [ [[ "$LINKER_VERSION_STRING" == *gold* ]] ]; then
      [ LINKER_VERSION_NUMBER=`$ECHO $LINKER_VERSION_STRING | \
          $SED -e 's/.* \([0-9][0-9]*\(\.[0-9][0-9]*\)*\).*) .*/\1/'` ]
+      LINKER_TYPE=gold
    else
      [ LINKER_VERSION_NUMBER=`$ECHO $LINKER_VERSION_STRING | \
          $SED -e 's/.* \([0-9][0-9]*\(\.[0-9][0-9]*\)*\).*/\1/'` ]
--- a/make/hotspot/gensrc/GensrcAdlc.gmk
+++ b/make/hotspot/gensrc/GensrcAdlc.gmk
@ -170,6 +170,7 @@ ifeq ($(call check-jvm-feature, compiler2), true)
  ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64)
    AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \
        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_vector.ad \
+        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_atomic.ad \
    )))
  endif

--- a/make/modules/java.desktop/lib/ClientLibraries.gmk
+++ b/make/modules/java.desktop/lib/ClientLibraries.gmk
@ -237,7 +237,7 @@ ifeq ($(ENABLE_HEADLESS_ONLY), false)
      DISABLED_WARNINGS_gcc_dgif_lib.c := sign-compare, \
      DISABLED_WARNINGS_gcc_jcmaster.c := implicit-fallthrough, \
      DISABLED_WARNINGS_gcc_jdphuff.c := shift-negative-value, \
-      DISABLED_WARNINGS_gcc_png.c := maybe-uninitialized unused-function, \
+      DISABLED_WARNINGS_gcc_png.c := maybe-uninitialized, \
      DISABLED_WARNINGS_gcc_pngerror.c := maybe-uninitialized, \
      DISABLED_WARNINGS_gcc_splashscreen_gfx_impl.c := implicit-fallthrough \
          maybe-uninitialized, \
@ -248,7 +248,6 @@ ifeq ($(ENABLE_HEADLESS_ONLY), false)
      DISABLED_WARNINGS_clang := deprecated-non-prototype, \
      DISABLED_WARNINGS_clang_dgif_lib.c := sign-compare, \
      DISABLED_WARNINGS_clang_gzwrite.c := format-nonliteral, \
-      DISABLED_WARNINGS_clang_png.c := unused-function, \
      DISABLED_WARNINGS_clang_splashscreen_impl.c := sign-compare \
          unused-but-set-variable unused-function, \
      DISABLED_WARNINGS_clang_splashscreen_png.c := \
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
--- a/src/hotspot/cpu/aarch64/aarch64_atomic.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_atomic.ad
@ -0,0 +1,909 @@
+// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2016, 2021, Red Hat Inc. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+//
+
+// BEGIN This file is automatically generated. Do not edit --------------
+
+// Sundry CAS operations.  Note that release is always true,
+// regardless of the memory ordering of the CAS.  This is because we
+// need the volatile case to be sequentially consistent but there is
+// no trailing StoreLoad barrier emitted by C2.  Unfortunately we
+// can't check the type of memory ordering here, so we always emit a
+// STLXR.
+
+// This section is generated from aarch64_atomic_ad.m4
+
+
+instruct compareAndExchangeB(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (CompareAndExchangeB mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchgb $res = $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::byte, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+    __ sxtbw($res$$Register, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeS(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (CompareAndExchangeS mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchgs $res = $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::halfword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+    __ sxthw($res$$Register, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeI(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (CompareAndExchangeI mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchgw $res = $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeL(iRegLNoSp res, indirect mem, iRegL oldval, iRegL newval, rFlagsReg cr) %{
+  match(Set res (CompareAndExchangeL mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchg $res = $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeN(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchgw $res = $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeP(iRegPNoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchg $res = $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeBAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndExchangeB mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchgb_acq $res = $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::byte, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+    __ sxtbw($res$$Register, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeSAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndExchangeS mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchgs_acq $res = $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::halfword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+    __ sxthw($res$$Register, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeIAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndExchangeI mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchgw_acq $res = $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeLAcq(iRegLNoSp res, indirect mem, iRegL oldval, iRegL newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndExchangeL mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangeNAcq(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchgw_acq $res = $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndExchangePAcq(iRegPNoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapB(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (CompareAndSwapB mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgb $res = $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::byte, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapS(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (CompareAndSwapS mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgs $res = $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::halfword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapI(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgw $res = $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapL(iRegINoSp res, indirect mem, iRegL oldval, iRegL newval, rFlagsReg cr) %{
+  match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg $res = $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgw $res = $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg $res = $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapBAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapB mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgb_acq $res = $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::byte, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapSAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapS mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgs_acq $res = $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::halfword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapIAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgw_acq $res = $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapLAcq(iRegINoSp res, indirect mem, iRegL oldval, iRegL newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgw_acq $res = $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg_acq $res = $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ false, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapB(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (WeakCompareAndSwapB mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgb_weak $res = $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::byte, /*acquire*/ false, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapS(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (WeakCompareAndSwapS mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgs_weak $res = $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::halfword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapI(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  match(Set res (WeakCompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgw_weak $res = $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ false, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapL(iRegINoSp res, indirect mem, iRegL oldval, iRegL newval, rFlagsReg cr) %{
+  match(Set res (WeakCompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg_weak $res = $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapN(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgw_weak $res = $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ false, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(2*VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg_weak $res = $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ false, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapBAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (WeakCompareAndSwapB mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgb_acq_weak $res = $mem, $oldval, $newval\t# (byte) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::byte, /*acquire*/ true, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapSAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (WeakCompareAndSwapS mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgs_acq_weak $res = $mem, $oldval, $newval\t# (short) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::halfword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapIAcq(iRegINoSp res, indirect mem, iRegI oldval, iRegI newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (WeakCompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgw_acq_weak $res = $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ true, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapLAcq(iRegINoSp res, indirect mem, iRegL oldval, iRegL newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (WeakCompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg_acq_weak $res = $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchgw_acq_weak $res = $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::word, /*acquire*/ true, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct weakCompareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+  predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg_acq_weak $res = $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::xword, /*acquire*/ true, /*release*/ true,
+               /*weak*/ true, noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct getAndSetI(indirect mem, iRegI newval, iRegINoSp oldval) %{
+  match(Set oldval (GetAndSetI mem newval));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "atomic_xchgw  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchgw($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndSetL(indirect mem, iRegL newval, iRegLNoSp oldval) %{
+  match(Set oldval (GetAndSetL mem newval));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "atomic_xchg  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchg($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndSetN(indirect mem, iRegN newval, iRegNNoSp oldval) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+  match(Set oldval (GetAndSetN mem newval));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "atomic_xchgw  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchgw($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndSetP(indirect mem, iRegP newval, iRegPNoSp oldval) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+  match(Set oldval (GetAndSetP mem newval));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "atomic_xchg  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchg($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndSetIAcq(indirect mem, iRegI newval, iRegINoSp oldval) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set oldval (GetAndSetI mem newval));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "atomic_xchgw_acq  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchgalw($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndSetLAcq(indirect mem, iRegL newval, iRegLNoSp oldval) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set oldval (GetAndSetL mem newval));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "atomic_xchg_acq  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchgal($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndSetNAcq(indirect mem, iRegN newval, iRegNNoSp oldval) %{
+  predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);
+  match(Set oldval (GetAndSetN mem newval));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "atomic_xchgw_acq  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchgalw($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndSetPAcq(indirect mem, iRegP newval, iRegPNoSp oldval) %{
+  predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));
+  match(Set oldval (GetAndSetP mem newval));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "atomic_xchg_acq  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchgal($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddI(indirect mem, iRegINoSp newval, iRegIorL2I incr) %{
+  match(Set newval (GetAndAddI mem incr));
+  ins_cost(2*VOLATILE_REF_COST+1);
+  format %{ "get_and_addI $newval, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addw($newval$$Register, $incr$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddIAcq(indirect mem, iRegINoSp newval, iRegIorL2I incr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set newval (GetAndAddI mem incr));
+  ins_cost(VOLATILE_REF_COST+1);
+  format %{ "get_and_addI_acq $newval, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addalw($newval$$Register, $incr$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddINoRes(indirect mem, Universe dummy, iRegIorL2I incr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem incr));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "get_and_addI noreg, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addw(noreg, $incr$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddIAcqNoRes(indirect mem, Universe dummy, iRegIorL2I incr) %{
+  predicate(n->as_LoadStore()->result_not_used() && needs_acquiring_load_exclusive(n));
+  match(Set dummy (GetAndAddI mem incr));
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "get_and_addI_acq noreg, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addalw(noreg, $incr$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddIConst(indirect mem, iRegINoSp newval, immIAddSub incr) %{
+  match(Set newval (GetAndAddI mem incr));
+  ins_cost(2*VOLATILE_REF_COST+1);
+  format %{ "get_and_addI $newval, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addw($newval$$Register, $incr$$constant, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddIAcqConst(indirect mem, iRegINoSp newval, immIAddSub incr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set newval (GetAndAddI mem incr));
+  ins_cost(VOLATILE_REF_COST+1);
+  format %{ "get_and_addI_acq $newval, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addalw($newval$$Register, $incr$$constant, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddINoResConst(indirect mem, Universe dummy, immIAddSub incr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem incr));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "get_and_addI noreg, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addw(noreg, $incr$$constant, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddIAcqNoResConst(indirect mem, Universe dummy, immIAddSub incr) %{
+  predicate(n->as_LoadStore()->result_not_used() && needs_acquiring_load_exclusive(n));
+  match(Set dummy (GetAndAddI mem incr));
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "get_and_addI_acq noreg, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addalw(noreg, $incr$$constant, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddL(indirect mem, iRegLNoSp newval, iRegL incr) %{
+  match(Set newval (GetAndAddL mem incr));
+  ins_cost(2*VOLATILE_REF_COST+1);
+  format %{ "get_and_addL $newval, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_add($newval$$Register, $incr$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddLAcq(indirect mem, iRegLNoSp newval, iRegL incr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set newval (GetAndAddL mem incr));
+  ins_cost(VOLATILE_REF_COST+1);
+  format %{ "get_and_addL_acq $newval, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addal($newval$$Register, $incr$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddLNoRes(indirect mem, Universe dummy, iRegL incr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem incr));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "get_and_addL noreg, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_add(noreg, $incr$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddLAcqNoRes(indirect mem, Universe dummy, iRegL incr) %{
+  predicate(n->as_LoadStore()->result_not_used() && needs_acquiring_load_exclusive(n));
+  match(Set dummy (GetAndAddL mem incr));
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "get_and_addL_acq noreg, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addal(noreg, $incr$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddLConst(indirect mem, iRegLNoSp newval, immLAddSub incr) %{
+  match(Set newval (GetAndAddL mem incr));
+  ins_cost(2*VOLATILE_REF_COST+1);
+  format %{ "get_and_addL $newval, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_add($newval$$Register, $incr$$constant, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddLAcqConst(indirect mem, iRegLNoSp newval, immLAddSub incr) %{
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set newval (GetAndAddL mem incr));
+  ins_cost(VOLATILE_REF_COST+1);
+  format %{ "get_and_addL_acq $newval, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addal($newval$$Register, $incr$$constant, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddLNoResConst(indirect mem, Universe dummy, immLAddSub incr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem incr));
+  ins_cost(2*VOLATILE_REF_COST);
+  format %{ "get_and_addL noreg, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_add(noreg, $incr$$constant, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct getAndAddLAcqNoResConst(indirect mem, Universe dummy, immLAddSub incr) %{
+  predicate(n->as_LoadStore()->result_not_used() && needs_acquiring_load_exclusive(n));
+  match(Set dummy (GetAndAddL mem incr));
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "get_and_addL_acq noreg, [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_addal(noreg, $incr$$constant, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}
--- a/src/hotspot/cpu/aarch64/aarch64_atomic_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_atomic_ad.m4
@ -0,0 +1,246 @@
+// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2016, 2021, Red Hat Inc. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+//
+
+// BEGIN This file is automatically generated. Do not edit --------------
+
+// Sundry CAS operations.  Note that release is always true,
+// regardless of the memory ordering of the CAS.  This is because we
+// need the volatile case to be sequentially consistent but there is
+// no trailing StoreLoad barrier emitted by C2.  Unfortunately we
+// can't check the type of memory ordering here, so we always emit a
+// STLXR.
+
+// This section is generated from aarch64_atomic_ad.m4
+
+dnl Return Arg1 with two spaces before it. We need this because m4
+dnl strips leading spaces from macro args.
+define(`INDENT', `  $1')dnl
+dnl
+dnl
+dnl
+dnl ====================== CompareAndExchange*
+dnl
+define(`CAE_INSN1',
+`
+instruct compareAndExchange$1$7(iReg$2NoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
+ifelse($7,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),`dnl')
+  match(Set res (CompareAndExchange$1 mem (Binary oldval newval)));
+  ins_cost(`'ifelse($7,Acq,,2*)VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchg$5`'ifelse($7,Acq,_acq,) $res = $mem, $oldval, $newval\t# ($3) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::$4, /*acquire*/ ifelse($7,Acq,true,false), /*release*/ true,
+               /*weak*/ false, $res$$Register);
+    __ $6($res$$Register, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+define(`CAE_INSN2',
+`
+instruct compareAndExchange$1$6(iReg$2NoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
+ifelse($1$6,PAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));),
+       $1$6,NAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);),
+       $1,P,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
+       $1,N,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
+       $6,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),
+       `dnl')
+  match(Set res (CompareAndExchange$1 mem (Binary oldval newval)));
+  ins_cost(`'ifelse($6,Acq,,2*)VOLATILE_REF_COST);
+  effect(TEMP_DEF res, KILL cr);
+  format %{
+    "cmpxchg$5`'ifelse($6,Acq,_acq,) $res = $mem, $oldval, $newval\t# ($3) if $mem == $oldval then $mem <-- $newval"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::$4, /*acquire*/ ifelse($6,Acq,true,false), /*release*/ true,
+               /*weak*/ false, $res$$Register);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+CAE_INSN1(B,    I,  byte,       byte,       b,  sxtbw,  )
+CAE_INSN1(S,    I,  short,      halfword,   s,  sxthw,  )
+CAE_INSN2(I,    I,  int,        word,       w,  ,   )
+CAE_INSN2(L,    L,  long,       xword,      ,   ,   )
+CAE_INSN2(N,    N,  narrow oop, word,       w,  ,   )
+CAE_INSN2(P,    P,  ptr,        xword,      ,   ,   )
+dnl
+CAE_INSN1(B,    I,  byte,       byte,       b,  sxtbw,  Acq)
+CAE_INSN1(S,    I,  short,      halfword,   s,  sxthw,  Acq)
+CAE_INSN2(I,    I,  int,        word,       w,  Acq)
+CAE_INSN2(L,    L,  long,       xword,      ,   Acq)
+CAE_INSN2(N,    N,  narrow oop, word,       w,  Acq)
+CAE_INSN2(P,    P,  ptr,        xword,      ,   Acq)
+dnl
+dnl
+dnl
+dnl ====================== (Weak)CompareAndSwap*
+dnl
+define(`CAS_INSN1',
+`
+instruct ifelse($7,Weak,'weakCompare`,'compare`)AndSwap$1$6(iRegINoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
+ifelse($6,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),`dnl')
+  match(Set res ($7CompareAndSwap$1 mem (Binary oldval newval)));
+  ins_cost(`'ifelse($6,Acq,,2*)VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg$5`'ifelse($6,Acq,_acq,)`'ifelse($7,Weak,_weak) $res = $mem, $oldval, $newval\t# ($3) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::$4, /*acquire*/ ifelse($6,Acq,true,false), /*release*/ true,
+               /*weak*/ ifelse($7,Weak,true,false), noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+define(`CAS_INSN2',
+`
+instruct ifelse($7,Weak,'weakCompare`,'compare`)AndSwap$1$6(iRegINoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
+ifelse($1$6,PAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));),
+       $1$6,NAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);),
+       $1,P,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
+       $1,N,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
+       $6,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),
+       `dnl')
+  match(Set res ($7CompareAndSwap$1 mem (Binary oldval newval)));
+  ins_cost(`'ifelse($6,Acq,,2*)VOLATILE_REF_COST);
+  effect(KILL cr);
+  format %{
+    "cmpxchg$5`'ifelse($6,Acq,_acq,)`'ifelse($7,Weak,_weak) $res = $mem, $oldval, $newval\t# ($3) if $mem == $oldval then $mem <-- $newval"
+    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+  ins_encode %{
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
+               Assembler::$4, /*acquire*/ ifelse($6,Acq,true,false), /*release*/ true,
+               /*weak*/ ifelse($7,Weak,true,false), noreg);
+    __ csetw($res$$Register, Assembler::EQ);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+CAS_INSN1(B,    I,  byte,       byte,       b,  ,           )
+CAS_INSN1(S,    I,  short,      halfword,   s,  ,           )
+CAS_INSN2(I,    I,  int,        word,       w,  ,           )
+CAS_INSN2(L,    L,  long,       xword,      ,   ,           )
+CAS_INSN2(N,    N,  narrow oop, word,       w,  ,           )
+CAS_INSN2(P,    P,  ptr,        xword,      ,   ,           )
+dnl
+CAS_INSN1(B,    I,  byte,       byte,       b,  Acq,        )
+CAS_INSN1(S,    I,  short,      halfword,   s,  Acq,        )
+CAS_INSN2(I,    I,  int,        word,       w,  Acq,        )
+CAS_INSN2(L,    L,  long,       xword,      ,   Acq,        )
+CAS_INSN2(N,    N,  narrow oop, word,       w,  Acq,        )
+CAS_INSN2(P,    P,  ptr,        xword,      ,   Acq,        )
+dnl
+CAS_INSN1(B,    I,  byte,       byte,       b,  ,       Weak)
+CAS_INSN1(S,    I,  short,      halfword,   s,  ,       Weak)
+CAS_INSN2(I,    I,  int,        word,       w,  ,       Weak)
+CAS_INSN2(L,    L,  long,       xword,      ,   ,       Weak)
+CAS_INSN2(N,    N,  narrow oop, word,       w,  ,       Weak)
+CAS_INSN2(P,    P,  ptr,        xword,      ,   ,       Weak)
+dnl
+CAS_INSN1(B,    I,  byte,       byte,       b,  Acq,    Weak)
+CAS_INSN1(S,    I,  short,      halfword,   s,  Acq,    Weak)
+CAS_INSN2(I,    I,  int,        word,       w,  Acq,    Weak)
+CAS_INSN2(L,    L,  long,       xword,      ,   Acq,    Weak)
+CAS_INSN2(N,    N,  narrow oop, word,       w,  Acq,    Weak)
+CAS_INSN2(P,    P,  ptr,        xword,      ,   Acq,    Weak)
+dnl
+dnl
+dnl
+dnl ====================== GetAndSet*
+dnl
+define(`GAS_INSN1',
+`
+instruct getAndSet$1$3(indirect mem, iReg$1 newval, iReg$1NoSp oldval) %{
+ifelse($1$3,PAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));),
+       $1$3,NAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);),
+       $1,P,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
+       $1,N,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
+       $3,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),
+       `dnl')
+  match(Set oldval (GetAndSet$1 mem newval));
+  ins_cost(`'ifelse($4,Acq,,2*)VOLATILE_REF_COST);
+  format %{ "atomic_xchg$2`'ifelse($3,Acq,_acq)  $oldval, $newval, [$mem]" %}
+  ins_encode %{
+    __ atomic_xchg`'ifelse($3,Acq,al)$2($oldval$$Register, $newval$$Register, as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}')dnl
+dnl
+GAS_INSN1(I,    w,  )
+GAS_INSN1(L,    ,   )
+GAS_INSN1(N,    w,  )
+GAS_INSN1(P,    ,   )
+dnl
+GAS_INSN1(I,    w,  Acq)
+GAS_INSN1(L,    ,   Acq)
+GAS_INSN1(N,    w,  Acq)
+GAS_INSN1(P,    ,   Acq)
+dnl
+dnl
+dnl
+dnl ====================== GetAndAdd*
+dnl
+define(`GAA_INSN1',
+`
+instruct getAndAdd$1$4$5$6(indirect mem, `'ifelse($5,NoRes,Universe dummy,iReg$1NoSp newval), `'ifelse($6,Const,imm$1AddSub incr,iReg$2 incr)) %{
+ifelse($4$5,AcqNoRes,INDENT(predicate(n->as_LoadStore()->result_not_used() && needs_acquiring_load_exclusive(n));),
+       $5,NoRes,INDENT(predicate(n->as_LoadStore()->result_not_used());),
+       $4,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),
+       `dnl')
+  match(Set ifelse($5,NoRes,dummy,newval) (GetAndAdd$1 mem incr));
+  ins_cost(`'ifelse($4,Acq,,2*)VOLATILE_REF_COST`'ifelse($5,NoRes,,+1));
+  format %{ "get_and_add$1`'ifelse($4,Acq,_acq) `'ifelse($5,NoRes,noreg,$newval), [$mem], $incr" %}
+  ins_encode %{
+    __ atomic_add`'ifelse($4,Acq,al)$3(`'ifelse($5,NoRes,noreg,$newval$$Register), `'ifelse($6,Const,$incr$$constant,$incr$$Register), as_Register($mem$$base));
+  %}
+  ins_pipe(pipe_serial);
+%}')dnl
+dnl
+dnl
+GAA_INSN1(I,    IorL2I,     w,  ,       ,           )
+GAA_INSN1(I,    IorL2I,     w,  Acq,    ,           )
+GAA_INSN1(I,    IorL2I,     w,  ,       NoRes,      )
+GAA_INSN1(I,    IorL2I,     w,  Acq,    NoRes,      )
+GAA_INSN1(I,    I,          w,  ,       ,       Const)
+GAA_INSN1(I,    I,          w,  Acq,    ,       Const)
+GAA_INSN1(I,    I,          w,  ,       NoRes,  Const)
+GAA_INSN1(I,    I,          w,  Acq,    NoRes,  Const)
+dnl
+GAA_INSN1(L,    L,          ,   ,       ,           )
+GAA_INSN1(L,    L,          ,   Acq,    ,           )
+GAA_INSN1(L,    L,          ,   ,       NoRes,      )
+GAA_INSN1(L,    L,          ,   Acq,    NoRes,      )
+GAA_INSN1(L,    L,          ,   ,       ,       Const)
+GAA_INSN1(L,    L,          ,   Acq,    ,       Const)
+GAA_INSN1(L,    L,          ,   ,       NoRes,  Const)
+GAA_INSN1(L,    L,          ,   Acq,    NoRes,  Const)
+dnl
--- a/src/hotspot/cpu/aarch64/cas.m4
+++ b/src/hotspot/cpu/aarch64/cas.m4
@ -1,161 +0,0 @@
-dnl Copyright (c) 2016, 2021, Red Hat Inc. All rights reserved.
-dnl DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-dnl
-dnl This code is free software; you can redistribute it and/or modify it
-dnl under the terms of the GNU General Public License version 2 only, as
-dnl published by the Free Software Foundation.
-dnl
-dnl This code is distributed in the hope that it will be useful, but WITHOUT
-dnl ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-dnl FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl version 2 for more details (a copy is included in the LICENSE file that
-dnl accompanied this code).
-dnl
-dnl You should have received a copy of the GNU General Public License version
-dnl 2 along with this work; if not, write to the Free Software Foundation,
-dnl Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
-dnl
-dnl Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
-dnl or visit www.oracle.com if you need additional information or have any
-dnl questions.
-dnl
-dnl
-dnl Process this file with m4 cas.m4 to generate the CAE and wCAS
-dnl instructions used in aarch64.ad.
-dnl
-
-// BEGIN This section of the file is automatically generated. Do not edit --------------
-
-// Sundry CAS operations.  Note that release is always true,
-// regardless of the memory ordering of the CAS.  This is because we
-// need the volatile case to be sequentially consistent but there is
-// no trailing StoreLoad barrier emitted by C2.  Unfortunately we
-// can't check the type of memory ordering here, so we always emit a
-// STLXR.
-
-// This section is generated from cas.m4
-
-dnl Return Arg1 with two spaces before it. We need this because m4
-dnl strips leading spaces from macro args.
-define(`INDENT', `  $1')dnl
-dnl
-define(`CAS_INSN',
-`
-// This pattern is generated automatically from cas.m4.
-// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
-instruct compareAndExchange$1$6(iReg$2NoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
-ifelse($1$6,PAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));),
-       $1$6,NAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);),
-       $1,P,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
-       $1,N,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
-       $6,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),
-       `dnl')
-  match(Set res (CompareAndExchange$1 mem (Binary oldval newval)));
-  ifelse($6,Acq,'ins_cost(VOLATILE_REF_COST);`,'ins_cost(2 * VOLATILE_REF_COST);`)
-  effect(TEMP_DEF res, KILL cr);
-  format %{
-    "cmpxchg$5`'ifelse($6,Acq,_acq,) $res = $mem, $oldval, $newval\t# ($3, weak) if $mem == $oldval then $mem <-- $newval"
-  %}
-  ins_encode %{
-    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
-               Assembler::$4, /*acquire*/ ifelse($6,Acq,true,false), /*release*/ true,
-               /*weak*/ false, $res$$Register);
-  %}
-  ins_pipe(pipe_slow);
-%}')dnl
-define(`CAS_INSN4',
-`
-// This pattern is generated automatically from cas.m4.
-// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
-instruct compareAndExchange$1$7(iReg$2NoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
-ifelse($7,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),`dnl')
-  match(Set res (CompareAndExchange$1 mem (Binary oldval newval)));
-  ifelse($7,Acq,'ins_cost(VOLATILE_REF_COST);`,'ins_cost(2 * VOLATILE_REF_COST);`)
-  effect(TEMP_DEF res, KILL cr);
-  format %{
-    "cmpxchg$5`'ifelse($7,Acq,_acq,) $res = $mem, $oldval, $newval\t# ($3, weak) if $mem == $oldval then $mem <-- $newval"
-  %}
-  ins_encode %{
-    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
-               Assembler::$4, /*acquire*/ ifelse($7,Acq,true,false), /*release*/ true,
-               /*weak*/ false, $res$$Register);
-    __ $6($res$$Register, $res$$Register);
-  %}
-  ins_pipe(pipe_slow);
-%}')dnl
-CAS_INSN4(B,I,byte,byte,b,sxtbw)
-CAS_INSN4(S,I,short,halfword,s,sxthw)
-CAS_INSN(I,I,int,word,w)
-CAS_INSN(L,L,long,xword)
-CAS_INSN(N,N,narrow oop,word,w)
-CAS_INSN(P,P,ptr,xword)
-dnl
-CAS_INSN4(B,I,byte,byte,b,sxtbw,Acq)
-CAS_INSN4(S,I,short,halfword,s,sxthw,Acq)
-CAS_INSN(I,I,int,word,w,Acq)
-CAS_INSN(L,L,long,xword,,Acq)
-CAS_INSN(N,N,narrow oop,word,w,Acq)
-CAS_INSN(P,P,ptr,xword,,Acq)
-dnl
-define(`CAS_INSN2',
-`
-// This pattern is generated automatically from cas.m4.
-// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
-instruct weakCompareAndSwap$1$6(iRegINoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
-ifelse($6,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),`dnl')
-  match(Set res (WeakCompareAndSwap$1 mem (Binary oldval newval)));
-  ifelse($6,Acq,'ins_cost(VOLATILE_REF_COST);`,'ins_cost(2 * VOLATILE_REF_COST);`)
-  effect(KILL cr);
-  format %{
-    "cmpxchg$5`'ifelse($6,Acq,_acq,) $res = $mem, $oldval, $newval\t# ($3, weak) if $mem == $oldval then $mem <-- $newval"
-    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
-  %}
-  ins_encode %{
-    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
-               Assembler::$4, /*acquire*/ ifelse($6,Acq,true,false), /*release*/ true,
-               /*weak*/ true, noreg);
-    __ csetw($res$$Register, Assembler::EQ);
-  %}
-  ins_pipe(pipe_slow);
-%}')dnl
-define(`CAS_INSN3',
-`
-// This pattern is generated automatically from cas.m4.
-// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
-instruct weakCompareAndSwap$1$6(iRegINoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
-ifelse($1$6,PAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));),
-       $1$6,NAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);),
-       $1,P,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
-       $1,N,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
-       $6,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),
-       `dnl')
-  match(Set res (WeakCompareAndSwap$1 mem (Binary oldval newval)));
-  ifelse($6,Acq,'ins_cost(VOLATILE_REF_COST);`,'ins_cost(2 * VOLATILE_REF_COST);`)
-  effect(KILL cr);
-  format %{
-    "cmpxchg$5`'ifelse($6,Acq,_acq,) $res = $mem, $oldval, $newval\t# ($3, weak) if $mem == $oldval then $mem <-- $newval"
-    "csetw $res, EQ\t# $res <-- (EQ ? 1 : 0)"
-  %}
-  ins_encode %{
-    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register,
-               Assembler::$4, /*acquire*/ ifelse($6,Acq,true,false), /*release*/ true,
-               /*weak*/ true, noreg);
-    __ csetw($res$$Register, Assembler::EQ);
-  %}
-  ins_pipe(pipe_slow);
-%}')dnl
-CAS_INSN2(B,I,byte,byte,b)
-CAS_INSN2(S,I,short,halfword,s)
-CAS_INSN3(I,I,int,word,w)
-CAS_INSN3(L,L,long,xword)
-CAS_INSN3(N,N,narrow oop,word,w)
-CAS_INSN3(P,P,ptr,xword)
-CAS_INSN2(B,I,byte,byte,b,Acq)
-CAS_INSN2(S,I,short,halfword,s,Acq)
-CAS_INSN3(I,I,int,word,w,Acq)
-CAS_INSN3(L,L,long,xword,,Acq)
-CAS_INSN3(N,N,narrow oop,word,w,Acq)
-CAS_INSN3(P,P,ptr,xword,,Acq)
-dnl
-
-// END This section of the file is automatically generated. Do not edit --------------
--- a/src/hotspot/cpu/arm/arm.ad
+++ b/src/hotspot/cpu/arm/arm.ad
@ -1063,6 +1063,10 @@ bool Matcher::is_reg2reg_move(MachNode* m) {
  return false;
 }

+bool Matcher::is_register_biasing_candidate(const MachNode* mdef, int oper_index) {
+  return false;
+}
+
 bool Matcher::is_generic_vector(MachOper* opnd)  {
  ShouldNotReachHere();  // generic vector operands not supported
  return false;
--- a/src/hotspot/cpu/ppc/atomicAccess_ppc.hpp
+++ b/src/hotspot/cpu/ppc/atomicAccess_ppc.hpp
@ -157,6 +157,9 @@ inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_va
  return result;
 }

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@ -2383,6 +2383,10 @@ bool Matcher::is_reg2reg_move(MachNode* m) {
  return false;
 }

+bool Matcher::is_register_biasing_candidate(const MachNode* mdef, int oper_index) {
+  return false;
+}
+
 bool Matcher::is_generic_vector(MachOper* opnd)  {
  ShouldNotReachHere();  // generic vector operands not supported
  return false;
--- a/src/hotspot/cpu/riscv/riscv.ad
+++ b/src/hotspot/cpu/riscv/riscv.ad
@ -2053,6 +2053,10 @@ bool Matcher::is_reg2reg_move(MachNode* m) {
  return false;
 }

+bool Matcher::is_register_biasing_candidate(const MachNode* mdef, int oper_index) {
+  return false;
+}
+
 bool Matcher::is_generic_vector(MachOper* opnd) {
  ShouldNotReachHere(); // generic vector operands not supported
  return false;
--- a/src/hotspot/cpu/s390/s390.ad
+++ b/src/hotspot/cpu/s390/s390.ad
@ -1865,6 +1865,10 @@ bool Matcher::is_reg2reg_move(MachNode* m) {
  return false;
 }

+bool Matcher::is_register_biasing_candidate(const MachNode* mdef, int oper_index) {
+  return false;
+}
+
 bool Matcher::is_generic_vector(MachOper* opnd)  {
  ShouldNotReachHere();  // generic vector operands not supported
  return false;
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
@ -89,10 +89,10 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm

 void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count, Register tmp) {
-  Label done;
+  Label L_done;

  __ testptr(count, count);
-  __ jcc(Assembler::zero, done);
+  __ jccb(Assembler::zero, L_done);

  // Calculate end address in "count".
  Address::ScaleFactor scale = UseCompressedOops ? Address::times_4 : Address::times_8;
@ -111,31 +111,31 @@ void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* mas
  __ shrptr(count, CardTable::card_shift());
  __ addptr(count, tmp);

-  Label loop;
+  Label L_loop;
  // Iterate from start card to end card (inclusive).
-  __ bind(loop);
+  __ bind(L_loop);

-  Label is_clean_card;
+  Label L_is_clean_card;
  if (UseCondCardMark) {
    __ cmpb(Address(addr, 0), G1CardTable::clean_card_val());
-    __ jcc(Assembler::equal, is_clean_card);
+    __ jccb(Assembler::equal, L_is_clean_card);
  } else {
   __ movb(Address(addr, 0), G1CardTable::dirty_card_val());
  }

-  Label next_card;
-  __ bind(next_card);
+  Label L_next_card;
+  __ bind(L_next_card);
  __ addptr(addr, sizeof(CardTable::CardValue));
  __ cmpptr(addr, count);
-  __ jcc(Assembler::belowEqual, loop);
-  __ jmp(done);
+  __ jccb(Assembler::belowEqual, L_loop);
+  __ jmpb(L_done);

-  __ bind(is_clean_card);
-  // Card was clean. Dirty card and go to next..
+  __ bind(L_is_clean_card);
+  // Card was clean. Dirty card and go to next.
  __ movb(Address(addr, 0), G1CardTable::dirty_card_val());
-  __ jmp(next_card);
+  __ jmpb(L_next_card);

-  __ bind(done);
+  __ bind(L_done);
 }

 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@ -157,22 +157,6 @@ void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorator
  }
 }

-static void generate_queue_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
-                                     const Register thread, const Register value, const Register temp) {
-  // This code assumes that buffer index is pointer sized.
-  STATIC_ASSERT(in_bytes(SATBMarkQueue::byte_width_of_index()) == sizeof(intptr_t));
-  // Can we store a value in the given thread's buffer?
-  // (The index field is typed as size_t.)
-  __ movptr(temp, Address(thread, in_bytes(index_offset)));   // temp := *(index address)
-  __ testptr(temp, temp);                                     // index == 0?
-  __ jcc(Assembler::zero, runtime);                           // jump to runtime if index == 0 (full buffer)
-  // The buffer is not full, store value into it.
-  __ subptr(temp, wordSize);                                  // temp := next index
-  __ movptr(Address(thread, in_bytes(index_offset)), temp);   // *(index address) := next index
-  __ addptr(temp, Address(thread, in_bytes(buffer_offset)));  // temp := buffer address + next index
-  __ movptr(Address(temp, 0), value);                         // *(buffer address + next index) := value
-}
-
 static void generate_pre_barrier_fast_path(MacroAssembler* masm,
                                           const Register thread) {
  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
@ -190,21 +174,40 @@ static void generate_pre_barrier_slow_path(MacroAssembler* masm,
                                           const Register pre_val,
                                           const Register thread,
                                           const Register tmp,
-                                           Label& done,
-                                           Label& runtime) {
+                                           Label& L_done) {
+  Address index_addr(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
+  Address buffer_addr(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
+
+  // This code assumes that buffer index is pointer sized.
+  STATIC_ASSERT(in_bytes(SATBMarkQueue::byte_width_of_index()) == sizeof(intptr_t));
+
+  Label L_runtime;
+
  // Do we need to load the previous value?
  if (obj != noreg) {
    __ load_heap_oop(pre_val, Address(obj, 0), noreg, AS_RAW);
  }
+
  // Is the previous value null?
-  __ cmpptr(pre_val, NULL_WORD);
-  __ jcc(Assembler::equal, done);
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::satb_mark_queue_index_offset(),
-                           G1ThreadLocalData::satb_mark_queue_buffer_offset(),
-                           runtime,
-                           thread, pre_val, tmp);
-  __ jmp(done);
+  __ testptr(pre_val, pre_val);
+  __ jcc(Assembler::equal, L_done);
+
+  // Can we store a value in the given thread's buffer?
+  // (The index field is typed as size_t.)
+  __ movptr(tmp, index_addr);             // temp := *(index address)
+  __ testptr(tmp, tmp);                   // index == 0?
+  __ jccb(Assembler::zero, L_runtime);    // jump to runtime if index == 0 (full buffer)
+
+  // The buffer is not full, store value into it.
+  __ subptr(tmp, wordSize);               // temp := next index
+  __ movptr(index_addr, tmp);             // *(index address) := next index
+  __ addptr(tmp, buffer_addr);            // temp := buffer address + next index
+  __ movptr(Address(tmp, 0), pre_val);    // *(buffer address + next index) := value
+
+  // Jump out if done, or fall-through to runtime.
+  // "L_done" is far away, so jump cannot be short.
+  __ jmp(L_done);
+  __ bind(L_runtime);
 }

 void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
@ -219,7 +222,6 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
  const Register thread = r15_thread;

  Label done;
-  Label runtime;

  assert(pre_val != noreg, "check this code");

@ -231,9 +233,7 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
  generate_pre_barrier_fast_path(masm, thread);
  // If marking is not active (*(mark queue active address) == 0), jump to done
  __ jcc(Assembler::equal, done);
-  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp, done, runtime);
-
-  __ bind(runtime);
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp, done);

  // Determine and save the live input values
  __ push_call_clobbered_registers();
@ -272,23 +272,23 @@ static void generate_post_barrier(MacroAssembler* masm,
                                  const Register store_addr,
                                  const Register new_val,
                                  const Register tmp1,
-                                  Label& done,
                                  bool new_val_may_be_null) {

  assert_different_registers(store_addr, new_val, tmp1, noreg);

  Register thread = r15_thread;

+  Label L_done;
  // Does store cross heap regions?
  __ movptr(tmp1, store_addr);                                    // tmp1 := store address
  __ xorptr(tmp1, new_val);                                       // tmp1 := store address ^ new value
  __ shrptr(tmp1, G1HeapRegion::LogOfHRGrainBytes);               // ((store address ^ new value) >> LogOfHRGrainBytes) == 0?
-  __ jcc(Assembler::equal, done);
+  __ jccb(Assembler::equal, L_done);

  // Crosses regions, storing null?
  if (new_val_may_be_null) {
-    __ cmpptr(new_val, NULL_WORD);                                // new value == null?
-    __ jcc(Assembler::equal, done);
+    __ testptr(new_val, new_val);                                 // new value == null?
+    __ jccb(Assembler::equal, L_done);
  }

  __ movptr(tmp1, store_addr);                                    // tmp1 := store address
@ -298,20 +298,19 @@ static void generate_post_barrier(MacroAssembler* masm,
  __ addptr(tmp1, card_table_addr);                               // tmp1 := card address
  if (UseCondCardMark) {
    __ cmpb(Address(tmp1, 0), G1CardTable::clean_card_val());     // *(card address) == clean_card_val?
-    __ jcc(Assembler::notEqual, done);
+    __ jccb(Assembler::notEqual, L_done);
  }
  // Storing a region crossing, non-null oop, card is clean.
  // Dirty card.
  __ movb(Address(tmp1, 0), G1CardTable::dirty_card_val());       // *(card address) := dirty_card_val
+  __ bind(L_done);
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                  Register store_addr,
                                                  Register new_val,
                                                  Register tmp) {
-  Label done;
-  generate_post_barrier(masm, store_addr, new_val, tmp, done, true /* new_val_may_be_null */);
-  __ bind(done);
+  generate_post_barrier(masm, store_addr, new_val, tmp, true /* new_val_may_be_null */);
 }

 #if defined(COMPILER2)
@ -354,7 +353,6 @@ void G1BarrierSetAssembler::g1_write_barrier_pre_c2(MacroAssembler* masm,
 void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
                                                         G1PreBarrierStubC2* stub) const {
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
  Register obj = stub->obj();
  Register pre_val = stub->pre_val();
  Register thread = stub->thread();
@ -362,9 +360,8 @@ void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
  assert(stub->tmp2() == noreg, "not needed in this platform");

  __ bind(*stub->entry());
-  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp, *stub->continuation(), runtime);
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp, *stub->continuation());

-  __ bind(runtime);
  generate_c2_barrier_runtime_call(masm, stub, pre_val, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry));
  __ jmp(*stub->continuation());
 }
@ -374,9 +371,7 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register new_val,
                                                     Register tmp,
                                                     bool new_val_may_be_null) {
-  Label done;
-  generate_post_barrier(masm, store_addr, new_val, tmp, done, new_val_may_be_null);
-  __ bind(done);
+  generate_post_barrier(masm, store_addr, new_val, tmp, new_val_may_be_null);
 }

 #endif // COMPILER2
@ -449,7 +444,7 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
    ce->mem2reg(stub->addr(), stub->pre_val(), T_OBJECT, stub->patch_code(), stub->info(), false /*wide*/);
  }

-  __ cmpptr(pre_val_reg, NULL_WORD);
+  __ testptr(pre_val_reg, pre_val_reg);
  __ jcc(Assembler::equal, *stub->continuation());
  ce->store_parameter(stub->pre_val()->as_register(), 0);
  __ call(RuntimeAddress(bs->pre_barrier_c1_runtime_code_blob()->code_begin()));
@ -465,9 +460,7 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
                                                     Register thread,
                                                     Register tmp1,
                                                     Register tmp2 /* unused on x86 */) {
-  Label done;
-  generate_post_barrier(masm, store_addr, new_val, tmp1, done, true /* new_val_may_be_null */);
-  masm->bind(done);
+  generate_post_barrier(masm, store_addr, new_val, tmp1, true /* new_val_may_be_null */);
 }

 #define __ sasm->
@ -490,8 +483,7 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  Address queue_index(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
  Address buffer(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));

-  Label done;
-  Label runtime;
+  Label L_done, L_runtime;

  // Is marking still active?
  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
@ -500,13 +492,13 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
    __ cmpb(queue_active, 0);
  }
-  __ jcc(Assembler::equal, done);
+  __ jcc(Assembler::equal, L_done);

  // Can we store original value in the thread's buffer?

  __ movptr(tmp, queue_index);
  __ testptr(tmp, tmp);
-  __ jcc(Assembler::zero, runtime);
+  __ jccb(Assembler::zero, L_runtime);
  __ subptr(tmp, wordSize);
  __ movptr(queue_index, tmp);
  __ addptr(tmp, buffer);
@ -514,9 +506,9 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  // prev_val (rax)
  __ load_parameter(0, pre_val);
  __ movptr(Address(tmp, 0), pre_val);
-  __ jmp(done);
+  __ jmp(L_done);

-  __ bind(runtime);
+  __ bind(L_runtime);

  __ push_call_clobbered_registers();

@ -526,7 +518,7 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*

  __ pop_call_clobbered_registers();

-  __ bind(done);
+  __ bind(L_done);

  __ pop_ppx(rdx);
  __ pop_ppx(rax);
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp
@ -3524,10 +3524,10 @@ void StubGenerator::aesgcm_avx512(Register in, Register len, Register ct, Regist
                                    false, true, false, false, false, ghashin_offset, aesout_offset, HashKey_32);

  ghash16_avx512(false, true, false, false, true, in, pos, avx512_subkeyHtbl, AAD_HASHx, SHUF_MASK, stack_offset, 16 * 16, 0, HashKey_16);
+  __ addl(pos, 16 * 16);

  __ bind(MESG_BELOW_32_BLKS);
  __ subl(len, 16 * 16);
-  __ addl(pos, 16 * 16);
  gcm_enc_dec_last_avx512(len, in, pos, AAD_HASHx, SHUF_MASK, avx512_subkeyHtbl, ghashin_offset, HashKey_16, true, true);

  __ bind(GHASH_DONE);
@ -4016,13 +4016,15 @@ void StubGenerator::aesgcm_avx2(Register in, Register len, Register ct, Register
  const Register rounds = r10;
  const XMMRegister ctr_blockx = xmm9;
  const XMMRegister aad_hashx = xmm8;
-  Label encrypt_done, encrypt_by_8_new, encrypt_by_8;
+  Label encrypt_done, encrypt_by_8_new, encrypt_by_8, exit;

  //This routine should be called only for message sizes of 128 bytes or more.
  //Macro flow:
  //process 8 16 byte blocks in initial_num_blocks.
  //process 8 16 byte blocks at a time until all are done 'encrypt_by_8_new  followed by ghash_last_8'
  __ xorl(pos, pos);
+  __ cmpl(len, 128);
+  __ jcc(Assembler::less, exit);

  //Generate 8 constants for htbl
  generateHtbl_8_block_avx2(subkeyHtbl);
@ -4090,6 +4092,7 @@ void StubGenerator::aesgcm_avx2(Register in, Register len, Register ct, Register
  __ vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
  __ vpxor(xmm13, xmm13, xmm13, Assembler::AVX_128bit);

+  __ bind(exit);
 }

 #undef __
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -2633,6 +2633,70 @@ bool Matcher::supports_vector_calling_convention(void) {
  return EnableVectorSupport;
 }

+static bool is_ndd_demotable(const MachNode* mdef) {
+  return ((mdef->flags() & Node::PD::Flag_ndd_demotable) != 0);
+}
+
+static bool is_ndd_demotable_commutative(const MachNode* mdef) {
+  return ((mdef->flags() & Node::PD::Flag_ndd_demotable_commutative) != 0);
+}
+
+static bool is_demotion_candidate(const MachNode* mdef) {
+  return (is_ndd_demotable(mdef) || is_ndd_demotable_commutative(mdef));
+}
+
+bool Matcher::is_register_biasing_candidate(const MachNode* mdef,
+                                            int oper_index) {
+  if (mdef == nullptr) {
+    return false;
+  }
+
+  if (mdef->num_opnds() <= oper_index || mdef->operand_index(oper_index) < 0 ||
+      mdef->in(mdef->operand_index(oper_index)) == nullptr) {
+    assert(oper_index != 1 || !is_demotion_candidate(mdef), "%s", mdef->Name());
+    assert(oper_index != 2 || !is_ndd_demotable_commutative(mdef), "%s", mdef->Name());
+    return false;
+  }
+
+  // Complex memory operand covers multiple incoming edges needed for
+  // address computation. Biasing def towards any address component will not
+  // result in NDD demotion by assembler.
+  if (mdef->operand_num_edges(oper_index) != 1) {
+    assert(!is_ndd_demotable(mdef), "%s", mdef->Name());
+    return false;
+  }
+
+  // Demotion candidate must be register mask compatible with definition.
+  const RegMask& oper_mask = mdef->in_RegMask(mdef->operand_index(oper_index));
+  if (!oper_mask.overlap(mdef->out_RegMask())) {
+    assert(!is_demotion_candidate(mdef), "%s", mdef->Name());
+    return false;
+  }
+
+  switch (oper_index) {
+  // First operand of MachNode corresponding to Intel APX NDD selection
+  // pattern can share its assigned register with definition operand if
+  // their live ranges do not overlap. In such a scenario we can demote
+  // it to legacy map0/map1 instruction by replacing its 4-byte extended
+  // EVEX prefix with shorter REX/REX2 encoding. Demotion candidates
+  // are decorated with a special flag by instruction selector.
+  case 1:
+    return is_demotion_candidate(mdef);
+
+  // Definition operand of commutative operation can be biased towards second
+  // operand.
+  case 2:
+    return is_ndd_demotable_commutative(mdef);
+
+  // Current scheme only selects up to two biasing candidates
+  default:
+    assert(false, "unhandled operand index: %s", mdef->Name());
+    break;
+  }
+
+  return false;
+}
+
 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
  assert(EnableVectorSupport, "sanity");
  int lo = XMM0_num;
@ -2812,7 +2876,7 @@ static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_byte

 class Node::PD {
 public:
-  enum NodeFlags {
+  enum NodeFlags : uint64_t {
    Flag_intel_jcc_erratum    = Node::_last_flag << 1,
    Flag_sets_carry_flag      = Node::_last_flag << 2,
    Flag_sets_parity_flag     = Node::_last_flag << 3,
@ -2824,7 +2888,9 @@ public:
    Flag_clears_zero_flag     = Node::_last_flag << 9,
    Flag_clears_overflow_flag = Node::_last_flag << 10,
    Flag_clears_sign_flag     = Node::_last_flag << 11,
-    _last_flag                = Flag_clears_sign_flag
+    Flag_ndd_demotable        = Node::_last_flag << 12,
+    Flag_ndd_demotable_commutative = Node::_last_flag << 13,
+    _last_flag                = Flag_ndd_demotable_commutative
  };
 };

@ -9801,7 +9867,7 @@ instruct addI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (AddI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable_commutative);

  format %{ "eaddl    $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -9829,7 +9895,7 @@ instruct addI_rReg_rReg_imm_ndd(rRegI dst, rRegI src1, immI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (AddI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "eaddl    $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -9872,7 +9938,7 @@ instruct addI_rReg_rReg_mem_ndd(rRegI dst, rRegI src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (AddI src1 (LoadI src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable_commutative);

  ins_cost(150);
  format %{ "eaddl    $dst, $src1, $src2\t# int ndd" %}
@ -9929,6 +9995,7 @@ instruct incI_rReg_ndd(rRegI dst, rRegI src, immI_1 val, rFlagsReg cr)
  predicate(UseAPX && UseIncDec);
  match(Set dst (AddI src val));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "eincl    $dst, $src\t# int ndd" %}
  ins_encode %{
@ -9983,6 +10050,7 @@ instruct decI_rReg_ndd(rRegI dst, rRegI src, immI_M1 val, rFlagsReg cr)
  predicate(UseAPX && UseIncDec);
  match(Set dst (AddI src val));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "edecl    $dst, $src\t# int ndd" %}
  ins_encode %{
@ -10089,7 +10157,7 @@ instruct addL_rReg_ndd(rRegL dst, rRegL src1, rRegL src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (AddL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable_commutative);

  format %{ "eaddq    $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -10117,7 +10185,7 @@ instruct addL_rReg_rReg_imm_ndd(rRegL dst, rRegL src1, immL32 src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (AddL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "eaddq    $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -10160,7 +10228,7 @@ instruct addL_rReg_rReg_mem_ndd(rRegL dst, rRegL src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (AddL src1 (LoadL src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable_commutative);

  ins_cost(150);
  format %{ "eaddq    $dst, $src1, $src2\t# long ndd" %}
@ -10216,6 +10284,7 @@ instruct incL_rReg_ndd(rRegL dst, rRegI src, immL1 val, rFlagsReg cr)
  predicate(UseAPX && UseIncDec);
  match(Set dst (AddL src val));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "eincq    $dst, $src\t# long ndd" %}
  ins_encode %{
@ -10270,6 +10339,7 @@ instruct decL_rReg_ndd(rRegL dst, rRegL src, immL_M1 val, rFlagsReg cr)
  predicate(UseAPX && UseIncDec);
  match(Set dst (AddL src val));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "edecq    $dst, $src\t# long ndd" %}
  ins_encode %{
@ -10984,7 +11054,7 @@ instruct subI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (SubI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "esubl    $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -10998,7 +11068,7 @@ instruct subI_rReg_rReg_imm_ndd(rRegI dst, rRegI src1, immI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (SubI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "esubl    $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -11041,7 +11111,7 @@ instruct subI_rReg_rReg_mem_ndd(rRegI dst, rRegI src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (SubI src1 (LoadI src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  ins_cost(150);
  format %{ "esubl    $dst, $src1, $src2\t# int ndd" %}
@ -11099,7 +11169,7 @@ instruct subL_rReg_ndd(rRegL dst, rRegL src1, rRegL src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (SubL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "esubq    $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -11113,7 +11183,7 @@ instruct subL_rReg_rReg_imm_ndd(rRegL dst, rRegL src1, immL32 src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (SubL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "esubq    $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -11156,7 +11226,7 @@ instruct subL_rReg_rReg_mem_ndd(rRegL dst, rRegL src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (SubL src1 (LoadL src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_carry_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  ins_cost(150);
  format %{ "esubq    $dst, $src1, $src2\t# long ndd" %}
@ -11228,7 +11298,7 @@ instruct negI_rReg_ndd(rRegI dst, rRegI src, immI_0 zero, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (SubI zero src));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "enegl    $dst, $src\t# int ndd" %}
  ins_encode %{
@ -11256,7 +11326,7 @@ instruct negI_rReg_2_ndd(rRegI dst, rRegI src, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (NegI src));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "enegl    $dst, $src\t# int ndd" %}
  ins_encode %{
@ -11297,7 +11367,7 @@ instruct negL_rReg_ndd(rRegL dst, rRegL src, immL0 zero, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (SubL zero src));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "enegq    $dst, $src\t# long ndd" %}
  ins_encode %{
@ -11325,7 +11395,7 @@ instruct negL_rReg_2_ndd(rRegL dst, rRegL src, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (NegL src));
  effect(KILL cr);
-  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag);
+  flag(PD::Flag_sets_overflow_flag, PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_ndd_demotable);

  format %{ "enegq    $dst, $src\t# long ndd" %}
  ins_encode %{
@ -11370,6 +11440,7 @@ instruct mulI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (MulI src1 src2));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable_commutative);

  ins_cost(300);
  format %{ "eimull   $dst, $src1, $src2\t# int ndd" %}
@ -11411,6 +11482,7 @@ instruct mulI_rReg_rReg_mem_ndd(rRegI dst, rRegI src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (MulI src1 (LoadI src2)));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  ins_cost(350);
  format %{ "eimull   $dst, $src1, $src2\t# int ndd" %}
@ -11462,6 +11534,7 @@ instruct mulL_rReg_ndd(rRegL dst, rRegL src1, rRegL src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (MulL src1 src2));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable_commutative);

  ins_cost(300);
  format %{ "eimulq   $dst, $src1, $src2\t# long ndd" %}
@ -11503,6 +11576,7 @@ instruct mulL_rReg_rReg_mem_ndd(rRegL dst, rRegL src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (MulL src1 (LoadL src2)));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable_commutative);

  ins_cost(350);
  format %{ "eimulq   $dst, $src1, $src2 \t# long" %}
@ -11777,6 +11851,7 @@ instruct salI_rReg_immI2_ndd(rRegI dst, rRegI src, immI2 shift, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (LShiftI src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "esall    $dst, $src, $shift\t# int(ndd)" %}
  ins_encode %{
@ -11805,6 +11880,7 @@ instruct salI_rReg_imm_ndd(rRegI dst, rRegI src, immI8 shift, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (LShiftI src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "esall    $dst, $src, $shift\t# int (ndd)" %}
  ins_encode %{
@ -11911,6 +11987,7 @@ instruct sarI_rReg_imm_ndd(rRegI dst, rRegI src, immI8 shift, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (RShiftI src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "esarl    $dst, $src, $shift\t# int (ndd)" %}
  ins_encode %{
@ -12017,6 +12094,7 @@ instruct shrI_rReg_imm_ndd(rRegI dst, rRegI src, immI8 shift, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (URShiftI src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "eshrl    $dst, $src, $shift\t # int (ndd)" %}
  ins_encode %{
@ -12124,6 +12202,7 @@ instruct salL_rReg_immI2_ndd(rRegL dst, rRegL src, immI2 shift, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (LShiftL src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "esalq    $dst, $src, $shift\t# long (ndd)" %}
  ins_encode %{
@ -12152,6 +12231,7 @@ instruct salL_rReg_imm_ndd(rRegL dst, rRegL src, immI8 shift, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (LShiftL src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "esalq    $dst, $src, $shift\t# long (ndd)" %}
  ins_encode %{
@ -12258,6 +12338,7 @@ instruct sarL_rReg_imm_ndd(rRegL dst, rRegL src, immI shift, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (RShiftL src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "esarq    $dst, $src, $shift\t# long (ndd)" %}
  ins_encode %{
@ -12364,6 +12445,7 @@ instruct shrL_rReg_imm_ndd(rRegL dst, rRegL src, immI8 shift, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (URShiftL src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "eshrq    $dst, $src, $shift\t# long (ndd)" %}
  ins_encode %{
@ -12535,6 +12617,7 @@ instruct rolI_rReg_Var_ndd(rRegI dst, rRegI src, rcx_RegI shift, rFlagsReg cr)
  predicate(UseAPX && n->bottom_type()->basic_type() == T_INT);
  match(Set dst (RotateLeft src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "eroll    $dst, $src, $shift\t# rotate left (int ndd)" %}
  ins_encode %{
@ -12599,6 +12682,7 @@ instruct rorI_rReg_Var_ndd(rRegI dst, rRegI src, rcx_RegI shift, rFlagsReg cr)
  predicate(UseAPX && n->bottom_type()->basic_type() == T_INT);
  match(Set dst (RotateRight src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "erorl    $dst, $src, $shift\t# rotate right(int ndd)" %}
  ins_encode %{
@ -12651,6 +12735,7 @@ instruct rolL_rReg_Var(rRegL dst, rcx_RegI shift, rFlagsReg cr)
  predicate(!UseAPX && n->bottom_type()->basic_type() == T_LONG);
  match(Set dst (RotateLeft dst shift));
  effect(KILL cr);
+
  format %{ "rolq    $dst, $shift" %}
  ins_encode %{
    __ rolq($dst$$Register);
@ -12664,6 +12749,7 @@ instruct rolL_rReg_Var_ndd(rRegL dst, rRegL src, rcx_RegI shift, rFlagsReg cr)
  predicate(UseAPX && n->bottom_type()->basic_type() == T_LONG);
  match(Set dst (RotateLeft src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "erolq    $dst, $src, $shift\t# rotate left(long ndd)" %}
  ins_encode %{
@ -12728,6 +12814,7 @@ instruct rorL_rReg_Var_ndd(rRegL dst, rRegL src, rcx_RegI shift, rFlagsReg cr)
  predicate(UseAPX && n->bottom_type()->basic_type() == T_LONG);
  match(Set dst (RotateRight src shift));
  effect(KILL cr);
+  flag(PD::Flag_ndd_demotable);

  format %{ "erorq    $dst, $src, $shift\t# rotate right(long ndd)" %}
  ins_encode %{
@ -12805,7 +12892,7 @@ instruct andI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (AndI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  format %{ "eandl     $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -12898,7 +12985,7 @@ instruct andI_rReg_rReg_imm_ndd(rRegI dst, rRegI src1, immI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (AndI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  format %{ "eandl    $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -12942,7 +13029,7 @@ instruct andI_rReg_rReg_mem_ndd(rRegI dst, rRegI src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (AndI src1 (LoadI src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  ins_cost(150);
  format %{ "eandl    $dst, $src1, $src2\t# int ndd" %}
@ -13142,7 +13229,7 @@ instruct orI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (OrI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  format %{ "eorl     $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -13171,7 +13258,7 @@ instruct orI_rReg_rReg_imm_ndd(rRegI dst, rRegI src1, immI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (OrI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  format %{ "eorl     $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -13185,7 +13272,7 @@ instruct orI_rReg_imm_rReg_ndd(rRegI dst, immI src1, rRegI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (OrI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  format %{ "eorl     $dst, $src2, $src1\t# int ndd" %}
  ins_encode %{
@ -13229,7 +13316,7 @@ instruct orI_rReg_rReg_mem_ndd(rRegI dst, rRegI src1, memory src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (OrI src1 (LoadI src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  ins_cost(150);
  format %{ "eorl     $dst, $src1, $src2\t# int ndd" %}
@ -13305,7 +13392,7 @@ instruct xorI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (XorI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  format %{ "exorl    $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -13331,6 +13418,7 @@ instruct xorI_rReg_im1_ndd(rRegI dst, rRegI src, immI_M1 imm)
 %{
  match(Set dst (XorI src imm));
  predicate(UseAPX);
+  flag(PD::Flag_ndd_demotable);

  format %{ "enotl    $dst, $src" %}
  ins_encode %{
@ -13361,7 +13449,7 @@ instruct xorI_rReg_rReg_imm_ndd(rRegI dst, rRegI src1, immI src2, rFlagsReg cr)
  predicate(UseAPX && n->in(2)->bottom_type()->is_int()->get_con() != -1);
  match(Set dst (XorI src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  format %{ "exorl    $dst, $src1, $src2\t# int ndd" %}
  ins_encode %{
@ -13407,7 +13495,7 @@ instruct xorI_rReg_rReg_mem_ndd(rRegI dst, rRegI src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (XorI src1 (LoadI src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  ins_cost(150);
  format %{ "exorl    $dst, $src1, $src2\t# int ndd" %}
@ -13486,7 +13574,7 @@ instruct andL_rReg_ndd(rRegL dst, rRegL src1, rRegL src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (AndL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  format %{ "eandq     $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -13542,7 +13630,7 @@ instruct andL_rReg_rReg_imm_ndd(rRegL dst, rRegL src1, immL32 src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (AndL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  format %{ "eandq    $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -13586,7 +13674,7 @@ instruct andL_rReg_rReg_mem_ndd(rRegL dst, rRegL src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (AndL src1 (LoadL src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  ins_cost(150);
  format %{ "eandq    $dst, $src1, $src2\t# long ndd" %}
@ -13789,7 +13877,7 @@ instruct orL_rReg_ndd(rRegL dst, rRegL src1, rRegL src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (OrL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  format %{ "eorq     $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -13844,7 +13932,7 @@ instruct orL_rReg_rReg_imm_ndd(rRegL dst, rRegL src1, immL32 src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (OrL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  format %{ "eorq     $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -13858,7 +13946,7 @@ instruct orL_rReg_imm_rReg_ndd(rRegL dst, immL32 src1, rRegL src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (OrL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  format %{ "eorq     $dst, $src2, $src1\t# long ndd" %}
  ins_encode %{
@ -13903,7 +13991,7 @@ instruct orL_rReg_rReg_mem_ndd(rRegL dst, rRegL src1, memory src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (OrL src1 (LoadL src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  ins_cost(150);
  format %{ "eorq     $dst, $src1, $src2\t# long ndd" %}
@ -13982,7 +14070,7 @@ instruct xorL_rReg_ndd(rRegL dst, rRegL src1, rRegL src2, rFlagsReg cr)
  predicate(UseAPX);
  match(Set dst (XorL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  format %{ "exorq    $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -14008,6 +14096,7 @@ instruct xorL_rReg_im1_ndd(rRegL dst,rRegL src, immL_M1 imm)
 %{
  predicate(UseAPX);
  match(Set dst (XorL src imm));
+  flag(PD::Flag_ndd_demotable);

  format %{ "enotq   $dst, $src" %}
  ins_encode %{
@ -14038,7 +14127,7 @@ instruct xorL_rReg_rReg_imm(rRegL dst, rRegL src1, immL32 src2, rFlagsReg cr)
  predicate(UseAPX && n->in(2)->bottom_type()->is_long()->get_con() != -1L);
  match(Set dst (XorL src1 src2));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable);

  format %{ "exorq    $dst, $src1, $src2\t# long ndd" %}
  ins_encode %{
@ -14084,7 +14173,7 @@ instruct xorL_rReg_rReg_mem_ndd(rRegL dst, rRegL src1, memory src2, rFlagsReg cr
  predicate(UseAPX);
  match(Set dst (XorL src1 (LoadL src2)));
  effect(KILL cr);
-  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag);
+  flag(PD::Flag_sets_sign_flag, PD::Flag_sets_zero_flag, PD::Flag_sets_parity_flag, PD::Flag_clears_overflow_flag, PD::Flag_clears_carry_flag, PD::Flag_ndd_demotable_commutative);

  ins_cost(150);
  format %{ "exorq    $dst, $src1, $src2\t# long ndd" %}
@ -16539,6 +16628,7 @@ instruct minI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2)
  predicate(UseAPX);
  match(Set dst (MinI src1 src2));
  effect(DEF dst, USE src1, USE src2);
+  flag(PD::Flag_ndd_demotable);

  ins_cost(200);
  expand %{
@ -16590,6 +16680,7 @@ instruct maxI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2)
  predicate(UseAPX);
  match(Set dst (MaxI src1 src2));
  effect(DEF dst, USE src1, USE src2);
+  flag(PD::Flag_ndd_demotable);

  ins_cost(200);
  expand %{
--- a/src/hotspot/os/aix/os_aix.cpp
+++ b/src/hotspot/os/aix/os_aix.cpp
@ -1038,6 +1038,8 @@ static void* dll_load_library(const char *filename, int *eno, char *ebuf, int eb
    dflags |= RTLD_MEMBER;
  }

+  Events::log_dll_message(nullptr, "Attempting to load shared library %s", filename);
+
  void* result;
  const char* error_report = nullptr;
  JFR_ONLY(NativeLibraryLoadEvent load_event(filename, &result);)
--- a/src/hotspot/os/bsd/os_bsd.cpp
+++ b/src/hotspot/os/bsd/os_bsd.cpp
@ -1035,6 +1035,8 @@ void *os::Bsd::dlopen_helper(const char *filename, int mode, char *ebuf, int ebu
  int rtn = fegetenv(&default_fenv);
  assert(rtn == 0, "fegetenv must succeed");

+  Events::log_dll_message(nullptr, "Attempting to load shared library %s", filename);
+
  void* result;
  JFR_ONLY(NativeLibraryLoadEvent load_event(filename, &result);)
  result = ::dlopen(filename, RTLD_LAZY);
--- a/src/hotspot/os/linux/os_linux.cpp
+++ b/src/hotspot/os/linux/os_linux.cpp
@ -159,9 +159,7 @@ physical_memory_size_type os::Linux::_physical_memory = 0;
 address   os::Linux::_initial_thread_stack_bottom = nullptr;
 uintptr_t os::Linux::_initial_thread_stack_size   = 0;

-int (*os::Linux::_pthread_getcpuclockid)(pthread_t, clockid_t *) = nullptr;
 pthread_t os::Linux::_main_thread;
-bool os::Linux::_supports_fast_thread_cpu_time = false;
 const char * os::Linux::_libc_version = nullptr;
 const char * os::Linux::_libpthread_version = nullptr;

@ -1475,29 +1473,6 @@ void os::Linux::capture_initial_stack(size_t max_size) {
 ////////////////////////////////////////////////////////////////////////////////
 // time support

-void os::Linux::fast_thread_clock_init() {
-  clockid_t clockid;
-  struct timespec tp;
-  int (*pthread_getcpuclockid_func)(pthread_t, clockid_t *) =
-      (int(*)(pthread_t, clockid_t *)) dlsym(RTLD_DEFAULT, "pthread_getcpuclockid");
-
-  // Switch to using fast clocks for thread cpu time if
-  // the clock_getres() returns 0 error code.
-  // Note, that some kernels may support the current thread
-  // clock (CLOCK_THREAD_CPUTIME_ID) but not the clocks
-  // returned by the pthread_getcpuclockid().
-  // If the fast POSIX clocks are supported then the clock_getres()
-  // must return at least tp.tv_sec == 0 which means a resolution
-  // better than 1 sec. This is extra check for reliability.
-
-  if (pthread_getcpuclockid_func &&
-      pthread_getcpuclockid_func(_main_thread, &clockid) == 0 &&
-      clock_getres(clockid, &tp) == 0 && tp.tv_sec == 0) {
-    _supports_fast_thread_cpu_time = true;
-    _pthread_getcpuclockid = pthread_getcpuclockid_func;
-  }
-}
-
 // thread_id is kernel thread id (similar to Solaris LWP id)
 intx os::current_thread_id() { return os::Linux::gettid(); }
 int os::current_process_id() {
@ -1900,6 +1875,8 @@ void * os::Linux::dlopen_helper(const char *filename, char *ebuf, int ebuflen) {
  assert(rtn == 0, "fegetenv must succeed");
 #endif // IA32

+  Events::log_dll_message(nullptr, "Attempting to load shared library %s", filename);
+
  void* result;
  JFR_ONLY(NativeLibraryLoadEvent load_event(filename, &result);)
  result = ::dlopen(filename, RTLD_LAZY);
@ -4328,7 +4305,7 @@ OSReturn os::get_native_priority(const Thread* const thread,
 // For reference, please, see IEEE Std 1003.1-2004:
 //   http://www.unix.org/single_unix_specification

-jlong os::Linux::fast_thread_cpu_time(clockid_t clockid) {
+jlong os::Linux::total_thread_cpu_time(clockid_t clockid) {
  struct timespec tp;
  int status = clock_gettime(clockid, &tp);
  assert(status == 0, "clock_gettime error: %s", os::strerror(errno));
@ -4556,8 +4533,6 @@ jint os::init_2(void) {

  os::Posix::init_2();

-  Linux::fast_thread_clock_init();
-
  if (PosixSignals::init() == JNI_ERR) {
    return JNI_ERR;
  }
@ -4985,14 +4960,14 @@ int os::open(const char *path, int oflag, int mode) {
  return fd;
 }

-static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time);
+static jlong user_thread_cpu_time(Thread *thread);

-static jlong fast_cpu_time(Thread *thread) {
+static jlong total_thread_cpu_time(Thread *thread) {
    clockid_t clockid;
-    int rc = os::Linux::pthread_getcpuclockid(thread->osthread()->pthread_id(),
+    int rc = pthread_getcpuclockid(thread->osthread()->pthread_id(),
                                              &clockid);
    if (rc == 0) {
-      return os::Linux::fast_thread_cpu_time(clockid);
+      return os::Linux::total_thread_cpu_time(clockid);
    } else {
      // It's possible to encounter a terminated native thread that failed
      // to detach itself from the VM - which should result in ESRCH.
@ -5009,41 +4984,31 @@ static jlong fast_cpu_time(Thread *thread) {
 // the fast estimate available on the platform.

 jlong os::current_thread_cpu_time() {
-  if (os::Linux::supports_fast_thread_cpu_time()) {
-    return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
-  } else {
-    // return user + sys since the cost is the same
-    return slow_thread_cpu_time(Thread::current(), true /* user + sys */);
-  }
+  return os::Linux::total_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
 }

 jlong os::thread_cpu_time(Thread* thread) {
-  // consistent with what current_thread_cpu_time() returns
-  if (os::Linux::supports_fast_thread_cpu_time()) {
-    return fast_cpu_time(thread);
-  } else {
-    return slow_thread_cpu_time(thread, true /* user + sys */);
-  }
+  return total_thread_cpu_time(thread);
 }

 jlong os::current_thread_cpu_time(bool user_sys_cpu_time) {
-  if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
-    return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
+  if (user_sys_cpu_time) {
+    return os::Linux::total_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
  } else {
-    return slow_thread_cpu_time(Thread::current(), user_sys_cpu_time);
+    return user_thread_cpu_time(Thread::current());
  }
 }

 jlong os::thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
-  if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
-    return fast_cpu_time(thread);
+  if (user_sys_cpu_time) {
+    return total_thread_cpu_time(thread);
  } else {
-    return slow_thread_cpu_time(thread, user_sys_cpu_time);
+    return user_thread_cpu_time(thread);
  }
 }

 //  -1 on error.
-static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
+static jlong user_thread_cpu_time(Thread *thread) {
  pid_t  tid = thread->osthread()->thread_id();
  char *s;
  char stat[2048];
@ -5080,11 +5045,8 @@ static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
                 &ldummy, &ldummy, &ldummy, &ldummy, &ldummy,
                 &user_time, &sys_time);
  if (count != 13) return -1;
-  if (user_sys_cpu_time) {
-    return ((jlong)sys_time + (jlong)user_time) * (1000000000 / os::Posix::clock_tics_per_second());
-  } else {
-    return (jlong)user_time * (1000000000 / os::Posix::clock_tics_per_second());
-  }
+
+  return (jlong)user_time * (1000000000 / os::Posix::clock_tics_per_second());
 }

 void os::current_thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
@ -5163,7 +5125,7 @@ int os::get_core_path(char* buffer, size_t bufferSize) {

    if (core_pattern[0] == '|') {
      written = jio_snprintf(buffer, bufferSize,
-                             "\"%s\" (or dumping to %s/core.%d)",
+                             "\"%s\" (alternatively, falling back to %s/core.%d)",
                             &core_pattern[1], p, current_process_id());
    } else if (pid_pos != nullptr) {
      *pid_pos = '\0';
--- a/src/hotspot/os/linux/os_linux.hpp
+++ b/src/hotspot/os/linux/os_linux.hpp
@ -32,16 +32,12 @@
 class os::Linux {
  friend class os;

-  static int (*_pthread_getcpuclockid)(pthread_t, clockid_t *);
-
  static address   _initial_thread_stack_bottom;
  static uintptr_t _initial_thread_stack_size;

  static const char *_libc_version;
  static const char *_libpthread_version;

-  static bool _supports_fast_thread_cpu_time;
-
  static GrowableArray<int>* _cpu_to_node;
  static GrowableArray<int>* _nindex_to_node;

@ -146,18 +142,7 @@ class os::Linux {
  static bool manually_expand_stack(JavaThread * t, address addr);
  static void expand_stack_to(address bottom);

-  // fast POSIX clocks support
-  static void fast_thread_clock_init(void);
-
-  static int pthread_getcpuclockid(pthread_t tid, clockid_t *clock_id) {
-    return _pthread_getcpuclockid ? _pthread_getcpuclockid(tid, clock_id) : -1;
-  }
-
-  static bool supports_fast_thread_cpu_time() {
-    return _supports_fast_thread_cpu_time;
-  }
-
-  static jlong fast_thread_cpu_time(clockid_t clockid);
+  static jlong total_thread_cpu_time(clockid_t clockid);

  static jlong sendfile(int out_fd, int in_fd, jlong* offset, jlong count);

--- a/src/hotspot/os/linux/procMapsParser.cpp
+++ b/src/hotspot/os/linux/procMapsParser.cpp
@ -50,7 +50,14 @@ ProcSmapsParser::~ProcSmapsParser() {

 bool ProcSmapsParser::read_line() {
  _line[0] = '\0';
-  return ::fgets(_line, _linelen, _f) != nullptr;
+
+  if (::fgets(_line, _linelen, _f) == nullptr) {
+    // On error or EOF, ensure deterministic empty buffer
+    _line[0] = '\0';
+    return false;
+  } else {
+    return true;
+  }
 }

 bool ProcSmapsParser::is_header_line() {
@ -101,8 +108,6 @@ void ProcSmapsParser::scan_additional_line(ProcSmapsInfo& out) {
  }
 }

-// Starts or continues parsing. Returns true on success,
-// false on EOF or on error.
 bool ProcSmapsParser::parse_next(ProcSmapsInfo& out) {

  // Information about a single mapping reaches across several lines.
@ -117,15 +122,13 @@ bool ProcSmapsParser::parse_next(ProcSmapsInfo& out) {
  assert(is_header_line(), "Not a header line: \"%s\".", _line);
  scan_header_line(out);

-  // Now read until we encounter the next header line or EOF or an error.
-  bool ok = false, stop = false;
-  do {
-    ok = read_line();
-    stop = !ok || is_header_line();
-    if (!stop) {
-      scan_additional_line(out);
+  while (true) {
+    bool ok = read_line();
+    if (!ok || is_header_line()) {
+      break;  // EOF or next header
    }
-  } while (!stop);
+    scan_additional_line(out);
+  }

-  return ok;
+  return true;  // always return true if a mapping was parsed
 }
--- a/src/hotspot/os/linux/procMapsParser.hpp
+++ b/src/hotspot/os/linux/procMapsParser.hpp
@ -84,8 +84,7 @@ public:
  ProcSmapsParser(FILE* f);
  ~ProcSmapsParser();

-  // Starts or continues parsing. Returns true on success,
-  // false on EOF or on error.
+  // Starts or continues parsing. Returns true iff a mapping was parsed.
  bool parse_next(ProcSmapsInfo& out);
 };

--- a/src/hotspot/os/posix/os_posix.cpp
+++ b/src/hotspot/os/posix/os_posix.cpp
@ -108,41 +108,60 @@ size_t os::_os_min_stack_allowed = PTHREAD_STACK_MIN;

 // Check core dump limit and report possible place where core can be found
 void os::check_core_dump_prerequisites(char* buffer, size_t bufferSize, bool check_only) {
+  stringStream buf(buffer, bufferSize);
  if (!FLAG_IS_DEFAULT(CreateCoredumpOnCrash) && !CreateCoredumpOnCrash) {
-    jio_snprintf(buffer, bufferSize, "CreateCoredumpOnCrash is disabled from command line");
-    VMError::record_coredump_status(buffer, false);
+    buf.print("CreateCoredumpOnCrash is disabled from command line");
+    VMError::record_coredump_status(buf.freeze(), false);
  } else {
    struct rlimit rlim;
    bool success = true;
    bool warn = true;
    char core_path[PATH_MAX];
    if (get_core_path(core_path, PATH_MAX) <= 0) {
-      jio_snprintf(buffer, bufferSize, "core.%d (may not exist)", current_process_id());
+      // In the warning message, let the user know.
+      if (check_only) {
+        buf.print("the core path couldn't be determined. It commonly defaults to ");
+      }
+      buf.print("core.%d%s", current_process_id(), check_only ? "" : " (may not exist)");
 #ifdef LINUX
    } else if (core_path[0] == '"') { // redirect to user process
-      jio_snprintf(buffer, bufferSize, "Core dumps may be processed with %s", core_path);
+      if (check_only) {
+        buf.print("core dumps may be further processed by the following: ");
+      } else {
+        buf.print("Determined by the following: ");
+      }
+      buf.print("%s", core_path);
 #endif
    } else if (getrlimit(RLIMIT_CORE, &rlim) != 0) {
-      jio_snprintf(buffer, bufferSize, "%s (may not exist)", core_path);
+      if (check_only) {
+        buf.print("the rlimit couldn't be determined. If resource limits permit, the core dump will be located at ");
+      }
+      buf.print("%s%s", core_path, check_only ? "" : " (may not exist)");
    } else {
      switch(rlim.rlim_cur) {
        case RLIM_INFINITY:
-          jio_snprintf(buffer, bufferSize, "%s", core_path);
+          buf.print("%s", core_path);
          warn = false;
          break;
        case 0:
-          jio_snprintf(buffer, bufferSize, "Core dumps have been disabled. To enable core dumping, try \"ulimit -c unlimited\" before starting Java again");
+          buf.print("%s dumps have been disabled. To enable core dumping, try \"ulimit -c unlimited\" before starting Java again", check_only ? "core" : "Core");
          success = false;
          break;
        default:
-          jio_snprintf(buffer, bufferSize, "%s (max size " UINT64_FORMAT " k). To ensure a full core dump, try \"ulimit -c unlimited\" before starting Java again", core_path, uint64_t(rlim.rlim_cur) / K);
+          if (check_only) {
+            buf.print("core dumps are constrained ");
+          } else {
+             buf.print( "%s ", core_path);
+          }
+          buf.print( "(max size " UINT64_FORMAT " k). To ensure a full core dump, try \"ulimit -c unlimited\" before starting Java again", uint64_t(rlim.rlim_cur) / K);
          break;
      }
    }
+    const char* result = buf.freeze();
    if (!check_only) {
-      VMError::record_coredump_status(buffer, success);
+      VMError::record_coredump_status(result, success);
    } else if (warn) {
-      warning("CreateCoredumpOnCrash specified, but %s", buffer);
+      warning("CreateCoredumpOnCrash specified, but %s", result);
    }
  }
 }
--- a/src/hotspot/os/windows/os_windows.cpp
+++ b/src/hotspot/os/windows/os_windows.cpp
@ -1715,6 +1715,8 @@ static int _print_module(const char* fname, address base_address,
 // same architecture as Hotspot is running on
 void * os::dll_load(const char *name, char *ebuf, int ebuflen) {
  log_info(os)("attempting shared library load of %s", name);
+  Events::log_dll_message(nullptr, "Attempting to load shared library %s", name);
+
  void* result;
  JFR_ONLY(NativeLibraryLoadEvent load_event(name, &result);)
  result = LoadLibrary(name);
--- a/src/hotspot/os/windows/sharedRuntimeRem.cpp
+++ b/src/hotspot/os/windows/sharedRuntimeRem.cpp
@ -50,11 +50,9 @@ double SharedRuntime::fmod_winx64(double x, double y)
  hx ^= sx;                /* |x| */
  hy &= 0x7fffffff;       /* |y| */

-#pragma warning( disable : 4146 )
  /* purge off exception values */
  if ((hy | ly) == 0 || (hx >= 0x7ff00000) ||       /* y=0,or x not finite */
-    ((hy | ((ly | -ly) >> 31))>0x7ff00000))     /* or y is NaN */
-#pragma warning( default : 4146 )
+      ((hy | ((ly | -ly) >> 31))>0x7ff00000))       /* or y is NaN */
    return (x*y) / (x*y);
  if (hx <= hy) {
    if ((hx<hy) || (lx<ly)) return x;      /* |x|<|y| return x */
--- a/src/hotspot/os_cpu/bsd_aarch64/atomicAccess_bsd_aarch64.hpp
+++ b/src/hotspot/os_cpu/bsd_aarch64/atomicAccess_bsd_aarch64.hpp
@ -52,12 +52,16 @@ struct AtomicAccess::PlatformAdd {
  }
 };

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<size_t byte_size>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<byte_size>::operator()(T volatile* dest,
                                                           T exchange_value,
                                                           atomic_memory_order order) const {
  STATIC_ASSERT(byte_size == sizeof(T));
+  STATIC_ASSERT(byte_size == 4 || byte_size == 8);
  T res = __atomic_exchange_n(dest, exchange_value, __ATOMIC_RELEASE);
  FULL_MEM_BARRIER;
  return res;
--- a/src/hotspot/os_cpu/bsd_x86/atomicAccess_bsd_x86.hpp
+++ b/src/hotspot/os_cpu/bsd_x86/atomicAccess_bsd_x86.hpp
@ -52,6 +52,9 @@ inline D AtomicAccess::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_va
  return old_value;
 }

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
--- a/src/hotspot/os_cpu/bsd_zero/atomicAccess_bsd_zero.hpp
+++ b/src/hotspot/os_cpu/bsd_zero/atomicAccess_bsd_zero.hpp
@ -66,6 +66,9 @@ inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_va
  return res;
 }

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
--- a/src/hotspot/os_cpu/linux_aarch64/atomicAccess_linux_aarch64.hpp
+++ b/src/hotspot/os_cpu/linux_aarch64/atomicAccess_linux_aarch64.hpp
@ -113,6 +113,9 @@ inline D AtomicAccess::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_va
  return atomic_fastcall(stub, dest, add_value);
 }

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
--- a/src/hotspot/os_cpu/linux_arm/atomicAccess_linux_arm.hpp
+++ b/src/hotspot/os_cpu/linux_arm/atomicAccess_linux_arm.hpp
@ -118,6 +118,8 @@ inline D AtomicAccess::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_va
  return add_using_helper<int32_t>(ARMAtomicFuncs::_add_func, dest, add_value);
 }

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};

 template<>
 template<typename T>
--- a/src/hotspot/os_cpu/linux_riscv/atomicAccess_linux_riscv.hpp
+++ b/src/hotspot/os_cpu/linux_riscv/atomicAccess_linux_riscv.hpp
@ -152,6 +152,9 @@ inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest __attribu
 }
 #endif

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<size_t byte_size>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<byte_size>::operator()(T volatile* dest,
@ -164,6 +167,7 @@ inline T AtomicAccess::PlatformXchg<byte_size>::operator()(T volatile* dest,
 #endif

  STATIC_ASSERT(byte_size == sizeof(T));
+  STATIC_ASSERT(byte_size == 4 || byte_size == 8);

  if (order != memory_order_relaxed) {
    FULL_MEM_BARRIER;
--- a/src/hotspot/os_cpu/linux_s390/atomicAccess_linux_s390.hpp
+++ b/src/hotspot/os_cpu/linux_s390/atomicAccess_linux_s390.hpp
@ -209,6 +209,9 @@ inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I inc,
 //
 // The return value is the (unchanged) value from memory as it was when the
 // replacement succeeded.
+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
--- a/src/hotspot/os_cpu/linux_x86/atomicAccess_linux_x86.hpp
+++ b/src/hotspot/os_cpu/linux_x86/atomicAccess_linux_x86.hpp
@ -52,6 +52,9 @@ inline D AtomicAccess::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_va
  return old_value;
 }

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
--- a/src/hotspot/os_cpu/linux_zero/atomicAccess_linux_zero.hpp
+++ b/src/hotspot/os_cpu/linux_zero/atomicAccess_linux_zero.hpp
@ -65,6 +65,9 @@ inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_va
  return res;
 }

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 template<>
 template<typename T>
 inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
--- a/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp
@ -68,6 +68,9 @@ DEFINE_INTRINSIC_ADD(InterlockedAdd64, __int64)

 #undef DEFINE_INTRINSIC_ADD

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 #define DEFINE_INTRINSIC_XCHG(IntrinsicName, IntrinsicType)               \
  template<>                                                              \
  template<typename T>                                                    \
@ -75,6 +78,8 @@ DEFINE_INTRINSIC_ADD(InterlockedAdd64, __int64)
                                                                         T exchange_value, \
                                                                         atomic_memory_order order) const { \
    STATIC_ASSERT(sizeof(IntrinsicType) == sizeof(T));                    \
+    STATIC_ASSERT(sizeof(IntrinsicType) == 4 ||                           \
+                  sizeof(IntrinsicType) == 8);                            \
    return PrimitiveConversions::cast<T>(                                 \
      IntrinsicName(reinterpret_cast<IntrinsicType volatile *>(dest),     \
                    PrimitiveConversions::cast<IntrinsicType>(exchange_value))); \
--- a/src/hotspot/os_cpu/windows_x86/atomicAccess_windows_x86.hpp
+++ b/src/hotspot/os_cpu/windows_x86/atomicAccess_windows_x86.hpp
@ -70,6 +70,9 @@ DEFINE_INTRINSIC_ADD(InterlockedAdd64, __int64)

 #undef DEFINE_INTRINSIC_ADD

+template<>
+struct AtomicAccess::PlatformXchg<1> : AtomicAccess::XchgUsingCmpxchg<1> {};
+
 #define DEFINE_INTRINSIC_XCHG(IntrinsicName, IntrinsicType)               \
  template<>                                                              \
  template<typename T>                                                    \
@ -77,6 +80,8 @@ DEFINE_INTRINSIC_ADD(InterlockedAdd64, __int64)
                                                                         T exchange_value, \
                                                                         atomic_memory_order order) const { \
    STATIC_ASSERT(sizeof(IntrinsicType) == sizeof(T));                    \
+    STATIC_ASSERT(sizeof(IntrinsicType) == 4 ||                           \
+                  sizeof(IntrinsicType) == 8);                            \
    return PrimitiveConversions::cast<T>(                                 \
      IntrinsicName(reinterpret_cast<IntrinsicType volatile *>(dest),     \
                    PrimitiveConversions::cast<IntrinsicType>(exchange_value))); \
--- a/src/hotspot/share/classfile/resolutionErrors.cpp
+++ b/src/hotspot/share/classfile/resolutionErrors.cpp
@ -73,7 +73,7 @@ void ResolutionErrorTable::add_entry(const constantPoolHandle& pool, int cp_inde

  ResolutionErrorKey key(pool(), cp_index);
  ResolutionErrorEntry *entry = new ResolutionErrorEntry(error, message, cause, cause_msg);
-  _resolution_error_table->put(key, entry);
+  _resolution_error_table->put_when_absent(key, entry);
 }

 // create new nest host error entry
@ -85,7 +85,7 @@ void ResolutionErrorTable::add_entry(const constantPoolHandle& pool, int cp_inde

  ResolutionErrorKey key(pool(), cp_index);
  ResolutionErrorEntry *entry = new ResolutionErrorEntry(message);
-  _resolution_error_table->put(key, entry);
+  _resolution_error_table->put_when_absent(key, entry);
 }

 // find entry in the table
@ -126,6 +126,15 @@ ResolutionErrorEntry::~ResolutionErrorEntry() {
  }
 }

+void ResolutionErrorEntry::set_nest_host_error(const char* message) {
+  // If a message is already set, free it.
+  if (nest_host_error() != nullptr) {
+    FREE_C_HEAP_ARRAY(char, _nest_host_error);
+  }
+  _nest_host_error = message;
+}
+
+
 class ResolutionErrorDeleteIterate : StackObj {
  ConstantPool* p;

--- a/src/hotspot/share/classfile/resolutionErrors.hpp
+++ b/src/hotspot/share/classfile/resolutionErrors.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -91,10 +91,7 @@ class ResolutionErrorEntry : public CHeapObj<mtClass> {
  ~ResolutionErrorEntry();

  // The incoming nest host error message is already in the C-Heap.
-  void set_nest_host_error(const char* message) {
-    _nest_host_error = message;
-  }
-
+  void set_nest_host_error(const char* message);

  Symbol*            error() const              { return _error; }
  const char*        message() const            { return _message; }
--- a/src/hotspot/share/classfile/systemDictionary.cpp
+++ b/src/hotspot/share/classfile/systemDictionary.cpp
@ -1864,14 +1864,19 @@ void SystemDictionary::add_nest_host_error(const constantPoolHandle& pool,
  {
    MutexLocker ml(Thread::current(), SystemDictionary_lock);
    ResolutionErrorEntry* entry = ResolutionErrorTable::find_entry(pool, which);
-    if (entry != nullptr && entry->nest_host_error() == nullptr) {
+    if (entry == nullptr) {
+      // Only add a new entry to the resolution error table if one hasn't been found for this
+      // constant pool index. In this case resolution succeeded but there's an error in this nest host
+      // that we use the table to record.
+      assert(pool->resolved_klass_at(which) != nullptr, "klass should be resolved if there is no entry");
+      ResolutionErrorTable::add_entry(pool, which, message);
+    } else {
      // An existing entry means we had a true resolution failure (LinkageError) with our nest host, but we
      // still want to add the error message for the higher-level access checks to report. We should
      // only reach here under the same error condition, so we can ignore the potential race with setting
-      // the message. If we see it is already set then we can ignore it.
+      // the message, and set it again.
+      assert(entry->nest_host_error() == nullptr || strcmp(entry->nest_host_error(), message) == 0, "should be the same message");
      entry->set_nest_host_error(message);
-    } else {
-      ResolutionErrorTable::add_entry(pool, which, message);
    }
  }
 }
--- a/src/hotspot/share/compiler/compilationMemoryStatistic.cpp
+++ b/src/hotspot/share/compiler/compilationMemoryStatistic.cpp
@ -1010,8 +1010,10 @@ void CompilationMemoryStatistic::print_error_report(outputStream* st) {
    oom_stats->print_peak_state_on(st);
    st->cr();
  }
-  st->print_cr("Compiler Memory Statistic, 10 most expensive compilations:");
-  print_all_by_size(st, false, false, 0, 10);
+  if (Thread::current_or_null_safe() != nullptr) {
+    st->print_cr("Compiler Memory Statistic, 10 most expensive compilations:");
+    print_all_by_size(st, false, false, 0, 10);
+  }
 }

 void CompilationMemoryStatistic::print_final_report(outputStream* st) {
--- a/src/hotspot/share/gc/g1/g1AllocRegion.cpp
+++ b/src/hotspot/share/gc/g1/g1AllocRegion.cpp
@ -33,10 +33,10 @@
 #include "utilities/align.hpp"

 G1CollectedHeap* G1AllocRegion::_g1h = nullptr;
-G1HeapRegion* G1AllocRegion::_dummy_region = nullptr;
+Atomic<G1HeapRegion*> G1AllocRegion::_dummy_region;

 void G1AllocRegion::setup(G1CollectedHeap* g1h, G1HeapRegion* dummy_region) {
-  assert(_dummy_region == nullptr, "should be set once");
+  assert(_dummy_region.load_relaxed() == nullptr, "should be set once");
  assert(dummy_region != nullptr, "pre-condition");
  assert(dummy_region->free() == 0, "pre-condition");

@ -46,11 +46,11 @@ void G1AllocRegion::setup(G1CollectedHeap* g1h, G1HeapRegion* dummy_region) {
  assert(dummy_region->par_allocate(1, 1, &assert_tmp) == nullptr, "should fail");

  _g1h = g1h;
-  _dummy_region = dummy_region;
+  _dummy_region.release_store(dummy_region);
 }

 size_t G1AllocRegion::fill_up_remaining_space(G1HeapRegion* alloc_region) {
-  assert(alloc_region != nullptr && alloc_region != _dummy_region,
+  assert(alloc_region != nullptr && alloc_region != _dummy_region.load_relaxed(),
         "pre-condition");
  size_t result = 0;

@ -111,13 +111,13 @@ size_t G1AllocRegion::retire_internal(G1HeapRegion* alloc_region, bool fill_up)
 }

 size_t G1AllocRegion::retire(bool fill_up) {
-  assert_alloc_region(_alloc_region != nullptr, "not initialized properly");
+  assert_alloc_region(_alloc_region.load_relaxed() != nullptr, "not initialized properly");

  size_t waste = 0;

  trace("retiring");
-  G1HeapRegion* alloc_region = _alloc_region;
-  if (alloc_region != _dummy_region) {
+  G1HeapRegion* alloc_region = _alloc_region.load_acquire();
+  if (alloc_region != _dummy_region.load_relaxed()) {
    waste = retire_internal(alloc_region, fill_up);
    reset_alloc_region();
  }
@ -127,7 +127,7 @@ size_t G1AllocRegion::retire(bool fill_up) {
 }

 HeapWord* G1AllocRegion::new_alloc_region_and_allocate(size_t word_size) {
-  assert_alloc_region(_alloc_region == _dummy_region, "pre-condition");
+  assert_alloc_region(_alloc_region.load_relaxed() == _dummy_region.load_relaxed(), "pre-condition");

  trace("attempting region allocation");
  G1HeapRegion* new_alloc_region = allocate_new_region(word_size);
@ -138,7 +138,6 @@ HeapWord* G1AllocRegion::new_alloc_region_and_allocate(size_t word_size) {
    HeapWord* result = new_alloc_region->allocate(word_size);
    assert_alloc_region(result != nullptr, "the allocation should succeeded");

-    OrderAccess::storestore();
    // Note that we first perform the allocation and then we store the
    // region in _alloc_region. This is the reason why an active region
    // can never be empty.
@ -154,16 +153,16 @@ HeapWord* G1AllocRegion::new_alloc_region_and_allocate(size_t word_size) {

 void G1AllocRegion::init() {
  trace("initializing");
-  assert_alloc_region(_alloc_region == nullptr, "pre-condition");
-  assert_alloc_region(_dummy_region != nullptr, "should have been set");
-  _alloc_region = _dummy_region;
+  assert_alloc_region(_alloc_region.load_relaxed() == nullptr, "pre-condition");
+  assert_alloc_region(_dummy_region.load_relaxed() != nullptr, "should have been set");
+  _alloc_region.release_store(_dummy_region.load_relaxed());
  _count = 0;
  trace("initialized");
 }

 void G1AllocRegion::set(G1HeapRegion* alloc_region) {
  trace("setting");
-  assert_alloc_region(_alloc_region == _dummy_region && _count == 0, "pre-condition");
+  assert_alloc_region(_alloc_region.load_relaxed() == _dummy_region.load_relaxed() && _count == 0, "pre-condition");

  update_alloc_region(alloc_region);
  trace("set");
@ -175,19 +174,19 @@ void G1AllocRegion::update_alloc_region(G1HeapRegion* alloc_region) {
  // maintain the "the alloc region cannot be empty" invariant.
  assert_alloc_region(alloc_region != nullptr && !alloc_region->is_empty(), "pre-condition");

-  _alloc_region = alloc_region;
+  _alloc_region.release_store(alloc_region);
  _count += 1;
  trace("updated");
 }

 G1HeapRegion* G1AllocRegion::release() {
  trace("releasing");
-  G1HeapRegion* alloc_region = _alloc_region;
+  G1HeapRegion* alloc_region = _alloc_region.load_acquire();
  retire(false /* fill_up */);
-  assert_alloc_region(_alloc_region == _dummy_region, "post-condition of retire()");
-  _alloc_region = nullptr;
+  assert_alloc_region(_alloc_region.load_relaxed() == _dummy_region.load_relaxed(), "post-condition of retire()");
+  _alloc_region.store_relaxed(nullptr);
  trace("released");
-  return (alloc_region == _dummy_region) ? nullptr : alloc_region;
+  return (alloc_region == _dummy_region.load_relaxed()) ? nullptr : alloc_region;
 }

 #ifndef PRODUCT
@ -211,12 +210,13 @@ void G1AllocRegion::trace(const char* str, size_t min_word_size, size_t desired_

    out->print("%s: %u ", _name, _count);

-    if (_alloc_region == nullptr) {
+    G1HeapRegion* alloc_region = _alloc_region.load_acquire();
+    if (alloc_region == nullptr) {
      out->print("null");
-    } else if (_alloc_region == _dummy_region) {
+    } else if (alloc_region == _dummy_region.load_relaxed()) {
      out->print("DUMMY");
    } else {
-      out->print(HR_FORMAT, HR_FORMAT_PARAMS(_alloc_region));
+      out->print(HR_FORMAT, HR_FORMAT_PARAMS(alloc_region));
    }

    out->print(" : %s", str);
@ -235,7 +235,7 @@ void G1AllocRegion::trace(const char* str, size_t min_word_size, size_t desired_
 #endif // PRODUCT

 G1AllocRegion::G1AllocRegion(const char* name, uint node_index)
-  : _alloc_region(nullptr),
+  : _alloc_region(),
    _count(0),
    _name(name),
    _node_index(node_index)
@ -250,7 +250,7 @@ void MutatorAllocRegion::retire_region(G1HeapRegion* alloc_region) {
 }

 void MutatorAllocRegion::init() {
-  assert(_retained_alloc_region == nullptr, "Pre-condition");
+  assert(_retained_alloc_region.load_relaxed() == nullptr, "Pre-condition");
  G1AllocRegion::init();
  _wasted_bytes = 0;
 }
@ -261,8 +261,9 @@ bool MutatorAllocRegion::should_retain(G1HeapRegion* region) {
    return false;
  }

-  if (_retained_alloc_region != nullptr &&
-      free_bytes < _retained_alloc_region->free()) {
+  G1HeapRegion* retained_alloc_region = _retained_alloc_region.load_acquire();
+  if (retained_alloc_region != nullptr &&
+      free_bytes < retained_alloc_region->free()) {
    return false;
  }

@ -278,10 +279,11 @@ size_t MutatorAllocRegion::retire(bool fill_up) {
    // free than the currently retained region.
    if (should_retain(current_region)) {
      trace("mutator retained");
-      if (_retained_alloc_region != nullptr) {
-        waste = retire_internal(_retained_alloc_region, true);
+      G1HeapRegion* retained_alloc_region = _retained_alloc_region.load_acquire();
+      if (retained_alloc_region != nullptr) {
+        waste = retire_internal(retained_alloc_region, true);
      }
-      _retained_alloc_region = current_region;
+      _retained_alloc_region.release_store(current_region);
    } else {
      waste = retire_internal(current_region, fill_up);
    }
@ -300,7 +302,7 @@ size_t MutatorAllocRegion::used_in_alloc_regions() {
    used += hr->used();
  }

-  hr = _retained_alloc_region;
+  hr = _retained_alloc_region.load_acquire();
  if (hr != nullptr) {
    used += hr->used();
  }
@ -313,9 +315,10 @@ G1HeapRegion* MutatorAllocRegion::release() {
  // The retained alloc region must be retired and this must be
  // done after the above call to release the mutator alloc region,
  // since it might update the _retained_alloc_region member.
-  if (_retained_alloc_region != nullptr) {
-    _wasted_bytes += retire_internal(_retained_alloc_region, false);
-    _retained_alloc_region = nullptr;
+  G1HeapRegion* retained_alloc_region = _retained_alloc_region.load_acquire();
+  if (retained_alloc_region != nullptr) {
+    _wasted_bytes += retire_internal(retained_alloc_region, false);
+    _retained_alloc_region.store_relaxed(nullptr);
  }
  log_debug(gc, alloc, region)("Mutator Allocation stats, regions: %u, wasted size: %zu%s (%4.1f%%)",
                               count(),
--- a/src/hotspot/share/gc/g1/g1AllocRegion.hpp
+++ b/src/hotspot/share/gc/g1/g1AllocRegion.hpp
@ -29,6 +29,7 @@
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1HeapRegionAttr.hpp"
 #include "gc/g1/g1NUMA.hpp"
+#include "runtime/atomic.hpp"

 class G1CollectedHeap;

@ -40,8 +41,6 @@ class G1CollectedHeap;
 // replaced.

 class G1AllocRegion : public CHeapObj<mtGC> {
-
-private:
  // The active allocating region we are currently allocating out
  // of. The invariant is that if this object is initialized (i.e.,
  // init() has been called and release() has not) then _alloc_region
@ -52,7 +51,7 @@ private:
  // then _alloc_region is null and this object should not be used to
  // satisfy allocation requests (it was done this way to force the
  // correct use of init() and release()).
-  G1HeapRegion* volatile _alloc_region;
+  Atomic<G1HeapRegion*> _alloc_region;

  // It keeps track of the distinct number of regions that are used
  // for allocation in the active interval of this object, i.e.,
@ -71,7 +70,7 @@ private:
  // == end()). When we don't have a valid active region we make
  // _alloc_region point to this. This allows us to skip checking
  // whether the _alloc_region is null or not.
-  static G1HeapRegion* _dummy_region;
+  static Atomic<G1HeapRegion*> _dummy_region;

  // After a region is allocated by alloc_new_region, this
  // method is used to set it as the active alloc_region
@ -124,9 +123,9 @@ public:
  static void setup(G1CollectedHeap* g1h, G1HeapRegion* dummy_region);

  G1HeapRegion* get() const {
-    G1HeapRegion * hr = _alloc_region;
+    G1HeapRegion * hr = _alloc_region.load_acquire();
    // Make sure that the dummy region does not escape this class.
-    return (hr == _dummy_region) ? nullptr : hr;
+    return (hr == _dummy_region.load_relaxed()) ? nullptr : hr;
  }

  uint count() { return _count; }
@ -177,7 +176,7 @@ private:
  // Retained allocation region. Used to lower the waste generated
  // during mutation by having two active regions if the free space
  // in a region about to be retired still could fit a TLAB.
-  G1HeapRegion* volatile _retained_alloc_region;
+  Atomic<G1HeapRegion*> _retained_alloc_region;

  // Decide if the region should be retained, based on the free size
  // in it and the free size in the currently retained region, if any.
--- a/src/hotspot/share/gc/g1/g1AllocRegion.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1AllocRegion.inline.hpp
@ -32,13 +32,13 @@
 #define assert_alloc_region(p, message)                                  \
  do {                                                                   \
    assert((p), "[%s] %s c: %u r: " PTR_FORMAT,                          \
-           _name, (message), _count, p2i(_alloc_region)                  \
+           _name, (message), _count, p2i(_alloc_region.load_relaxed())   \
          );                                                             \
  } while (0)


 inline void G1AllocRegion::reset_alloc_region() {
-  _alloc_region = _dummy_region;
+  _alloc_region.store_relaxed(_dummy_region.load_relaxed());
 }

 inline HeapWord* G1AllocRegion::par_allocate(G1HeapRegion* alloc_region, size_t word_size) {
@ -51,7 +51,7 @@ inline HeapWord* G1AllocRegion::par_allocate(G1HeapRegion* alloc_region, size_t
 inline HeapWord* G1AllocRegion::attempt_allocation(size_t min_word_size,
                                                   size_t desired_word_size,
                                                   size_t* actual_word_size) {
-  G1HeapRegion* alloc_region = _alloc_region;
+  G1HeapRegion* alloc_region = _alloc_region.load_acquire();
  assert_alloc_region(alloc_region != nullptr && !alloc_region->is_empty(), "not initialized properly");

  HeapWord* result = alloc_region->par_allocate(min_word_size, desired_word_size, actual_word_size);
@ -97,8 +97,9 @@ inline HeapWord* G1AllocRegion::attempt_allocation_using_new_region(size_t min_w
 inline HeapWord* MutatorAllocRegion::attempt_retained_allocation(size_t min_word_size,
                                                                 size_t desired_word_size,
                                                                 size_t* actual_word_size) {
-  if (_retained_alloc_region != nullptr) {
-    HeapWord* result = _retained_alloc_region->par_allocate(min_word_size, desired_word_size, actual_word_size);
+  G1HeapRegion* retained_alloc_region = _retained_alloc_region.load_acquire();
+  if (retained_alloc_region != nullptr) {
+    HeapWord* result = retained_alloc_region->par_allocate(min_word_size, desired_word_size, actual_word_size);
    if (result != nullptr) {
      trace("alloc retained", min_word_size, desired_word_size, *actual_word_size, result);
      return result;
--- a/src/hotspot/share/gc/g1/g1Arguments.cpp
+++ b/src/hotspot/share/gc/g1/g1Arguments.cpp
@ -77,10 +77,11 @@ void G1Arguments::initialize_alignments() {
 }

 size_t G1Arguments::conservative_max_heap_alignment() {
-  if (FLAG_IS_DEFAULT(G1HeapRegionSize)) {
-    return G1HeapRegion::max_ergonomics_size();
-  }
-  return G1HeapRegion::max_region_size();
+  const size_t region_size = FLAG_IS_DEFAULT(G1HeapRegionSize)
+                           ? G1HeapRegion::max_ergonomics_size()
+                           : G1HeapRegion::max_region_size();
+
+  return calculate_heap_alignment(region_size);
 }

 void G1Arguments::initialize_verification_types() {
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@ -2355,7 +2355,8 @@ static void print_region_type(outputStream* st, const char* type, uint count, bo
 }

 void G1CollectedHeap::print_heap_on(outputStream* st) const {
-  size_t heap_used = Heap_lock->owned_by_self() ? used() : used_unlocked();
+  size_t heap_used = (Thread::current_or_null_safe() != nullptr &&
+                      Heap_lock->owned_by_self()) ? used() : used_unlocked();
  st->print("%-20s", "garbage-first heap");
  st->print(" total reserved %zuK, committed %zuK, used %zuK",
            _hrm.reserved().byte_size()/K, capacity()/K, heap_used/K);
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
@ -611,23 +611,24 @@ void G1RemSet::scan_collection_set_code_roots(G1ParScanThreadState* pss,
                                              G1GCPhaseTimes::GCParPhases coderoots_phase,
                                              G1GCPhaseTimes::GCParPhases objcopy_phase) {
  EventGCPhaseParallel event;
-
  Tickspan code_root_scan_time;
  Tickspan code_root_trim_partially_time;
-  G1EvacPhaseWithTrimTimeTracker timer(pss, code_root_scan_time, code_root_trim_partially_time);

  G1GCPhaseTimes* p = _g1h->phase_times();
+  {
+    G1EvacPhaseWithTrimTimeTracker timer(pss, code_root_scan_time, code_root_trim_partially_time);

-  G1ScanCodeRootsClosure cl(_scan_state, pss, worker_id);
-  // Code roots work distribution occurs inside the iteration method. So scan all collection
-  // set regions for all threads.
-  _g1h->collection_set_iterate_increment_from(&cl, worker_id);
+    G1ScanCodeRootsClosure cl(_scan_state, pss, worker_id);
+    // Code roots work distribution occurs inside the iteration method. So scan all collection
+    // set regions for all threads.
+    _g1h->collection_set_iterate_increment_from(&cl, worker_id);
+
+    p->record_or_add_thread_work_item(coderoots_phase, worker_id, cl.code_roots_scanned(), G1GCPhaseTimes::CodeRootsScannedNMethods);
+  }

  p->record_or_add_time_secs(coderoots_phase, worker_id, code_root_scan_time.seconds());
  p->add_time_secs(objcopy_phase, worker_id, code_root_trim_partially_time.seconds());

-  p->record_or_add_thread_work_item(coderoots_phase, worker_id, cl.code_roots_scanned(), G1GCPhaseTimes::CodeRootsScannedNMethods);
-
  event.commit(GCId::current(), worker_id, G1GCPhaseTimes::phase_name(coderoots_phase));
 }

--- a/src/hotspot/share/gc/parallel/parallelArguments.cpp
+++ b/src/hotspot/share/gc/parallel/parallelArguments.cpp
@ -37,8 +37,45 @@
 #include "utilities/defaultStream.hpp"
 #include "utilities/powerOfTwo.hpp"

-size_t ParallelArguments::conservative_max_heap_alignment() {
-  return compute_heap_alignment();
+static size_t num_young_spaces() {
+  // When using NUMA, we create one MutableNUMASpace for each NUMA node
+  const size_t num_eden_spaces = UseNUMA ? os::numa_get_groups_num() : 1;
+
+  // The young generation must have room for eden + two survivors
+  return num_eden_spaces + 2;
+}
+
+static size_t num_old_spaces() {
+  return 1;
+}
+
+void ParallelArguments::initialize_alignments() {
+  // Initialize card size before initializing alignments
+  CardTable::initialize_card_size();
+  const size_t card_table_alignment = CardTable::ct_max_alignment_constraint();
+  SpaceAlignment = ParallelScavengeHeap::default_space_alignment();
+
+  if (UseLargePages) {
+    const size_t total_spaces = num_young_spaces() + num_old_spaces();
+    const size_t page_size =  os::page_size_for_region_unaligned(MaxHeapSize, total_spaces);
+    ParallelScavengeHeap::set_desired_page_size(page_size);
+
+    if (page_size == os::vm_page_size()) {
+      log_warning(gc, heap)("MaxHeapSize (%zu) must be large enough for %zu * page-size; Disabling UseLargePages for heap",
+                            MaxHeapSize, total_spaces);
+    }
+
+    if (page_size > SpaceAlignment) {
+      SpaceAlignment = page_size;
+    }
+
+    HeapAlignment = lcm(page_size, card_table_alignment);
+
+  } else {
+    assert(is_aligned(SpaceAlignment, os::vm_page_size()), "");
+    ParallelScavengeHeap::set_desired_page_size(os::vm_page_size());
+    HeapAlignment = card_table_alignment;
+  }
 }

 void ParallelArguments::initialize() {
@ -98,49 +135,36 @@ void ParallelArguments::initialize() {
  FullGCForwarding::initialize_flags(heap_reserved_size_bytes());
 }

-void ParallelArguments::initialize_alignments() {
-  // Initialize card size before initializing alignments
-  CardTable::initialize_card_size();
-  SpaceAlignment = ParallelScavengeHeap::default_space_alignment();
-  HeapAlignment = compute_heap_alignment();
-}
+size_t ParallelArguments::conservative_max_heap_alignment() {
+  // The card marking array and the offset arrays for old generations are
+  // committed in os pages as well. Make sure they are entirely full (to
+  // avoid partial page problems), e.g. if 512 bytes heap corresponds to 1
+  // byte entry and the os page size is 4096, the maximum heap size should
+  // be 512*4096 = 2MB aligned.

-void ParallelArguments::initialize_heap_flags_and_sizes_one_pass() {
-  // Do basic sizing work
-  GenArguments::initialize_heap_flags_and_sizes();
-}
+  size_t alignment = CardTable::ct_max_alignment_constraint();

-void ParallelArguments::initialize_heap_flags_and_sizes() {
-  initialize_heap_flags_and_sizes_one_pass();
-
-  if (!UseLargePages) {
-    ParallelScavengeHeap::set_desired_page_size(os::vm_page_size());
-    return;
+  if (UseLargePages) {
+      // In presence of large pages we have to make sure that our
+      // alignment is large page aware.
+      alignment = lcm(os::large_page_size(), alignment);
  }

-  // If using large-page, need to update SpaceAlignment so that spaces are page-size aligned.
-  const size_t min_pages = 4; // 1 for eden + 1 for each survivor + 1 for old
-  const size_t page_sz = os::page_size_for_region_aligned(MinHeapSize, min_pages);
-  ParallelScavengeHeap::set_desired_page_size(page_sz);
-
-  if (page_sz == os::vm_page_size()) {
-    log_warning(gc, heap)("MinHeapSize (%zu) must be large enough for 4 * page-size; Disabling UseLargePages for heap", MinHeapSize);
-    return;
-  }
-
-  // Space is largepage-aligned.
-  size_t new_alignment = page_sz;
-  if (new_alignment != SpaceAlignment) {
-    SpaceAlignment = new_alignment;
-    // Redo everything from the start
-    initialize_heap_flags_and_sizes_one_pass();
-  }
-}
-
-size_t ParallelArguments::heap_reserved_size_bytes() {
-  return MaxHeapSize;
+  return alignment;
 }

 CollectedHeap* ParallelArguments::create_heap() {
  return new ParallelScavengeHeap();
 }
+
+size_t ParallelArguments::young_gen_size_lower_bound() {
+  return num_young_spaces() * SpaceAlignment;
+}
+
+size_t ParallelArguments::old_gen_size_lower_bound() {
+  return num_old_spaces() * SpaceAlignment;
+}
+
+size_t ParallelArguments::heap_reserved_size_bytes() {
+  return MaxHeapSize;
+}
--- a/src/hotspot/share/gc/parallel/parallelArguments.hpp
+++ b/src/hotspot/share/gc/parallel/parallelArguments.hpp
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2017, Red Hat, Inc. and/or its affiliates.
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -25,21 +26,16 @@
 #ifndef SHARE_GC_PARALLEL_PARALLELARGUMENTS_HPP
 #define SHARE_GC_PARALLEL_PARALLELARGUMENTS_HPP

-#include "gc/shared/gcArguments.hpp"
 #include "gc/shared/genArguments.hpp"

-class CollectedHeap;
-
 class ParallelArguments : public GenArguments {
 private:
  virtual void initialize_alignments();
-  virtual void initialize_heap_flags_and_sizes();
-
-  void initialize_heap_flags_and_sizes_one_pass();
-
  virtual void initialize();
  virtual size_t conservative_max_heap_alignment();
  virtual CollectedHeap* create_heap();
+  virtual size_t young_gen_size_lower_bound();
+  virtual size_t old_gen_size_lower_bound();

 public:
  static size_t heap_reserved_size_bytes();
--- a/src/hotspot/share/gc/parallel/parallelScavengeHeap.cpp
+++ b/src/hotspot/share/gc/parallel/parallelScavengeHeap.cpp
@ -307,9 +307,13 @@ HeapWord* ParallelScavengeHeap::mem_allocate_cas_noexpand(size_t size, bool is_t

 HeapWord* ParallelScavengeHeap::mem_allocate_work(size_t size, bool is_tlab) {
  for (uint loop_count = 0; /* empty */; ++loop_count) {
-    HeapWord* result = mem_allocate_cas_noexpand(size, is_tlab);
-    if (result != nullptr) {
-      return result;
+    HeapWord* result;
+    {
+      ConditionalMutexLocker locker(Heap_lock, !is_init_completed());
+      result = mem_allocate_cas_noexpand(size, is_tlab);
+      if (result != nullptr) {
+        return result;
+      }
    }

    // Read total_collections() under the lock so that multiple
@ -326,10 +330,15 @@ HeapWord* ParallelScavengeHeap::mem_allocate_work(size_t size, bool is_tlab) {
      }

      if (!is_init_completed()) {
-        // Can't do GC; try heap expansion to satisfy the request.
-        result = expand_heap_and_allocate(size, is_tlab);
-        if (result != nullptr) {
-          return result;
+        // Double checked locking, this ensure that is_init_completed() does not
+        // transition while expanding the heap.
+        MonitorLocker ml(InitCompleted_lock, Monitor::_no_safepoint_check_flag);
+        if (!is_init_completed()) {
+          // Can't do GC; try heap expansion to satisfy the request.
+          result = expand_heap_and_allocate(size, is_tlab);
+          if (result != nullptr) {
+            return result;
+          }
        }
      }

--- a/src/hotspot/share/gc/serial/serialArguments.cpp
+++ b/src/hotspot/share/gc/serial/serialArguments.cpp
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2017, Red Hat, Inc. and/or its affiliates.
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -27,11 +28,49 @@
 #include "gc/shared/fullGCForwarding.hpp"
 #include "gc/shared/gcArguments.hpp"

+static size_t compute_heap_alignment() {
+  // The card marking array and the offset arrays for old generations are
+  // committed in os pages as well. Make sure they are entirely full (to
+  // avoid partial page problems), e.g. if 512 bytes heap corresponds to 1
+  // byte entry and the os page size is 4096, the maximum heap size should
+  // be 512*4096 = 2MB aligned.
+
+  size_t alignment = CardTable::ct_max_alignment_constraint();
+
+  if (UseLargePages) {
+      // In presence of large pages we have to make sure that our
+      // alignment is large page aware.
+      alignment = lcm(os::large_page_size(), alignment);
+  }
+
+  return alignment;
+}
+
+void SerialArguments::initialize_alignments() {
+  // Initialize card size before initializing alignments
+  CardTable::initialize_card_size();
+  SpaceAlignment = (size_t)Generation::GenGrain;
+  HeapAlignment = compute_heap_alignment();
+}
+
 void SerialArguments::initialize() {
  GCArguments::initialize();
  FullGCForwarding::initialize_flags(MaxHeapSize);
 }

+size_t SerialArguments::conservative_max_heap_alignment() {
+  return MAX2((size_t)Generation::GenGrain, compute_heap_alignment());
+}
+
 CollectedHeap* SerialArguments::create_heap() {
  return new SerialHeap();
 }
+
+size_t SerialArguments::young_gen_size_lower_bound() {
+  // The young generation must be aligned and have room for eden + two survivors
+  return 3 * SpaceAlignment;
+}
+
+size_t SerialArguments::old_gen_size_lower_bound() {
+  return SpaceAlignment;
+}
--- a/src/hotspot/share/gc/serial/serialArguments.hpp
+++ b/src/hotspot/share/gc/serial/serialArguments.hpp
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2017, Red Hat, Inc. and/or its affiliates.
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -27,12 +28,14 @@

 #include "gc/shared/genArguments.hpp"

-class CollectedHeap;
-
 class SerialArguments : public GenArguments {
 private:
+  virtual void initialize_alignments();
  virtual void initialize();
+  virtual size_t conservative_max_heap_alignment();
  virtual CollectedHeap* create_heap();
+  virtual size_t young_gen_size_lower_bound();
+  virtual size_t old_gen_size_lower_bound();
 };

 #endif // SHARE_GC_SERIAL_SERIALARGUMENTS_HPP
--- a/src/hotspot/share/gc/serial/serialHeap.cpp
+++ b/src/hotspot/share/gc/serial/serialHeap.cpp
@ -304,9 +304,12 @@ HeapWord* SerialHeap::mem_allocate_work(size_t size, bool is_tlab) {
  HeapWord* result = nullptr;

  for (uint try_count = 1; /* break */; try_count++) {
-    result = mem_allocate_cas_noexpand(size, is_tlab);
-    if (result != nullptr) {
-      break;
+    {
+      ConditionalMutexLocker locker(Heap_lock, !is_init_completed());
+      result = mem_allocate_cas_noexpand(size, is_tlab);
+      if (result != nullptr) {
+        break;
+      }
    }
    uint gc_count_before;  // Read inside the Heap_lock locked region.
    {
@ -320,10 +323,15 @@ HeapWord* SerialHeap::mem_allocate_work(size_t size, bool is_tlab) {
      }

      if (!is_init_completed()) {
-        // Can't do GC; try heap expansion to satisfy the request.
-        result = expand_heap_and_allocate(size, is_tlab);
-        if (result != nullptr) {
-          return result;
+        // Double checked locking, this ensure that is_init_completed() does not
+        // transition while expanding the heap.
+        MonitorLocker ml(InitCompleted_lock, Monitor::_no_safepoint_check_flag);
+        if (!is_init_completed()) {
+          // Can't do GC; try heap expansion to satisfy the request.
+          result = expand_heap_and_allocate(size, is_tlab);
+          if (result != nullptr) {
+            return result;
+          }
        }
      }

--- a/src/hotspot/share/gc/shared/bufferNode.hpp
+++ b/src/hotspot/share/gc/shared/bufferNode.hpp
@ -27,6 +27,7 @@

 #include "cppstdlib/limits.hpp"
 #include "gc/shared/freeListAllocator.hpp"
+#include "runtime/atomic.hpp"
 #include "utilities/debug.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/lockFreeStack.hpp"
@ -38,7 +39,7 @@ class BufferNode {

  InternalSizeType _index;
  InternalSizeType _capacity;
-  BufferNode* volatile _next;
+  Atomic<BufferNode*> _next;
  void* _buffer[1];             // Pseudo flexible array member.

  BufferNode(InternalSizeType capacity)
@ -58,11 +59,11 @@ public:
    return std::numeric_limits<InternalSizeType>::max();
  }

-  static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; }
+  static Atomic<BufferNode*>* next_ptr(BufferNode& bn) { return &bn._next; }
  typedef LockFreeStack<BufferNode, &next_ptr> Stack;

-  BufferNode* next() const     { return _next;  }
-  void set_next(BufferNode* n) { _next = n;     }
+  BufferNode* next() const     { return _next.load_relaxed(); }
+  void set_next(BufferNode* n) { _next.store_relaxed(n); }
  size_t index() const         { return _index; }

  void set_index(size_t i)     {
--- a/src/hotspot/share/gc/shared/gcArguments.cpp
+++ b/src/hotspot/share/gc/shared/gcArguments.cpp
@ -62,24 +62,6 @@ void GCArguments::initialize_heap_sizes() {
  initialize_size_info();
 }

-size_t GCArguments::compute_heap_alignment() {
-  // The card marking array and the offset arrays for old generations are
-  // committed in os pages as well. Make sure they are entirely full (to
-  // avoid partial page problems), e.g. if 512 bytes heap corresponds to 1
-  // byte entry and the os page size is 4096, the maximum heap size should
-  // be 512*4096 = 2MB aligned.
-
-  size_t alignment = CardTable::ct_max_alignment_constraint();
-
-  if (UseLargePages) {
-      // In presence of large pages we have to make sure that our
-      // alignment is large page aware.
-      alignment = lcm(os::large_page_size(), alignment);
-  }
-
-  return alignment;
-}
-
 #ifdef ASSERT
 void GCArguments::assert_flags() {
  assert(InitialHeapSize <= MaxHeapSize, "Ergonomics decided on incompatible initial and maximum heap sizes");
--- a/src/hotspot/share/gc/shared/gcArguments.hpp
+++ b/src/hotspot/share/gc/shared/gcArguments.hpp
@ -45,6 +45,8 @@ protected:

 public:
  virtual void initialize();
+
+  // Return the (conservative) maximum heap alignment
  virtual size_t conservative_max_heap_alignment() = 0;

  // Used by heap size heuristics to determine max
@ -59,8 +61,6 @@ public:
  }

  void initialize_heap_sizes();
-
-  static size_t compute_heap_alignment();
 };

 #endif // SHARE_GC_SHARED_GCARGUMENTS_HPP
--- a/src/hotspot/share/gc/shared/gcLogPrecious.cpp
+++ b/src/hotspot/share/gc/shared/gcLogPrecious.cpp
@ -25,6 +25,7 @@
 #include "runtime/mutex.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/os.hpp"
+#include "runtime/thread.hpp"
 #include "utilities/ostream.hpp"

 stringStream* GCLogPrecious::_lines = nullptr;
@ -83,7 +84,8 @@ void GCLogPrecious::print_on_error(outputStream* st) {
    return;
  }

-  if (!_lock->try_lock_without_rank_check()) {
+  if (Thread::current_or_null_safe() == nullptr ||
+      !_lock->try_lock_without_rank_check()) {
    st->print_cr("<Skipped>\n");
    return;
  }
--- a/src/hotspot/share/gc/shared/gc_globals.hpp
+++ b/src/hotspot/share/gc/shared/gc_globals.hpp
@ -291,7 +291,7 @@
          "size on systems with small physical memory size")                \
          range(0.0, 100.0)                                                 \
                                                                            \
-  product(double, InitialRAMPercentage, 0.2,                                \
+  product(double, InitialRAMPercentage, 0.0,                                \
          "Percentage of real memory used for initial heap size")           \
          range(0.0, 100.0)                                                 \
                                                                            \
--- a/src/hotspot/share/gc/shared/genArguments.cpp
+++ b/src/hotspot/share/gc/shared/genArguments.cpp
@ -42,17 +42,6 @@ size_t MaxOldSize = 0;
 // See more in JDK-8346005
 size_t OldSize = ScaleForWordSize(4*M);

-size_t GenArguments::conservative_max_heap_alignment() { return (size_t)Generation::GenGrain; }
-
-static size_t young_gen_size_lower_bound() {
-  // The young generation must be aligned and have room for eden + two survivors
-  return 3 * SpaceAlignment;
-}
-
-static size_t old_gen_size_lower_bound() {
-  return SpaceAlignment;
-}
-
 size_t GenArguments::scale_by_NewRatio_aligned(size_t base_size, size_t alignment) {
  return align_down_bounded(base_size / (NewRatio + 1), alignment);
 }
@ -64,13 +53,6 @@ static size_t bound_minus_alignment(size_t desired_size,
  return MIN2(desired_size, max_minus);
 }

-void GenArguments::initialize_alignments() {
-  // Initialize card size before initializing alignments
-  CardTable::initialize_card_size();
-  SpaceAlignment = (size_t)Generation::GenGrain;
-  HeapAlignment = compute_heap_alignment();
-}
-
 void GenArguments::initialize_heap_flags_and_sizes() {
  GCArguments::initialize_heap_flags_and_sizes();

--- a/src/hotspot/share/gc/shared/genArguments.hpp
+++ b/src/hotspot/share/gc/shared/genArguments.hpp
@ -38,17 +38,16 @@ extern size_t OldSize;
 class GenArguments : public GCArguments {
  friend class TestGenCollectorPolicy; // Testing
 private:
-  virtual void initialize_alignments();
  virtual void initialize_size_info();

-  // Return the (conservative) maximum heap alignment
-  virtual size_t conservative_max_heap_alignment();
-
  DEBUG_ONLY(void assert_flags();)
  DEBUG_ONLY(void assert_size_info();)

  static size_t scale_by_NewRatio_aligned(size_t base_size, size_t alignment);

+  virtual size_t young_gen_size_lower_bound() = 0;
+  virtual size_t old_gen_size_lower_bound() = 0;
+
 protected:
  virtual void initialize_heap_flags_and_sizes();
 };
--- a/src/hotspot/share/gc/shared/jvmFlagConstraintsGC.cpp
+++ b/src/hotspot/share/gc/shared/jvmFlagConstraintsGC.cpp
@ -250,7 +250,7 @@ static JVMFlag::Error MaxSizeForHeapAlignment(const char* name, size_t value, bo
  } else
 #endif
  {
-    heap_alignment = GCArguments::compute_heap_alignment();
+    heap_alignment = Arguments::conservative_max_heap_alignment();
  }

  return MaxSizeForAlignment(name, value, heap_alignment, verbose);
@ -285,7 +285,7 @@ JVMFlag::Error SoftMaxHeapSizeConstraintFunc(size_t value, bool verbose) {
 JVMFlag::Error HeapBaseMinAddressConstraintFunc(size_t value, bool verbose) {
  // If an overflow happened in Arguments::set_heap_size(), MaxHeapSize will have too large a value.
  // Check for this by ensuring that MaxHeapSize plus the requested min base address still fit within max_uintx.
-  if (UseCompressedOops && FLAG_IS_ERGO(MaxHeapSize) && (value > (max_uintx - MaxHeapSize))) {
+  if (value > (max_uintx - MaxHeapSize)) {
    JVMFlag::printError(verbose,
                        "HeapBaseMinAddress (%zu) or MaxHeapSize (%zu) is too large. "
                        "Sum of them must be less than or equal to maximum of size_t (%zu)\n",
--- a/src/hotspot/share/gc/shared/satbMarkQueue.cpp
+++ b/src/hotspot/share/gc/shared/satbMarkQueue.cpp
@ -27,7 +27,6 @@
 #include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "oops/oop.inline.hpp"
-#include "runtime/atomicAccess.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
@ -85,28 +84,28 @@ SATBMarkQueueSet::~SATBMarkQueueSet() {
 // remains set until the count is reduced to zero.

 // Increment count.  If count > threshold, set flag, else maintain flag.
-static void increment_count(volatile size_t* cfptr, size_t threshold) {
+static void increment_count(Atomic<size_t>* cfptr, size_t threshold) {
  size_t old;
-  size_t value = AtomicAccess::load(cfptr);
+  size_t value = cfptr->load_relaxed();
  do {
    old = value;
    value += 2;
    assert(value > old, "overflow");
    if (value > threshold) value |= 1;
-    value = AtomicAccess::cmpxchg(cfptr, old, value);
+    value = cfptr->compare_exchange(old, value);
  } while (value != old);
 }

 // Decrement count.  If count == 0, clear flag, else maintain flag.
-static void decrement_count(volatile size_t* cfptr) {
+static void decrement_count(Atomic<size_t>* cfptr) {
  size_t old;
-  size_t value = AtomicAccess::load(cfptr);
+  size_t value = cfptr->load_relaxed();
  do {
    assert((value >> 1) != 0, "underflow");
    old = value;
    value -= 2;
    if (value <= 1) value = 0;
-    value = AtomicAccess::cmpxchg(cfptr, old, value);
+    value = cfptr->compare_exchange(old, value);
  } while (value != old);
 }

@ -332,7 +331,7 @@ void SATBMarkQueueSet::print_all(const char* msg) {
 #endif // PRODUCT

 void SATBMarkQueueSet::abandon_completed_buffers() {
-  AtomicAccess::store(&_count_and_process_flag, size_t(0));
+  _count_and_process_flag.store_relaxed(0u);
  BufferNode* buffers_to_delete = _list.pop_all();
  while (buffers_to_delete != nullptr) {
    BufferNode* bn = buffers_to_delete;
--- a/src/hotspot/share/gc/shared/satbMarkQueue.hpp
+++ b/src/hotspot/share/gc/shared/satbMarkQueue.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -29,6 +29,7 @@
 #include "memory/allocation.hpp"
 #include "memory/padded.hpp"
 #include "oops/oopsHierarchy.hpp"
+#include "runtime/atomic.hpp"

 class Thread;
 class Monitor;
@ -87,7 +88,7 @@ class SATBMarkQueueSet: public PtrQueueSet {

  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, 0);
  PaddedEnd<BufferNode::Stack> _list;
-  volatile size_t _count_and_process_flag;
+  Atomic<size_t> _count_and_process_flag;
  // These are rarely (if ever) changed, so same cache line as count.
  size_t _process_completed_buffers_threshold;
  size_t _buffer_enqueue_threshold;
@ -148,12 +149,12 @@ public:
  // The number of buffers in the list.  Racy and not updated atomically
  // with the set of completed buffers.
  size_t completed_buffers_num() const {
-    return _count_and_process_flag >> 1;
+    return _count_and_process_flag.load_relaxed() >> 1;
  }

  // Return true if completed buffers should be processed.
  bool process_completed_buffers() const {
-    return (_count_and_process_flag & 1) != 0;
+    return (_count_and_process_flag.load_relaxed() & 1) != 0;
  }

 #ifndef PRODUCT
--- a/src/hotspot/share/gc/shenandoah/shenandoahArguments.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahArguments.cpp
@ -37,6 +37,7 @@
 #include "runtime/globals_extension.hpp"
 #include "runtime/java.hpp"
 #include "utilities/defaultStream.hpp"
+#include "utilities/powerOfTwo.hpp"

 void ShenandoahArguments::initialize() {
 #if !(defined AARCH64 || defined AMD64 || defined PPC64 || defined RISCV64)
@ -205,7 +206,7 @@ void ShenandoahArguments::initialize() {
 }

 size_t ShenandoahArguments::conservative_max_heap_alignment() {
-  size_t align = ShenandoahMaxRegionSize;
+  size_t align = next_power_of_2(ShenandoahMaxRegionSize);
  if (UseLargePages) {
    align = MAX2(align, os::large_page_size());
  }
--- a/src/hotspot/share/gc/shenandoah/shenandoahGenerationalHeap.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahGenerationalHeap.cpp
@ -208,13 +208,13 @@ oop ShenandoahGenerationalHeap::evacuate_object(oop p, Thread* thread) {

  assert(ShenandoahThreadLocalData::is_evac_allowed(thread), "must be enclosed in oom-evac scope");

-  ShenandoahHeapRegion* r = heap_region_containing(p);
-  assert(!r->is_humongous(), "never evacuate humongous objects");
+  ShenandoahHeapRegion* from_region = heap_region_containing(p);
+  assert(!from_region->is_humongous(), "never evacuate humongous objects");

-  ShenandoahAffiliation target_gen = r->affiliation();
-  // gc_generation() can change asynchronously and should not be used here.
-  assert(active_generation() != nullptr, "Error");
-  if (active_generation()->is_young() && target_gen == YOUNG_GENERATION) {
+  // Try to keep the object in the same generation
+  const ShenandoahAffiliation target_gen = from_region->affiliation();
+
+  if (target_gen == YOUNG_GENERATION) {
    markWord mark = p->mark();
    if (mark.is_marked()) {
      // Already forwarded.
@ -224,26 +224,31 @@ oop ShenandoahGenerationalHeap::evacuate_object(oop p, Thread* thread) {
    if (mark.has_displaced_mark_helper()) {
      // We don't want to deal with MT here just to ensure we read the right mark word.
      // Skip the potential promotion attempt for this one.
-    } else if (age_census()->is_tenurable(r->age() + mark.age())) {
-      oop result = try_evacuate_object(p, thread, r, OLD_GENERATION);
+    } else if (age_census()->is_tenurable(from_region->age() + mark.age())) {
+      // If the object is tenurable, try to promote it
+      oop result = try_evacuate_object<YOUNG_GENERATION, OLD_GENERATION>(p, thread, from_region->age());
+
+      // If we failed to promote this aged object, we'll fall through to code below and evacuate to young-gen.
      if (result != nullptr) {
        return result;
      }
-      // If we failed to promote this aged object, we'll fall through to code below and evacuate to young-gen.
    }
+    return try_evacuate_object<YOUNG_GENERATION, YOUNG_GENERATION>(p, thread, from_region->age());
  }
-  return try_evacuate_object(p, thread, r, target_gen);
+
+  assert(target_gen == OLD_GENERATION, "Expected evacuation to old");
+  return try_evacuate_object<OLD_GENERATION, OLD_GENERATION>(p, thread, from_region->age());
 }

 // try_evacuate_object registers the object and dirties the associated remembered set information when evacuating
 // to OLD_GENERATION.
-oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, ShenandoahHeapRegion* from_region,
-                                        ShenandoahAffiliation target_gen) {
+template<ShenandoahAffiliation FROM_GENERATION, ShenandoahAffiliation TO_GENERATION>
+oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, uint from_region_age) {
  bool alloc_from_lab = true;
  bool has_plab = false;
  HeapWord* copy = nullptr;
  size_t size = ShenandoahForwarding::size(p);
-  bool is_promotion = (target_gen == OLD_GENERATION) && from_region->is_young();
+  constexpr bool is_promotion = (TO_GENERATION == OLD_GENERATION) && (FROM_GENERATION == YOUNG_GENERATION);

 #ifdef ASSERT
  if (ShenandoahOOMDuringEvacALot &&
@ -252,7 +257,7 @@ oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, Shena
  } else {
 #endif
    if (UseTLAB) {
-      switch (target_gen) {
+      switch (TO_GENERATION) {
        case YOUNG_GENERATION: {
          copy = allocate_from_gclab(thread, size);
          if ((copy == nullptr) && (size < ShenandoahThreadLocalData::gclab_size(thread))) {
@ -300,7 +305,7 @@ oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, Shena
    if (copy == nullptr) {
      // If we failed to allocate in LAB, we'll try a shared allocation.
      if (!is_promotion || !has_plab || (size > PLAB::min_size())) {
-        ShenandoahAllocRequest req = ShenandoahAllocRequest::for_shared_gc(size, target_gen, is_promotion);
+        ShenandoahAllocRequest req = ShenandoahAllocRequest::for_shared_gc(size, TO_GENERATION, is_promotion);
        copy = allocate_memory(req);
        alloc_from_lab = false;
      }
@ -314,8 +319,8 @@ oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, Shena
 #endif

  if (copy == nullptr) {
-    if (target_gen == OLD_GENERATION) {
-      if (from_region->is_young()) {
+    if (TO_GENERATION == OLD_GENERATION) {
+      if (FROM_GENERATION == YOUNG_GENERATION) {
        // Signal that promotion failed. Will evacuate this old object somewhere in young gen.
        old_generation()->handle_failed_promotion(thread, size);
        return nullptr;
@ -327,14 +332,12 @@ oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, Shena
    }

    control_thread()->handle_alloc_failure_evac(size);
-
    oom_evac_handler()->handle_out_of_memory_during_evacuation();
-
    return ShenandoahBarrierSet::resolve_forwarded(p);
  }

  if (ShenandoahEvacTracking) {
-    evac_tracker()->begin_evacuation(thread, size * HeapWordSize, from_region->affiliation(), target_gen);
+    evac_tracker()->begin_evacuation(thread, size * HeapWordSize, FROM_GENERATION, TO_GENERATION);
  }

  // Copy the object:
@ -342,8 +345,8 @@ oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, Shena
  oop copy_val = cast_to_oop(copy);

  // Update the age of the evacuated object
-  if (target_gen == YOUNG_GENERATION && is_aging_cycle()) {
-    ShenandoahHeap::increase_object_age(copy_val, from_region->age() + 1);
+  if (TO_GENERATION == YOUNG_GENERATION && is_aging_cycle()) {
+    increase_object_age(copy_val, from_region_age + 1);
  }

  // Try to install the new forwarding pointer.
@ -360,18 +363,12 @@ oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, Shena

    if (ShenandoahEvacTracking) {
      // Record that the evacuation succeeded
-      evac_tracker()->end_evacuation(thread, size * HeapWordSize, from_region->affiliation(), target_gen);
+      evac_tracker()->end_evacuation(thread, size * HeapWordSize, FROM_GENERATION, TO_GENERATION);
    }

-    if (target_gen == OLD_GENERATION) {
-      old_generation()->handle_evacuation(copy, size, from_region->is_young());
-    } else {
-      // When copying to the old generation above, we don't care
-      // about recording object age in the census stats.
-      assert(target_gen == YOUNG_GENERATION, "Error");
+    if (TO_GENERATION == OLD_GENERATION) {
+      old_generation()->handle_evacuation(copy, size);
    }
-    shenandoah_assert_correct(nullptr, copy_val);
-    return copy_val;
  }  else {
    // Failed to evacuate. We need to deal with the object that is left behind. Since this
    // new allocation is certainly after TAMS, it will be considered live in the next cycle.
@ -382,7 +379,7 @@ oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, Shena
      // For LAB allocations, it is enough to rollback the allocation ptr. Either the next
      // object will overwrite this stale copy, or the filler object on LAB retirement will
      // do this.
-      switch (target_gen) {
+      switch (TO_GENERATION) {
        case YOUNG_GENERATION: {
          ShenandoahThreadLocalData::gclab(thread)->undo_allocation(copy, size);
          break;
@ -405,14 +402,16 @@ oop ShenandoahGenerationalHeap::try_evacuate_object(oop p, Thread* thread, Shena
      // we have to keep the fwdptr initialized and pointing to our (stale) copy.
      assert(size >= ShenandoahHeap::min_fill_size(), "previously allocated object known to be larger than min_size");
      fill_with_object(copy, size);
-      shenandoah_assert_correct(nullptr, copy_val);
-      // For non-LAB allocations, the object has already been registered
    }
-    shenandoah_assert_correct(nullptr, result);
-    return result;
  }
+  shenandoah_assert_correct(nullptr, result);
+  return result;
 }

+template oop ShenandoahGenerationalHeap::try_evacuate_object<YOUNG_GENERATION, YOUNG_GENERATION>(oop p, Thread* thread, uint from_region_age);
+template oop ShenandoahGenerationalHeap::try_evacuate_object<YOUNG_GENERATION, OLD_GENERATION>(oop p, Thread* thread, uint from_region_age);
+template oop ShenandoahGenerationalHeap::try_evacuate_object<OLD_GENERATION, OLD_GENERATION>(oop p, Thread* thread, uint from_region_age);
+
 inline HeapWord* ShenandoahGenerationalHeap::allocate_from_plab(Thread* thread, size_t size, bool is_promotion) {
  assert(UseTLAB, "TLABs should be enabled");

--- a/src/hotspot/share/gc/shenandoah/shenandoahGenerationalHeap.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahGenerationalHeap.hpp
@ -87,7 +87,9 @@ public:
  void update_region_ages(ShenandoahMarkingContext* ctx);

  oop evacuate_object(oop p, Thread* thread) override;
-  oop try_evacuate_object(oop p, Thread* thread, ShenandoahHeapRegion* from_region, ShenandoahAffiliation target_gen);
+
+  template<ShenandoahAffiliation FROM_REGION, ShenandoahAffiliation TO_REGION>
+  oop try_evacuate_object(oop p, Thread* thread, uint from_region_age);

  // In the generational mode, we will use these two functions for young, mixed, and global collections.
  // For young and mixed, the generation argument will be the young generation, otherwise it will be the global generation.
--- a/src/hotspot/share/gc/shenandoah/shenandoahGenerationalHeap.inline.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahGenerationalHeap.inline.hpp
@ -34,4 +34,5 @@ inline bool ShenandoahGenerationalHeap::is_tenurable(const ShenandoahHeapRegion*
  return _age_census->is_tenurable(r->age());
 }

+
 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHGENERATIONALHEAP_INLINE_HPP
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
@ -1015,7 +1015,7 @@ HeapWord* ShenandoahHeap::allocate_memory_under_lock(ShenandoahAllocRequest& req
  // Record the plab configuration for this result and register the object.
  if (result != nullptr && req.is_old()) {
    old_generation()->configure_plab_for_current_thread(req);
-    if (req.type() == ShenandoahAllocRequest::_alloc_shared_gc) {
+    if (!req.is_lab_alloc()) {
      // Register the newly allocated object while we're holding the global lock since there's no synchronization
      // built in to the implementation of register_object().  There are potential races when multiple independent
      // threads are allocating objects, some of which might span the same card region.  For example, consider
--- a/src/hotspot/share/gc/shenandoah/shenandoahOldGeneration.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahOldGeneration.cpp
@ -619,7 +619,7 @@ void ShenandoahOldGeneration::log_failed_promotion(LogStream& ls, Thread* thread
  }
 }

-void ShenandoahOldGeneration::handle_evacuation(HeapWord* obj, size_t words, bool promotion) {
+void ShenandoahOldGeneration::handle_evacuation(HeapWord* obj, size_t words) const {
  // Only register the copy of the object that won the evacuation race.
  _card_scan->register_object_without_lock(obj);

--- a/src/hotspot/share/gc/shenandoah/shenandoahOldGeneration.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahOldGeneration.hpp
@ -179,7 +179,7 @@ public:
  void log_failed_promotion(LogStream& ls, Thread* thread, size_t size) const;

  // A successful evacuation re-dirties the cards and registers the object with the remembered set
-  void handle_evacuation(HeapWord* obj, size_t words, bool promotion);
+  void handle_evacuation(HeapWord* obj, size_t words) const;

  // Clear the flag after it is consumed by the control thread
  bool clear_failed_evacuation() {
--- a/src/hotspot/share/gc/z/zRelocate.cpp
+++ b/src/hotspot/share/gc/z/zRelocate.cpp
@ -1087,7 +1087,6 @@ private:
  ZRelocateSmallAllocator                   _small_allocator;
  ZRelocateMediumAllocator                  _medium_allocator;
  const size_t                              _total_forwardings;
-  volatile size_t                           _numa_local_forwardings;

 public:
  ZRelocateTask(ZRelocationSet* relocation_set,
@ -1104,8 +1103,7 @@ public:
      _medium_targets(medium_targets),
      _small_allocator(_generation),
      _medium_allocator(_generation, shared_medium_targets),
-      _total_forwardings(relocation_set->nforwardings()),
-      _numa_local_forwardings(0) {
+      _total_forwardings(relocation_set->nforwardings()) {

    for (uint32_t i = 0; i < ZNUMA::count(); i++) {
      ZRelocationSetParallelIterator* const iter = _iters->addr(i);
@ -1124,18 +1122,17 @@ public:

    // Signal that we're not using the queue anymore. Used mostly for asserts.
    _queue->deactivate();
-
-    if (ZNUMA::is_enabled()) {
-      log_debug(gc, reloc, numa)("Forwardings relocated NUMA-locally: %zu / %zu (%.0f%%)",
-                                 _numa_local_forwardings, _total_forwardings, percent_of(_numa_local_forwardings, _total_forwardings));
-    }
  }

  virtual void work() {
    ZRelocateWork<ZRelocateSmallAllocator> small(&_small_allocator, _small_targets->addr(), _generation);
    ZRelocateWork<ZRelocateMediumAllocator> medium(&_medium_allocator, _medium_targets->addr(), _generation);
+
    const uint32_t num_nodes = ZNUMA::count();
-    uint32_t numa_local_forwardings_worker = 0;
+    const uint32_t start_node = ZNUMA::id();
+    uint32_t current_node = start_node;
+    bool has_affinity = false;
+    bool has_affinity_current_node = false;

    const auto do_forwarding = [&](ZForwarding* forwarding) {
      ZPage* const page = forwarding->page();
@ -1167,26 +1164,30 @@ public:

    const auto do_forwarding_one_from_iter = [&]() {
      ZForwarding* forwarding;
-      const uint32_t start_node = ZNUMA::id();
-      uint32_t current_node = start_node;

-      for (uint32_t i = 0; i < num_nodes; i++) {
+      for (;;) {
        if (_iters->get(current_node).next_if(&forwarding, check_numa_local, current_node)) {
-          claim_and_do_forwarding(forwarding);
-
-          if (current_node == start_node) {
-            // Track if this forwarding was relocated on the local NUMA node
-            numa_local_forwardings_worker++;
+          // Set thread affinity for NUMA-local processing (if needed)
+          if (UseNUMA && !has_affinity_current_node) {
+            os::numa_set_thread_affinity(Thread::current(), ZNUMA::numa_id_to_node(current_node));
+            has_affinity = true;
+            has_affinity_current_node = true;
          }

+          // Perform the forwarding task
+          claim_and_do_forwarding(forwarding);
          return true;
        }

-        // Check next node.
+        // No work found on the current node, move to the next node
        current_node = (current_node + 1) % num_nodes;
-      }
+        has_affinity_current_node = false;

-      return false;
+        // If we've looped back to the starting node there's no more work to do
+        if (current_node == start_node) {
+          return false;
+        }
+      }
    };

    for (;;) {
@ -1209,11 +1210,13 @@ public:
      }
    }

-    if (ZNUMA::is_enabled()) {
-      AtomicAccess::add(&_numa_local_forwardings, numa_local_forwardings_worker, memory_order_relaxed);
-    }
-
    _queue->leave();
+
+    if (UseNUMA && has_affinity) {
+      // Restore the affinity of the thread so that it isn't bound to a specific
+      // node any more
+      os::numa_set_thread_affinity(Thread::current(), -1);
+    }
  }

  virtual void resize_workers(uint nworkers) {
--- a/src/hotspot/share/gc/z/zVerify.cpp
+++ b/src/hotspot/share/gc/z/zVerify.cpp
@ -130,7 +130,10 @@ static void z_verify_root_oop_object(zaddress addr, void* p) {

 static void z_verify_old_oop(zpointer* p) {
  const zpointer o = *p;
-  assert(o != zpointer::null, "Old should not contain raw null");
+  if (o == zpointer::null) {
+    guarantee(ZGeneration::young()->is_phase_mark_complete(), "Only possible when flip promoting");
+    guarantee(ZHeap::heap()->page(p)->is_allocating(), "Raw nulls only possible in allocating pages");
+  }
  if (!z_is_null_relaxed(o)) {
    if (ZPointer::is_mark_good(o)) {
      // Even though the pointer is mark good, we can't verify that it should
--- a/src/hotspot/share/jfr/periodic/sampling/jfrThreadSampling.cpp
+++ b/src/hotspot/share/jfr/periodic/sampling/jfrThreadSampling.cpp
@ -217,7 +217,8 @@ static bool compute_top_frame(const JfrSampleRequest& request, frame& top_frame,
          const PcDesc* const pc_desc = get_pc_desc(sampled_nm, sampled_pc);
          if (is_valid(pc_desc)) {
            intptr_t* const synthetic_sp = sender_sp - sampled_nm->frame_size();
-            top_frame = frame(synthetic_sp, synthetic_sp, sender_sp - 2, pc_desc->real_pc(sampled_nm), sampled_nm);
+            intptr_t* const synthetic_fp = sender_sp AARCH64_ONLY( - frame::sender_sp_offset);
+            top_frame = frame(synthetic_sp, synthetic_sp, synthetic_fp, pc_desc->real_pc(sampled_nm), sampled_nm);
            in_continuation = is_in_continuation(top_frame, jt);
            return true;
          }
--- a/src/hotspot/share/memory/memoryReserver.cpp
+++ b/src/hotspot/share/memory/memoryReserver.cpp
@ -437,7 +437,7 @@ ReservedSpace HeapReserver::Instance::try_reserve_range(char *highest_start,

    if (reserved.is_reserved()) {
      if (reserved.base() >= aligned_heap_base_min_address &&
-          size <= (uintptr_t)(upper_bound - reserved.base())) {
+          size <= (size_t)(upper_bound - reserved.base())) {
        // Got a successful reservation.
        return reserved;
      }
@ -546,16 +546,16 @@ ReservedHeapSpace HeapReserver::Instance::reserve_compressed_oops_heap(const siz

  const size_t attach_point_alignment = lcm(alignment, os_attach_point_alignment);

-  char* aligned_heap_base_min_address = align_up((char*)HeapBaseMinAddress, alignment);
-  size_t noaccess_prefix = ((aligned_heap_base_min_address + size) > (char*)OopEncodingHeapMax) ?
+  uintptr_t aligned_heap_base_min_address = align_up(MAX2(HeapBaseMinAddress, alignment), alignment);
+  size_t noaccess_prefix = ((aligned_heap_base_min_address + size) > OopEncodingHeapMax) ?
    noaccess_prefix_size : 0;

  ReservedSpace reserved{};

  // Attempt to alloc at user-given address.
  if (!FLAG_IS_DEFAULT(HeapBaseMinAddress)) {
-    reserved = try_reserve_memory(size + noaccess_prefix, alignment, page_size, aligned_heap_base_min_address);
-    if (reserved.base() != aligned_heap_base_min_address) { // Enforce this exact address.
+    reserved = try_reserve_memory(size + noaccess_prefix, alignment, page_size, (char*)aligned_heap_base_min_address);
+    if (reserved.base() != (char*)aligned_heap_base_min_address) { // Enforce this exact address.
      release(reserved);
      reserved = {};
    }
@ -575,38 +575,41 @@ ReservedHeapSpace HeapReserver::Instance::reserve_compressed_oops_heap(const siz

    // Attempt to allocate so that we can run without base and scale (32-Bit unscaled compressed oops).
    // Give it several tries from top of range to bottom.
-    if (aligned_heap_base_min_address + size <= (char *)UnscaledOopHeapMax) {
+    if (aligned_heap_base_min_address + size <= UnscaledOopHeapMax) {

      // Calc address range within we try to attach (range of possible start addresses).
-      char* const highest_start = align_down((char *)UnscaledOopHeapMax - size, attach_point_alignment);
-      char* const lowest_start  = align_up(aligned_heap_base_min_address, attach_point_alignment);
-      reserved = try_reserve_range(highest_start, lowest_start, attach_point_alignment,
-                                   aligned_heap_base_min_address, (char *)UnscaledOopHeapMax, size, alignment, page_size);
+      uintptr_t const highest_start = align_down(UnscaledOopHeapMax - size, attach_point_alignment);
+      uintptr_t const lowest_start  = align_up(aligned_heap_base_min_address, attach_point_alignment);
+      assert(lowest_start <= highest_start, "lowest: " INTPTR_FORMAT " highest: " INTPTR_FORMAT ,
+                                          lowest_start, highest_start);
+      reserved = try_reserve_range((char*)highest_start, (char*)lowest_start, attach_point_alignment,
+                                   (char*)aligned_heap_base_min_address, (char*)UnscaledOopHeapMax, size, alignment, page_size);
    }

    // zerobased: Attempt to allocate in the lower 32G.
-    char *zerobased_max = (char *)OopEncodingHeapMax;
+    const uintptr_t zerobased_max = OopEncodingHeapMax;

    // Give it several tries from top of range to bottom.
    if (aligned_heap_base_min_address + size <= zerobased_max && // Zerobased theoretical possible.
        ((!reserved.is_reserved()) ||                            // No previous try succeeded.
-         (reserved.end() > zerobased_max))) {                    // Unscaled delivered an arbitrary address.
+         (reserved.end() > (char*)zerobased_max))) {             // Unscaled delivered an arbitrary address.

      // Release previous reservation
      release(reserved);

      // Calc address range within we try to attach (range of possible start addresses).
-      char *const highest_start = align_down(zerobased_max - size, attach_point_alignment);
+      uintptr_t const highest_start = align_down(zerobased_max - size, attach_point_alignment);
      // Need to be careful about size being guaranteed to be less
      // than UnscaledOopHeapMax due to type constraints.
-      char *lowest_start = aligned_heap_base_min_address;
-      uint64_t unscaled_end = UnscaledOopHeapMax - size;
-      if (unscaled_end < UnscaledOopHeapMax) { // unscaled_end wrapped if size is large
-        lowest_start = MAX2(lowest_start, (char*)unscaled_end);
+      uintptr_t lowest_start = aligned_heap_base_min_address;
+      if (size < UnscaledOopHeapMax) {
+        lowest_start = MAX2<uintptr_t>(lowest_start, UnscaledOopHeapMax - size);
      }
      lowest_start = align_up(lowest_start, attach_point_alignment);
-      reserved = try_reserve_range(highest_start, lowest_start, attach_point_alignment,
-                                   aligned_heap_base_min_address, zerobased_max, size, alignment, page_size);
+      assert(lowest_start <= highest_start, "lowest: " INTPTR_FORMAT " highest: " INTPTR_FORMAT,
+                                          lowest_start, highest_start);
+      reserved = try_reserve_range((char*)highest_start, (char*)lowest_start, attach_point_alignment,
+                                   (char*)aligned_heap_base_min_address, (char*)zerobased_max, size, alignment, page_size);
    }

    // Now we go for heaps with base != 0.  We need a noaccess prefix to efficiently
@ -616,17 +619,17 @@ ReservedHeapSpace HeapReserver::Instance::reserve_compressed_oops_heap(const siz
    // Try to attach at addresses that are aligned to OopEncodingHeapMax. Disjointbase mode.
    char** addresses = get_attach_addresses_for_disjoint_mode();
    int i = 0;
-    while ((addresses[i] != nullptr) &&       // End of array not yet reached.
-           ((!reserved.is_reserved()) ||      // No previous try succeeded.
-           (reserved.end() > zerobased_max && // Not zerobased or unscaled address.
-                                              // Not disjoint address.
+    while ((addresses[i] != nullptr) &&              // End of array not yet reached.
+           ((!reserved.is_reserved()) ||             // No previous try succeeded.
+           (reserved.end() > (char*)zerobased_max && // Not zerobased or unscaled address.
+                                                     // Not disjoint address.
            !CompressedOops::is_disjoint_heap_base_address((address)reserved.base())))) {

      // Release previous reservation
      release(reserved);

      char* const attach_point = addresses[i];
-      assert(attach_point >= aligned_heap_base_min_address, "Flag support broken");
+      assert((uintptr_t)attach_point >= aligned_heap_base_min_address, "Flag support broken");
      reserved = try_reserve_memory(size + noaccess_prefix, alignment, page_size, attach_point);
      i++;
    }
--- a/src/hotspot/share/opto/cfgnode.cpp
+++ b/src/hotspot/share/opto/cfgnode.cpp
@ -326,7 +326,7 @@ bool RegionNode::is_unreachable_region(const PhaseGVN* phase) {

  // First, cut the simple case of fallthrough region when NONE of
  // region's phis references itself directly or through a data node.
-  if (is_possible_unsafe_loop(phase)) {
+  if (is_possible_unsafe_loop()) {
    // If we have a possible unsafe loop, check if the region node is actually unreachable from root.
    if (is_unreachable_from_root(phase)) {
      _is_unreachable_region = true;
@ -336,7 +336,7 @@ bool RegionNode::is_unreachable_region(const PhaseGVN* phase) {
  return false;
 }

-bool RegionNode::is_possible_unsafe_loop(const PhaseGVN* phase) const {
+bool RegionNode::is_possible_unsafe_loop() const {
  uint max = outcnt();
  uint i;
  for (i = 0; i < max; i++) {
@ -634,8 +634,8 @@ Node *RegionNode::Ideal(PhaseGVN *phase, bool can_reshape) {
    }
  } else if (can_reshape && cnt == 1) {
    // Is it dead loop?
-    // If it is LoopNopde it had 2 (+1 itself) inputs and
-    // one of them was cut. The loop is dead if it was EntryContol.
+    // If it is LoopNode it had 2 (+1 itself) inputs and
+    // one of them was cut. The loop is dead if it was EntryControl.
    // Loop node may have only one input because entry path
    // is removed in PhaseIdealLoop::Dominators().
    assert(!this->is_Loop() || cnt_orig <= 3, "Loop node should have 3 or less inputs");
@ -1392,7 +1392,7 @@ bool PhiNode::try_clean_memory_phi(PhaseIterGVN* igvn) {
  }
  assert(is_diamond_phi() > 0, "sanity");
  assert(req() == 3, "same as region");
-  const Node* region = in(0);
+  RegionNode* region = in(0)->as_Region();
  for (uint i = 1; i < 3; i++) {
    Node* phi_input = in(i);
    if (phi_input != nullptr && phi_input->is_MergeMem() && region->in(i)->outcnt() == 1) {
@ -1400,8 +1400,9 @@ bool PhiNode::try_clean_memory_phi(PhaseIterGVN* igvn) {
      MergeMemNode* merge_mem = phi_input->as_MergeMem();
      uint j = 3 - i;
      Node* other_phi_input = in(j);
-      if (other_phi_input != nullptr && other_phi_input == merge_mem->base_memory()) {
+      if (other_phi_input != nullptr && other_phi_input == merge_mem->base_memory() && !is_data_loop(region, phi_input, igvn)) {
        // merge_mem is a successor memory to other_phi_input, and is not pinned inside the diamond, so push it out.
+        // Only proceed if the transformation doesn't create a data loop
        // This will allow the diamond to collapse completely if there are no other phis left.
        igvn->replace_node(this, merge_mem);
        return true;
--- a/src/hotspot/share/opto/cfgnode.hpp
+++ b/src/hotspot/share/opto/cfgnode.hpp
@ -84,7 +84,7 @@ private:
  bool _is_unreachable_region;
  LoopStatus _loop_status;

-  bool is_possible_unsafe_loop(const PhaseGVN* phase) const;
+  bool is_possible_unsafe_loop() const;
  bool is_unreachable_from_root(const PhaseGVN* phase) const;
 public:
  // Node layout (parallels PhiNode):
--- a/src/hotspot/share/opto/chaitin.cpp
+++ b/src/hotspot/share/opto/chaitin.cpp
@ -1471,6 +1471,65 @@ static OptoReg::Name find_first_set(LRG& lrg, RegMask& mask) {
  return assigned;
 }

+OptoReg::Name PhaseChaitin::select_bias_lrg_color(LRG& lrg) {
+  uint bias_lrg1_idx = _lrg_map.find(lrg._copy_bias);
+  uint bias_lrg2_idx = _lrg_map.find(lrg._copy_bias2);
+
+  // If bias_lrg1 has a color
+  if (bias_lrg1_idx != 0 && !_ifg->_yanked->test(bias_lrg1_idx)) {
+    OptoReg::Name reg = lrgs(bias_lrg1_idx).reg();
+    //  and it is legal for lrg
+    if (is_legal_reg(lrg, reg)) {
+      return reg;
+    }
+  }
+
+  // If bias_lrg2 has a color
+  if (bias_lrg2_idx != 0 && !_ifg->_yanked->test(bias_lrg2_idx)) {
+    OptoReg::Name reg = lrgs(bias_lrg2_idx).reg();
+    //  and it is legal for lrg
+    if (is_legal_reg(lrg, reg)) {
+      return reg;
+    }
+  }
+
+  uint bias_lrg_idx = 0;
+  if (bias_lrg1_idx != 0 && bias_lrg2_idx != 0) {
+    // Since none of the bias live ranges are part of the IFG yet, constrain the
+    // definition mask with the bias live range with the least degrees of
+    // freedom. This will increase the chances of register sharing once the bias
+    // live range becomes part of the IFG.
+    lrgs(bias_lrg1_idx).compute_set_mask_size();
+    lrgs(bias_lrg2_idx).compute_set_mask_size();
+    bias_lrg_idx = lrgs(bias_lrg1_idx).degrees_of_freedom() >
+                           lrgs(bias_lrg2_idx).degrees_of_freedom()
+                       ? bias_lrg2_idx
+                       : bias_lrg1_idx;
+  } else if (bias_lrg1_idx != 0) {
+    bias_lrg_idx = bias_lrg1_idx;
+  } else if (bias_lrg2_idx != 0) {
+    bias_lrg_idx = bias_lrg2_idx;
+  }
+
+  // Register masks with offset excludes all mask bits before the offset.
+  // Such masks are mainly used for allocation from stack slots. Constrain the
+  // register mask of definition live range using bias mask only if
+  // both masks have zero offset.
+  if (bias_lrg_idx != 0 && !lrg.mask().is_offset() &&
+      !lrgs(bias_lrg_idx).mask().is_offset()) {
+    // Choose a color which is legal for bias_lrg
+    ResourceMark rm(C->regmask_arena());
+    RegMask tempmask(lrg.mask(), C->regmask_arena());
+    tempmask.and_with(lrgs(bias_lrg_idx).mask());
+    tempmask.clear_to_sets(lrg.num_regs());
+    OptoReg::Name reg = find_first_set(lrg, tempmask);
+    if (OptoReg::is_valid(reg)) {
+      return reg;
+    }
+  }
+  return OptoReg::Bad;
+}
+
 // Choose a color using the biasing heuristic
 OptoReg::Name PhaseChaitin::bias_color(LRG& lrg) {

@ -1492,25 +1551,10 @@ OptoReg::Name PhaseChaitin::bias_color(LRG& lrg) {
    }
  }

-  uint copy_lrg = _lrg_map.find(lrg._copy_bias);
-  if (copy_lrg != 0) {
-    // If he has a color,
-    if(!_ifg->_yanked->test(copy_lrg)) {
-      OptoReg::Name reg = lrgs(copy_lrg).reg();
-      //  And it is legal for you,
-      if (is_legal_reg(lrg, reg)) {
-        return reg;
-      }
-    } else if (!lrg.mask().is_offset()) {
-      // Choose a color which is legal for him
-      ResourceMark rm(C->regmask_arena());
-      RegMask tempmask(lrg.mask(), C->regmask_arena());
-      tempmask.and_with(lrgs(copy_lrg).mask());
-      tempmask.clear_to_sets(lrg.num_regs());
-      OptoReg::Name reg = find_first_set(lrg, tempmask);
-      if (OptoReg::is_valid(reg))
-        return reg;
-    }
+  // Try biasing the color with non-interfering bias live range[s].
+  OptoReg::Name reg = select_bias_lrg_color(lrg);
+  if (OptoReg::is_valid(reg)) {
+    return reg;
  }

  // If no bias info exists, just go with the register selection ordering
@ -1524,7 +1568,7 @@ OptoReg::Name PhaseChaitin::bias_color(LRG& lrg) {
  // CNC - Fun hack.  Alternate 1st and 2nd selection.  Enables post-allocate
  // copy removal to remove many more copies, by preventing a just-assigned
  // register from being repeatedly assigned.
-  OptoReg::Name reg = lrg.mask().find_first_elem();
+  reg = lrg.mask().find_first_elem();
  if( (++_alternate & 1) && OptoReg::is_valid(reg) ) {
    // This 'Remove; find; Insert' idiom is an expensive way to find the
    // SECOND element in the mask.
@ -1640,6 +1684,27 @@ uint PhaseChaitin::Select( ) {
        }
      }
    }
+
+    Node* def = lrg->_def;
+    if (lrg->is_singledef() && !lrg->_is_bound && def->is_Mach()) {
+      MachNode* mdef = def->as_Mach();
+      if (Matcher::is_register_biasing_candidate(mdef, 1)) {
+        Node* in1 = mdef->in(mdef->operand_index(1));
+        if (in1 != nullptr && lrg->_copy_bias == 0) {
+          lrg->_copy_bias = _lrg_map.find(in1);
+        }
+      }
+
+      // For commutative operations, def allocation can also be
+      // biased towards LRG of second input's def.
+      if (Matcher::is_register_biasing_candidate(mdef, 2)) {
+        Node* in2 = mdef->in(mdef->operand_index(2));
+        if (in2 != nullptr && lrg->_copy_bias2 == 0) {
+          lrg->_copy_bias2 = _lrg_map.find(in2);
+        }
+      }
+    }
+
    //assert(is_infinite_stack == lrg->mask().is_infinite_stack(), "nbrs must not change InfiniteStackedness");
    // Aligned pairs need aligned masks
    assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
--- a/src/hotspot/share/opto/chaitin.hpp
+++ b/src/hotspot/share/opto/chaitin.hpp
@ -63,6 +63,7 @@ public:

  uint _risk_bias;              // Index of LRG which we want to avoid color
  uint _copy_bias;              // Index of LRG which we want to share color
+  uint _copy_bias2;             // Index of second LRG which we want to share color

  uint _next;                   // Index of next LRG in linked list
  uint _prev;                   // Index of prev LRG in linked list
@ -703,6 +704,8 @@ private:
  OptoReg::Name choose_color(LRG& lrg);
  // Helper function which implements biasing heuristic
  OptoReg::Name bias_color(LRG& lrg);
+  // Helper function which implements color biasing
+  OptoReg::Name select_bias_lrg_color(LRG& lrg);

  // Split uncolorable live ranges
  // Return new number of live ranges
--- a/src/hotspot/share/opto/idealGraphPrinter.cpp
+++ b/src/hotspot/share/opto/idealGraphPrinter.cpp
@ -35,6 +35,97 @@

 #ifndef PRODUCT

+// Support for printing properties
+class PrintProperties
+{
+private:
+  IdealGraphPrinter* _printer;
+
+public:
+  PrintProperties(IdealGraphPrinter* printer) : _printer(printer) {}
+  void print_node_properties(Node* node);
+  void print_lrg_properties(const LRG& lrg, const char* buffer);
+  void print_property(int flag, const char* name);
+  void print_property(int flag, const char* name, const char* val);
+  void print_property(int flag, const char* name, int val);
+};
+
+void PrintProperties::print_node_properties(Node* node) {
+  const jushort flags = node->flags();
+  print_property((flags & Node::Flag_is_Copy), "is_copy");
+  print_property((flags & Node::Flag_rematerialize), "rematerialize");
+  print_property((flags & Node::Flag_needs_anti_dependence_check), "needs_anti_dependence_check");
+  print_property((flags & Node::Flag_is_macro), "is_macro");
+  print_property((flags & Node::Flag_is_Con), "is_con");
+  print_property((flags & Node::Flag_is_cisc_alternate), "is_cisc_alternate");
+  print_property((flags & Node::Flag_is_dead_loop_safe), "is_dead_loop_safe");
+  print_property((flags & Node::Flag_may_be_short_branch), "may_be_short_branch");
+  print_property((flags & Node::Flag_has_call), "has_call");
+  print_property((flags & Node::Flag_has_swapped_edges), "has_swapped_edges");
+  Matcher* matcher = _printer->C->matcher();
+  if (matcher != nullptr) {
+    print_property(matcher->is_shared(node),"is_shared");
+    print_property(!(matcher->is_shared(node)), "is_shared", IdealGraphPrinter::FALSE_VALUE);
+    print_property(matcher->is_dontcare(node), "is_dontcare");
+    print_property(!(matcher->is_dontcare(node)),"is_dontcare", IdealGraphPrinter::FALSE_VALUE);
+    Node* old = matcher->find_old_node(node);
+    if (old != nullptr) {
+      print_property(true, "old_node_idx", old->_idx);
+    }
+  }
+}
+
+void PrintProperties::print_lrg_properties(const LRG &lrg, const char *buffer) {
+  print_property(true, "mask", buffer);
+  print_property(true, "mask_size", lrg.mask_size());
+  if (lrg._degree_valid) {
+    print_property(true, "degree", lrg.degree());
+  }
+  print_property(true, "num_regs", lrg.num_regs());
+  print_property(true, "reg_pressure", lrg.reg_pressure());
+  print_property(true, "cost", lrg._cost);
+  print_property(true, "area", lrg._area);
+  print_property(true, "score", lrg.score());
+  print_property((lrg._risk_bias != 0), "risk_bias", lrg._risk_bias);
+  print_property((lrg._copy_bias != 0), "copy_bias", lrg._copy_bias);
+  print_property((lrg._copy_bias2 != 0), "copy_bias2", lrg._copy_bias2);
+  print_property(lrg.is_singledef(), "is_singledef");
+  print_property(lrg.is_multidef(), "is_multidef");
+  print_property(lrg._is_oop, "is_oop");
+  print_property(lrg._is_float, "is_float");
+  print_property(lrg._is_vector, "is_vector");
+  print_property(lrg._is_predicate, "is_predicate");
+  print_property(lrg._is_scalable, "is_scalable");
+  print_property(lrg._was_spilled1, "was_spilled1");
+  print_property(lrg._was_spilled2, "was_spilled2");
+  print_property(lrg._direct_conflict, "direct_conflict");
+  print_property(lrg._fat_proj, "fat_proj");
+  print_property(lrg._was_lo, "_was_lo");
+  print_property(lrg._has_copy, "has_copy");
+  print_property(lrg._at_risk, "at_risk");
+  print_property(lrg._must_spill, "must_spill");
+  print_property(lrg._is_bound, "is_bound");
+  print_property((lrg._msize_valid && lrg._degree_valid && lrg.lo_degree()), "trivial");
+}
+
+void PrintProperties::print_property(int flag, const char* name) {
+  if (flag != 0) {
+    _printer->print_prop(name, IdealGraphPrinter::TRUE_VALUE);
+  }
+}
+
+void PrintProperties::print_property(int flag, const char* name, const char* val) {
+  if (flag != 0) {
+    _printer->print_prop(name, val);
+  }
+}
+
+void PrintProperties::print_property(int flag, const char* name, int val) {
+  if (flag != 0) {
+    _printer->print_prop(name, val);
+  }
+}
+
 // Constants
 // Keep consistent with Java constants
 const char *IdealGraphPrinter::INDENT = "  ";
@ -522,54 +613,8 @@ void IdealGraphPrinter::visit_node(Node* n, bool edges) {
      print_prop("jvms", buffer);
    }

-    const jushort flags = node->flags();
-    if (flags & Node::Flag_is_Copy) {
-      print_prop("is_copy", "true");
-    }
-    if (flags & Node::Flag_rematerialize) {
-      print_prop("rematerialize", "true");
-    }
-    if (flags & Node::Flag_needs_anti_dependence_check) {
-      print_prop("needs_anti_dependence_check", "true");
-    }
-    if (flags & Node::Flag_is_macro) {
-      print_prop("is_macro", "true");
-    }
-    if (flags & Node::Flag_is_Con) {
-      print_prop("is_con", "true");
-    }
-    if (flags & Node::Flag_is_cisc_alternate) {
-      print_prop("is_cisc_alternate", "true");
-    }
-    if (flags & Node::Flag_is_dead_loop_safe) {
-      print_prop("is_dead_loop_safe", "true");
-    }
-    if (flags & Node::Flag_may_be_short_branch) {
-      print_prop("may_be_short_branch", "true");
-    }
-    if (flags & Node::Flag_has_call) {
-      print_prop("has_call", "true");
-    }
-    if (flags & Node::Flag_has_swapped_edges) {
-      print_prop("has_swapped_edges", "true");
-    }
-
-    if (C->matcher() != nullptr) {
-      if (C->matcher()->is_shared(node)) {
-        print_prop("is_shared", "true");
-      } else {
-        print_prop("is_shared", "false");
-      }
-      if (C->matcher()->is_dontcare(node)) {
-        print_prop("is_dontcare", "true");
-      } else {
-        print_prop("is_dontcare", "false");
-      }
-      Node* old = C->matcher()->find_old_node(node);
-      if (old != nullptr) {
-        print_prop("old_node_idx", old->_idx);
-      }
-    }
+    PrintProperties print_node(this);
+    print_node.print_node_properties(node);

    if (node->is_Proj()) {
      print_prop("con", (int)node->as_Proj()->_con);
@ -1145,73 +1190,10 @@ void IdealGraphPrinter::print(const char* name, Node* node, GrowableArray<const
      buffer[0] = 0;
      stringStream lrg_mask_stream(buffer, sizeof(buffer) - 1);
      lrg.mask().dump(&lrg_mask_stream);
-      print_prop("mask", buffer);
-      print_prop("mask_size", lrg.mask_size());
-      if (lrg._degree_valid) {
-        print_prop("degree", lrg.degree());
-      }
-      print_prop("num_regs", lrg.num_regs());
-      print_prop("reg_pressure", lrg.reg_pressure());
-      print_prop("cost", lrg._cost);
-      print_prop("area", lrg._area);
-      print_prop("score", lrg.score());
-      if (lrg._risk_bias != 0) {
-        print_prop("risk_bias", lrg._risk_bias);
-      }
-      if (lrg._copy_bias != 0) {
-        print_prop("copy_bias", lrg._copy_bias);
-      }
-      if (lrg.is_singledef()) {
-        print_prop("is_singledef", TRUE_VALUE);
-      }
-      if (lrg.is_multidef()) {
-        print_prop("is_multidef", TRUE_VALUE);
-      }
-      if (lrg._is_oop) {
-        print_prop("is_oop", TRUE_VALUE);
-      }
-      if (lrg._is_float) {
-        print_prop("is_float", TRUE_VALUE);
-      }
-      if (lrg._is_vector) {
-        print_prop("is_vector", TRUE_VALUE);
-      }
-      if (lrg._is_predicate) {
-        print_prop("is_predicate", TRUE_VALUE);
-      }
-      if (lrg._is_scalable) {
-        print_prop("is_scalable", TRUE_VALUE);
-      }
-      if (lrg._was_spilled1) {
-        print_prop("was_spilled1", TRUE_VALUE);
-      }
-      if (lrg._was_spilled2) {
-        print_prop("was_spilled2", TRUE_VALUE);
-      }
-      if (lrg._direct_conflict) {
-        print_prop("direct_conflict", TRUE_VALUE);
-      }
-      if (lrg._fat_proj) {
-        print_prop("fat_proj", TRUE_VALUE);
-      }
-      if (lrg._was_lo) {
-        print_prop("_was_lo", TRUE_VALUE);
-      }
-      if (lrg._has_copy) {
-        print_prop("has_copy", TRUE_VALUE);
-      }
-      if (lrg._at_risk) {
-        print_prop("at_risk", TRUE_VALUE);
-      }
-      if (lrg._must_spill) {
-        print_prop("must_spill", TRUE_VALUE);
-      }
-      if (lrg._is_bound) {
-        print_prop("is_bound", TRUE_VALUE);
-      }
-      if (lrg._msize_valid && lrg._degree_valid && lrg.lo_degree()) {
-        print_prop("trivial", TRUE_VALUE);
-      }
+
+      PrintProperties print_node(this);
+      print_node.print_lrg_properties(lrg, buffer);
+
      tail(PROPERTIES_ELEMENT);
      tail(LIVE_RANGE_ELEMENT);
    }
--- a/src/hotspot/share/opto/idealGraphPrinter.hpp
+++ b/src/hotspot/share/opto/idealGraphPrinter.hpp
@ -46,8 +46,9 @@ class ConnectionGraph;
 class Parse;

 class IdealGraphPrinter : public CHeapObj<mtCompiler> {
- private:
+  friend class PrintProperties;

+private:
  static const char *INDENT;
  static const char *TOP_ELEMENT;
  static const char *GROUP_ELEMENT;
--- a/src/hotspot/share/opto/machnode.cpp
+++ b/src/hotspot/share/opto/machnode.cpp
@ -460,6 +460,13 @@ int MachNode::operand_index(Node* def) const {
  return -1;
 }

+int MachNode::operand_num_edges(uint oper_index) const {
+  if (num_opnds() > oper_index) {
+    return _opnds[oper_index]->num_edges();
+  }
+  return 0;
+}
+
 //------------------------------peephole---------------------------------------
 // Apply peephole rule(s) to this instruction
 int MachNode::peephole(Block *block, int block_index, PhaseCFG* cfg_, PhaseRegAlloc *ra_) {
--- a/src/hotspot/share/opto/machnode.hpp
+++ b/src/hotspot/share/opto/machnode.hpp
@ -266,6 +266,7 @@ public:
  int  operand_index(uint operand) const;
  int  operand_index(const MachOper *oper) const;
  int  operand_index(Node* m) const;
+  int  operand_num_edges(uint operand) const;

  // Register class input is expected in
  virtual const RegMask &in_RegMask(uint) const;
--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@ -512,6 +512,8 @@ public:
  DEBUG_ONLY( bool verify_after_postselect_cleanup(); )

 public:
+  static bool is_register_biasing_candidate(const MachNode* mdef, int oper_index);
+
  // This routine is run whenever a graph fails to match.
  // If it returns, the compiler should bailout to interpreter without error.
  // In non-product mode, SoftMatchFailure is false to detect non-canonical
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@ -828,26 +828,26 @@ public:
  #undef DEFINE_CLASS_ID

  // Flags are sorted by usage frequency.
-  enum NodeFlags {
-    Flag_is_Copy                     = 1 << 0, // should be first bit to avoid shift
-    Flag_rematerialize               = 1 << 1,
-    Flag_needs_anti_dependence_check = 1 << 2,
-    Flag_is_macro                    = 1 << 3,
-    Flag_is_Con                      = 1 << 4,
-    Flag_is_cisc_alternate           = 1 << 5,
-    Flag_is_dead_loop_safe           = 1 << 6,
-    Flag_may_be_short_branch         = 1 << 7,
-    Flag_avoid_back_to_back_before   = 1 << 8,
-    Flag_avoid_back_to_back_after    = 1 << 9,
-    Flag_has_call                    = 1 << 10,
-    Flag_has_swapped_edges           = 1 << 11,
-    Flag_is_scheduled                = 1 << 12,
-    Flag_is_expensive                = 1 << 13,
-    Flag_is_predicated_vector        = 1 << 14,
-    Flag_for_post_loop_opts_igvn     = 1 << 15,
-    Flag_for_merge_stores_igvn       = 1 << 16,
-    Flag_is_removed_by_peephole      = 1 << 17,
-    Flag_is_predicated_using_blend   = 1 << 18,
+  enum NodeFlags : uint64_t {
+    Flag_is_Copy                     = 1ULL << 0, // should be first bit to avoid shift
+    Flag_rematerialize               = 1ULL << 1,
+    Flag_needs_anti_dependence_check = 1ULL << 2,
+    Flag_is_macro                    = 1ULL << 3,
+    Flag_is_Con                      = 1ULL << 4,
+    Flag_is_cisc_alternate           = 1ULL << 5,
+    Flag_is_dead_loop_safe           = 1ULL << 6,
+    Flag_may_be_short_branch         = 1ULL << 7,
+    Flag_avoid_back_to_back_before   = 1ULL << 8,
+    Flag_avoid_back_to_back_after    = 1ULL << 9,
+    Flag_has_call                    = 1ULL << 10,
+    Flag_has_swapped_edges           = 1ULL << 11,
+    Flag_is_scheduled                = 1ULL << 12,
+    Flag_is_expensive                = 1ULL << 13,
+    Flag_is_predicated_vector        = 1ULL << 14,
+    Flag_for_post_loop_opts_igvn     = 1ULL << 15,
+    Flag_for_merge_stores_igvn       = 1ULL << 16,
+    Flag_is_removed_by_peephole      = 1ULL << 17,
+    Flag_is_predicated_using_blend   = 1ULL << 18,
    _last_flag                       = Flag_is_predicated_using_blend
  };

--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@ -1022,27 +1022,39 @@ bool VPointer::can_make_speculative_aliasing_check_with(const VPointer& other) c
  // or at the multiversion_if. That is before the pre-loop. From the construction of
  // VPointer, we already know that all its variables (except iv) are pre-loop invariant.
  //
-  // For the computation of main_init, we also need the pre_limit, and so we need
-  // to check that this value is pre-loop invariant. In the case of non-equal iv_scales,
-  // we also need the main_limit in the aliasing check, and so this value must then
-  // also be pre-loop invariant.
+  // In VPointer::make_speculative_aliasing_check_with we compute main_init in all
+  // cases. For this, we require pre_init and pre_limit. These values must be available
+  // for the speculative check, i.e. their control must dominate the speculative check.
+  // Further, "if vp1.iv_scale() != vp2.iv_scale()" we additionally need to have
+  // main_limit available for the speculative check.
+  // Note: no matter if the speculative check is inserted as a predicate or at the
+  //       multiversion if, the speculative check happens before (dominates) the
+  //       pre-loop.
+  Node* pre_init = _vloop.pre_loop_end()->init_trip();
  Opaque1Node* pre_limit_opaq = _vloop.pre_loop_end()->limit()->as_Opaque1();
  Node* pre_limit = pre_limit_opaq->in(1);
  Node* main_limit = _vloop.cl()->limit();
-
-  if (!_vloop.is_pre_loop_invariant(pre_limit)) {
+  if (!_vloop.is_available_for_speculative_check(pre_init)) {
 #ifdef ASSERT
    if (_vloop.is_trace_speculative_aliasing_analysis()) {
-      tty->print_cr("VPointer::can_make_speculative_aliasing_check_with: pre_limit is not pre-loop independent!");
+      tty->print_cr("VPointer::can_make_speculative_aliasing_check_with: pre_limit is not available at speculative check!");
+    }
+#endif
+    return false;
+  }
+  if (!_vloop.is_available_for_speculative_check(pre_limit)) {
+#ifdef ASSERT
+    if (_vloop.is_trace_speculative_aliasing_analysis()) {
+      tty->print_cr("VPointer::can_make_speculative_aliasing_check_with: pre_limit is not available at speculative check!");
    }
 #endif
    return false;
  }

-  if (vp1.iv_scale() != vp2.iv_scale() && !_vloop.is_pre_loop_invariant(main_limit)) {
+  if (vp1.iv_scale() != vp2.iv_scale() && !_vloop.is_available_for_speculative_check(main_limit)) {
 #ifdef ASSERT
    if (_vloop.is_trace_speculative_aliasing_analysis()) {
-      tty->print_cr("VPointer::can_make_speculative_aliasing_check_with: main_limit is not pre-loop independent!");
+      tty->print_cr("VPointer::can_make_speculative_aliasing_check_with: main_limit is not available at speculative check!");
    }
 #endif
    return false;
@ -1119,6 +1131,8 @@ BoolNode* VPointer::make_speculative_aliasing_check_with(const VPointer& other,
  Node* pre_limit = pre_limit_opaq->in(1);
  assert(_vloop.is_pre_loop_invariant(pre_init),  "needed for aliasing check before pre-loop");
  assert(_vloop.is_pre_loop_invariant(pre_limit), "needed for aliasing check before pre-loop");
+  assert(_vloop.is_available_for_speculative_check(pre_init),  "ctrl must be early enough to avoid cycles");
+  assert(_vloop.is_available_for_speculative_check(pre_limit), "ctrl must be early enough to avoid cycles");

  Node* pre_initL = new ConvI2LNode(pre_init);
  Node* pre_limitL = new ConvI2LNode(pre_limit);
@ -1180,6 +1194,7 @@ BoolNode* VPointer::make_speculative_aliasing_check_with(const VPointer& other,
    jint main_iv_stride = _vloop.iv_stride();
    Node* main_limit = _vloop.cl()->limit();
    assert(_vloop.is_pre_loop_invariant(main_limit), "needed for aliasing check before pre-loop");
+    assert(_vloop.is_available_for_speculative_check(main_limit), "ctrl must be early enough to avoid cycles");

    Node* main_limitL = new ConvI2LNode(main_limit);
    phase->register_new_node_with_ctrl_of(main_limitL, pre_init);
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -236,6 +236,8 @@ public:
  // Some nodes must be pre-loop invariant, so that they can be used for conditions
  // before or inside the pre-loop. For example, alignment of main-loop vector
  // memops must be achieved in the pre-loop, via the exit check in the pre-loop.
+  // Note: this condition is NOT strong enough for speculative checks, those happen
+  //       before the pre-loop. See is_available_for_speculative_check
  bool is_pre_loop_invariant(Node* n) const {
    // Must be in the main-loop, otherwise we can't access the pre-loop.
    // This fails during SuperWord::unrolling_analysis, but that is ok.
@ -257,6 +259,28 @@ public:
    return is_before_pre_loop(early);
  }

+  // Nodes that are to be used in speculative checks must be available early enough.
+  // Note: the speculative check happens before the pre-loop, either at the auto
+  //       vectorization predicate or the multiversion if. This is before the
+  //       pre-loop, and thus the condition here is stronger then the one from
+  //       is_pre_loop_invariant.
+  bool is_available_for_speculative_check(Node* n) const {
+    assert(are_speculative_checks_possible(), "meaningless without speculative check");
+    ParsePredicateSuccessProj* parse_predicate_proj = auto_vectorization_parse_predicate_proj();
+    // Find the control of the predicate:
+    ProjNode* proj = (parse_predicate_proj != nullptr) ? parse_predicate_proj : multiversioning_fast_proj();
+    Node* check_ctrl = proj->in(0)->as_If()->in(0);
+
+    // Often, the control of n already dominates that of the predicate.
+    Node* n_ctrl = phase()->get_ctrl(n);
+    if (phase()->is_dominator(n_ctrl, check_ctrl)) { return true; }
+
+    // But in some cases, the ctrl of n is after that of the predicate,
+    // but the early ctrl is before the predicate.
+    Node* n_early = phase()->compute_early_ctrl(n, n_ctrl);
+    return phase()->is_dominator(n_early, check_ctrl);
+  }
+
  // Check if the loop passes some basic preconditions for vectorization.
  // Return indicates if analysis succeeded.
  bool check_preconditions();
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@ -40,38 +40,76 @@ void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
    }                                                 \
  )

-// This is similar to IGVN optimization. But we are a bit lazy, and don't care about
-// notification / worklist, since the list of nodes is rather small, and we don't
-// expect optimizations that trickle over the whole graph.
-void VTransformGraph::optimize(VTransform& vtransform) {
-  TRACE_OPTIMIZE( tty->print_cr("\nVTransformGraph::optimize"); )
-
-  bool progress = true;
-  DEBUG_ONLY(int pass_count = 0;)
-  while (progress) {
-    progress = false;
-    assert(++pass_count < 10, "ensure we do not have endless loops");
-    for (int i = 0; i < _vtnodes.length(); i++) {
-      VTransformNode* vtn = _vtnodes.at(i);
-      if (!vtn->is_alive()) { continue; }
-      progress |= vtn->optimize(_vloop_analyzer, vtransform);
-
-      // Nodes that have no use any more are dead.
-      if (vtn->out_strong_edges() == 0 &&
-          // There are some exceptions:
-          // 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive.
-          // 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive.
-          // 3. Outer node with strong inputs: is a use after the loop that we must keep alive.
-          !(vtn->isa_PhiScalar() != nullptr ||
-            vtn->is_load_or_store_in_loop() ||
-            (vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) {
-        vtn->mark_dead();
-        progress = true;
-      }
-    }
+void VTransformOptimize::worklist_push(VTransformNode* vtn) {
+  if (!_worklist_set.test_set(vtn->_idx)) {
+    _worklist.push(vtn);
  }
 }

+VTransformNode* VTransformOptimize::worklist_pop() {
+  VTransformNode* vtn = _worklist.pop();
+  _worklist_set.remove(vtn->_idx);
+  return vtn;
+}
+
+void VTransform::optimize() {
+  NOT_PRODUCT( if (vloop().is_trace_optimization()) { tty->print_cr("\nVTransform::optimize"); } )
+  ResourceMark rm;
+  VTransformOptimize vtoptimize(_vloop_analyzer, *this);
+  vtoptimize.optimize();
+}
+
+void VTransformOptimize::optimize() {
+  // Initialize: push all nodes to worklist.
+  for (int i = 0; i < _vtransform.graph().vtnodes().length(); i++) {
+    VTransformNode* vtn = _vtransform.graph().vtnodes().at(i);
+    worklist_push(vtn);
+  }
+
+  // We don't want to iterate too many times. We set some arbitrary limit,
+  // just to catch infinite loops.
+  DEBUG_ONLY( int allowed_steps = 100 * _worklist.length(); )
+
+  // Optimize iteratively.
+  while (_worklist.is_nonempty()) {
+    VTransformNode* vtn = worklist_pop();
+    optimize_step(vtn);
+    assert(--allowed_steps > 0, "no endless loop");
+  }
+
+  DEBUG_ONLY( verify(); )
+}
+
+#ifdef ASSERT
+void VTransformOptimize::verify() {
+  for (int i = 0; i < _vtransform.graph().vtnodes().length(); i++) {
+    VTransformNode* vtn = _vtransform.graph().vtnodes().at(i);
+    assert(!optimize_step(vtn), "Missed optimization during VTransform::optimize for %s", vtn->name());
+    assert(_worklist.is_empty(), "vtnode on worklist despite no progress for %s", vtn->name());
+  }
+}
+#endif
+
+// Return true if (and only if) we made progress.
+bool VTransformOptimize::optimize_step(VTransformNode* vtn) {
+  if (!vtn->is_alive()) { return false; }
+  bool progress = vtn->optimize(*this);
+
+  // Nodes that have no use any more are dead.
+  if (vtn->out_strong_edges() == 0 &&
+      // There are some exceptions:
+      // 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive.
+      // 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive.
+      // 3. Outer node with strong inputs: is a use after the loop that we must keep alive.
+      !(vtn->isa_PhiScalar() != nullptr ||
+        vtn->is_load_or_store_in_loop() ||
+        (vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) {
+    vtn->mark_dead(*this);
+    return true;
+  }
+  return progress;
+}
+
 // Compute a linearization of the graph. We do this with a reverse-post-order of a DFS.
 // This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and
 // the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the
@ -1141,8 +1179,8 @@ VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& appl
  return VTransformApplyResult::make_vector(vn);
 }

-bool VTransformReductionVectorNode::optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
-  return optimize_move_non_strict_order_reductions_out_of_loop(vloop_analyzer, vtransform);
+bool VTransformReductionVectorNode::optimize(VTransformOptimize& vtoptimize) {
+  return optimize_move_non_strict_order_reductions_out_of_loop(vtoptimize);
 }

 int VTransformReductionVectorNode::vector_reduction_opcode() const {
@ -1213,7 +1251,7 @@ bool VTransformReductionVectorNode::requires_strict_order() const {
 //       become profitable, since the expensive reduction node is moved
 //       outside the loop, and instead cheaper element-wise vector accumulations
 //       are performed inside the loop.
-bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform) {
+bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(const VTransform& vtransform) {
  // We have a phi with a single use.
  VTransformPhiScalarNode* phi = in_req(1)->isa_PhiScalar();
  if (phi == nullptr) {
@ -1260,13 +1298,13 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
        current_red->element_basic_type() != bt ||
        current_red->vector_length() != vlen) {
      TRACE_OPTIMIZE(
-        tty->print("  Cannot move out of loop, other reduction node does not match:");
+        tty->print("  Cannot move out of loop, other reduction node does not match: ");
        print();
        tty->print("  other: ");
        if (current_red != nullptr) {
          current_red->print();
        } else {
-          tty->print("nullptr");
+          tty->print_cr("nullptr");
        }
      )
      return false; // not compatible
@ -1314,7 +1352,8 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
  return true; // success
 }

-bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
+bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(VTransformOptimize& vtoptimize) {
+  VTransform& vtransform = vtoptimize.vtransform();
  if (!optimize_move_non_strict_order_reductions_out_of_loop_preconditions(vtransform)) {
    return false;
  }
@ -1328,7 +1367,7 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
  const uint vlen    = vector_length();
  const BasicType bt = element_basic_type();
  const int vopc     = VectorNode::opcode(sopc, bt);
-  PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
+  PhaseIdealLoop* phase = vtoptimize.vloop_analyzer().vloop().phase();

  // Create a vector of identity values.
  Node* identity = ReductionNode::make_identity_con_scalar(phase->igvn(), sopc, bt);
@ -1341,6 +1380,7 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
  // Look at old scalar phi.
  VTransformPhiScalarNode* phi_scalar = in_req(1)->isa_PhiScalar();
  PhiNode* old_phi = phi_scalar->node();
+  vtoptimize.worklist_push(phi_scalar);
  VTransformNode* init = phi_scalar->in_req(1);

  TRACE_OPTIMIZE(
@ -1354,6 +1394,7 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
  phi_vector->init_req(0, phi_scalar->in_req(0));
  phi_vector->init_req(1, vtn_identity_vector);
  // Note: backedge comes later
+  vtoptimize.worklist_push(phi_vector);

  // Traverse down the chain of reductions, and replace them with vector_accumulators.
  VTransformReductionVectorNode* first_red   = this;
@ -1365,6 +1406,8 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
    VTransformVectorNode* vector_accumulator = new (vtransform.arena()) VTransformElementWiseVectorNode(vtransform, 3, current_red->properties(), vopc);
    vector_accumulator->init_req(1, current_vector_accumulator);
    vector_accumulator->init_req(2, vector_input);
+    vtoptimize.worklist_push(current_red);
+    vtoptimize.worklist_push(vector_accumulator);
    TRACE_OPTIMIZE(
      tty->print("  replace    ");
      current_red->print();
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@ -24,6 +24,7 @@
 #ifndef SHARE_OPTO_VTRANSFORM_HPP
 #define SHARE_OPTO_VTRANSFORM_HPP

+#include "libadt/vectset.hpp"
 #include "opto/node.hpp"
 #include "opto/vectorization.hpp"
 #include "opto/vectornode.hpp"
@ -192,7 +193,6 @@ public:
  const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
  const GrowableArray<VTransformNode*>& get_schedule() const { return _schedule; }

-  void optimize(VTransform& vtransform);
  bool schedule();
  bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
  float cost_for_vector_loop() const;
@ -257,7 +257,7 @@ public:
  DEBUG_ONLY( bool has_graph() const { return !_graph.is_empty(); } )
  VTransformGraph& graph() { return _graph; }

-  void optimize() { return _graph.optimize(*this); }
+  void optimize();
  bool schedule() { return _graph.schedule(); }
  bool is_profitable() const;
  float cost_for_vector_loop() const { return _graph.cost_for_vector_loop(); }
@ -291,6 +291,36 @@ private:
  void apply_vectorization() const;
 };

+// We keep track of the worklist during optimizations.
+// The concept is somewhat parallel to IGVN: we keep on
+// optimizing vtnodes on the worklist, which may in turn
+// add more nodes to the list. We keep on optimizing until
+// no more nodes are on the worklist.
+class VTransformOptimize : public StackObj {
+private:
+  const VLoopAnalyzer& _vloop_analyzer;
+  VTransform& _vtransform;
+
+  GrowableArray<VTransformNode*> _worklist;
+  VectorSet _worklist_set;
+
+public:
+  VTransformOptimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) :
+    _vloop_analyzer(vloop_analyzer),
+    _vtransform(vtransform) {}
+
+  const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; }
+  VTransform& vtransform() { return _vtransform; }
+
+  void worklist_push(VTransformNode* vtn);
+  void optimize();
+
+private:
+  VTransformNode* worklist_pop();
+  bool optimize_step(VTransformNode* vtn);
+  DEBUG_ONLY( void verify(); )
+};
+
 // Keeps track of the state during "VTransform::apply"
 // -> keep track of the already transformed nodes and the memory state.
 class VTransformApplyState : public StackObj {
@ -531,10 +561,15 @@ public:

  bool is_alive() const { return _is_alive; }

-  void mark_dead() {
+  void mark_dead(VTransformOptimize& vtoptimize) {
    _is_alive = false;
-    // Remove all inputs
+    // Remove all inputs, and put inputs on worklist in
+    // case they are also dead.
    for (uint i = 0; i < req(); i++) {
+      VTransformNode* in = in_req(i);
+      if (in != nullptr) {
+        vtoptimize.worklist_push(in);
+      }
      set_req(i, nullptr);
    }
  }
@ -558,7 +593,7 @@ public:
  virtual const VPointer& vpointer() const { ShouldNotReachHere(); }
  virtual bool is_loop_head_phi() const { return false; }

-  virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
+  virtual bool optimize(VTransformOptimize& vtoptimize) { return false; }

  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const = 0;

@ -868,7 +903,7 @@ public:
  VTransformReductionVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
    VTransformVectorNode(vtransform, 3, properties) {}
  virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
-  virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
+  virtual bool optimize(VTransformOptimize& vtoptimize) override;
  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
@ -876,8 +911,8 @@ public:
 private:
  int vector_reduction_opcode() const;
  bool requires_strict_order() const;
-  bool optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform);
-  bool optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform);
+  bool optimize_move_non_strict_order_reductions_out_of_loop_preconditions(const VTransform& vtransform);
+  bool optimize_move_non_strict_order_reductions_out_of_loop(VTransformOptimize& vtoptimize);
 };

 class VTransformPhiVectorNode : public VTransformVectorNode {
--- a/src/hotspot/share/runtime/arguments.cpp
+++ b/src/hotspot/share/runtime/arguments.cpp
@ -1478,10 +1478,10 @@ void Arguments::set_conservative_max_heap_alignment() {
  // the alignments imposed by several sources: any requirements from the heap
  // itself and the maximum page size we may run the VM with.
  size_t heap_alignment = GCConfig::arguments()->conservative_max_heap_alignment();
-  _conservative_max_heap_alignment = MAX4(heap_alignment,
+  _conservative_max_heap_alignment = MAX3(heap_alignment,
                                          os::vm_allocation_granularity(),
-                                          os::max_page_size(),
-                                          GCArguments::compute_heap_alignment());
+                                          os::max_page_size());
+  assert(is_power_of_2(_conservative_max_heap_alignment), "Expected to be a power-of-2");
 }

 jint Arguments::set_ergonomics_flags() {
@ -1589,8 +1589,8 @@ void Arguments::set_heap_size() {
    }

    if (UseCompressedOops) {
-      size_t heap_end = HeapBaseMinAddress + MaxHeapSize;
-      size_t max_coop_heap = max_heap_for_compressed_oops();
+      uintptr_t heap_end = HeapBaseMinAddress + MaxHeapSize;
+      uintptr_t max_coop_heap = max_heap_for_compressed_oops();

      // Limit the heap size to the maximum possible when using compressed oops
      if (heap_end < max_coop_heap) {
@ -1607,7 +1607,7 @@ void Arguments::set_heap_size() {
          aot_log_info(aot)("UseCompressedOops disabled due to "
                            "max heap %zu > compressed oop heap %zu. "
                            "Please check the setting of MaxRAMPercentage %5.2f.",
-                            reasonable_max, max_coop_heap, MaxRAMPercentage);
+                            reasonable_max, (size_t)max_coop_heap, MaxRAMPercentage);
          FLAG_SET_ERGO(UseCompressedOops, false);
        } else {
          reasonable_max = max_coop_heap;
--- a/src/hotspot/share/runtime/atomic.hpp
+++ b/src/hotspot/share/runtime/atomic.hpp
@ -75,6 +75,7 @@
 //     v.release_store(x) -> void
 //     v.release_store_fence(x) -> void
 //     v.compare_exchange(x, y [, o]) -> T
+//     v.exchange(x [, o]) -> T
 //
 // (2) All atomic types are default constructible.
 //
@ -92,7 +93,6 @@
 // (3) Atomic pointers and atomic integers additionally provide
 //
 //   member functions:
-//     v.exchange(x [, o]) -> T
 //     v.add_then_fetch(i [, o]) -> T
 //     v.sub_then_fetch(i [, o]) -> T
 //     v.fetch_then_add(i [, o]) -> T
@ -102,10 +102,7 @@
 // type of i must be signed, or both must be unsigned. Atomic pointers perform
 // element arithmetic.
 //
-// (4) An atomic translated type additionally provides the exchange
-// function if its associated atomic decayed type provides that function.
-//
-// (5) Atomic integers additionally provide
+// (4) Atomic integers additionally provide
 //
 //   member functions:
 //     v.and_then_fetch(x [, o]) -> T
@ -115,7 +112,7 @@
 //     v.fetch_then_or(x [, o]) -> T
 //     v.fetch_then_xor(x [, o]) -> T
 //
-// (6) Atomic pointers additionally provide
+// (5) Atomic pointers additionally provide
 //
 //   nested types:
 //     ElementType -> std::remove_pointer_t<T>
@ -127,9 +124,6 @@
 // stand out a little more when used in surrounding non-atomic code. Without
 // the "AtomicAccess::" qualifier, some of those names are easily overlooked.
 //
-// Atomic bytes don't provide exchange(). This is because that operation
-// hasn't been implemented for 1 byte values. That could be changed if needed.
-//
 // Atomic for 2 byte integers is not supported. This is because atomic
 // operations of that size have not been implemented. There haven't been
 // required use-cases. Many platforms don't provide hardware support.
@ -184,15 +178,8 @@ private:

  // Helper base classes, providing various parts of the APIs.
  template<typename T> class CommonCore;
-  template<typename T> class SupportsExchange;
  template<typename T> class SupportsArithmetic;

-  // Support conditional exchange() for atomic translated types.
-  template<typename T> class HasExchange;
-  template<typename T> class DecayedHasExchange;
-  template<typename Derived, typename T, bool = DecayedHasExchange<T>::value>
-  class TranslatedExchange;
-
 public:
  template<typename T, Category = category<T>()>
  class Atomic;
@ -275,15 +262,7 @@ public:
                     atomic_memory_order order = memory_order_conservative) {
    return AtomicAccess::cmpxchg(value_ptr(), compare_value, new_value, order);
  }
-};

-template<typename T>
-class AtomicImpl::SupportsExchange : public CommonCore<T> {
-protected:
-  explicit SupportsExchange(T value) : CommonCore<T>(value) {}
-  ~SupportsExchange() = default;
-
-public:
  T exchange(T new_value,
             atomic_memory_order order = memory_order_conservative) {
    return AtomicAccess::xchg(this->value_ptr(), new_value, order);
@ -291,7 +270,7 @@ public:
 };

 template<typename T>
-class AtomicImpl::SupportsArithmetic : public SupportsExchange<T> {
+class AtomicImpl::SupportsArithmetic : public CommonCore<T> {
  // Guarding the AtomicAccess calls with constexpr checking of Offset produces
  // better compile-time error messages.
  template<typename Offset>
@ -311,7 +290,7 @@ class AtomicImpl::SupportsArithmetic : public SupportsExchange<T> {
  }

 protected:
-  explicit SupportsArithmetic(T value) : SupportsExchange<T>(value) {}
+  explicit SupportsArithmetic(T value) : CommonCore<T>(value) {}
  ~SupportsArithmetic() = default;

 public:
@ -424,54 +403,8 @@ public:

 // Atomic translated type

-// Test whether Atomic<T> has exchange().
 template<typename T>
-class AtomicImpl::HasExchange {
-  template<typename Check> static void* test(decltype(&Check::exchange));
-  template<typename> static int test(...);
-  using test_type = decltype(test<Atomic<T>>(nullptr));
-public:
-  static constexpr bool value = std::is_pointer_v<test_type>;
-};
-
-// Test whether the atomic decayed type associated with T has exchange().
-template<typename T>
-class AtomicImpl::DecayedHasExchange {
-  using Translator = PrimitiveConversions::Translate<T>;
-  using Decayed = typename Translator::Decayed;
-
-  // "Unit test" HasExchange<>.
-  static_assert(HasExchange<int>::value);
-  static_assert(HasExchange<int*>::value);
-  static_assert(!HasExchange<char>::value);
-
-public:
-  static constexpr bool value = HasExchange<Decayed>::value;
-};
-
-// Base class for atomic translated type if atomic decayed type doesn't have
-// exchange().
-template<typename Derived, typename T, bool>
-class AtomicImpl::TranslatedExchange {};
-
-// Base class for atomic translated type if atomic decayed type does have
-// exchange().
-template<typename Derived, typename T>
-class AtomicImpl::TranslatedExchange<Derived, T, true> {
-public:
-  T exchange(T new_value,
-             atomic_memory_order order = memory_order_conservative) {
-    return static_cast<Derived*>(this)->exchange_impl(new_value, order);
-  }
-};
-
-template<typename T>
-class AtomicImpl::Atomic<T, AtomicImpl::Category::Translated>
-  : public TranslatedExchange<Atomic<T>, T>
-{
-  // Give TranslatedExchange<> access to exchange_impl() if needed.
-  friend class TranslatedExchange<Atomic<T>, T>;
-
+class AtomicImpl::Atomic<T, AtomicImpl::Category::Translated> {
  using Translator = PrimitiveConversions::Translate<T>;
  using Decayed = typename Translator::Decayed;

@ -533,12 +466,7 @@ public:
                                           order));
  }

-private:
-  // Implementation of exchange() if needed.
-  // Exclude when not needed, to prevent reference to non-existent function
-  // of atomic decayed type if someone explicitly instantiates Atomic<T>.
-  template<typename Dep = Decayed, ENABLE_IF(HasExchange<Dep>::value)>
-  T exchange_impl(T new_value, atomic_memory_order order) {
+  T exchange(T new_value, atomic_memory_order order = memory_order_conservative) {
    return recover(_value.exchange(decay(new_value), order));
  }
 };
--- a/src/hotspot/share/runtime/atomicAccess.hpp
+++ b/src/hotspot/share/runtime/atomicAccess.hpp
@ -419,8 +419,8 @@ private:
  struct XchgImpl;

  // Platform-specific implementation of xchg.  Support for sizes
-  // of 4, and sizeof(intptr_t) are required.  The class is a function
-  // object that must be default constructable, with these requirements:
+  // of 1, 4, and 8 are required.  The class is a function object
+  // that must be default constructable, with these requirements:
  //
  // - dest is of type T*.
  // - exchange_value is of type T.
@ -635,7 +635,6 @@ inline void AtomicAccess::dec(D volatile* dest, atomic_memory_order order) {
  STATIC_ASSERT(std::is_pointer<D>::value || std::is_integral<D>::value);
  using I = std::conditional_t<std::is_pointer<D>::value, ptrdiff_t, D>;
  // Assumes two's complement integer representation.
-  #pragma warning(suppress: 4146)
  AtomicAccess::add(dest, I(-1), order);
 }

@ -652,7 +651,6 @@ inline D AtomicAccess::sub(D volatile* dest, I sub_value, atomic_memory_order or
  STATIC_ASSERT(sizeof(I) <= sizeof(AddendType));
  AddendType addend = sub_value;
  // Assumes two's complement integer representation.
-  #pragma warning(suppress: 4146) // In case AddendType is not signed.
  return AtomicAccess::add(dest, -addend, order);
 }

--- a/src/hotspot/share/runtime/cpuTimeCounters.cpp
+++ b/src/hotspot/share/runtime/cpuTimeCounters.cpp
@ -118,8 +118,5 @@ ThreadTotalCPUTimeClosure::~ThreadTotalCPUTimeClosure() {
 }

 void ThreadTotalCPUTimeClosure::do_thread(Thread* thread) {
-  // The default code path (fast_thread_cpu_time()) asserts that
-  // pthread_getcpuclockid() and clock_gettime() must return 0. Thus caller
-  // must ensure the thread exists and has not terminated.
  _total += os::thread_cpu_time(thread);
 }
--- a/src/hotspot/share/runtime/flags/jvmFlag.cpp
+++ b/src/hotspot/share/runtime/flags/jvmFlag.cpp
@ -162,7 +162,7 @@ void JVMFlag::print_on(outputStream* st, bool withComments, bool printRanges) co
    //      uintx ThresholdTolerance                       = 10                                        {product} {default}
    //     size_t TLABSize                                 = 0                                         {product} {default}
    //      uintx SurvivorRatio                            = 8                                         {product} {default}
-    //     double InitialRAMPercentage                     = 1.562500                                  {product} {default}
+    //     double InitialRAMPercentage                     = 0.000000                                  {product} {default}
    //      ccstr CompileCommandFile                       = MyFile.cmd                                {product} {command line}
    //  ccstrlist CompileOnly                              = Method1
    //            CompileOnly                             += Method2                                   {product} {command line}
--- a/src/hotspot/share/utilities/vmError.cpp
+++ b/src/hotspot/share/utilities/vmError.cpp
@ -664,6 +664,7 @@ void VMError::report(outputStream* st, bool _verbose) {
  BEGIN
  if (MemTracker::enabled() &&
      NmtVirtualMemory_lock != nullptr &&
+      _thread != nullptr &&
      NmtVirtualMemory_lock->owned_by_self()) {
    // Manually unlock to avoid reentrancy due to mallocs in detailed mode.
    NmtVirtualMemory_lock->unlock();
@ -1305,7 +1306,7 @@ void VMError::report(outputStream* st, bool _verbose) {
    os::print_signal_handlers(st, buf, sizeof(buf));
    st->cr();

-  STEP_IF("Native Memory Tracking", _verbose)
+  STEP_IF("Native Memory Tracking", _verbose && _thread != nullptr)
    MemTracker::error_report(st);
    st->cr();

--- a/src/hotspot/share/utilities/waitBarrier_generic.cpp
+++ b/src/hotspot/share/utilities/waitBarrier_generic.cpp
@ -23,7 +23,6 @@
 *
 */

-#include "runtime/atomicAccess.hpp"
 #include "runtime/orderAccess.hpp"
 #include "runtime/os.hpp"
 #include "utilities/spinYield.hpp"
@ -79,10 +78,10 @@

 void GenericWaitBarrier::arm(int barrier_tag) {
  assert(barrier_tag != 0, "Pre arm: Should be arming with armed value");
-  assert(AtomicAccess::load(&_barrier_tag) == 0,
+  assert(_barrier_tag.load_relaxed() == 0,
         "Pre arm: Should not be already armed. Tag: %d",
-         AtomicAccess::load(&_barrier_tag));
-  AtomicAccess::release_store(&_barrier_tag, barrier_tag);
+         _barrier_tag.load_relaxed());
+  _barrier_tag.release_store(barrier_tag);

  Cell &cell = tag_to_cell(barrier_tag);
  cell.arm(barrier_tag);
@ -92,9 +91,9 @@ void GenericWaitBarrier::arm(int barrier_tag) {
 }

 void GenericWaitBarrier::disarm() {
-  int barrier_tag = AtomicAccess::load_acquire(&_barrier_tag);
+  int barrier_tag = _barrier_tag.load_acquire();
  assert(barrier_tag != 0, "Pre disarm: Should be armed. Tag: %d", barrier_tag);
-  AtomicAccess::release_store(&_barrier_tag, 0);
+  _barrier_tag.release_store(0);

  Cell &cell = tag_to_cell(barrier_tag);
  cell.disarm(barrier_tag);
@ -121,7 +120,7 @@ void GenericWaitBarrier::Cell::arm(int32_t requested_tag) {

  SpinYield sp;
  while (true) {
-    state = AtomicAccess::load_acquire(&_state);
+    state = _state.load_acquire();
    assert(decode_tag(state) == 0,
           "Pre arm: Should not be armed. "
           "Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT,
@ -134,7 +133,7 @@ void GenericWaitBarrier::Cell::arm(int32_t requested_tag) {

  // Try to swing cell to armed. This should always succeed after the check above.
  int64_t new_state = encode(requested_tag, 0);
-  int64_t prev_state = AtomicAccess::cmpxchg(&_state, state, new_state);
+  int64_t prev_state = _state.compare_exchange(state, new_state);
  if (prev_state != state) {
    fatal("Cannot arm the wait barrier. "
          "Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT,
@ -145,14 +144,14 @@ void GenericWaitBarrier::Cell::arm(int32_t requested_tag) {
 int GenericWaitBarrier::Cell::signal_if_needed(int max) {
  int signals = 0;
  while (true) {
-    int cur = AtomicAccess::load_acquire(&_outstanding_wakeups);
+    int cur = _outstanding_wakeups.load_acquire();
    if (cur == 0) {
      // All done, no more waiters.
      return 0;
    }
    assert(cur > 0, "Sanity");

-    int prev = AtomicAccess::cmpxchg(&_outstanding_wakeups, cur, cur - 1);
+    int prev = _outstanding_wakeups.compare_exchange(cur, cur - 1);
    if (prev != cur) {
      // Contention, return to caller for early return or backoff.
      return prev;
@ -172,7 +171,7 @@ void GenericWaitBarrier::Cell::disarm(int32_t expected_tag) {
  int32_t waiters;

  while (true) {
-    int64_t state = AtomicAccess::load_acquire(&_state);
+    int64_t state = _state.load_acquire();
    int32_t tag = decode_tag(state);
    waiters = decode_waiters(state);

@ -182,7 +181,7 @@ void GenericWaitBarrier::Cell::disarm(int32_t expected_tag) {
           tag, waiters);

    int64_t new_state = encode(0, waiters);
-    if (AtomicAccess::cmpxchg(&_state, state, new_state) == state) {
+    if (_state.compare_exchange(state, new_state) == state) {
      // Successfully disarmed.
      break;
    }
@ -191,19 +190,19 @@ void GenericWaitBarrier::Cell::disarm(int32_t expected_tag) {
  // Wake up waiters, if we have at least one.
  // Allow other threads to assist with wakeups, if possible.
  if (waiters > 0) {
-    AtomicAccess::release_store(&_outstanding_wakeups, waiters);
+    _outstanding_wakeups.release_store(waiters);
    SpinYield sp;
    while (signal_if_needed(INT_MAX) > 0) {
      sp.wait();
    }
  }
-  assert(AtomicAccess::load(&_outstanding_wakeups) == 0, "Post disarm: Should not have outstanding wakeups");
+  assert(_outstanding_wakeups.load_relaxed() == 0, "Post disarm: Should not have outstanding wakeups");
 }

 void GenericWaitBarrier::Cell::wait(int32_t expected_tag) {
  // Try to register ourselves as pending waiter.
  while (true) {
-    int64_t state = AtomicAccess::load_acquire(&_state);
+    int64_t state = _state.load_acquire();
    int32_t tag = decode_tag(state);
    if (tag != expected_tag) {
      // Cell tag had changed while waiting here. This means either the cell had
@ -219,7 +218,7 @@ void GenericWaitBarrier::Cell::wait(int32_t expected_tag) {
           tag, waiters);

    int64_t new_state = encode(tag, waiters + 1);
-    if (AtomicAccess::cmpxchg(&_state, state, new_state) == state) {
+    if (_state.compare_exchange(state, new_state) == state) {
      // Success! Proceed to wait.
      break;
    }
@ -238,7 +237,7 @@ void GenericWaitBarrier::Cell::wait(int32_t expected_tag) {

  // Register ourselves as completed waiter before leaving.
  while (true) {
-    int64_t state = AtomicAccess::load_acquire(&_state);
+    int64_t state = _state.load_acquire();
    int32_t tag = decode_tag(state);
    int32_t waiters = decode_waiters(state);

@ -248,7 +247,7 @@ void GenericWaitBarrier::Cell::wait(int32_t expected_tag) {
           tag, waiters);

    int64_t new_state = encode(tag, waiters - 1);
-    if (AtomicAccess::cmpxchg(&_state, state, new_state) == state) {
+    if (_state.compare_exchange(state, new_state) == state) {
      // Success!
      break;
    }
--- a/src/hotspot/share/utilities/waitBarrier_generic.hpp
+++ b/src/hotspot/share/utilities/waitBarrier_generic.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -27,6 +27,7 @@

 #include "memory/allocation.hpp"
 #include "memory/padded.hpp"
+#include "runtime/atomic.hpp"
 #include "runtime/semaphore.hpp"
 #include "utilities/globalDefinitions.hpp"

@ -43,10 +44,10 @@ private:
    Semaphore _sem;

    // Cell state, tracks the arming + waiters status
-    volatile int64_t _state;
+    Atomic<int64_t> _state;

    // Wakeups to deliver for current waiters
-    volatile int _outstanding_wakeups;
+    Atomic<int> _outstanding_wakeups;

    int signal_if_needed(int max);

@ -83,7 +84,7 @@ private:
  // Trailing padding to protect the last cell.
  DEFINE_PAD_MINUS_SIZE(0, DEFAULT_PADDING_SIZE, 0);

-  volatile int _barrier_tag;
+  Atomic<int> _barrier_tag;

  // Trailing padding to insulate the rest of the barrier from adjacent
  // data structures. The leading padding is not needed, as cell padding
--- a/src/java.base/share/classes/java/lang/VirtualThread.java
+++ b/src/java.base/share/classes/java/lang/VirtualThread.java
@ -315,6 +315,18 @@ final class VirtualThread extends BaseVirtualThread {
        }
    }

+    /**
+     * Submits the given task to the given executor. If the scheduler is a
+     * ForkJoinPool then the task is first adapted to a ForkJoinTask.
+     */
+    private void submit(Executor executor, Runnable task) {
+        if (executor instanceof ForkJoinPool pool) {
+            pool.submit(ForkJoinTask.adapt(task));
+        } else {
+            executor.execute(task);
+        }
+    }
+
    /**
     * Submits the runContinuation task to the scheduler. For the default scheduler,
     * and calling it on a worker thread, the task will be pushed to the local queue,
@ -335,12 +347,12 @@ final class VirtualThread extends BaseVirtualThread {
                if (currentThread().isVirtual()) {
                    Continuation.pin();
                    try {
-                        scheduler.execute(runContinuation);
+                        submit(scheduler, runContinuation);
                    } finally {
                        Continuation.unpin();
                    }
                } else {
-                    scheduler.execute(runContinuation);
+                    submit(scheduler, runContinuation);
                }
                done = true;
            } catch (RejectedExecutionException ree) {
@ -1536,4 +1548,4 @@ final class VirtualThread extends BaseVirtualThread {
        unblocker.setDaemon(true);
        unblocker.start();
    }
-}
+}
--- a/src/java.base/share/classes/java/lang/reflect/TypeVariable.java
+++ b/src/java.base/share/classes/java/lang/reflect/TypeVariable.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -72,7 +72,7 @@ public interface TypeVariable<D extends GenericDeclaration> extends Type, Annota
    Type[] getBounds();

    /**
-     * Returns the {@code GenericDeclaration} object representing the
+     * Returns a {@code GenericDeclaration} object representing the
     * generic declaration declared for this type variable.
     *
     * @return the generic declaration declared for this type variable.
--- a/src/java.base/share/classes/java/nio/file/Files.java
+++ b/src/java.base/share/classes/java/nio/file/Files.java
@ -203,7 +203,7 @@ public final class Files {
     * @throws  UnsupportedOperationException
     *          if an unsupported option is specified
     * @throws  FileAlreadyExistsException
-     *          If a file of that name already exists and the {@link
+     *          If the path locates an existing file and the {@link
     *          StandardOpenOption#CREATE_NEW CREATE_NEW} option is specified
     *          <i>(optional specific exception)</i>
     * @throws  IOException
@ -340,7 +340,7 @@ public final class Files {
     *          if an unsupported open option is specified or the array contains
     *          attributes that cannot be set atomically when creating the file
     * @throws  FileAlreadyExistsException
-     *          If a file of that name already exists and the {@link
+     *          If the path locates an existing file and the {@link
     *          StandardOpenOption#CREATE_NEW CREATE_NEW} option is specified
     *          and the file is being opened for writing <i>(optional specific
     *          exception)</i>
@ -377,7 +377,7 @@ public final class Files {
     * @throws  UnsupportedOperationException
     *          if an unsupported open option is specified
     * @throws  FileAlreadyExistsException
-     *          If a file of that name already exists and the {@link
+     *          If the path locates an existing file and the {@link
     *          StandardOpenOption#CREATE_NEW CREATE_NEW} option is specified
     *          and the file is being opened for writing <i>(optional specific
     *          exception)</i>
@ -575,10 +575,11 @@ public final class Files {
        Set.of(StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE);

    /**
-     * Creates a new and empty file, failing if the file already exists. The
-     * check for the existence of the file and the creation of the new file if
-     * it does not exist are a single operation that is atomic with respect to
-     * all other filesystem activities that might affect the directory.
+     * Creates a new and empty file, failing if {@code path} locates an existing
+     * file. The check for the existence of the file and the creation of the new
+     * file if it does not exist are a single operation that is atomic with
+     * respect to all other filesystem activities that might affect the
+     * directory.
     *
     * <p> The {@code attrs} parameter is optional {@link FileAttribute
     * file-attributes} to set atomically when creating the file. Each attribute
@ -598,7 +599,7 @@ public final class Files {
     *          if the array contains an attribute that cannot be set atomically
     *          when creating the file
     * @throws  FileAlreadyExistsException
-     *          If a file of that name already exists
+     *          if {@code path} locates an existing file
     *          <i>(optional specific exception)</i>
     * @throws  IOException
     *          if an I/O error occurs or the parent directory does not exist
@ -611,7 +612,8 @@ public final class Files {
    }

    /**
-     * Creates a new directory. The check for the existence of the file and the
+     * Creates a new directory, failing if {@code dir} locates an existing
+     * file. The check for the existence of the file and the
     * creation of the directory if it does not exist are a single operation
     * that is atomic with respect to all other filesystem activities that might
     * affect the directory. The {@link #createDirectories createDirectories}
@ -636,8 +638,8 @@ public final class Files {
     *          if the array contains an attribute that cannot be set atomically
     *          when creating the directory
     * @throws  FileAlreadyExistsException
-     *          if a directory could not otherwise be created because a file of
-     *          that name already exists <i>(optional specific exception)</i>
+     *          if {@code dir} locates an existing file
+     *          <i>(optional specific exception)</i>
     * @throws  IOException
     *          if an I/O error occurs or the parent directory does not exist
     */
@ -676,8 +678,8 @@ public final class Files {
     *          if the array contains an attribute that cannot be set atomically
     *          when creating the directory
     * @throws  FileAlreadyExistsException
-     *          if {@code dir} exists but is not a directory <i>(optional specific
-     *          exception)</i>
+     *          if {@code dir} locates an existing file that is not a directory
+     *          <i>(optional specific exception)</i>
     * @throws  IOException
     *          if an I/O error occurs
     */
@ -930,7 +932,8 @@ public final class Files {
    }

    /**
-     * Creates a symbolic link to a target <i>(optional operation)</i>.
+     * Creates a symbolic link to a target, failing if {@code link} locates an
+     * existing file <i>(optional operation)</i>.
     *
     * <p> The {@code target} parameter is the target of the link. It may be an
     * {@link Path#isAbsolute absolute} or relative path and may not exist. When
@ -964,8 +967,8 @@ public final class Files {
     *          array contains an attribute that cannot be set atomically when
     *          creating the symbolic link
     * @throws  FileAlreadyExistsException
-     *          if a file with the name already exists <i>(optional specific
-     *          exception)</i>
+     *          if {@code link} locates an existing file
+     *          <i>(optional specific exception)</i>
     * @throws  IOException
     *          if an I/O error occurs
     */
@ -978,7 +981,8 @@ public final class Files {
    }

    /**
-     * Creates a new link (directory entry) for an existing file <i>(optional
+     * Creates a new link (directory entry) for an existing file,
+     * failing if {@code link} locates an existing file <i>(optional
     * operation)</i>.
     *
     * <p> The {@code link} parameter locates the directory entry to create.
@ -1007,8 +1011,8 @@ public final class Files {
     *          if the implementation does not support adding an existing file
     *          to a directory
     * @throws  FileAlreadyExistsException
-     *          if the entry could not otherwise be created because a file of
-     *          that name already exists <i>(optional specific exception)</i>
+     *          if {@code link} locates an existing file
+     *          <i>(optional specific exception)</i>
     * @throws  IOException
     *          if an I/O error occurs
     */
@ -2711,7 +2715,7 @@ public final class Files {
     * @throws  UnsupportedOperationException
     *          if an unsupported option is specified
     * @throws  FileAlreadyExistsException
-     *          If a file of that name already exists and the {@link
+     *          If the path locates an existing file and the {@link
     *          StandardOpenOption#CREATE_NEW CREATE_NEW} option is specified
     *          <i>(optional specific exception)</i>
     *
@ -2754,7 +2758,7 @@ public final class Files {
     * @throws  UnsupportedOperationException
     *          if an unsupported option is specified
     * @throws  FileAlreadyExistsException
-     *          If a file of that name already exists and the {@link
+     *          If the path locates an existing file and the {@link
     *          StandardOpenOption#CREATE_NEW CREATE_NEW} option is specified
     *          <i>(optional specific exception)</i>
     *
@ -3161,7 +3165,7 @@ public final class Files {
     * @throws  UnsupportedOperationException
     *          if an unsupported option is specified
     * @throws  FileAlreadyExistsException
-     *          If a file of that name already exists and the {@link
+     *          If the path locates an existing file and the {@link
     *          StandardOpenOption#CREATE_NEW CREATE_NEW} option is specified
     *          <i>(optional specific exception)</i>
     */
@ -3222,7 +3226,7 @@ public final class Files {
     * @throws  UnsupportedOperationException
     *          if an unsupported option is specified
     * @throws  FileAlreadyExistsException
-     *          If a file of that name already exists and the {@link
+     *          If the path locates an existing file and the {@link
     *          StandardOpenOption#CREATE_NEW CREATE_NEW} option is specified
     *          <i>(optional specific exception)</i>
     */
--- a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
+++ b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
@ -560,89 +560,70 @@ public class ForkJoinPool extends AbstractExecutorService
     * access (which is usually needed anyway).
     *
     * Signalling. Signals (in signalWork) cause new or reactivated
-     * workers to scan for tasks.  Method signalWork and its callers
-     * try to approximate the unattainable goal of having the right
-     * number of workers activated for the tasks at hand, but must err
-     * on the side of too many workers vs too few to avoid stalls:
+     * workers to scan for tasks.  SignalWork is invoked in two cases:
+     * (1) When a task is pushed onto an empty queue, and (2) When a
+     * worker takes a top-level task from a queue that has additional
+     * tasks. Together, these suffice in O(log(#threads)) steps to
+     * fully activate with at least enough workers, and ideally no
+     * more than required.  This ideal is unobtainable: Callers do not
+     * know whether another worker will finish its current task and
+     * poll for others without need of a signal (which is otherwise an
+     * advantage of work-stealing vs other schemes), and also must
+     * conservatively estimate the triggering conditions of emptiness
+     * or non-emptiness; all of which usually cause more activations
+     * than necessary (see below). (Method signalWork is also used as
+     * failsafe in case of Thread failures in deregisterWorker, to
+     * activate or create a new worker to replace them).
     *
-     *  * If computations are purely tree structured, it suffices for
-     *    every worker to activate another when it pushes a task into
-     *    an empty queue, resulting in O(log(#threads)) steps to full
-     *    activation. Emptiness must be conservatively approximated,
-     *    which may result in unnecessary signals.  Also, to reduce
-     *    resource usages in some cases, at the expense of slower
-     *    startup in others, activation of an idle thread is preferred
-     *    over creating a new one, here and elsewhere.
-     *
-     *  * At the other extreme, if "flat" tasks (those that do not in
-     *    turn generate others) come in serially from only a single
-     *    producer, each worker taking a task from a queue should
-     *    propagate a signal if there are more tasks in that
-     *    queue. This is equivalent to, but generally faster than,
-     *    arranging the stealer take multiple tasks, re-pushing one or
-     *    more on its own queue, and signalling (because its queue is
-     *    empty), also resulting in logarithmic full activation
-     *    time. If tasks do not not engage in unbounded loops based on
-     *    the actions of other workers with unknown dependencies loop,
-     *    this form of proagation can be limited to one signal per
-     *    activation (phase change). We distinguish the cases by
-     *    further signalling only if the task is an InterruptibleTask
-     *    (see below), which are the only supported forms of task that
-     *    may do so.
-     *
-     * * Because we don't know about usage patterns (or most commonly,
-     *    mixtures), we use both approaches, which present even more
-     *    opportunities to over-signal. (Failure to distinguish these
-     *    cases in terms of submission methods was arguably an early
-     *    design mistake.)  Note that in either of these contexts,
-     *    signals may be (and often are) unnecessary because active
-     *    workers continue scanning after running tasks without the
-     *    need to be signalled (which is one reason work stealing is
-     *    often faster than alternatives), so additional workers
-     *    aren't needed.
-     *
-     * * For rapidly branching tasks that require full pool resources,
-     *   oversignalling is OK, because signalWork will soon have no
-     *   more workers to create or reactivate. But for others (mainly
-     *   externally submitted tasks), overprovisioning may cause very
-     *   noticeable slowdowns due to contention and resource
-     *   wastage. We reduce impact by deactivating workers when
-     *   queues don't have accessible tasks, but reactivating and
-     *   rescanning if other tasks remain.
-     *
-     * * Despite these, signal contention and overhead effects still
-     *   occur during ramp-up and ramp-down of small computations.
+     * Top-Level scheduling
+     * ====================
     *
     * Scanning. Method runWorker performs top-level scanning for (and
     * execution of) tasks by polling a pseudo-random permutation of
     * the array (by starting at a given index, and using a constant
     * cyclically exhaustive stride.)  It uses the same basic polling
     * method as WorkQueue.poll(), but restarts with a different
-     * permutation on each invocation.  The pseudorandom generator
-     * need not have high-quality statistical properties in the long
+     * permutation on each rescan.  The pseudorandom generator need
+     * not have high-quality statistical properties in the long
     * term. We use Marsaglia XorShifts, seeded with the Weyl sequence
-     * from ThreadLocalRandom probes, which are cheap and
-     * suffice. Each queue's polling attempts to avoid becoming stuck
-     * when other scanners/pollers stall.  Scans do not otherwise
-     * explicitly take into account core affinities, loads, cache
-     * localities, etc, However, they do exploit temporal locality
-     * (which usually approximates these) by preferring to re-poll
-     * from the same queue after a successful poll before trying
-     * others, which also reduces bookkeeping, cache traffic, and
-     * scanning overhead. But it also reduces fairness, which is
-     * partially counteracted by giving up on detected interference
-     * (which also reduces contention when too many workers try to
-     * take small tasks from the same queue).
+     * from ThreadLocalRandom probes, which are cheap and suffice.
     *
     * Deactivation. When no tasks are found by a worker in runWorker,
-     * it tries to deactivate()), giving up (and rescanning) on "ctl"
-     * contention. To avoid missed signals during deactivation, the
-     * method rescans and reactivates if there may have been a missed
-     * signal during deactivation. To reduce false-alarm reactivations
-     * while doing so, we scan multiple times (analogously to method
-     * quiescent()) before trying to reactivate.  Because idle workers
-     * are often not yet blocked (parked), we use a WorkQueue field to
-     * advertise that a waiter actually needs unparking upon signal.
+     * it invokes deactivate, that first deactivates (to an IDLE
+     * phase).  Avoiding missed signals during deactivation requires a
+     * (conservative) rescan, reactivating if there may be tasks to
+     * poll. Because idle workers are often not yet blocked (parked),
+     * we use a WorkQueue field to advertise that a waiter actually
+     * needs unparking upon signal.
+     *
+     * When tasks are constructed as (recursive) DAGs, top-level
+     * scanning is usually infrequent, and doesn't encounter most
+     * of the following problems addressed by runWorker and awaitWork:
+     *
+     * Locality. Polls are organized into "runs", continuing until
+     * empty or contended, while also minimizing interference by
+     * postponing bookeeping to ends of runs. This may reduce
+     * fairness.
+     *
+     * Contention. When many workers try to poll few queues, they
+     * often collide, generating CAS failures and disrupting locality
+     * of workers already running their tasks. This also leads to
+     * stalls when tasks cannot be taken because other workers have
+     * not finished poll operations, which is detected by reading
+     * ahead in queue arrays. In both cases, workers restart scans in a
+     * way that approximates randomized backoff.
+     *
+     * Oversignalling. When many short top-level tasks are present in
+     * a small number of queues, the above signalling strategy may
+     * activate many more workers than needed, worsening locality and
+     * contention problems, while also generating more global
+     * contention (field ctl is CASed on every activation and
+     * deactivation). We filter out (both in runWorker and
+     * signalWork) attempted signals that are surely not needed
+     * because the signalled tasks are already taken.
+     *
+     * Shutdown and Quiescence
+     * =======================
     *
     * Quiescence. Workers scan looking for work, giving up when they
     * don't find any, without being sure that none are available.
@ -892,9 +873,7 @@ public class ForkJoinPool extends AbstractExecutorService
     * shutdown, runners are interrupted so they can cancel. Since
     * external joining callers never run these tasks, they must await
     * cancellation by others, which can occur along several different
-     * paths. The inability to rely on caller-runs may also require
-     * extra signalling (resulting in scanning and contention) so is
-     * done only conditionally in methods push and runworker.
+     * paths.
     *
     * Across these APIs, rules for reporting exceptions for tasks
     * with results accessed via join() differ from those via get(),
@ -961,9 +940,13 @@ public class ForkJoinPool extends AbstractExecutorService
     * less-contended applications. To help arrange this, some
     * non-reference fields are declared as "long" even when ints or
     * shorts would suffice.  For class WorkQueue, an
-     * embedded @Contended region segregates fields most heavily
-     * updated by owners from those most commonly read by stealers or
-     * other management.
+     * embedded @Contended isolates the very busy top index, along
+     * with status and bookkeeping fields written (mostly) by owners,
+     * that otherwise interfere with reading array and base
+     * fields. There are other variables commonly contributing to
+     * false-sharing-related performance issues (including fields of
+     * class Thread), but we can't do much about this except try to
+     * minimize access.
     *
     * Initial sizing and resizing of WorkQueue arrays is an even more
     * delicate tradeoff because the best strategy systematically
@ -972,13 +955,11 @@ public class ForkJoinPool extends AbstractExecutorService
     * direct false-sharing and indirect cases due to GC bookkeeping
     * (cardmarks etc), and reduce the number of resizes, which are
     * not especially fast because they require atomic transfers.
-     * Currently, arrays for workers are initialized to be just large
-     * enough to avoid resizing in most tree-structured tasks, but
-     * larger for external queues where both false-sharing problems
-     * and the need for resizing are more common. (Maintenance note:
-     * any changes in fields, queues, or their uses, or JVM layout
-     * policies, must be accompanied by re-evaluation of these
-     * placement and sizing decisions.)
+     * Currently, arrays are initialized to be just large enough to
+     * avoid resizing in most tree-structured tasks, but grow rapidly
+     * until large.  (Maintenance note: any changes in fields, queues,
+     * or their uses, or JVM layout policies, must be accompanied by
+     * re-evaluation of these placement and sizing decisions.)
     *
     * Style notes
     * ===========
@ -1061,17 +1042,11 @@ public class ForkJoinPool extends AbstractExecutorService
    static final int DEFAULT_COMMON_MAX_SPARES = 256;

    /**
-     * Initial capacity of work-stealing queue array for workers.
+     * Initial capacity of work-stealing queue array.
     * Must be a power of two, at least 2. See above.
     */
    static final int INITIAL_QUEUE_CAPACITY = 1 << 6;

-    /**
-     * Initial capacity of work-stealing queue array for external queues.
-     * Must be a power of two, at least 2. See above.
-     */
-    static final int INITIAL_EXTERNAL_QUEUE_CAPACITY = 1 << 9;
-
    // conversions among short, int, long
    static final int  SMASK           = 0xffff;      // (unsigned) short bits
    static final long LMASK           = 0xffffffffL; // lower 32 bits of long
@ -1211,11 +1186,11 @@ public class ForkJoinPool extends AbstractExecutorService
        @jdk.internal.vm.annotation.Contended("w")
        int stackPred;             // pool stack (ctl) predecessor link
        @jdk.internal.vm.annotation.Contended("w")
+        volatile int parking;      // nonzero if parked in awaitWork
+        @jdk.internal.vm.annotation.Contended("w")
        volatile int source;       // source queue id (or DROPPED)
        @jdk.internal.vm.annotation.Contended("w")
        int nsteals;               // number of steals from other queues
-        @jdk.internal.vm.annotation.Contended("w")
-        volatile int parking;      // nonzero if parked in awaitWork

        // Support for atomic operations
        private static final Unsafe U;
@ -1248,11 +1223,11 @@ public class ForkJoinPool extends AbstractExecutorService
         */
        WorkQueue(ForkJoinWorkerThread owner, int id, int cfg,
                  boolean clearThreadLocals) {
-            array = new ForkJoinTask<?>[owner == null ?
-                                        INITIAL_EXTERNAL_QUEUE_CAPACITY :
-                                        INITIAL_QUEUE_CAPACITY];
-            this.owner = owner;
            this.config = (clearThreadLocals) ? cfg | CLEAR_TLS : cfg;
+            if ((this.owner = owner) == null) {
+                array = new ForkJoinTask<?>[INITIAL_QUEUE_CAPACITY];
+                phase = id | IDLE;
+            }
        }

        /**
@ -1279,27 +1254,27 @@ public class ForkJoinPool extends AbstractExecutorService
         * @throws RejectedExecutionException if array could not be resized
         */
        final void push(ForkJoinTask<?> task, ForkJoinPool pool, boolean internal) {
-            int s = top, b = base, m, cap, room; ForkJoinTask<?>[] a;
-            if ((a = array) != null && (cap = a.length) > 0 && // else disabled
-                task != null) {
-                int pk = task.noUserHelp() + 1;             // prev slot offset
-                if ((room = (m = cap - 1) - (s - b)) >= 0) {
+            int s = top, b = base, m, cap, room; ForkJoinTask<?>[] a, na;
+            if ((a = array) != null && (cap = a.length) > 0) { // else disabled
+                int k = (m = cap - 1) & s;
+                if ((room = m - (s - b)) >= 0) {
                    top = s + 1;
-                    long pos = slotOffset(m & s);
+                    long pos = slotOffset(k);
                    if (!internal)
                        U.putReference(a, pos, task);       // inside lock
                    else
                        U.getAndSetReference(a, pos, task); // fully fenced
-                    if (room == 0)                          // resize
-                        growArray(a, cap, s);
+                    if (room == 0 && (na = growArray(a, cap, s)) != null)
+                        k = ((a = na).length - 1) & s;      // resize
                }
                if (!internal)
                    unlockPhase();
                if (room < 0)
                    throw new RejectedExecutionException("Queue capacity exceeded");
-                if ((room == 0 || a[m & (s - pk)] == null) &&
-                    pool != null)
-                    pool.signalWork();   // may have appeared empty
+                if (pool != null &&
+                    (room == 0 ||
+                     U.getReferenceAcquire(a, slotOffset(m & (s - 1))) == null))
+                    pool.signalWork(a, k);    // may have appeared empty
            }
        }

@ -1308,11 +1283,12 @@ public class ForkJoinPool extends AbstractExecutorService
         * @param a old array
         * @param cap old array capacity
         * @param s current top
+         * @return new array, or null on failure
         */
-        private void growArray(ForkJoinTask<?>[] a, int cap, int s) {
-            int newCap = cap << 1;
+        private ForkJoinTask<?>[] growArray(ForkJoinTask<?>[] a, int cap, int s) {
+            int newCap = (cap >= 1 << 16) ? cap << 1 : cap << 2;
+            ForkJoinTask<?>[] newArray = null;
            if (a != null && a.length == cap && cap > 0 && newCap > 0) {
-                ForkJoinTask<?>[] newArray = null;
                try {
                    newArray = new ForkJoinTask<?>[newCap];
                } catch (OutOfMemoryError ex) {
@ -1329,34 +1305,45 @@ public class ForkJoinPool extends AbstractExecutorService
                    updateArray(newArray);           // fully fenced
                }
            }
+            return newArray;
        }

        /**
-         * Takes next task, if one exists, in order specified by mode,
-         * so acts as either local-pop or local-poll. Called only by owner.
-         * @param fifo nonzero if FIFO mode
+         * Takes next task, if one exists, in lifo order.
         */
-        private ForkJoinTask<?> nextLocalTask(int fifo) {
+        private ForkJoinTask<?> localPop() {
            ForkJoinTask<?> t = null;
-            ForkJoinTask<?>[] a = array;
-            int b = base, p = top, cap;
-            if (p - b > 0 && a != null && (cap = a.length) > 0) {
-                for (int m = cap - 1, s, nb;;) {
-                    if (fifo == 0 || (nb = b + 1) == p) {
-                        if ((t = (ForkJoinTask<?>)U.getAndSetReference(
-                                 a, slotOffset(m & (s = p - 1)), null)) != null)
-                            updateTop(s);       // else lost race for only task
-                        break;
+            int s = top - 1, cap; long k; ForkJoinTask<?>[] a;
+            if ((a = array) != null && (cap = a.length) > 0 &&
+                U.getReference(a, k = slotOffset((cap - 1) & s)) != null &&
+                (t = (ForkJoinTask<?>)U.getAndSetReference(a, k, null)) != null)
+                updateTop(s);
+            return t;
+        }
+
+        /**
+         * Takes next task, if one exists, in fifo order.
+         */
+        private ForkJoinTask<?> localPoll() {
+            ForkJoinTask<?> t = null;
+            int p = top, cap; ForkJoinTask<?>[] a;
+            if ((a = array) != null && (cap = a.length) > 0) {
+                for (int b = base; p - b > 0; ) {
+                    int nb = b + 1;
+                    long k = slotOffset((cap - 1) & b);
+                    if (U.getReference(a, k) == null) {
+                        if (nb == p)
+                            break;          // else base is lagging
+                        while (b == (b = U.getIntAcquire(this, BASE)))
+                            Thread.onSpinWait(); // spin to reduce memory traffic
                    }
-                    if ((t = (ForkJoinTask<?>)U.getAndSetReference(
-                             a, slotOffset(m & b), null)) != null) {
+                    else if ((t = (ForkJoinTask<?>)
+                              U.getAndSetReference(a, k, null)) != null) {
                        updateBase(nb);
                        break;
                    }
-                    while (b == (b = U.getIntAcquire(this, BASE)))
-                        Thread.onSpinWait();    // spin to reduce memory traffic
-                    if (p - b <= 0)
-                        break;
+                    else
+                        b = base;
                }
            }
            return t;
@ -1364,10 +1351,9 @@ public class ForkJoinPool extends AbstractExecutorService

        /**
         * Takes next task, if one exists, using configured mode.
-         * (Always internal, never called for Common pool.)
         */
        final ForkJoinTask<?> nextLocalTask() {
-            return nextLocalTask(config & FIFO);
+            return (config & FIFO) == 0 ? localPop() : localPoll();
        }

        /**
@ -1443,12 +1429,12 @@ public class ForkJoinPool extends AbstractExecutorService
        // specialized execution methods

        /**
-         * Runs the given task, as well as remaining local tasks.
+         * Runs the given task, as well as remaining local tasks
         */
        final void topLevelExec(ForkJoinTask<?> task, int fifo) {
            while (task != null) {
                task.doExec();
-                task = nextLocalTask(fifo);
+                task = (fifo != 0) ? localPoll() : localPop();
            }
        }

@ -1578,7 +1564,7 @@ public class ForkJoinPool extends AbstractExecutorService
         * Cancels all local tasks. Called only by owner.
         */
        final void cancelTasks() {
-            for (ForkJoinTask<?> t; (t = nextLocalTask(0)) != null; ) {
+            for (ForkJoinTask<?> t; (t = localPop()) != null; ) {
                try {
                    t.cancel(false);
                } catch (Throwable ignore) {
@ -1780,7 +1766,8 @@ public class ForkJoinPool extends AbstractExecutorService
     * @param w caller's WorkQueue
     */
    final void registerWorker(WorkQueue w) {
-        if (w != null && (runState & STOP) == 0L) {
+        if (w != null) {
+            w.array = new ForkJoinTask<?>[INITIAL_QUEUE_CAPACITY];
            ThreadLocalRandom.localInit();
            int seed = w.stackPred = ThreadLocalRandom.getProbe();
            int phaseSeq = seed & ~((IDLE << 1) - 1); // initial phase tag
@ -1858,17 +1845,18 @@ public class ForkJoinPool extends AbstractExecutorService
        }
        if ((tryTerminate(false, false) & STOP) == 0L &&
            phase != 0 && w != null && w.source != DROPPED) {
-            signalWork();                  // possibly replace
            w.cancelTasks();               // clean queue
+            signalWork(null, 0);           // possibly replace
        }
        if (ex != null)
            ForkJoinTask.rethrow(ex);
    }

    /**
-     * Releases an idle worker, or creates one if not enough exist.
+     * Releases an idle worker, or creates one if not enough exist,
+     * giving up if array a is nonnull and task at a[k] already taken.
     */
-    final void signalWork() {
+    final void signalWork(ForkJoinTask<?>[] a, int k) {
        int pc = parallelism;
        for (long c = ctl;;) {
            WorkQueue[] qs = queues;
@ -1884,13 +1872,15 @@ public class ForkJoinPool extends AbstractExecutorService
            if (sp == 0) {
                if ((short)(c >>> TC_SHIFT) >= pc)
                    break;
-                nc = ((c + TC_UNIT) & TC_MASK);
+                nc = ((c + TC_UNIT) & TC_MASK) | ac;
            }
            else if ((v = w) == null)
                break;
            else
-                nc = (v.stackPred & LMASK) | (c & TC_MASK);
-            if (c == (c = compareAndExchangeCtl(c, nc | ac))) {
+                nc = (v.stackPred & LMASK) | (c & TC_MASK) | ac;
+            if (a != null && k < a.length && k >= 0 && a[k] == null)
+                break;
+            if (c == (c = ctl) && c == (c = compareAndExchangeCtl(c, nc))) {
                if (v == null)
                    createWorker();
                else {
@ -1973,178 +1963,196 @@ public class ForkJoinPool extends AbstractExecutorService
     * @param w caller's WorkQueue (may be null on failed initialization)
     */
    final void runWorker(WorkQueue w) {
-        if (w != null) {
-            int phase = w.phase, r = w.stackPred;     // seed from registerWorker
-            int fifo = w.config & FIFO, nsteals = 0, src = -1;
-            for (;;) {
-                WorkQueue[] qs;
+        if (w != null && w.phase != 0) {                  // else unregistered
+            WorkQueue[] qs;
+            int r = w.stackPred;                          // seed from registerWorker
+            int fifo = (int)config & FIFO, rescans = 0, inactive = 0, taken = 0, n;
+            while ((runState & STOP) == 0L && (qs = queues) != null &&
+                   (n = qs.length) > 0) {
+                int i = r, step = (r >>> 16) | 1;
                r ^= r << 13; r ^= r >>> 17; r ^= r << 5; // xorshift
-                if ((runState & STOP) != 0L || (qs = queues) == null)
-                    break;
-                int n = qs.length, i = r, step = (r >>> 16) | 1;
-                boolean rescan = false;
-                scan: for (int l = n; l > 0; --l, i += step) {  // scan queues
-                    int j, cap; WorkQueue q; ForkJoinTask<?>[] a;
-                    if ((q = qs[j = i & (n - 1)]) != null &&
-                        (a = q.array) != null && (cap = a.length) > 0) {
-                        for (int m = cap - 1, pb = -1, b = q.base;;) {
-                            ForkJoinTask<?> t; long k;
+                scan: for (int j = n; j != 0; --j, i += step) {
+                    WorkQueue q; int qid;
+                    if ((q = qs[qid = i & (n - 1)]) != null) {
+                        ForkJoinTask<?>[] a; int cap;     // poll queue
+                        while ((a = q.array) != null && (cap = a.length) > 0) {
+                            int b, nb, nk; long bp; ForkJoinTask<?> t;
                            t = (ForkJoinTask<?>)U.getReferenceAcquire(
-                                a, k = slotOffset(m & b));
-                            if (b != (b = q.base) || t == null ||
-                                !U.compareAndSetReference(a, k, t, null)) {
-                                if (a[b & m] == null) {
-                                    if (rescan)           // end of run
-                                        break scan;
-                                    if (a[(b + 1) & m] == null &&
-                                        a[(b + 2) & m] == null) {
-                                        break;            // probably empty
+                                a, bp = slotOffset((cap - 1) & (b = q.base)));
+                            long np = slotOffset(nk = (nb = b + 1) & (cap - 1));
+                            if (q.base == b) {            // else inconsistent
+                                if (t == null) {
+                                    if (q.array == a) {   // else resized
+                                        if (rescans > 0)  // ran or stalled
+                                            break scan;
+                                        if (U.getReference(a, np) == null &&
+                                            (rescans >= 0 ||
+                                             (U.getReferenceAcquire(a, bp) == null &&
+                                              q.top == q.base)))
+                                            break;
+                                        rescans = 1;      // may be stalled
                                    }
-                                    if (pb == (pb = b)) { // track progress
-                                        rescan = true;    // stalled; reorder scan
+                                }
+                                else if (inactive != 0) {
+                                    if ((inactive = tryReactivate(w)) != 0) {
+                                        rescans = 1;      // can't take yet
                                        break scan;
                                    }
                                }
-                            }
-                            else {
-                                boolean propagate;
-                                int nb = q.base = b + 1, prevSrc = src;
-                                w.nsteals = ++nsteals;
-                                w.source = src = j;       // volatile
-                                rescan = true;
-                                int nh = t.noUserHelp();
-                                if (propagate =
-                                    (prevSrc != src || nh != 0) && a[nb & m] != null)
-                                    signalWork();
-                                w.topLevelExec(t, fifo);
-                                if ((b = q.base) != nb && !propagate)
-                                    break scan;          // reduce interference
+                                else if (U.compareAndSetReference(a, bp, t, null)) {
+                                    q.base = nb;
+                                    Object nt = U.getReferenceAcquire(a, np);
+                                    w.source = qid;
+                                    rescans = 1;
+                                    ++taken;
+                                    if (nt != null &&     // confirm a[nk]
+                                        U.getReferenceAcquire(a, np) == nt)
+                                        signalWork(a, nk); // propagate
+                                    w.topLevelExec(t, fifo);
+                                }
                            }
                        }
                    }
                }
-                if (!rescan) {
-                    if (((phase = deactivate(w, phase)) & IDLE) != 0)
-                        break;
-                    src = -1;                            // re-enable propagation
+                if (rescans >= 0)
+                    --rescans;
+                else if (inactive == 0) {
+                    if ((inactive = deactivate(w, taken)) != 0)
+                        taken = 0;
                }
+                else if (awaitWork(w) == 0)
+                    inactive = rescans = 0;
+                else
+                    break;
            }
        }
    }

    /**
-     * Deactivates and if necessary awaits signal or termination.
+     * Tries to deactivate worker, keeping active on contention
     *
-     * @param w the worker
-     * @param phase current phase
-     * @return current phase, with IDLE set if worker should exit
+     * @param w the work queue
+     * @param taken number of stolen tasks since last deactivation
+     * @return nonzero if inactive
     */
-    private int deactivate(WorkQueue w, int phase) {
-        if (w == null)                        // currently impossible
-            return IDLE;
-        int p = phase | IDLE, activePhase = phase + (IDLE << 1);
-        long pc = ctl, qc = (activePhase & LMASK) | ((pc - RC_UNIT) & UMASK);
-        int sp = w.stackPred = (int)pc;       // set ctl stack link
-        w.phase = p;
-        if (!compareAndSetCtl(pc, qc))        // try to enqueue
-            return w.phase = phase;           // back out on possible signal
-        int ac = (short)(qc >>> RC_SHIFT), n; long e; WorkQueue[] qs;
-        if (((e = runState) & STOP) != 0L ||
-            ((e & SHUTDOWN) != 0L && ac == 0 && quiescent() > 0) ||
-            (qs = queues) == null || (n = qs.length) <= 0)
-            return IDLE;                      // terminating
-
-        for (int prechecks = Math.min(ac, 2), // reactivation threshold
-             k = Math.max(n + (n << 1), SPIN_WAITS << 1);;) {
-            WorkQueue q; int cap; ForkJoinTask<?>[] a; long c;
-            if (w.phase == activePhase)
-                return activePhase;
-            if (--k < 0)
-                return awaitWork(w, p);       // block, drop, or exit
-            if ((q = qs[k & (n - 1)]) == null)
-                Thread.onSpinWait();
-            else if ((a = q.array) != null && (cap = a.length) > 0 &&
-                     a[q.base & (cap - 1)] != null && --prechecks < 0 &&
-                     (int)(c = ctl) == activePhase &&
-                     compareAndSetCtl(c, (sp & LMASK) | ((c + RC_UNIT) & UMASK)))
-                return w.phase = activePhase; // reactivate
+    private int deactivate(WorkQueue w, int taken) {
+        int inactive = 0, phase;
+        if (w != null && (inactive = (phase = w.phase) & IDLE) == 0) {
+            long sp = (phase + (IDLE << 1)) & LMASK, pc, c;
+            w.phase = phase | IDLE;
+            w.stackPred = (int)(pc = ctl);    // set ctl stack link
+            if (!compareAndSetCtl(            // try to enqueue
+                    pc, c = ((pc - RC_UNIT) & UMASK) | sp))
+                w.phase = phase;              // back out on contention
+            else {
+                if (taken != 0) {
+                    w.nsteals += taken;
+                    if ((w.config & CLEAR_TLS) != 0 &&
+                        (Thread.currentThread() instanceof ForkJoinWorkerThread f))
+                        f.resetThreadLocals(); // (instanceof check always true)
+                }
+                if (((c & RC_MASK) == 0L && quiescent() > 0) || taken == 0)
+                    inactive = w.phase & IDLE; // check quiescent termination
+                else {                         // spin for approx 1 scan cost
+                    int tc = (short)(c >>> TC_SHIFT);
+                    int spins = Math.max((tc << 1) + tc, SPIN_WAITS);
+                    while ((inactive = w.phase & IDLE) != 0 && --spins != 0)
+                        Thread.onSpinWait();
+                }
+            }
        }
+        return inactive;
+    }
+
+    /**
+     * Reactivates worker w if it is currently top of ctl stack
+     *
+     * @param w the work queue
+     * @return 0 if now active
+     */
+    private int tryReactivate(WorkQueue w) {
+        int inactive = 0;
+        if (w != null) {                         // always true; hoist checks
+            int sp = w.stackPred, phase, activePhase; long c;
+            if ((inactive = (phase = w.phase) & IDLE) != 0 &&
+                (int)(c = ctl) == (activePhase = phase + IDLE) &&
+                compareAndSetCtl(c, (sp & LMASK) | ((c + RC_UNIT) & UMASK))) {
+                w.phase = activePhase;
+                inactive = 0;
+            }
+        }
+        return inactive;
    }

    /**
     * Awaits signal or termination.
     *
     * @param w the work queue
-     * @param p current phase (known to be idle)
-     * @return current phase, with IDLE set if worker should exit
+     * @return 0 if now active
     */
-    private int awaitWork(WorkQueue w, int p) {
-        if (w != null) {
-            ForkJoinWorkerThread t; long deadline;
-            if ((w.config & CLEAR_TLS) != 0 && (t = w.owner) != null)
-                t.resetThreadLocals();          // clear before reactivate
-            if ((ctl & RC_MASK) > 0L)
-                deadline = 0L;
-            else if ((deadline =
-                      (((w.source != INVALID_ID) ? keepAlive : TIMEOUT_SLOP)) +
-                      System.currentTimeMillis()) == 0L)
-                deadline = 1L;                 // avoid zero
-            int activePhase = p + IDLE;
-            if ((p = w.phase) != activePhase && (runState & STOP) == 0L) {
+    private int awaitWork(WorkQueue w) {
+        int inactive = 0, phase;
+        if (w != null) {                          // always true; hoist checks
+            long waitTime = (w.source == INVALID_ID) ? 0L : keepAlive;
+            if ((inactive = (phase = w.phase) & IDLE) != 0) {
                LockSupport.setCurrentBlocker(this);
-                w.parking = 1;                 // enable unpark
-                while ((p = w.phase) != activePhase) {
-                    boolean trimmable = false; int trim;
-                    Thread.interrupted();      // clear status
+                int activePhase = phase + IDLE;
+                for (long deadline = 0L;;) {
+                    Thread.interrupted();         // clear status
                    if ((runState & STOP) != 0L)
                        break;
-                    if (deadline != 0L) {
-                        if ((trim = tryTrim(w, p, deadline)) > 0)
-                            break;
-                        else if (trim < 0)
-                            deadline = 0L;
-                        else
-                            trimmable = true;
+                    boolean trimmable = false;    // use timed wait if trimmable
+                    long d = 0L, c;
+                    if (((c = ctl) & RC_MASK) == 0L && (int)c == activePhase) {
+                        long now = System.currentTimeMillis();
+                        if (deadline == 0L)
+                            deadline = waitTime + now;
+                        if (deadline - now <= TIMEOUT_SLOP) {
+                            if (tryTrim(w, c, activePhase))
+                                break;
+                            continue;             // lost race to trim
+                        }
+                        d = deadline;
+                        trimmable = true;
                    }
-                    U.park(trimmable, deadline);
+                    w.parking = 1;                // enable unpark and recheck
+                    if ((inactive = w.phase & IDLE) != 0)
+                        U.park(trimmable, d);
+                    w.parking = 0;                // close unpark window
+                    if (inactive == 0 || (inactive = w.phase & IDLE) == 0)
+                        break;
                }
-                w.parking = 0;
                LockSupport.setCurrentBlocker(null);
            }
        }
-        return p;
+        return inactive;
    }

    /**
     * Tries to remove and deregister worker after timeout, and release
-     * another to do the same.
-     * @return > 0: trimmed, < 0 : not trimmable, else 0
+     * another to do the same unless new tasks are found.
     */
-    private int tryTrim(WorkQueue w, int phase, long deadline) {
-        long c, nc; int stat, activePhase, vp, i; WorkQueue[] vs; WorkQueue v;
-        if ((activePhase = phase + IDLE) != (int)(c = ctl) || w == null)
-            stat = -1;                      // no longer ctl top
-        else if (deadline - System.currentTimeMillis() >= TIMEOUT_SLOP)
-            stat = 0;                       // spurious wakeup
-        else if (!compareAndSetCtl(
-                     c, nc = ((w.stackPred & LMASK) | (RC_MASK & c) |
-                               (TC_MASK & (c - TC_UNIT)))))
-            stat = -1;                      // lost race to signaller
-        else {
-            stat = 1;
-            w.source = DROPPED;
-            w.phase = activePhase;
-            if ((vp = (int)nc) != 0 && (vs = queues) != null &&
-                vs.length > (i = vp & SMASK) && (v = vs[i]) != null &&
-                compareAndSetCtl(           // try to wake up next waiter
-                    nc, ((UMASK & (nc + RC_UNIT)) |
-                         (nc & TC_MASK) | (v.stackPred & LMASK)))) {
-                v.source = INVALID_ID;      // enable cascaded timeouts
-                v.phase = vp;
-                U.unpark(v.owner);
+    private boolean tryTrim(WorkQueue w, long c, int activePhase) {
+        if (w != null) {
+            int vp, i; WorkQueue[] vs; WorkQueue v;
+            long nc = ((w.stackPred & LMASK) |
+                       ((RC_MASK & c) | (TC_MASK & (c - TC_UNIT))));
+            if (compareAndSetCtl(c, nc)) {
+                w.source = DROPPED;
+                w.phase = activePhase;
+                if ((vp = (int)nc) != 0 && (vs = queues) != null &&
+                    vs.length > (i = vp & SMASK) && (v = vs[i]) != null &&
+                    compareAndSetCtl(           // try to wake up next waiter
+                        nc, ((v.stackPred & LMASK) |
+                             ((UMASK & (nc + RC_UNIT)) | (nc & TC_MASK))))) {
+                    v.source = INVALID_ID;      // enable cascaded timeouts
+                    v.phase = vp;
+                    U.unpark(v.owner);
+                }
+                return true;
            }
        }
-        return stat;
+        return false;
    }

    /**
@ -2561,52 +2569,35 @@ public class ForkJoinPool extends AbstractExecutorService

    /**
     * Finds and locks a WorkQueue for an external submitter, or
-     * throws RejectedExecutionException if shutdown or terminating.
-     * @param r current ThreadLocalRandom.getProbe() value
+     * throws RejectedExecutionException if shutdown
     * @param rejectOnShutdown true if RejectedExecutionException
-     *        should be thrown when shutdown (else only if terminating)
+     *        should be thrown when shutdown
     */
-    private WorkQueue submissionQueue(int r, boolean rejectOnShutdown) {
-        int reuse;                                   // nonzero if prefer create
-        if ((reuse = r) == 0) {
-            ThreadLocalRandom.localInit();           // initialize caller's probe
+    final WorkQueue externalSubmissionQueue(boolean rejectOnShutdown) {
+        int r;
+        if ((r = ThreadLocalRandom.getProbe()) == 0) {
+            ThreadLocalRandom.localInit();   // initialize caller's probe
            r = ThreadLocalRandom.getProbe();
        }
-        for (int probes = 0; ; ++probes) {
-            int n, i, id; WorkQueue[] qs; WorkQueue q;
-            if ((qs = queues) == null)
-                break;
-            if ((n = qs.length) <= 0)
+        for (;;) {
+            WorkQueue q; WorkQueue[] qs; int n, id, i;
+            if ((qs = queues) == null || (n = qs.length) <= 0)
                break;
            if ((q = qs[i = (id = r & EXTERNAL_ID_MASK) & (n - 1)]) == null) {
-                WorkQueue w = new WorkQueue(null, id, 0, false);
-                w.phase = id;
-                boolean reject = ((lockRunState() & SHUTDOWN) != 0 &&
-                                  rejectOnShutdown);
-                if (!reject && queues == qs && qs[i] == null)
-                    q = qs[i] = w;                   // else lost race to install
+                WorkQueue newq = new WorkQueue(null, id, 0, false);
+                lockRunState();
+                if (qs[i] == null && queues == qs)
+                    q = qs[i] = newq;         // else lost race to install
                unlockRunState();
-                if (q != null)
-                    return q;
-                if (reject)
+            }
+            if (q != null && q.tryLockPhase()) {
+                if (rejectOnShutdown && (runState & SHUTDOWN) != 0L) {
+                    q.unlockPhase();          // check while q lock held
                    break;
-                reuse = 0;
-            }
-            if (reuse == 0 || !q.tryLockPhase()) {   // move index
-                if (reuse == 0) {
-                    if (probes >= n >> 1)
-                        reuse = r;                   // stop prefering free slot
                }
-                else if (q != null)
-                    reuse = 0;                       // probe on collision
-                r = ThreadLocalRandom.advanceProbe(r);
-            }
-            else if (rejectOnShutdown && (runState & SHUTDOWN) != 0L) {
-                q.unlockPhase();                     // check while q lock held
-                break;
-            }
-            else
                return q;
+            }
+            r = ThreadLocalRandom.advanceProbe(r); // move
        }
        throw new RejectedExecutionException();
    }
@ -2620,24 +2611,12 @@ public class ForkJoinPool extends AbstractExecutorService
        }
        else {                     // find and lock queue
            internal = false;
-            q = submissionQueue(ThreadLocalRandom.getProbe(), true);
+            q = externalSubmissionQueue(true);
        }
        q.push(task, signalIfEmpty ? this : null, internal);
        return task;
    }

-    /**
-     * Returns queue for an external submission, bypassing call to
-     * submissionQueue if already established and unlocked.
-     */
-    final WorkQueue externalSubmissionQueue(boolean rejectOnShutdown) {
-        WorkQueue[] qs; WorkQueue q; int n;
-        int r = ThreadLocalRandom.getProbe();
-        return (((qs = queues) != null && (n = qs.length) > 0 &&
-                 (q = qs[r & EXTERNAL_ID_MASK & (n - 1)]) != null && r != 0 &&
-                 q.tryLockPhase()) ? q : submissionQueue(r, rejectOnShutdown));
-    }
-
    /**
     * Returns queue for an external thread, if one exists that has
     * possibly ever submitted to the given pool (nonzero probe), or
@ -3310,11 +3289,14 @@ public class ForkJoinPool extends AbstractExecutorService
     * @since 19
     */
    public int setParallelism(int size) {
+        int prevSize;
        if (size < 1 || size > MAX_CAP)
            throw new IllegalArgumentException();
        if ((config & PRESET_SIZE) != 0)
            throw new UnsupportedOperationException("Cannot override System property");
-        return getAndSetParallelism(size);
+        if ((prevSize = getAndSetParallelism(size)) < size)
+            signalWork(null, 0); // trigger worker activation
+        return prevSize;
    }

    /**
--- a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
+++ b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
@ -588,13 +588,15 @@ public class LinkedTransferQueue<E> extends AbstractQueue<E>
            do {
                m = p.item;
                q = p.next;
-                if (p.isData != haveData && haveData != (m != null) &&
-                    p.cmpExItem(m, e) == m) {
-                    Thread w = p.waiter;    // matched complementary node
-                    if (p != h && h == cmpExHead(h, (q == null) ? p : q))
-                        h.next = h;         // advance head; self-link old
-                    LockSupport.unpark(w);
-                    return m;
+                if (p.isData != haveData && haveData != (m != null)) {
+                    if (p.cmpExItem(m, e) == m) {
+                        Thread w = p.waiter; // matched complementary node
+                        if (p != h && h == cmpExHead(h, (q == null) ? p : q))
+                            h.next = h;     // advance head; self-link old
+                        LockSupport.unpark(w);
+                        return m;
+                    }
+                    continue restart;
                } else if (q == null) {
                    if (ns == 0L)           // try to append unless immediate
                        break restart;
--- a/Show More
+++ b/Show More