Merge branch 'master' into 8044609-ssl

2026-02-11 19:08:23 +00:00 · 2025-09-24 13:33:25 +00:00 · 2025-09-24 13:33:25 +00:00 · 4d4af4301f
commit 4d4af4301f
parent f94c4e74da 47efe3c794
1007 changed files with 122482 additions and 15240 deletions
--- a/doc/hotspot-style.html
+++ b/doc/hotspot-style.html
@ -1859,8 +1859,6 @@ difference.</p>
 <h3 id="additional-undecided-features">Additional Undecided
 Features</h3>
 <ul>
-<li><p>Trailing return type syntax for functions (<a
-href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2541.htm">n2541</a>)</p></li>
 <li><p>Member initializers and aggregates (<a
 href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3653.html">n3653</a>)</p></li>
 <li><p>Rvalue references and move semantics</p></li>
--- a/doc/hotspot-style.md
+++ b/doc/hotspot-style.md
@ -1853,9 +1853,6 @@ See Object Lifetime: C++17 6.8/8, C++20 6.7.3/8

 ### Additional Undecided Features

-* Trailing return type syntax for functions
-([n2541](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2541.htm))
-
 * Member initializers and aggregates
 ([n3653](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3653.html))

--- a/make/autoconf/flags-cflags.m4
+++ b/make/autoconf/flags-cflags.m4
@ -37,56 +37,25 @@ AC_DEFUN([FLAGS_SETUP_SHARED_LIBS],
  if test "x$TOOLCHAIN_TYPE" = xgcc; then
    # Default works for linux, might work on other platforms as well.
    SHARED_LIBRARY_FLAGS='-shared'
-    # --disable-new-dtags forces use of RPATH instead of RUNPATH for rpaths.
-    # This protects internal library dependencies within the JDK from being
-    # overridden using LD_LIBRARY_PATH. See JDK-8326891 for more information.
-    SET_EXECUTABLE_ORIGIN='-Wl,-rpath,\$$ORIGIN[$]1 -Wl,--disable-new-dtags'
-    SET_SHARED_LIBRARY_ORIGIN="-Wl,-z,origin $SET_EXECUTABLE_ORIGIN"
-    SET_SHARED_LIBRARY_NAME='-Wl,-soname=[$]1'

  elif test "x$TOOLCHAIN_TYPE" = xclang; then
    if test "x$OPENJDK_TARGET_OS" = xmacosx; then
      # Linking is different on MacOSX
      SHARED_LIBRARY_FLAGS="-dynamiclib -compatibility_version 1.0.0 -current_version 1.0.0"
-      SET_EXECUTABLE_ORIGIN='-Wl,-rpath,@loader_path$(or [$]1,/.)'
-      SET_SHARED_LIBRARY_ORIGIN="$SET_EXECUTABLE_ORIGIN"
-      SET_SHARED_LIBRARY_NAME='-Wl,-install_name,@rpath/[$]1'

    elif test "x$OPENJDK_TARGET_OS" = xaix; then
      # Linking is different on aix
      SHARED_LIBRARY_FLAGS="-shared -Wl,-bM:SRE -Wl,-bnoentry"
-      SET_EXECUTABLE_ORIGIN=""
-      SET_SHARED_LIBRARY_ORIGIN=''
-      SET_SHARED_LIBRARY_NAME=''

    else
      # Default works for linux, might work on other platforms as well.
      SHARED_LIBRARY_FLAGS='-shared'
-      SET_EXECUTABLE_ORIGIN='-Wl,-rpath,\$$ORIGIN[$]1'
-      if test "x$OPENJDK_TARGET_OS" = xlinux; then
-        SET_EXECUTABLE_ORIGIN="$SET_EXECUTABLE_ORIGIN -Wl,--disable-new-dtags"
-      fi
-      SET_SHARED_LIBRARY_NAME='-Wl,-soname=[$]1'
-
-      # arm specific settings
-      if test "x$OPENJDK_TARGET_CPU" = "xarm"; then
-        # '-Wl,-z,origin' isn't used on arm.
-        SET_SHARED_LIBRARY_ORIGIN='-Wl,-rpath,\$$$$ORIGIN[$]1'
-      else
-        SET_SHARED_LIBRARY_ORIGIN="-Wl,-z,origin $SET_EXECUTABLE_ORIGIN"
-      fi
    fi

  elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
    SHARED_LIBRARY_FLAGS="-dll"
-    SET_EXECUTABLE_ORIGIN=''
-    SET_SHARED_LIBRARY_ORIGIN=''
-    SET_SHARED_LIBRARY_NAME=''
  fi

-  AC_SUBST(SET_EXECUTABLE_ORIGIN)
-  AC_SUBST(SET_SHARED_LIBRARY_ORIGIN)
-  AC_SUBST(SET_SHARED_LIBRARY_NAME)
  AC_SUBST(SHARED_LIBRARY_FLAGS)
 ])

@ -934,48 +903,6 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
        IF_FALSE: [$2FDLIBM_CFLAGS=""])
  fi
  AC_SUBST($2FDLIBM_CFLAGS)
-
-  # Check whether the compiler supports the Arm C Language Extensions (ACLE)
-  # for SVE. Set SVE_CFLAGS to -march=armv8-a+sve if it does.
-  # ACLE and this flag are required to build the aarch64 SVE related functions in
-  # libvectormath. Apple Silicon does not support SVE; use macOS as a proxy for
-  # that check.
-  if test "x$OPENJDK_TARGET_CPU" = "xaarch64" && test "x$OPENJDK_TARGET_OS" = "xlinux"; then
-    if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
-      AC_LANG_PUSH(C)
-      OLD_CFLAGS="$CFLAGS"
-      CFLAGS="$CFLAGS -march=armv8-a+sve"
-      AC_MSG_CHECKING([if Arm SVE ACLE is supported])
-      AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <arm_sve.h>],
-          [
-            svint32_t r = svdup_n_s32(1);
-            return 0;
-          ])],
-          [
-            AC_MSG_RESULT([yes])
-            $2SVE_CFLAGS="-march=armv8-a+sve"
-            # Switching the initialization mode with gcc from 'pattern' to 'zero'
-            # avoids the use of unsupported `__builtin_clear_padding` for variable
-            # length aggregates
-            if test "x$DEBUG_LEVEL" != xrelease && test "x$TOOLCHAIN_TYPE" = xgcc ; then
-              INIT_ZERO_FLAG="-ftrivial-auto-var-init=zero"
-              FLAGS_COMPILER_CHECK_ARGUMENTS(ARGUMENT: [$INIT_ZERO_FLAG],
-                IF_TRUE: [
-                  $2SVE_CFLAGS="${$2SVE_CFLAGS} $INIT_ZERO_FLAG"
-                ]
-              )
-            fi
-          ],
-          [
-            AC_MSG_RESULT([no])
-            $2SVE_CFLAGS=""
-          ]
-      )
-      CFLAGS="$OLD_CFLAGS"
-      AC_LANG_POP(C)
-    fi
-  fi
-  AC_SUBST($2SVE_CFLAGS)
 ])

 AC_DEFUN_ONCE([FLAGS_SETUP_BRANCH_PROTECTION],
--- a/make/autoconf/flags-ldflags.m4
+++ b/make/autoconf/flags-ldflags.m4
@ -98,7 +98,7 @@ AC_DEFUN([FLAGS_SETUP_LDFLAGS_HELPER],

  # Setup OS-dependent LDFLAGS
  if test "x$OPENJDK_TARGET_OS" = xmacosx && test "x$TOOLCHAIN_TYPE" = xclang; then
-    # FIXME: We should really generalize SET_SHARED_LIBRARY_ORIGIN instead.
+    # FIXME: We should really generalize SetSharedLibraryOrigin instead.
    OS_LDFLAGS_JVM_ONLY="-Wl,-rpath,@loader_path/. -Wl,-rpath,@loader_path/.."
    OS_LDFLAGS="-mmacosx-version-min=$MACOSX_VERSION_MIN -Wl,-reproducible"
  fi
--- a/make/autoconf/flags-other.m4
+++ b/make/autoconf/flags-other.m4
@ -107,6 +107,62 @@ AC_DEFUN([FLAGS_SETUP_NMFLAGS],
  AC_SUBST(NMFLAGS)
 ])

+# Check whether the compiler supports the Arm C Language Extensions (ACLE)
+# for SVE. Set SVE_CFLAGS to -march=armv8-a+sve if it does.
+# ACLE and this flag are required to build the aarch64 SVE related functions
+# in libvectormath.
+AC_DEFUN([FLAGS_SETUP_SVE],
+[
+  AARCH64_SVE_AVAILABLE=false
+  # Apple Silicon does not support SVE; use macOS as a proxy for that check.
+  if test "x$OPENJDK_TARGET_CPU" = "xaarch64" && test "x$OPENJDK_TARGET_OS" = "xlinux"; then
+    if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
+      # check the compiler and binutils support sve or not
+      AC_MSG_CHECKING([if Arm SVE ACLE is supported])
+      AC_LANG_PUSH([C])
+      saved_cflags="$CFLAGS"
+      CFLAGS="$CFLAGS -march=armv8-a+sve $CFLAGS_WARNINGS_ARE_ERRORS ARG_ARGUMENT"
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+        [
+          #include <arm_sve.h>
+          svfloat64_t a() {}
+        ],
+        [
+          svint32_t r = svdup_n_s32(1)
+        ])],
+        [
+          AARCH64_SVE_AVAILABLE=true
+        ]
+      )
+      CFLAGS="$saved_cflags"
+      AC_LANG_POP([C])
+      AC_MSG_RESULT([$AARCH64_SVE_AVAILABLE])
+    fi
+  fi
+
+  UTIL_ARG_ENABLE(NAME: aarch64-sve, DEFAULT: auto,
+    RESULT: AARCH64_SVE_ENABLED,
+    DESC: [Use SVE when compiling libsleef],
+    AVAILABLE: $AARCH64_SVE_AVAILABLE)
+  SVE_CFLAGS=""
+  if test "x$AARCH64_SVE_ENABLED" = xtrue; then
+    SVE_CFLAGS="-march=armv8-a+sve"
+    # Switching the initialization mode with gcc from 'pattern' to 'zero'
+    # avoids the use of unsupported `__builtin_clear_padding` for variable
+    # length aggregates
+    if test "x$DEBUG_LEVEL" != xrelease && test "x$TOOLCHAIN_TYPE" = xgcc ; then
+      AC_MSG_CHECKING([Switching the initialization mode with gcc from pattern to zero])
+      INIT_ZERO_FLAG="-ftrivial-auto-var-init=zero"
+      FLAGS_COMPILER_CHECK_ARGUMENTS(ARGUMENT: [$INIT_ZERO_FLAG],
+        IF_TRUE: [
+          SVE_CFLAGS="${SVE_CFLAGS} $INIT_ZERO_FLAG"
+        ]
+      )
+    fi
+  fi
+  AC_SUBST(SVE_CFLAGS)
+])
+
 ################################################################################
 # platform independent
 AC_DEFUN([FLAGS_SETUP_ASFLAGS],
--- a/make/autoconf/flags.m4
+++ b/make/autoconf/flags.m4
@ -374,6 +374,7 @@ AC_DEFUN([FLAGS_SETUP_FLAGS],
  FLAGS_SETUP_RCFLAGS
  FLAGS_SETUP_NMFLAGS

+  FLAGS_SETUP_SVE
  FLAGS_SETUP_ASFLAGS
  FLAGS_SETUP_ASFLAGS_CPU_DEP([TARGET])
  FLAGS_SETUP_ASFLAGS_CPU_DEP([BUILD], [OPENJDK_BUILD_])
--- a/make/autoconf/lib-tests.m4
+++ b/make/autoconf/lib-tests.m4
@ -28,7 +28,7 @@
 ################################################################################

 # Minimum supported versions
-JTREG_MINIMUM_VERSION=7.5.2
+JTREG_MINIMUM_VERSION=8
 GTEST_MINIMUM_VERSION=1.14.0

 ################################################################################
--- a/make/autoconf/spec.gmk.template
+++ b/make/autoconf/spec.gmk.template
@ -624,17 +624,8 @@ ASFLAGS_DEBUG_SYMBOLS := @ASFLAGS_DEBUG_SYMBOLS@
 # Compress (or not) jars
 COMPRESS_JARS := @COMPRESS_JARS@

-# Options to linker to specify the library name.
-# (Note absence of := assignment, because we do not want to evaluate the macro body here)
-SET_SHARED_LIBRARY_NAME = @SET_SHARED_LIBRARY_NAME@
-
 SHARED_LIBRARY_FLAGS := @SHARED_LIBRARY_FLAGS@

-# Set origin using the linker, ie use the relative path to the dependent library to find the dependencies.
-# (Note absence of := assignment, because we do not want to evaluate the macro body here)
-SET_SHARED_LIBRARY_ORIGIN = @SET_SHARED_LIBRARY_ORIGIN@
-SET_EXECUTABLE_ORIGIN = @SET_EXECUTABLE_ORIGIN@
-
 LIBRARY_PREFIX := @LIBRARY_PREFIX@
 SHARED_LIBRARY_SUFFIX := @SHARED_LIBRARY_SUFFIX@
 STATIC_LIBRARY_SUFFIX := @STATIC_LIBRARY_SUFFIX@
--- a/make/common/Execute.gmk
+++ b/make/common/Execute.gmk
@ -148,9 +148,12 @@ define SetupExecuteBody
    $1_INFO := Running commands for $1
  endif

+  $1_VARDEPS := $$($1_COMMAND) $$($1_PRE_COMMAND) $$($1_POST_COMMAND)
+  $1_VARDEPS_FILE := $$(call DependOnVariable, $1_VARDEPS)
+
  ifneq ($$($1_PRE_COMMAND), )

-    $$($1_PRE_MARKER): $$($1_DEPS)
+    $$($1_PRE_MARKER): $$($1_DEPS) $$($1_VARDEPS_FILE)
        ifneq ($$($1_WARN), )
 	  $$(call LogWarn, $$($1_WARN))
        endif
@ -176,7 +179,7 @@ define SetupExecuteBody

    $1 := $$($1_PRE_MARKER) $$($1_EXEC_RESULT)
  else
-    $$($1_EXEC_RESULT): $$($1_DEPS)
+    $$($1_EXEC_RESULT): $$($1_DEPS) $$($1_VARDEPS_FILE)
        ifneq ($$($1_WARN), )
 	  $$(call LogWarn, $$($1_WARN))
        endif
--- a/make/common/JdkNativeCompilation.gmk
+++ b/make/common/JdkNativeCompilation.gmk
@ -30,6 +30,47 @@ ifeq ($(INCLUDE), true)

 include NativeCompilation.gmk

+ifeq ($(call isCompiler, gcc), true)
+  # --disable-new-dtags forces use of RPATH instead of RUNPATH for rpaths.
+  # This protects internal library dependencies within the JDK from being
+  # overridden using LD_LIBRARY_PATH. See JDK-8326891 for more information.
+  SetExecutableOrigin = \
+      -Wl,-rpath,\$(DOLLAR)ORIGIN$1 -Wl,--disable-new-dtags
+  SetSharedLibraryOrigin = \
+      -Wl,-z,origin -Wl,-rpath,\$(DOLLAR)ORIGIN$1 -Wl,--disable-new-dtags
+else ifeq ($(call isCompiler, clang), true)
+  ifeq ($(call isTargetOs, macosx), true)
+    SetExecutableOrigin = \
+        -Wl,-rpath,@loader_path$(or $1,/.)
+    SetSharedLibraryOrigin = \
+        -Wl,-rpath,@loader_path$(or $1,/.)
+  else ifeq ($(call isTargetOs, aix), true)
+    SetExecutableOrigin =
+    SetSharedLibraryOrigin =
+  else
+    ifeq ($(call isTargetOs, linux), true)
+      SetExecutableOrigin = \
+          -Wl,-rpath,\$(DOLLAR)ORIGIN$1 -Wl,--disable-new-dtags
+    else
+      SetExecutableOrigin = \
+          -Wl,-rpath,\$(DOLLAR)ORIGIN$1
+    endif
+
+    ifeq ($(call isTargetOs, arm), true)
+      SetSharedLibraryOrigin = \
+          -Wl,-rpath,\$(DOLLAR)ORIGIN$1
+    else
+      SetSharedLibraryOrigin = \
+          -Wl,-z,origin -Wl,-rpath,\$(DOLLAR)ORIGIN$1
+    endif
+  endif
+else ifeq ($(call isCompiler, microsoft), true)
+  SetExecutableOrigin =
+  SetSharedLibraryOrigin =
+else
+  $(error Unknown toolchain)
+endif
+
 FindSrcDirsForComponent += \
  $(call uniq, $(wildcard \
      $(TOPDIR)/src/$(strip $1)/$(OPENJDK_TARGET_OS)/native/$(strip $2) \
@ -444,9 +485,9 @@ define SetupJdkNativeCompilationBody

  ifneq ($$($1_LD_SET_ORIGIN), false)
    ifeq ($$($1_TYPE), EXECUTABLE)
-      $1_LDFLAGS += $$(call SET_EXECUTABLE_ORIGIN)
+      $1_LDFLAGS += $$(call SetExecutableOrigin)
    else
-      $1_LDFLAGS += $$(call SET_SHARED_LIBRARY_ORIGIN)
+      $1_LDFLAGS += $$(call SetSharedLibraryOrigin)
    endif
  endif
  # APPEND_LDFLAGS, if it exists, must be set after the origin flags
--- a/make/common/modules/LauncherCommon.gmk
+++ b/make/common/modules/LauncherCommon.gmk
@ -156,8 +156,8 @@ define SetupBuildLauncherBody
      DISABLED_WARNINGS_gcc := unused-function unused-variable, \
      DISABLED_WARNINGS_clang := unused-function, \
      LDFLAGS := $$($1_LDFLAGS), \
-      LDFLAGS_linux := $$(call SET_EXECUTABLE_ORIGIN,/../lib), \
-      LDFLAGS_macosx := $$(call SET_EXECUTABLE_ORIGIN,/../lib), \
+      LDFLAGS_linux := $$(call SetExecutableOrigin,/../lib), \
+      LDFLAGS_macosx := $$(call SetExecutableOrigin,/../lib), \
      LDFLAGS_FILTER_OUT := $$($1_LDFLAGS_FILTER_OUT), \
      JDK_LIBS := $$($1_JDK_LIBS), \
      JDK_LIBS_windows := $$($1_JDK_LIBS_windows), \
--- a/make/common/native/Link.gmk
+++ b/make/common/native/Link.gmk
@ -50,6 +50,26 @@ GetEntitlementsFile = \
      $(if $(wildcard $f), $f, $(DEFAULT_ENTITLEMENTS_FILE)) \
    )

+ifeq ($(call isCompiler, gcc), true)
+  SetSharedLibraryName = \
+      -Wl,-soname=$1
+else ifeq ($(call isCompiler, clang), true)
+  ifeq ($(call isTargetOs, macosx), true)
+    SetSharedLibraryName = \
+        -Wl,-install_name,@rpath/$1
+  else ifeq ($(call isTargetOs, aix), true)
+    SetSharedLibraryName =
+  else
+    # Default works for linux, might work on other platforms as well.
+    SetSharedLibraryName = \
+        -Wl,-soname=$1
+  endif
+else ifeq ($(call isCompiler, microsoft), true)
+  SetSharedLibraryName =
+else
+  $(error Unknown toolchain)
+endif
+
 ################################################################################
 define SetupLinking
  # Unless specifically set, stripping should only happen if symbols are also
@ -131,7 +151,7 @@ define CreateDynamicLibraryOrExecutable
  # A shared dynamic library or an executable binary has been specified
  ifeq ($$($1_TYPE), LIBRARY)
    # Generating a dynamic library.
-    $1_EXTRA_LDFLAGS += $$(call SET_SHARED_LIBRARY_NAME,$$($1_BASENAME))
+    $1_EXTRA_LDFLAGS += $$(call SetSharedLibraryName,$$($1_BASENAME))
  endif

  ifeq ($(MACOSX_CODESIGN_MODE), hardened)
--- a/make/conf/github-actions.conf
+++ b/make/conf/github-actions.conf
@ -26,7 +26,7 @@
 # Versions and download locations for dependencies used by GitHub Actions (GHA)

 GTEST_VERSION=1.14.0
-JTREG_VERSION=7.5.2+1
+JTREG_VERSION=8+2

 LINUX_X64_BOOT_JDK_EXT=tar.gz
 LINUX_X64_BOOT_JDK_URL=https://download.java.net/java/GA/jdk24/1f9ff9062db4449d8ca828c504ffae90/36/GPL/openjdk-24_linux-x64_bin.tar.gz
--- a/make/conf/jib-profiles.js
+++ b/make/conf/jib-profiles.js
@ -1174,9 +1174,9 @@ var getJibProfilesDependencies = function (input, common) {
        jtreg: {
            server: "jpg",
            product: "jtreg",
-            version: "7.5.2",
-            build_number: "1",
-            file: "bundles/jtreg-7.5.2+1.zip",
+            version: "8",
+            build_number: "2",
+            file: "bundles/jtreg-8+2.zip",
            environment_name: "JT_HOME",
            environment_path: input.get("jtreg", "home_path") + "/bin",
            configure_args: "--with-jtreg=" + input.get("jtreg", "home_path"),
--- a/make/hotspot/lib/CompileGtest.gmk
+++ b/make/hotspot/lib/CompileGtest.gmk
@ -152,7 +152,7 @@ $(eval $(call SetupJdkExecutable, BUILD_GTEST_LAUNCHER, \
        -I$(GTEST_FRAMEWORK_SRC)/googlemock \
        -I$(GTEST_FRAMEWORK_SRC)/googlemock/include, \
    LD_SET_ORIGIN := false, \
-    LDFLAGS_unix := $(call SET_SHARED_LIBRARY_ORIGIN), \
+    LDFLAGS_unix := $(call SetSharedLibraryOrigin), \
    JDK_LIBS := gtest:libjvm, \
    COPY_DEBUG_SYMBOLS := $(GTEST_COPY_DEBUG_SYMBOLS), \
    ZIP_EXTERNAL_DEBUG_SYMBOLS := false, \
--- a/make/hotspot/lib/JvmFeatures.gmk
+++ b/make/hotspot/lib/JvmFeatures.gmk
@ -57,7 +57,7 @@ ifeq ($(call check-jvm-feature, zero), true)
      -DZERO_LIBARCH='"$(OPENJDK_TARGET_CPU_LEGACY_LIB)"' $(LIBFFI_CFLAGS)
  JVM_LIBS_FEATURES += $(LIBFFI_LIBS)
  ifeq ($(ENABLE_LIBFFI_BUNDLING), true)
-    JVM_LDFLAGS_FEATURES += $(call SET_EXECUTABLE_ORIGIN,/..)
+    JVM_LDFLAGS_FEATURES += $(call SetExecutableOrigin,/..)
  endif
 else
  JVM_EXCLUDE_PATTERNS += /zero/
--- a/make/modules/java.base/Gensrc.gmk
+++ b/make/modules/java.base/Gensrc.gmk
@ -33,7 +33,6 @@ include gensrc/GensrcBuffer.gmk
 include gensrc/GensrcCharacterData.gmk
 include gensrc/GensrcCharsetCoder.gmk
 include gensrc/GensrcCharsetMapping.gmk
-include gensrc/GensrcExceptions.gmk
 include gensrc/GensrcMisc.gmk
 include gensrc/GensrcModuleLoaderMap.gmk
 include gensrc/GensrcRegex.gmk
--- a/make/modules/java.base/gensrc/GensrcExceptions.gmk
+++ b/make/modules/java.base/gensrc/GensrcExceptions.gmk
@ -1,57 +0,0 @@
-#
-# Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This code is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License version 2 only, as
-# published by the Free Software Foundation.  Oracle designates this
-# particular file as subject to the "Classpath" exception as provided
-# by Oracle in the LICENSE file that accompanied this code.
-#
-# This code is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-# version 2 for more details (a copy is included in the LICENSE file that
-# accompanied this code).
-#
-# You should have received a copy of the GNU General Public License version
-# 2 along with this work; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
-# or visit www.oracle.com if you need additional information or have any
-# questions.
-#
-
-include MakeIncludeStart.gmk
-ifeq ($(INCLUDE), true)
-
-################################################################################
-
-GENSRC_EXCEPTIONS :=
-
-GENSRC_EXCEPTIONS_DST := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/java/nio
-
-GENSRC_EXCEPTIONS_SRC := $(MODULE_SRC)/share/classes/java/nio
-GENSRC_EXCEPTIONS_CMD := $(TOPDIR)/make/scripts/genExceptions.sh
-
-GENSRC_EXCEPTIONS_SRC_DIRS := . charset channels
-
-$(GENSRC_EXCEPTIONS_DST)/_the.%.marker: $(GENSRC_EXCEPTIONS_SRC)/%/exceptions \
-    $(GENSRC_EXCEPTIONS_CMD)
-	$(call LogInfo, Generating exceptions java.nio $*)
-	$(call MakeDir, $(@D)/$*)
-	SCRIPTS="$(TOPDIR)/make/scripts" AWK="$(AWK)" SH="$(SH)" $(SH) \
-	    $(GENSRC_EXCEPTIONS_CMD) $< $(@D)/$* $(LOG_DEBUG)
-	$(TOUCH) $@
-
-GENSRC_EXCEPTIONS += $(foreach D, $(GENSRC_EXCEPTIONS_SRC_DIRS), $(GENSRC_EXCEPTIONS_DST)/_the.$(D).marker)
-
-$(GENSRC_EXCEPTIONS): $(BUILD_TOOLS_JDK)
-
-TARGETS += $(GENSRC_EXCEPTIONS)
-
-################################################################################
-
-endif # include guard
-include MakeIncludeEnd.gmk
--- a/make/scripts/addNotices.sh
+++ b/make/scripts/addNotices.sh
@ -1,45 +0,0 @@
-#! /bin/sh
-#
-# Copyright (c) 2007, 2020, Oracle and/or its affiliates. All rights reserved.
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This code is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License version 2 only, as
-# published by the Free Software Foundation.  Oracle designates this
-# particular file as subject to the "Classpath" exception as provided
-# by Oracle in the LICENSE file that accompanied this code.
-#
-# This code is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-# version 2 for more details (a copy is included in the LICENSE file that
-# accompanied this code).
-#
-# You should have received a copy of the GNU General Public License version
-# 2 along with this work; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
-# or visit www.oracle.com if you need additional information or have any
-# questions.
-#
-
-# Parse the first contiguous comment block in this script and generate
-# a java comment block. If this script is invoked with a copyright
-# year/year range, the java comment block will contain a Sun copyright.
-
-COPYRIGHT_YEARS="$1"
-
-cat <<__END__
-/*
-__END__
-
-if [ "x$COPYRIGHT_YEARS" != x ]; then
-  cat <<__END__
- * Copyright (c) $COPYRIGHT_YEARS Oracle and/or its affiliates. All rights reserved.
-__END__
-fi
-
-$AWK ' /^#.*Copyright.*Oracle/ { next }
-    /^#([^!]|$)/ { sub(/^#/, " *"); print }
-    /^$/ { print " */"; exit } ' $0
--- a/make/scripts/genExceptions.sh
+++ b/make/scripts/genExceptions.sh
@ -1,116 +0,0 @@
-#! /bin/sh
-#
-# Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved.
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This code is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License version 2 only, as
-# published by the Free Software Foundation.  Oracle designates this
-# particular file as subject to the "Classpath" exception as provided
-# by Oracle in the LICENSE file that accompanied this code.
-#
-# This code is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-# version 2 for more details (a copy is included in the LICENSE file that
-# accompanied this code).
-#
-# You should have received a copy of the GNU General Public License version
-# 2 along with this work; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
-# or visit www.oracle.com if you need additional information or have any
-# questions.
-#
-
-# Generate exception classes
-
-SPEC=$1
-DST=$2
-
-gen() {
-  ID=$1
-  WHAT=$2
-  SVUID=$3
-  ARG_TYPE=$4
-  ARG_ID=$5
-  ARG_PROP=$6
-  ARG_PHRASE=$7
-  ARG_PARAM="$ARG_TYPE$ $ARG_ID"
-  echo '-->' $DST/$ID.java
-  out=$DST/${ID}.java
-
-  $SH ${SCRIPTS}/addNotices.sh "$COPYRIGHT_YEARS" > $out
-
-  cat >>$out <<__END__
-
-// -- This file was mechanically generated: Do not edit! -- //
-
-package $PACKAGE;
-
-
-/**$WHAT
- *
- * @since $SINCE
- */
-
-public `if [ ${ABSTRACT:-0} = 1 ];
-        then echo 'abstract '; fi`class $ID
-    extends ${SUPER}
-{
-
-    @java.io.Serial
-    private static final long serialVersionUID = $SVUID;
-__END__
-
-  if [ $ARG_ID ]; then
-
-    cat >>$out <<__END__
-
-    /**
-     * The $ARG_PHRASE.
-     *
-     * @serial
-     */
-    private $ARG_TYPE $ARG_ID;
-
-    /**
-     * Constructs an instance of this class.
-     *
-     * @param  $ARG_ID
-     *         The $ARG_PHRASE
-     */
-    public $ID($ARG_TYPE $ARG_ID) {
-        super(String.valueOf($ARG_ID));
-	this.$ARG_ID = $ARG_ID;
-    }
-
-    /**
-     * Retrieves the $ARG_PHRASE.
-     *
-     * @return  The $ARG_PHRASE
-     */
-    public $ARG_TYPE get$ARG_PROP() {
-        return $ARG_ID;
-    }
-
-}
-__END__
-
-  else
-
-    cat >>$out <<__END__
-
-    /**
-     * Constructs an instance of this class.
-     */
-    public $ID() { }
-
-}
-__END__
-
-  fi
-}
-
-. $SPEC
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@ -216,11 +216,6 @@ source %{
          return false;
        }
        break;
-      case Op_ExpandV:
-        if (UseSVE < 2 || is_subword_type(bt)) {
-          return false;
-        }
-        break;
      case Op_VectorMaskToLong:
        if (UseSVE > 0 && vlen > 64) {
          return false;
@ -7113,10 +7108,39 @@ instruct vcompressS(vReg dst, vReg src, pReg pg,
  ins_pipe(pipe_slow);
 %}

-instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
+instruct vexpand_neon(vReg dst, vReg src, vReg mask, vReg tmp1, vReg tmp2) %{
+  predicate(UseSVE == 0);
+  match(Set dst (ExpandV src mask));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "vexpand_neon $dst, $src, $mask\t# KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    int length_in_bytes = (int) Matcher::vector_length_in_bytes(this);
+    __ vector_expand_neon($dst$$FloatRegister, $src$$FloatRegister, $mask$$FloatRegister,
+                          $tmp1$$FloatRegister, $tmp2$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vexpand_sve(vReg dst, vReg src, pRegGov pg, vReg tmp1, vReg tmp2) %{
+  predicate(UseSVE == 1 || (UseSVE == 2 && type2aelembytes(Matcher::vector_element_basic_type(n)) < 4));
+  match(Set dst (ExpandV src pg));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "vexpand_sve $dst, $src, $pg\t# KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    int length_in_bytes = (int) Matcher::vector_length_in_bytes(this);
+    __ vector_expand_sve($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
+                         $tmp1$$FloatRegister, $tmp2$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vexpand_sve2_SD(vReg dst, vReg src, pRegGov pg) %{
+  predicate(UseSVE == 2 && type2aelembytes(Matcher::vector_element_basic_type(n)) >= 4);
  match(Set dst (ExpandV src pg));
  effect(TEMP_DEF dst);
-  format %{ "vexpand $dst, $pg, $src" %}
+  format %{ "vexpand_sve2_SD $dst, $src, $pg" %}
  ins_encode %{
    // Example input:   src   = 1 2 3 4 5 6 7 8
    //                  pg    = 1 0 0 1 1 0 1 1
@ -7127,7 +7151,6 @@ instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
    // for TBL whose value is used to select the indexed element from src vector.

    BasicType bt = Matcher::vector_element_basic_type(this);
-    assert(UseSVE == 2 && !is_subword_type(bt), "unsupported");
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    // dst = 0 0 0 0 0 0 0 0
    __ sve_dup($dst$$FloatRegister, size, 0);
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@ -206,11 +206,6 @@ source %{
          return false;
        }
        break;
-      case Op_ExpandV:
-        if (UseSVE < 2 || is_subword_type(bt)) {
-          return false;
-        }
-        break;
      case Op_VectorMaskToLong:
        if (UseSVE > 0 && vlen > 64) {
          return false;
@ -5101,10 +5096,39 @@ instruct vcompressS(vReg dst, vReg src, pReg pg,
  ins_pipe(pipe_slow);
 %}

-instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
+instruct vexpand_neon(vReg dst, vReg src, vReg mask, vReg tmp1, vReg tmp2) %{
+  predicate(UseSVE == 0);
+  match(Set dst (ExpandV src mask));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "vexpand_neon $dst, $src, $mask\t# KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    int length_in_bytes = (int) Matcher::vector_length_in_bytes(this);
+    __ vector_expand_neon($dst$$FloatRegister, $src$$FloatRegister, $mask$$FloatRegister,
+                          $tmp1$$FloatRegister, $tmp2$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vexpand_sve(vReg dst, vReg src, pRegGov pg, vReg tmp1, vReg tmp2) %{
+  predicate(UseSVE == 1 || (UseSVE == 2 && type2aelembytes(Matcher::vector_element_basic_type(n)) < 4));
+  match(Set dst (ExpandV src pg));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "vexpand_sve $dst, $src, $pg\t# KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    int length_in_bytes = (int) Matcher::vector_length_in_bytes(this);
+    __ vector_expand_sve($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
+                         $tmp1$$FloatRegister, $tmp2$$FloatRegister, bt, length_in_bytes);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vexpand_sve2_SD(vReg dst, vReg src, pRegGov pg) %{
+  predicate(UseSVE == 2 && type2aelembytes(Matcher::vector_element_basic_type(n)) >= 4);
  match(Set dst (ExpandV src pg));
  effect(TEMP_DEF dst);
-  format %{ "vexpand $dst, $pg, $src" %}
+  format %{ "vexpand_sve2_SD $dst, $src, $pg" %}
  ins_encode %{
    // Example input:   src   = 1 2 3 4 5 6 7 8
    //                  pg    = 1 0 0 1 1 0 1 1
@ -5115,7 +5139,6 @@ instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
    // for TBL whose value is used to select the indexed element from src vector.

    BasicType bt = Matcher::vector_element_basic_type(this);
-    assert(UseSVE == 2 && !is_subword_type(bt), "unsupported");
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    // dst = 0 0 0 0 0 0 0 0
    __ sve_dup($dst$$FloatRegister, size, 0);
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -4068,6 +4068,13 @@ public:
  INSN(sve_brkb, 0b10); // Break before first true condition
 #undef INSN

+  // SVE move prefix (unpredicated)
+  void sve_movprfx(FloatRegister Zd, FloatRegister Zn) {
+    starti;
+    f(0b00000100, 31, 24), f(0b00, 23, 22), f(0b1, 21), f(0b00000, 20, 16);
+    f(0b101111, 15, 10), rf(Zn, 5), rf(Zd, 0);
+  }
+
 // Element count and increment scalar (SVE)
 #define INSN(NAME, TYPE)                                                             \
  void NAME(Register Xdn, unsigned imm4 = 1, int pattern = 0b11111) {                \
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.inline.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.inline.hpp
@ -26,7 +26,7 @@
 #ifndef CPU_AARCH64_ASSEMBLER_AARCH64_INLINE_HPP
 #define CPU_AARCH64_ASSEMBLER_AARCH64_INLINE_HPP

-#include "asm/assembler.inline.hpp"
+#include "asm/assembler.hpp"
 #include "asm/codeBuffer.hpp"
 #include "code/codeCache.hpp"

--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@ -2771,3 +2771,90 @@ void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister
    select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
  }
 }
+
+// Vector expand implementation. Elements from the src vector are expanded into
+// the dst vector under the control of the vector mask.
+// Since there are no native instructions directly corresponding to expand before
+// SVE2p2, the following implementations mainly leverages the TBL instruction to
+// implement expand. To compute the index input for TBL, the prefix sum algorithm
+// (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
+// for NEON and SVE, but with different instructions where appropriate.
+
+// Vector expand implementation for NEON.
+//
+// An example of 128-bit Byte vector:
+//   Data direction: high <== low
+//   Input:
+//         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
+//         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
+//   Expected result:
+//         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
+void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
+                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
+                                           int vector_length_in_bytes) {
+  assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
+  assert_different_registers(dst, src, mask, tmp1, tmp2);
+  // Since the TBL instruction only supports byte table, we need to
+  // compute indices in byte type for all types.
+  SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
+  // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
+  dup(tmp1, size, zr);
+  // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
+  negr(dst, size, mask);
+  // Calculate vector index for TBL with prefix sum algorithm.
+  // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
+  for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
+    ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
+    addv(dst, size, tmp2, dst);
+  }
+  // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
+  orr(tmp2, size, mask, mask);
+  // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
+  bsl(tmp2, size, dst, tmp1);
+  // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
+  movi(tmp1, size, 1);
+  // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
+  subv(dst, size, tmp2, tmp1);
+  // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
+  tbl(dst, size, src, 1, dst);
+}
+
+// Vector expand implementation for SVE.
+//
+// An example of 128-bit Short vector:
+//   Data direction: high <== low
+//   Input:
+//         src   = gf ed cb a9 87 65 43 21
+//         pg    = 00 01 00 01 00 01 00 01
+//   Expected result:
+//         dst   = 00 87 00 65 00 43 00 21
+void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
+                                          FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
+                                          int vector_length_in_bytes) {
+  assert(UseSVE > 0, "expand implementation only for SVE");
+  assert_different_registers(dst, src, tmp1, tmp2);
+  SIMD_RegVariant size = elemType_to_regVariant(bt);
+
+  // tmp1 = 00 00 00 00 00 00 00 00
+  sve_dup(tmp1, size, 0);
+  sve_movprfx(tmp2, tmp1);
+  // tmp2 = 00 01 00 01 00 01 00 01
+  sve_cpy(tmp2, size, pg, 1, true);
+  // Calculate vector index for TBL with prefix sum algorithm.
+  // tmp2 = 04 04 03 03 02 02 01 01
+  for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
+    sve_movprfx(dst, tmp1);
+    // The EXT instruction operates on the full-width sve register. The correct
+    // index calculation method is:
+    // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
+    // MaxVectorSize - i.
+    sve_ext(dst, tmp2, MaxVectorSize - i);
+    sve_add(tmp2, size, dst, tmp2);
+  }
+  // dst  = 00 04 00 03 00 02 00 01
+  sve_sel(dst, size, pg, tmp2, tmp1);
+  // dst  = -1 03 -1 02 -1 01 -1 00
+  sve_sub(dst, size, 1);
+  // dst  = 00 87 00 65 00 43 00 21
+  sve_tbl(dst, size, src, dst);
+}
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@ -204,4 +204,10 @@
                               FloatRegister index, FloatRegister tmp, BasicType bt,
                               unsigned vector_length_in_bytes);

+  void vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
+                          FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
+                          int vector_length_in_bytes);
+  void vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
+                         FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
+                         int vector_length_in_bytes);
 #endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
@ -86,15 +86,48 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
  }
 }

-void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
-                                                             Register start, Register count, Register scratch, RegSet saved_regs) {
-  __ push(saved_regs, sp);
-  assert_different_registers(start, count, scratch);
-  assert_different_registers(c_rarg0, count);
-  __ mov(c_rarg0, start);
-  __ mov(c_rarg1, count);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop(saved_regs, sp);
+void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm,
+                                                             DecoratorSet decorators,
+                                                             Register start,
+                                                             Register count,
+                                                             Register scratch,
+                                                             RegSet saved_regs) {
+
+  Label done;
+  Label loop;
+  Label next;
+
+  __ cbz(count, done);
+
+  // Calculate the number of card marks to set. Since the object might start and
+  // end within a card, we need to calculate this via the card table indexes of
+  // the actual start and last addresses covered by the object.
+  // Temporarily use the count register for the last element address.
+  __ lea(count, Address(start, count, Address::lsl(LogBytesPerHeapOop))); // end = start + count << LogBytesPerHeapOop
+  __ sub(count, count, BytesPerHeapOop);                                  // Use last element address for end.
+
+  __ lsr(start, start, CardTable::card_shift());
+  __ lsr(count, count, CardTable::card_shift());
+  __ sub(count, count, start);                                            // Number of bytes to mark - 1.
+
+  // Add card table base offset to start.
+  __ ldr(scratch, Address(rthread, in_bytes(G1ThreadLocalData::card_table_base_offset())));
+  __ add(start, start, scratch);
+
+  __ bind(loop);
+  if (UseCondCardMark) {
+    __ ldrb(scratch, Address(start, count));
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(scratch, 0, next);
+  }
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ strb(zr, Address(start, count));
+  __ bind(next);
+  __ subs(count, count, 1);
+  __ br(Assembler::GE, loop);
+
+  __ bind(done);
 }

 static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
@ -202,10 +235,14 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                            const Register store_addr,
                                            const Register new_val,
+                                            const Register thread,
                                            const Register tmp1,
                                            const Register tmp2,
                                            Label& done,
                                            bool new_val_may_be_null) {
+  assert(thread == rthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg, rscratch1);
+
  // Does store cross heap regions?
  __ eor(tmp1, store_addr, new_val);                     // tmp1 := store address ^ new value
  __ lsr(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);   // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
@ -214,33 +251,19 @@ static void generate_post_barrier_fast_path(MacroAssembler* masm,
  if (new_val_may_be_null) {
    __ cbz(new_val, done);
  }
-  // Storing region crossing non-null, is card young?
+  // Storing region crossing non-null.
  __ lsr(tmp1, store_addr, CardTable::card_shift());     // tmp1 := card address relative to card table base
-  __ load_byte_map_base(tmp2);                           // tmp2 := card table base address
-  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
-  __ ldrb(tmp2, Address(tmp1));                          // tmp2 := card
-  __ cmpw(tmp2, (int)G1CardTable::g1_young_card_val());  // tmp2 := card == young_card_val?
-}

-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(Assembler::StoreLoad);  // StoreLoad membar
-  __ ldrb(tmp2, Address(tmp1));     // tmp2 := card
-  __ cbzw(tmp2, done);
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ strb(zr, Address(tmp1));       // *(card address) := dirty_card_val
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, tmp1, tmp2, rscratch1);
-  __ b(done);
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ ldr(tmp2, card_table_addr);                         // tmp2 := card table base address
+  if (UseCondCardMark) {
+    __ ldrb(rscratch1, Address(tmp1, tmp2));             // rscratch1 := card
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(rscratch1, 0, done);
+  }
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ strb(zr, Address(tmp1, tmp2));                      // *(card address) := dirty_card_val
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
@ -249,27 +272,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                  Register thread,
                                                  Register tmp1,
                                                  Register tmp2) {
-  assert(thread == rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
-                             rscratch1);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
-         && tmp2 != noreg, "expecting a register");
-
  Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  __ br(Assembler::EQ, done);
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push(saved, sp);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
-  __ pop(saved, sp);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, false /* new_val_may_be_null */);
  __ bind(done);
 }

@ -329,38 +333,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register thread,
                                                     Register tmp1,
                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
-                             rscratch1);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
-         && tmp2 != noreg, "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ br(Assembler::NE, *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-  assert(stub->tmp3() == noreg, "not needed in this platform");
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ b(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -456,20 +432,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ b(*stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cbz(new_val_reg, *stub->continuation());
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ b(*stub->continuation());
-}
-
 #undef __

+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -521,74 +496,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ epilogue();
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  // arg0: store_address
-  Address store_addr(rfp, 2*BytesPerWord);
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-
-  Label done;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-
-  const Register thread = rthread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  const Register card_offset = rscratch2;
-  // LR is free here, so we can use it to hold the byte_map_base.
-  const Register byte_map_base = lr;
-
-  assert_different_registers(card_offset, byte_map_base, rscratch1);
-
-  __ load_parameter(0, card_offset);
-  __ lsr(card_offset, card_offset, CardTable::card_shift());
-  __ load_byte_map_base(byte_map_base);
-  __ ldrb(rscratch1, Address(byte_map_base, card_offset));
-  __ cmpw(rscratch1, (int)G1CardTable::g1_young_card_val());
-  __ br(Assembler::EQ, done);
-
-  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
-
-  __ membar(Assembler::StoreLoad);
-  __ ldrb(rscratch1, Address(byte_map_base, card_offset));
-  __ cbzw(rscratch1, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-  __ strb(zr, Address(byte_map_base, card_offset));
-
-  // Convert card offset into an address in card_addr
-  Register card_addr = card_offset;
-  __ add(card_addr, byte_map_base, card_addr);
-
-  __ ldr(rscratch1, queue_index);
-  __ cbz(rscratch1, runtime);
-  __ sub(rscratch1, rscratch1, wordSize);
-  __ str(rscratch1, queue_index);
-
-  // Reuse LR to hold buffer_addr
-  const Register buffer_addr = lr;
-
-  __ ldr(buffer_addr, buffer);
-  __ str(card_addr, Address(buffer_addr, rscratch1));
-  __ b(done);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-  __ pop_call_clobbered_registers();
-  __ bind(done);
-  __ epilogue();
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -32,9 +32,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -65,10 +63,15 @@ protected:
 public:
 #ifdef COMPILER1
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
 #endif

 #ifdef COMPILER2
@ -87,9 +90,7 @@ public:
                                Register thread,
                                Register tmp1,
                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif

  void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
--- a/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm,
                               Register new_val,
                               Register tmp1,
                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, new_val_may_be_null);
 }

 %}
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@ -1623,7 +1623,7 @@ public:
                    FloatRegister p, FloatRegister z, FloatRegister t1);
  void ghash_reduce_wide(int index, FloatRegister result, FloatRegister lo, FloatRegister hi,
                    FloatRegister p, FloatRegister z, FloatRegister t1);
-  void ghash_processBlocks_wide(address p, Register state, Register subkeyH,
+  void ghash_processBlocks_wide(Label& p, Register state, Register subkeyH,
                                Register data, Register blocks, int unrolls);


--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp
@ -507,7 +507,7 @@ void MacroAssembler::ghash_modmul(FloatRegister result,
 //
 // Clobbers all vector registers.
 //
-void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
+void MacroAssembler::ghash_processBlocks_wide(Label& field_polynomial, Register state,
                                              Register subkeyH,
                                              Register data, Register blocks, int unrolls) {
  int register_stride = 7;
@ -531,7 +531,10 @@ void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register
  FloatRegister p = v31;
  eor(vzr, T16B, vzr, vzr); // zero register

-  ldrq(p, field_polynomial);    // The field polynomial
+  // load polynomial via label which must identify local data in the
+  // same code stub
+  adr(rscratch1, field_polynomial);
+  ldrq(p, rscratch1);    // The field polynomial

  ldrq(v0, Address(state));
  ldrq(Hprime, Address(subkeyH));
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -802,7 +802,7 @@ class StubGenerator: public StubCodeGenerator {
  //
  // s and d are adjusted to point to the remaining words to copy
  //
-  void generate_copy_longs(StubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
+  address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
    BasicType type;
    copy_direction direction;

@ -854,7 +854,7 @@ class StubGenerator: public StubCodeGenerator {

    StubCodeMark mark(this, stub_id);

-    __ bind(start);
+    address start = __ pc();

    Label unaligned_copy_long;
    if (AvoidUnalignedAccesses) {
@ -894,9 +894,9 @@ class StubGenerator: public StubCodeGenerator {
    int prefetch = PrefetchCopyIntervalInBytes;
    bool use_stride = false;
    if (direction == copy_backwards) {
-       use_stride = prefetch > 256;
-       prefetch = -prefetch;
-       if (use_stride) __ mov(stride, prefetch);
+      use_stride = prefetch > 256;
+      prefetch = -prefetch;
+      if (use_stride) __ mov(stride, prefetch);
    }

    __ bind(again);
@ -1026,9 +1026,9 @@ class StubGenerator: public StubCodeGenerator {
      int prefetch = PrefetchCopyIntervalInBytes;
      bool use_stride = false;
      if (direction == copy_backwards) {
-         use_stride = prefetch > 256;
-         prefetch = -prefetch;
-         if (use_stride) __ mov(stride, prefetch);
+        use_stride = prefetch > 256;
+        prefetch = -prefetch;
+        if (use_stride) __ mov(stride, prefetch);
      }

      __ bind(again);
@ -1037,15 +1037,15 @@ class StubGenerator: public StubCodeGenerator {
        __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);

      if (direction == copy_forwards) {
-       // allowing for the offset of -8 the store instructions place
-       // registers into the target 64 bit block at the following
-       // offsets
-       //
-       // t0 at offset 0
-       // t1 at offset 8,  t2 at offset 16
-       // t3 at offset 24, t4 at offset 32
-       // t5 at offset 40, t6 at offset 48
-       // t7 at offset 56
+        // allowing for the offset of -8 the store instructions place
+        // registers into the target 64 bit block at the following
+        // offsets
+        //
+        // t0 at offset 0
+        // t1 at offset 8,  t2 at offset 16
+        // t3 at offset 24, t4 at offset 32
+        // t5 at offset 40, t6 at offset 48
+        // t7 at offset 56

        bs.copy_store_at_8(Address(d, 1 * unit), t0);
        bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
@ -1057,18 +1057,18 @@ class StubGenerator: public StubCodeGenerator {
        bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
        bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
      } else {
-       // d was not offset when we started so the registers are
-       // written into the 64 bit block preceding d with the following
-       // offsets
-       //
-       // t1 at offset -8
-       // t3 at offset -24, t0 at offset -16
-       // t5 at offset -48, t2 at offset -32
-       // t7 at offset -56, t4 at offset -48
-       //                   t6 at offset -64
-       //
-       // note that this matches the offsets previously noted for the
-       // loads
+        // d was not offset when we started so the registers are
+        // written into the 64 bit block preceding d with the following
+        // offsets
+        //
+        // t1 at offset -8
+        // t3 at offset -24, t0 at offset -16
+        // t5 at offset -48, t2 at offset -32
+        // t7 at offset -56, t4 at offset -48
+        //                   t6 at offset -64
+        //
+        // note that this matches the offsets previously noted for the
+        // loads

        bs.copy_store_at_8(Address(d, 1 * unit), t1);
        bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
@ -1109,10 +1109,10 @@ class StubGenerator: public StubCodeGenerator {
      {
        Label L1, L2;
        __ tbz(count, exact_log2(4), L1);
-       // this is the same as above but copying only 4 longs hence
-       // with only one intervening stp between the str instructions
-       // but note that the offsets and registers still follow the
-       // same pattern
+        // this is the same as above but copying only 4 longs hence
+        // with only one intervening stp between the str instructions
+        // but note that the offsets and registers still follow the
+        // same pattern
        bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
        bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
        if (direction == copy_forwards) {
@ -1127,10 +1127,10 @@ class StubGenerator: public StubCodeGenerator {
        __ bind(L1);

        __ tbz(count, 1, L2);
-       // this is the same as above but copying only 2 longs hence
-       // there is no intervening stp between the str instructions
-       // but note that the offset and register patterns are still
-       // the same
+        // this is the same as above but copying only 2 longs hence
+        // there is no intervening stp between the str instructions
+        // but note that the offset and register patterns are still
+        // the same
        bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
        if (direction == copy_forwards) {
          bs.copy_store_at_8(Address(d, 1 * unit), t0);
@ -1141,18 +1141,20 @@ class StubGenerator: public StubCodeGenerator {
        }
        __ bind(L2);

-       // for forwards copy we need to re-adjust the offsets we
-       // applied so that s and d are follow the last words written
+        // for forwards copy we need to re-adjust the offsets we
+        // applied so that s and d are follow the last words written

-       if (direction == copy_forwards) {
-         __ add(s, s, 16);
-         __ add(d, d, 8);
-       }
+        if (direction == copy_forwards) {
+          __ add(s, s, 16);
+          __ add(d, d, 8);
+        }

      }

      __ ret(lr);
-      }
+    }
+
+    return start;
  }

  // Small copy: less than 16 bytes.
@ -1206,10 +1208,6 @@ class StubGenerator: public StubCodeGenerator {
    }
  }

-  Label copy_f, copy_b;
-  Label copy_obj_f, copy_obj_b;
-  Label copy_obj_uninit_f, copy_obj_uninit_b;
-
  // All-singing all-dancing memory copy.
  //
  // Copy count units of memory from s to d.  The size of a unit is
@ -1447,19 +1445,19 @@ class StubGenerator: public StubCodeGenerator {
    }
    if (direction == copy_forwards) {
      if (type != T_OBJECT) {
-        __ bl(copy_f);
+        __ bl(StubRoutines::aarch64::copy_byte_f());
      } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
-        __ bl(copy_obj_uninit_f);
+        __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
      } else {
-        __ bl(copy_obj_f);
+        __ bl(StubRoutines::aarch64::copy_oop_f());
      }
    } else {
      if (type != T_OBJECT) {
-        __ bl(copy_b);
+        __ bl(StubRoutines::aarch64::copy_byte_b());
      } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
-        __ bl(copy_obj_uninit_b);
+        __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
      } else {
-        __ bl(copy_obj_b);
+        __ bl(StubRoutines::aarch64::copy_oop_b());
      }
    }

@ -1522,11 +1520,11 @@ class StubGenerator: public StubCodeGenerator {
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
-  // Side Effects: entry is set to the (post push) entry point so it
-  //               can be used by the corresponding conjoint copy
-  //               method
+  // Side Effects: nopush_entry is set to the (post push) entry point
+  //               so it can be used by the corresponding conjoint
+  //               copy method
  //
-  address generate_disjoint_copy(StubId stub_id, address *entry) {
+  address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_reg = RegSet::of(s, d, count);
    int size;
@ -1615,8 +1613,8 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }
@ -1679,10 +1677,10 @@ class StubGenerator: public StubCodeGenerator {
  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
-  //   entry is set to the no-overlap entry point so it can be used by
-  //   some other conjoint copy method
+  //   nopush_entry is set to the no-overlap entry point so it can be
+  //   used by some other conjoint copy method
  //
-  address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) {
+  address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_regs = RegSet::of(s, d, count);
    int size;
@ -1769,16 +1767,19 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }

    // use fwd copy when (d-s) above_equal (count*size)
+    Label L_overlapping;
    __ sub(rscratch1, d, s);
    __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
-    __ br(Assembler::HS, nooverlap_target);
+    __ br(Assembler::LO, L_overlapping);
+    __ b(RuntimeAddress(nooverlap_target));
+    __ bind(L_overlapping);

    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
    if (dest_uninitialized) {
@ -1850,7 +1851,7 @@ class StubGenerator: public StubCodeGenerator {
  //    r0 ==  0  -  success
  //    r0 == -1^K - failure, where K is partial transfer count
  //
-  address generate_checkcast_copy(StubId stub_id, address *entry) {
+  address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
    bool dest_uninitialized;
    switch (stub_id) {
    case StubId::stubgen_checkcast_arraycopy_id:
@ -1911,8 +1912,8 @@ class StubGenerator: public StubCodeGenerator {
 #endif //ASSERT

    // Caller of this entry point must set up the argument registers.
-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      BLOCK_COMMENT("Entry:");
    }

@ -2724,13 +2725,21 @@ class StubGenerator: public StubCodeGenerator {
  }

  void generate_arraycopy_stubs() {
-    address entry;
-    address entry_jbyte_arraycopy;
-    address entry_jshort_arraycopy;
-    address entry_jint_arraycopy;
-    address entry_oop_arraycopy;
-    address entry_jlong_arraycopy;
-    address entry_checkcast_arraycopy;
+    // Some copy stubs publish a normal entry and then a 2nd 'fallback'
+    // entry immediately following their stack push. This can be used
+    // as a post-push branch target for compatible stubs when they
+    // identify a special case that can be handled by the fallback
+    // stub e.g a disjoint copy stub may be use as a special case
+    // fallback for its compatible conjoint copy stub.
+    //
+    // A no push entry is always returned in the following local and
+    // then published by assigning to the appropriate entry field in
+    // class StubRoutines. The entry value is then passed to the
+    // generator for the compatible stub. That means the entry must be
+    // listed when saving to/restoring from the AOT cache, ensuring
+    // that the inter-stub jumps are noted at AOT-cache save and
+    // relocated at AOT cache load.
+    address nopush_entry;

    // generate the common exit first so later stubs can rely on it if
    // they want an UnsafeMemoryAccess exit non-local to the stub
@ -2738,83 +2747,123 @@ class StubGenerator: public StubCodeGenerator {
    // register the stub as the default exit with class UnsafeMemoryAccess
    UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);

-    generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
-    generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
+    // generate and publish arch64-specific bulk copy routines first
+    // so we can call them from other copy stubs
+    StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
+    StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);

-    generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
-    generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
+    StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
+    StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);

-    generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
-    generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
+    StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
+    StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);

    StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();

    //*** jbyte
    // Always need aligned and unaligned versions
-    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
-    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr);
+    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);

    //*** jshort
    // Always need aligned and unaligned versions
-    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
-    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr);
+    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is used by generic/unsafe copy
+    StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);

    //*** jint
    // Aligned versions
-    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
-    // entry_jint_arraycopy always points to the unaligned version
-    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+    // jint_arraycopy_nopush always points to the unaligned version
+    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jint_arraycopy_nopush = nopush_entry;

    //*** jlong
    // It is always aligned
-    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
+    // disjoint normal/nopush and conjoint normal entries are not
+    // generated since the arrayof versions are the same
    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
+    StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;

    //*** oops
    {
-      // With compressed oops we need unaligned versions; notice that
-      // we overwrite entry_oop_arraycopy.
-      bool aligned = !UseCompressedOops;
-
      StubRoutines::_arrayof_oop_disjoint_arraycopy
-        = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry);
+        = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
+      // disjoint arrayof nopush entry is needed by conjoint copy
+      StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
      StubRoutines::_arrayof_oop_arraycopy
-        = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
+        = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
+      // conjoint arrayof nopush entry is needed by generic/unsafe copy
+      StubRoutines::_oop_arraycopy_nopush = nopush_entry;
      // Aligned versions without pre-barriers
      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
-        = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
+        = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
+      // disjoint arrayof+uninit nopush entry is needed by conjoint copy
+      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
+      // note that we don't need a returned nopush entry because the
+      // generic/unsafe copy does not cater for uninit arrays.
      StubRoutines::_arrayof_oop_arraycopy_uninit
-        = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr);
+        = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
    }

+    // for oop copies reuse arrayof entries for non-arrayof cases
    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
+    StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
+    StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;

-    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
+    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
+    // checkcast nopush entry is needed by generic copy
+    StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
+    // note that we don't need a returned nopush entry because the
+    // generic copy does not cater for uninit arrays.
    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);

-    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
-                                                              entry_jshort_arraycopy,
-                                                              entry_jint_arraycopy,
-                                                              entry_jlong_arraycopy);
+    // unsafe arraycopy may fallback on conjoint stubs
+    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                              StubRoutines::_jshort_arraycopy_nopush,
+                                                              StubRoutines::_jint_arraycopy_nopush,
+                                                              StubRoutines::_jlong_arraycopy_nopush);

-    StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
-                                                               entry_jshort_arraycopy,
-                                                               entry_jint_arraycopy,
-                                                               entry_oop_arraycopy,
-                                                               entry_jlong_arraycopy,
-                                                               entry_checkcast_arraycopy);
+    // generic arraycopy may fallback on conjoint stubs
+    StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                               StubRoutines::_jshort_arraycopy_nopush,
+                                                               StubRoutines::_jint_arraycopy_nopush,
+                                                               StubRoutines::_oop_arraycopy_nopush,
+                                                               StubRoutines::_jlong_arraycopy_nopush,
+                                                               StubRoutines::_checkcast_arraycopy_nopush);

    StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
    StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
@ -3402,14 +3451,9 @@ class StubGenerator: public StubCodeGenerator {
  // counter = c_rarg7 - 16 bytes of CTR
  // return - number of processed bytes
  address generate_galoisCounterMode_AESCrypt() {
-    address ghash_polynomial = __ pc();
-    __ emit_int64(0x87);  // The low-order bits of the field
-                          // polynomial (i.e. p = z^7+z^2+z+1)
-                          // repeated in the low and high parts of a
-                          // 128-bit vector
-    __ emit_int64(0x87);
+    Label ghash_polynomial; // local data generated after code

-    __ align(CodeEntryAlignment);
+   __ align(CodeEntryAlignment);
    StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
    StubCodeMark mark(this, stub_id);
    address start = __ pc();
@ -3514,7 +3558,17 @@ class StubGenerator: public StubCodeGenerator {

    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);
-     return start;
+
+    // bind label and generate polynomial data
+    __ align(wordSize * 2);
+    __ bind(ghash_polynomial);
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
+    return start;
  }

  class Cached64Bytes {
@ -4559,16 +4613,6 @@ class StubGenerator: public StubCodeGenerator {
  // by the second lane from all vectors and so on.
  address generate_chacha20Block_blockpar() {
    Label L_twoRounds, L_cc20_const;
-    // The constant data is broken into two 128-bit segments to be loaded
-    // onto FloatRegisters.  The first 128 bits are a counter add overlay
-    // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
-    // The second 128-bits is a table constant used for 8-bit left rotations.
-    __ BIND(L_cc20_const);
-    __ emit_int64(0x0000000100000000UL);
-    __ emit_int64(0x0000000300000002UL);
-    __ emit_int64(0x0605040702010003UL);
-    __ emit_int64(0x0E0D0C0F0A09080BUL);
-
    __ align(CodeEntryAlignment);
    StubId stub_id = StubId::stubgen_chacha20Block_id;
    StubCodeMark mark(this, stub_id);
@ -4716,6 +4760,17 @@ class StubGenerator: public StubCodeGenerator {
    __ leave();
    __ ret(lr);

+    // bind label and generate local constant data used by this stub
+    // The constant data is broken into two 128-bit segments to be loaded
+    // onto FloatRegisters.  The first 128 bits are a counter add overlay
+    // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
+    // The second 128-bits is a table constant used for 8-bit left rotations.
+    __ BIND(L_cc20_const);
+    __ emit_int64(0x0000000100000000UL);
+    __ emit_int64(0x0000000300000002UL);
+    __ emit_int64(0x0605040702010003UL);
+    __ emit_int64(0x0E0D0C0F0A09080BUL);
+
    return start;
  }

@ -6036,10 +6091,6 @@ class StubGenerator: public StubCodeGenerator {
  address generate_kyber12To16() {
    Label L_F00, L_loop, L_end;

-    __ BIND(L_F00);
-    __ emit_int64(0x0f000f000f000f00);
-    __ emit_int64(0x0f000f000f000f00);
-
    __ align(CodeEntryAlignment);
    StubId stub_id = StubId::stubgen_kyber12To16_id;
    StubCodeMark mark(this, stub_id);
@ -6233,6 +6284,11 @@ class StubGenerator: public StubCodeGenerator {
    __ mov(r0, zr); // return 0
    __ ret(lr);

+    // bind label and generate constant data used by this stub
+    __ BIND(L_F00);
+    __ emit_int64(0x0f000f000f000f00);
+    __ emit_int64(0x0f000f000f000f00);
+
    return start;
  }

@ -9642,14 +9698,7 @@ class StubGenerator: public StubCodeGenerator {

    StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
    StubCodeMark mark(this, stub_id);
-    __ align(wordSize * 2);
-    address p = __ pc();
-    __ emit_int64(0x87);  // The low-order bits of the field
-                          // polynomial (i.e. p = z^7+z^2+z+1)
-                          // repeated in the low and high parts of a
-                          // 128-bit vector
-    __ emit_int64(0x87);
-
+    Label polynomial; // local data generated at end of stub
    __ align(CodeEntryAlignment);
    address start = __ pc();

@ -9661,7 +9710,8 @@ class StubGenerator: public StubCodeGenerator {
    FloatRegister vzr = v30;
    __ eor(vzr, __ T16B, vzr, vzr); // zero register

-    __ ldrq(v24, p);    // The field polynomial
+    __ adr(rscratch1, polynomial);
+    __ ldrq(v24, rscratch1);    // The field polynomial

    __ ldrq(v0, Address(state));
    __ ldrq(v1, Address(subkeyH));
@ -9701,6 +9751,15 @@ class StubGenerator: public StubCodeGenerator {
    __ st1(v0, __ T16B, state);
    __ ret(lr);

+    // bind label and generate local polynomial data
+    __ align(wordSize * 2);
+    __ bind(polynomial);
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
    return start;
  }

@ -9709,14 +9768,7 @@ class StubGenerator: public StubCodeGenerator {

    StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
    StubCodeMark mark(this, stub_id);
-    __ align(wordSize * 2);
-    address p = __ pc();
-    __ emit_int64(0x87);  // The low-order bits of the field
-                          // polynomial (i.e. p = z^7+z^2+z+1)
-                          // repeated in the low and high parts of a
-                          // 128-bit vector
-    __ emit_int64(0x87);
-
+    Label polynomial;           // local data generated after stub
    __ align(CodeEntryAlignment);
    address start = __ pc();

@ -9738,7 +9790,7 @@ class StubGenerator: public StubCodeGenerator {
      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
    }

-    __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
+    __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);

    if (unroll > 1) {
      // And restore state
@ -9751,7 +9803,17 @@ class StubGenerator: public StubCodeGenerator {

    __ ret(lr);

+    // bind label and generate polynomial data
+    __ align(wordSize * 2);
+    __ bind(polynomial);
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
    return start;
+
  }

  void generate_base64_encode_simdround(Register src, Register dst,
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
@ -201,12 +201,15 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                            const Register store_addr,
                                            const Register new_val,
+                                            const Register thread,
                                            const Register tmp1,
                                            const Register tmp2,
                                            Label& done,
                                            bool new_val_may_be_null) {
-  // Does store cross heap regions?
+  assert(thread == Rthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);

+  // Does store cross heap regions?
  __ eor(tmp1, store_addr, new_val);
  __ movs(tmp1, AsmOperand(tmp1, lsr, G1HeapRegion::LogOfHRGrainBytes));
  __ b(done, eq);
@ -215,76 +218,34 @@ static void generate_post_barrier_fast_path(MacroAssembler* masm,
  if (new_val_may_be_null) {
    __ cbz(new_val, done);
  }
-  // storing region crossing non-null, is card already dirty?
-  const Register card_addr = tmp1;

-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  __ mov_address(tmp2, (address)ct->card_table()->byte_map_base());
-  __ add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift()));
+  // storing region crossing non-null, is card already non-clean?
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ ldr(tmp2, card_table_addr);
+  __ add(tmp1, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift()));

-  __ ldrb(tmp2, Address(card_addr));
-  __ cmp(tmp2, (int)G1CardTable::g1_young_card_val());
+  if (UseCondCardMark) {
+    __ ldrb(tmp2, Address(tmp1));
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(tmp2, 0, done);
+  }
+
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zero_register()");
+  __ zero_register(tmp2);
+  __ strb(tmp2, Address(tmp1));                   // *(card address) := dirty_card_val
 }

-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            const Register tmp3,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2);
-  assert(CardTable::dirty_card_val() == 0, "adjust this code");
-  // card_addr is loaded by generate_post_barrier_fast_path
-  const Register card_addr = tmp1;
-  __ ldrb(tmp2, Address(card_addr));
-  __ cbz(tmp2, done);
-
-  // storing a region crossing, non-null oop, card is clean.
-  // dirty card and log.
-
-  __ strb(__ zero_register(tmp2), Address(card_addr));
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, card_addr, tmp2, tmp3);
-  __ b(done);
-}
-
-
 // G1 post-barrier.
 // Blows all volatile registers R0-R3,  LR).
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
-                                           Register store_addr,
-                                           Register new_val,
-                                           Register tmp1,
-                                           Register tmp2,
-                                           Register tmp3) {
+                                                  Register store_addr,
+                                                  Register new_val,
+                                                  Register tmp1,
+                                                  Register tmp2,
+                                                  Register tmp3) {
  Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  // card_addr and card are loaded by generate_post_barrier_fast_path
-  const Register card      = tmp2;
-  const Register card_addr = tmp1;
-   __ b(done, eq);
-  generate_post_barrier_slow_path(masm, Rthread, card_addr, tmp2, tmp3, done, runtime);
-
-  __ bind(runtime);
-
-  RegisterSet set = RegisterSet(store_addr) | RegisterSet(R0, R3) | RegisterSet(R12);
-  __ push(set);
-
-  if (card_addr != R0) {
-    __ mov(R0, card_addr);
-  }
-  __ mov(R1, Rthread);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), R0, R1);
-
-  __ pop(set);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, Rthread, tmp1, tmp2, done, true /* new_val_may_be_null */);
  __ bind(done);
 }

@ -344,35 +305,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register tmp1,
                                                     Register tmp2,
                                                     Register tmp3,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == Rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
-
-  stub->initialize_registers(thread, tmp1, tmp2, tmp3);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ b(*stub->entry(), ne);
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-  Register tmp3 = stub->tmp3();
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, tmp3,  *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp2);
-  __ b(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -463,20 +399,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ b(*stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cbz(new_val_reg, *stub->continuation());
-  ce->verify_reserved_argument_area_size(1);
-  __ str(stub->addr()->as_pointer_register(), Address(SP));
-  __ call(bs->post_barrier_c1_runtime_code_blob()->code_begin(), relocInfo::runtime_call_type);
-  __ b(*stub->continuation());
+#undef __
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
 }

-#undef __
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -536,102 +471,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ b(done);
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  // Input:
-  // - store_addr, pushed on the stack
-
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  Label done;
-  Label recheck;
-  Label runtime;
-
-  Address queue_index(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  AddressLiteral cardtable(ci_card_table_address_as<address>(), relocInfo::none);
-
-  // save at least the registers that need saving if the runtime is called
-  const RegisterSet saved_regs = RegisterSet(R0,R3) | RegisterSet(R12) | RegisterSet(LR);
-  const int nb_saved_regs = 6;
-  assert(nb_saved_regs == saved_regs.size(), "fix nb_saved_regs");
-  __ push(saved_regs);
-
-  const Register r_card_addr_0 = R0; // must be R0 for the slow case
-  const Register r_obj_0 = R0;
-  const Register r_card_base_1 = R1;
-  const Register r_tmp2 = R2;
-  const Register r_index_2 = R2;
-  const Register r_buffer_3 = R3;
-  const Register tmp1 = Rtemp;
-
-  __ ldr(r_obj_0, Address(SP, nb_saved_regs*wordSize));
-  // Note: there is a comment in x86 code about not using
-  // ExternalAddress / lea, due to relocation not working
-  // properly for that address. Should be OK for arm, where we
-  // explicitly specify that 'cardtable' has a relocInfo::none
-  // type.
-  __ lea(r_card_base_1, cardtable);
-  __ add(r_card_addr_0, r_card_base_1, AsmOperand(r_obj_0, lsr, CardTable::card_shift()));
-
-  // first quick check without barrier
-  __ ldrb(r_tmp2, Address(r_card_addr_0));
-
-  __ cmp(r_tmp2, (int)G1CardTable::g1_young_card_val());
-  __ b(recheck, ne);
-
-  __ bind(done);
-
-  __ pop(saved_regs);
-
-  __ ret();
-
-  __ bind(recheck);
-
-  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp1);
-
-  // reload card state after the barrier that ensures the stored oop was visible
-  __ ldrb(r_tmp2, Address(r_card_addr_0));
-
-  assert(CardTable::dirty_card_val() == 0, "adjust this code");
-  __ cbz(r_tmp2, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-
-  assert(0 == (int)CardTable::dirty_card_val(), "adjust this code");
-  if ((ci_card_table_address_as<intptr_t>() & 0xff) == 0) {
-    // Card table is aligned so the lowest byte of the table address base is zero.
-    __ strb(r_card_base_1, Address(r_card_addr_0));
-  } else {
-    __ strb(__ zero_register(r_tmp2), Address(r_card_addr_0));
-  }
-
-  __ ldr(r_index_2, queue_index);
-  __ ldr(r_buffer_3, buffer);
-
-  __ subs(r_index_2, r_index_2, wordSize);
-  __ b(runtime, lt); // go to runtime if now negative
-
-  __ str(r_index_2, queue_index);
-
-  __ str(r_card_addr_0, Address(r_buffer_3, r_index_2));
-
-  __ b(done);
-
-  __ bind(runtime);
-
-  __ save_live_registers();
-
-  assert(r_card_addr_0 == c_rarg0, "card_addr should be in R0");
-  __ mov(c_rarg1, Rthread);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), c_rarg0, c_rarg1);
-
-  __ restore_live_registers_without_return();
-
-  __ b(done);
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -32,9 +32,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -66,10 +64,15 @@ public:
 #ifdef COMPILER1
 public:
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
 #endif

 #ifdef COMPILER2
@ -89,9 +92,7 @@ public:
                                Register tmp1,
                                Register tmp2,
                                Register tmp3,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif

 };
--- a/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
+++ b/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -63,13 +63,13 @@ static void write_barrier_post(MacroAssembler* masm,
                               Register tmp1,
                               Register tmp2,
                               Register tmp3) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, new_val_may_be_null);
 }

 %}
--- a/src/hotspot/cpu/arm/stubGenerator_arm.cpp
+++ b/src/hotspot/cpu/arm/stubGenerator_arm.cpp
@ -3011,6 +3011,10 @@ class StubGenerator: public StubCodeGenerator {
    // Note:  the disjoint stubs must be generated first, some of
    //        the conjoint stubs use them.

+    // Note:   chaining of stubs does not rely on branching to an
+    //         auxiliary post-push entry because none of the stubs
+    //         push/pop a frame.
+
    // these need always status in case they are called from generic_arraycopy
    StubRoutines::_jbyte_disjoint_arraycopy  = generate_primitive_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
    StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(StubId::stubgen_jshort_disjoint_arraycopy_id);
@ -3024,6 +3028,7 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_primitive_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id);
    StubRoutines::_arrayof_oop_disjoint_arraycopy    = generate_oop_copy      (StubId::stubgen_arrayof_oop_disjoint_arraycopy_id);

+    // disjoint copy entry is needed by conjoint copy
    // these need always status in case they are called from generic_arraycopy
    StubRoutines::_jbyte_arraycopy  = generate_primitive_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy);
    StubRoutines::_jshort_arraycopy = generate_primitive_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy);
--- a/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp
+++ b/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp
@ -26,7 +26,7 @@
 #ifndef CPU_PPC_ASSEMBLER_PPC_INLINE_HPP
 #define CPU_PPC_ASSEMBLER_PPC_INLINE_HPP

-#include "asm/assembler.inline.hpp"
+#include "asm/assembler.hpp"
 #include "asm/codeBuffer.hpp"
 #include "code/codeCache.hpp"
 #include "runtime/vm_version.hpp"
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
@ -28,7 +28,6 @@
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1BarrierSetRuntime.hpp"
 #include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
@ -230,78 +229,52 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
  __ bind(filtered);
 }

-static void generate_region_crossing_test(MacroAssembler* masm, const Register store_addr, const Register new_val) {
-  __ xorr(R0, store_addr, new_val);                  // tmp1 := store address ^ new value
-  __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
-}
+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+  assert_different_registers(store_addr, new_val, tmp1, R0);
+  assert_different_registers(store_addr, tmp1, tmp2, R0);

-static Address generate_card_young_test(MacroAssembler* masm, const Register store_addr, const Register tmp1, const Register tmp2) {
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  __ load_const_optimized(tmp1, (address)(ct->card_table()->byte_map_base()), tmp2);
-  __ srdi(tmp2, store_addr, CardTable::card_shift());        // tmp1 := card address relative to card table base
-  __ lbzx(R0, tmp1, tmp2);                                   // tmp1 := card address
-  __ cmpwi(CR0, R0, (int)G1CardTable::g1_young_card_val());
-  return Address(tmp1, tmp2); // return card address
-}
+  __ xorr(R0, store_addr, new_val);                          // R0 := store address ^ new value
+  __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes);         // R0 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ beq(CR0, done);

-static void generate_card_dirty_test(MacroAssembler* masm, Address card_addr) {
-  __ membar(Assembler::StoreLoad);                        // Must reload after StoreLoad membar due to concurrent refinement
-  __ lbzx(R0, card_addr.base(), card_addr.index());       // tmp2 := card
-  __ cmpwi(CR0, R0, (int)G1CardTable::dirty_card_val()); // tmp2 := card == dirty_card_val?
+  // Crosses regions, storing null?
+  if (!new_val_may_be_null) {
+#ifdef ASSERT
+    __ cmpdi(CR0, new_val, 0);
+    __ asm_assert_ne("null oop not allowed (G1 post)");      // Checked by caller.
+#endif
+  } else {
+    __ cmpdi(CR0, new_val, 0);
+    __ beq(CR0, done);
+  }
+
+  __ ld(tmp1, G1ThreadLocalData::card_table_base_offset(), thread);
+  __ srdi(tmp2, store_addr, CardTable::card_shift());        // tmp2 := card address relative to card table base
+  if (UseCondCardMark) {
+    __ lbzx(R0, tmp1, tmp2);
+    __ cmpwi(CR0, R0, (int)G1CardTable::clean_card_val());
+    __ bne(CR0, done);
+  }
+
+  __ li(R0, G1CardTable::dirty_card_val());
+  __ stbx(R0, tmp1, tmp2);
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators,
                                                  Register store_addr, Register new_val,
-                                                  Register tmp1, Register tmp2, Register tmp3,
-                                                  MacroAssembler::PreservationLevel preservation_level) {
+                                                  Register tmp1, Register tmp2) {
  bool not_null = (decorators & IS_NOT_NULL) != 0;

-  Label runtime, filtered;
-  assert_different_registers(store_addr, new_val, tmp1, tmp2);
-
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  generate_region_crossing_test(masm, store_addr, new_val);
-  __ beq(CR0, filtered);
-
-  // Crosses regions, storing null?
-  if (not_null) {
-#ifdef ASSERT
-    __ cmpdi(CR0, new_val, 0);
-    __ asm_assert_ne("null oop not allowed (G1 post)"); // Checked by caller.
-#endif
-  } else {
-    __ cmpdi(CR0, new_val, 0);
-    __ beq(CR0, filtered);
-  }
-
-  Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2);
-  __ beq(CR0, filtered);
-
-  generate_card_dirty_test(masm, card_addr);
-  __ beq(CR0, filtered);
-
-  __ li(R0, (int)G1CardTable::dirty_card_val());
-  __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val
-
-  Register Rcard_addr = tmp3;
-  __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued.
-
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime, Rcard_addr, tmp1);
-  __ b(filtered);
-
-  __ bind(runtime);
-
-  assert(preservation_level == MacroAssembler::PRESERVATION_NONE,
-         "g1_write_barrier_post doesn't support preservation levels higher than PRESERVATION_NONE");
-
-  // Save the live input values.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), Rcard_addr, R16_thread);
-
-  __ bind(filtered);
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, R16_thread, tmp1, tmp2, done, !not_null);
+  __ bind(done);
 }

 void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@ -333,8 +306,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco
    }
    g1_write_barrier_post(masm, decorators,
                          base, val,
-                          tmp1, tmp2, tmp3,
-                          preservation_level);
+                          tmp1, tmp2);
  }
 }

@ -457,70 +429,29 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register new_val,
                                                     Register tmp1,
                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub,
+                                                     bool new_val_may_be_null,
                                                     bool decode_new_val) {
  assert_different_registers(store_addr, new_val, tmp1, R0);
  assert_different_registers(store_addr, tmp1, tmp2, R0);

-  stub->initialize_registers(R16_thread, tmp1, tmp2);
+  Label done;

-  bool null_check_required = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
  Register new_val_decoded = new_val;

  if (decode_new_val) {
    assert(UseCompressedOops, "or should not be here");
-    if (null_check_required && CompressedOops::base() != nullptr) {
+    if (new_val_may_be_null && CompressedOops::base() != nullptr) {
      // We prefer doing the null check after the region crossing check.
      // Only compressed oop modes with base != null require a null check here.
      __ cmpwi(CR0, new_val, 0);
-      __ beq(CR0, *stub->continuation());
-      null_check_required = false;
+      __ beq(CR0, done);
+      new_val_may_be_null = false;
    }
    new_val_decoded = __ decode_heap_oop_not_null(tmp2, new_val);
  }

-  generate_region_crossing_test(masm, store_addr, new_val_decoded);
-  __ beq(CR0, *stub->continuation());
-
-  // crosses regions, storing null?
-  if (null_check_required) {
-    __ cmpdi(CR0, new_val_decoded, 0);
-    __ beq(CR0, *stub->continuation());
-  }
-
-  Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2);
-  assert(card_addr.base() == tmp1 && card_addr.index() == tmp2, "needed by post barrier stub");
-  __ bc_far_optimized(Assembler::bcondCRbiIs0, __ bi0(CR0, Assembler::equal), *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Address card_addr(stub->tmp1(), stub->tmp2()); // See above.
-
-  __ bind(*stub->entry());
-
-  generate_card_dirty_test(masm, card_addr);
-  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CR0, Assembler::equal), *stub->continuation());
-
-  __ li(R0, (int)G1CardTable::dirty_card_val());
-  __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val
-
-  Register Rcard_addr = stub->tmp1();
-  __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued.
-
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime, Rcard_addr, stub->tmp2());
-  __ b(*stub->continuation());
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, Rcard_addr, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ b(*stub->continuation());
+  generate_post_barrier_fast_path(masm, store_addr, new_val_decoded, R16_thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -558,28 +489,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ b(*stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
+#undef __

-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register addr_reg = stub->addr()->as_pointer_register();
-  Register new_val_reg = stub->new_val()->as_register();
-
-  __ cmpdi(CR0, new_val_reg, 0);
-  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CR0, Assembler::equal), *stub->continuation());
-
-  address c_code = bs->post_barrier_c1_runtime_code_blob()->code_begin();
-  //__ load_const_optimized(R0, c_code);
-  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(c_code));
-  __ mtctr(R0);
-  __ mr(R0, addr_reg); // Pass addr in R0.
-  __ bctrl();
-  __ b(*stub->continuation());
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
 }

-#undef __
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -642,86 +564,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ b(restart);
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  G1BarrierSet* bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
-
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  // Using stack slots: spill addr, spill tmp2
-  const int stack_slots = 2;
-  Register tmp = R0;
-  Register addr = R14;
-  Register tmp2 = R15;
-  CardTable::CardValue* byte_map_base = bs->card_table()->byte_map_base();
-
-  Label restart, refill, ret;
-
-  // Spill
-  __ std(addr, -8, R1_SP);
-  __ std(tmp2, -16, R1_SP);
-
-  __ srdi(addr, R0, CardTable::card_shift()); // Addr is passed in R0.
-  __ load_const_optimized(/*cardtable*/ tmp2, byte_map_base, tmp);
-  __ add(addr, tmp2, addr);
-  __ lbz(tmp, 0, addr); // tmp := [addr + cardtable]
-
-  // Return if young card.
-  __ cmpwi(CR0, tmp, G1CardTable::g1_young_card_val());
-  __ beq(CR0, ret);
-
-  // Return if sequential consistent value is already dirty.
-  __ membar(Assembler::StoreLoad);
-  __ lbz(tmp, 0, addr); // tmp := [addr + cardtable]
-
-  __ cmpwi(CR0, tmp, G1CardTable::dirty_card_val());
-  __ beq(CR0, ret);
-
-  // Not dirty.
-
-  // First, dirty it.
-  __ li(tmp, G1CardTable::dirty_card_val());
-  __ stb(tmp, 0, addr);
-
-  int dirty_card_q_index_byte_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset());
-  int dirty_card_q_buf_byte_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset());
-
-  __ bind(restart);
-
-  // Get the index into the update buffer. G1DirtyCardQueue::_index is
-  // a size_t so ld_ptr is appropriate here.
-  __ ld(tmp2, dirty_card_q_index_byte_offset, R16_thread);
-
-  // index == 0?
-  __ cmpdi(CR0, tmp2, 0);
-  __ beq(CR0, refill);
-
-  __ ld(tmp, dirty_card_q_buf_byte_offset, R16_thread);
-  __ addi(tmp2, tmp2, -oopSize);
-
-  __ std(tmp2, dirty_card_q_index_byte_offset, R16_thread);
-  __ add(tmp2, tmp, tmp2);
-  __ std(addr, 0, tmp2); // [_buf + index] := <address_of_card>
-
-  // Restore temp registers and return-from-leaf.
-  __ bind(ret);
-  __ ld(tmp2, -16, R1_SP);
-  __ ld(addr, -8, R1_SP);
-  __ blr();
-
-  __ bind(refill);
-  const int nbytes_save = (MacroAssembler::num_volatile_regs + stack_slots) * BytesPerWord;
-  __ save_volatile_gprs(R1_SP, -nbytes_save); // except R0
-  __ mflr(R0);
-  __ std(R0, _abi0(lr), R1_SP);
-  __ push_frame_reg_args(nbytes_save, R0); // dummy frame for C call
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1DirtyCardQueueSet::handle_zero_index_for_thread), R16_thread);
-  __ pop_frame();
-  __ ld(R0, _abi0(lr), R1_SP);
-  __ mtlr(R0);
-  __ restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
-  __ b(restart);
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2018, 2021 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -37,9 +37,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -56,8 +54,7 @@ protected:
                            MacroAssembler::PreservationLevel preservation_level);
  void g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators,
                             Register store_addr, Register new_val,
-                             Register tmp1, Register tmp2, Register tmp3,
-                             MacroAssembler::PreservationLevel preservation_level);
+                             Register tmp1, Register tmp2);

  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                            Register base, RegisterOrConstant ind_or_offs, Register val,
@ -79,17 +76,21 @@ public:
                                Register new_val,
                                Register tmp1,
                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub,
+                                bool new_val_may_be_null,
                                bool decode_new_val);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
 #endif
 #ifdef COMPILER1
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif

  virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
--- a/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
+++ b/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2025 SAP SE. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@ -64,13 +64,13 @@ static void post_write_barrier(MacroAssembler* masm,
                               Register tmp1,
                               Register tmp2,
                               bool decode_new_val = false) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub, decode_new_val);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, new_val_may_be_null, decode_new_val);
 }

 %}
--- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
+++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
@ -3277,8 +3277,12 @@ class StubGenerator: public StubCodeGenerator {
    // register the stub as the default exit with class UnsafeMemoryAccess
    UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);

-    // Note: the disjoint stubs must be generated first, some of
-    // the conjoint stubs use them.
+    // Note: the disjoint stubs must be generated first, some of the
+    //       conjoint stubs use them.
+
+    // Note: chaining of stubs does not rely on branching to an
+    //       auxiliary post-push entry because none of the stubs
+    //       push/pop a frame.

    // non-aligned disjoint versions
    StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
--- a/src/hotspot/cpu/riscv/assembler_riscv.inline.hpp
+++ b/src/hotspot/cpu/riscv/assembler_riscv.inline.hpp
@ -27,7 +27,7 @@
 #ifndef CPU_RISCV_ASSEMBLER_RISCV_INLINE_HPP
 #define CPU_RISCV_ASSEMBLER_RISCV_INLINE_HPP

-#include "asm/assembler.inline.hpp"
+#include "asm/assembler.hpp"
 #include "asm/codeBuffer.hpp"
 #include "code/codeCache.hpp"

--- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
@ -87,15 +87,54 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
  }
 }

-void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
-                                                             Register start, Register count, Register tmp, RegSet saved_regs) {
-  __ push_reg(saved_regs, sp);
+void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm,
+                                                             DecoratorSet decorators,
+                                                             Register start,
+                                                             Register count,
+                                                             Register tmp,
+                                                             RegSet saved_regs) {
  assert_different_registers(start, count, tmp);
-  assert_different_registers(c_rarg0, count);
-  __ mv(c_rarg0, start);
-  __ mv(c_rarg1, count);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop_reg(saved_regs, sp);
+
+  Label loop, next, done;
+
+  // Zero count? Nothing to do.
+  __ beqz(count, done);
+
+  // Calculate the number of card marks to set. Since the object might start and
+  // end within a card, we need to calculate this via the card table indexes of
+  // the actual start and last addresses covered by the object.
+  // Temporarily use the count register for the last element address.
+  __ shadd(count, count, start, tmp, LogBytesPerHeapOop); // end = start + count << LogBytesPerHeapOop
+  __ subi(count, count, BytesPerHeapOop);                 // Use last element address for end.
+
+  __ srli(start, start, CardTable::card_shift());
+  __ srli(count, count, CardTable::card_shift());
+  __ sub(count, count, start);                            // Number of bytes to mark - 1.
+
+  // Add card table base offset to start.
+  Address card_table_address(xthread, G1ThreadLocalData::card_table_base_offset());
+  __ ld(tmp, card_table_address);
+  __ add(start, start, tmp);
+
+  __ bind(loop);
+  if (UseCondCardMark) {
+    __ add(tmp, start, count);
+    __ lbu(tmp, Address(tmp, 0));
+    static_assert((uint)G1CardTable::clean_card_val() == 0xff, "must be");
+    __ subi(tmp, tmp, G1CardTable::clean_card_val()); // Convert to clean_card_value() to a comparison
+                                                      // against zero to avoid use of an extra temp.
+    __ bnez(tmp, next);
+  }
+
+  __ add(tmp, start, count);
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ sb(zr, Address(tmp, 0));
+
+  __ bind(next);
+  __ subi(count, count, 1);
+  __ bgez(count, loop);
+
+  __ bind(done);
 }

 static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
@ -192,44 +231,37 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                            const Register store_addr,
                                            const Register new_val,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            Label& done,
-                                            bool new_val_may_be_null) {
-  // Does store cross heap regions?
-  __ xorr(tmp1, store_addr, new_val);                    // tmp1 := store address ^ new value
-  __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);  // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
-  __ beqz(tmp1, done);
-  // Crosses regions, storing null?
-  if (new_val_may_be_null) {
-    __ beqz(new_val, done);
-  }
-  // Storing region crossing non-null, is card young?
-  __ srli(tmp1, store_addr, CardTable::card_shift());    // tmp1 := card address relative to card table base
-  __ load_byte_map_base(tmp2);                           // tmp2 := card table base address
-  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
-  __ lbu(tmp2, Address(tmp1));                           // tmp2 := card
-}
-
-static void generate_post_barrier_slow_path(MacroAssembler* masm,
                                            const Register thread,
                                            const Register tmp1,
                                            const Register tmp2,
                                            Label& done,
-                                            Label& runtime) {
-  __ membar(MacroAssembler::StoreLoad);  // StoreLoad membar
-  __ lbu(tmp2, Address(tmp1));           // tmp2 := card
-  __ beqz(tmp2, done, true);
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ sb(zr, Address(tmp1));       // *(card address) := dirty_card_val
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, tmp1, tmp2, t0);
-  __ j(done);
+                                            bool new_val_may_be_null) {
+  assert(thread == xthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
+  // Does store cross heap regions?
+  __ xorr(tmp1, store_addr, new_val);                    // tmp1 := store address ^ new value
+  __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);  // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ beqz(tmp1, done);
+
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ beqz(new_val, done);
+  }
+  // Storing region crossing non-null, is card clean?
+  __ srli(tmp1, store_addr, CardTable::card_shift());    // tmp1 := card address relative to card table base
+
+  Address card_table_address(xthread, G1ThreadLocalData::card_table_base_offset());
+  __ ld(tmp2, card_table_address);                       // tmp2 := card table base address
+  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
+  if (UseCondCardMark) {
+    static_assert((uint)G1CardTable::clean_card_val() == 0xff, "must be");
+    __ lbu(tmp2, Address(tmp1, 0));                      // tmp2 := card
+    __ subi(tmp2, tmp2, G1CardTable::clean_card_val());  // Convert to clean_card_value() to a comparison
+                                                         // against zero to avoid use of an extra temp.
+    __ bnez(tmp2, done);
+  }
+  static_assert((uint)G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ sb(zr, Address(tmp1, 0));
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
@ -238,27 +270,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                  Register thread,
                                                  Register tmp1,
                                                  Register tmp2) {
-  assert(thread == xthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg,
-         "expecting a register");
-
  Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done (tmp2 holds the card value)
-  __ mv(t0, (int)G1CardTable::g1_young_card_val());
-  __ beq(tmp2, t0, done);   // card == young_card_val?
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push_reg(saved, sp);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
-  __ pop_reg(saved, sp);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
  __ bind(done);
 }

@ -318,37 +331,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register thread,
                                                     Register tmp1,
                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == xthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg,
-         "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path) (tmp2 holds the card value)
-  __ mv(t0, (int)G1CardTable::g1_young_card_val());
-  __ bne(tmp2, t0, *stub->entry(), true);
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ j(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -443,20 +429,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ j(*stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition");
-  assert(stub->new_val()->is_register(), "Precondition");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ beqz(new_val_reg, *stub->continuation(), /* is_far */ true);
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ j(*stub->continuation());
-}
-
 #undef __

+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -507,74 +492,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ epilogue();
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  // arg0 : store_address
-  Address store_addr(fp, 2 * BytesPerWord); // 2 BytesPerWord from fp
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-
-  Label done;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-  const Register thread = xthread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  const Register card_offset = t1;
-  // RA is free here, so we can use it to hold the byte_map_base.
-  const Register byte_map_base = ra;
-
-  assert_different_registers(card_offset, byte_map_base, t0);
-
-  __ load_parameter(0, card_offset);
-  __ srli(card_offset, card_offset, CardTable::card_shift());
-  __ load_byte_map_base(byte_map_base);
-
-  // Convert card offset into an address in card_addr
-  Register card_addr = card_offset;
-  __ add(card_addr, byte_map_base, card_addr);
-
-  __ lbu(t0, Address(card_addr, 0));
-  __ sub(t0, t0, (int)G1CardTable::g1_young_card_val());
-  __ beqz(t0, done);
-
-  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
-
-  __ membar(MacroAssembler::StoreLoad);
-  __ lbu(t0, Address(card_addr, 0));
-  __ beqz(t0, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-  __ sb(zr, Address(card_addr, 0));
-
-  __ ld(t0, queue_index);
-  __ beqz(t0, runtime);
-  __ subi(t0, t0, wordSize);
-  __ sd(t0, queue_index);
-
-  // Reuse RA to hold buffer_addr
-  const Register buffer_addr = ra;
-
-  __ ld(buffer_addr, buffer);
-  __ add(t0, buffer_addr, t0);
-  __ sd(card_addr, Address(t0, 0));
-  __ j(done);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-  __ pop_call_clobbered_registers();
-  __ bind(done);
-  __ epilogue();
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -35,9 +35,7 @@ class LIR_Assembler;
 #endif
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -68,10 +66,16 @@ protected:
 public:
 #ifdef COMPILER1
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif

 #ifdef COMPILER2
@ -90,9 +94,7 @@ public:
                                Register thread,
                                Register tmp1,
                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif

  void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
--- a/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
+++ b/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@ -63,13 +63,13 @@ static void write_barrier_post(MacroAssembler* masm,
                               Register new_val,
                               Register tmp1,
                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, new_val_may_be_null);
 }

 %}
--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
+++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
@ -732,8 +732,7 @@ class StubGenerator: public StubCodeGenerator {
  //
  // s and d are adjusted to point to the remaining words to copy
  //
-  void generate_copy_longs(StubId stub_id, Label &start,
-                           Register s, Register d, Register count) {
+  address generate_copy_longs(StubId stub_id, Register s, Register d, Register count) {
    BasicType type;
    copy_direction direction;
    switch (stub_id) {
@ -763,7 +762,7 @@ class StubGenerator: public StubCodeGenerator {
    Label again, drain;
    StubCodeMark mark(this, stub_id);
    __ align(CodeEntryAlignment);
-    __ bind(start);
+    address start = __ pc();

    if (direction == copy_forwards) {
      __ sub(s, s, bias);
@ -879,9 +878,9 @@ class StubGenerator: public StubCodeGenerator {
    }

    __ ret();
-  }

-  Label copy_f, copy_b;
+    return start;
+  }

  typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);

@ -1099,8 +1098,8 @@ class StubGenerator: public StubCodeGenerator {
  //   stub_id - is used to name the stub and identify all details of
  //             how to perform the copy.
  //
-  //   entry - is assigned to the stub's post push entry point unless
-  //           it is null
+  //   nopush_entry - is assigned to the stub's post push entry point
+  //                  unless it is null
  //
  // Inputs:
  //   c_rarg0   - source array address
@ -1111,11 +1110,11 @@ class StubGenerator: public StubCodeGenerator {
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
-  // Side Effects: entry is set to the (post push) entry point so it
-  //               can be used by the corresponding conjoint copy
-  //               method
+  // Side Effects: nopush_entry is set to the (post push) entry point
+  //               so it can be used by the corresponding conjoint
+  //               copy method
  //
-  address generate_disjoint_copy(StubId stub_id, address* entry) {
+  address generate_disjoint_copy(StubId stub_id, address* nopush_entry) {
    size_t size;
    bool aligned;
    bool is_oop;
@ -1204,8 +1203,8 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+     *nopush_entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }
@ -1256,8 +1255,8 @@ class StubGenerator: public StubCodeGenerator {
  //             corresponding disjoint copy routine which can be
  //             jumped to if the ranges do not actually overlap
  //
-  //   entry - is assigned to the stub's post push entry point unless
-  //           it is null
+  //   nopush_entry - is assigned to the stub's post push entry point
+  //                 unless it is null
  //
  // Inputs:
  //   c_rarg0   - source array address
@ -1269,10 +1268,10 @@ class StubGenerator: public StubCodeGenerator {
  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
-  //   entry is set to the no-overlap entry point so it can be used by
-  //   some other conjoint copy method
+  //   nopush_entry is set to the no-overlap entry point so it can be
+  //   used by some other conjoint copy method
  //
-  address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) {
+  address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
    const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_regs = RegSet::of(s, d, count);
    int size;
@ -1359,8 +1358,8 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }
@ -1370,7 +1369,7 @@ class StubGenerator: public StubCodeGenerator {
    __ slli(t1, count, exact_log2(size));
    Label L_continue;
    __ bltu(t0, t1, L_continue);
-    __ j(nooverlap_target);
+    __ j(RuntimeAddress(nooverlap_target));
    __ bind(L_continue);

    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
@ -1445,7 +1444,7 @@ class StubGenerator: public StubCodeGenerator {
  //    x10 ==  0  -  success
  //    x10 == -1^K - failure, where K is partial transfer count
  //
-  address generate_checkcast_copy(StubId stub_id, address* entry) {
+  address generate_checkcast_copy(StubId stub_id, address* nopush_entry) {
    bool dest_uninitialized;
    switch (stub_id) {
    case StubId::stubgen_checkcast_arraycopy_id:
@ -1496,8 +1495,8 @@ class StubGenerator: public StubCodeGenerator {
    __ enter(); // required for proper stackwalking of RuntimeStub frame

    // Caller of this entry point must set up the argument registers.
-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      BLOCK_COMMENT("Entry:");
    }

@ -2294,13 +2293,21 @@ class StubGenerator: public StubCodeGenerator {
  }

  void generate_arraycopy_stubs() {
-    address entry                     = nullptr;
-    address entry_jbyte_arraycopy     = nullptr;
-    address entry_jshort_arraycopy    = nullptr;
-    address entry_jint_arraycopy      = nullptr;
-    address entry_oop_arraycopy       = nullptr;
-    address entry_jlong_arraycopy     = nullptr;
-    address entry_checkcast_arraycopy = nullptr;
+    // Some copy stubs publish a normal entry and then a 2nd 'fallback'
+    // entry immediately following their stack push. This can be used
+    // as a post-push branch target for compatible stubs when they
+    // identify a special case that can be handled by the fallback
+    // stub e.g a disjoint copy stub may be use as a special case
+    // fallback for its compatible conjoint copy stub.
+    //
+    // A no push entry is always returned in the following local and
+    // then published by assigning to the appropriate entry field in
+    // class StubRoutines. The entry value is then passed to the
+    // generator for the compatible stub. That means the entry must be
+    // listed when saving to/restoring from the AOT cache, ensuring
+    // that the inter-stub jumps are noted at AOT-cache save and
+    // relocated at AOT cache load.
+    address nopush_entry = nullptr;

    // generate the common exit first so later stubs can rely on it if
    // they want an UnsafeMemoryAccess exit non-local to the stub
@ -2308,72 +2315,117 @@ class StubGenerator: public StubCodeGenerator {
    // register the stub as the default exit with class UnsafeMemoryAccess
    UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);

-    generate_copy_longs(StubId::stubgen_copy_byte_f_id, copy_f, c_rarg0, c_rarg1, t1);
-    generate_copy_longs(StubId::stubgen_copy_byte_b_id, copy_b, c_rarg0, c_rarg1, t1);
+    // generate and publish riscv-specific bulk copy routines first
+    // so we can call them from other copy stubs
+    StubRoutines::riscv::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, c_rarg0, c_rarg1, t1);
+    StubRoutines::riscv::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, c_rarg0, c_rarg1, t1);

    StubRoutines::riscv::_zero_blocks = generate_zero_blocks();

    //*** jbyte
    // Always need aligned and unaligned versions
-    StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
-    StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr);
+    StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);

    //*** jshort
    // Always need aligned and unaligned versions
-    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
-    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr);
+    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is used by generic/unsafe copy
+    StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);

    //*** jint
    // Aligned versions
-    StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+    StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
    // entry_jint_arraycopy always points to the unaligned version
-    StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+    StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jint_arraycopy_nopush = nopush_entry;

    //*** jlong
    // It is always aligned
-    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
+    // disjoint normal/nopush and conjoint normal entries are not
+    // generated since the arrayof versions are the same
    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
+    StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;

    //*** oops
    StubRoutines::_arrayof_oop_disjoint_arraycopy
-      = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry);
+      = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
+      // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
    StubRoutines::_arrayof_oop_arraycopy
-      = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
+      = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint arrayof nopush entry is needed by generic/unsafe copy
+    StubRoutines::_oop_arraycopy_nopush = nopush_entry;
    // Aligned versions without pre-barriers
    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
-      = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
-    StubRoutines::_arrayof_oop_arraycopy_uninit
-      = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr);
+      = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
+    // disjoint arrayof+uninit nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;

+    // note that we don't need a returned nopush entry because the
+    // generic/unsafe copy does not cater for uninit arrays.
+    StubRoutines::_arrayof_oop_arraycopy_uninit
+      = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
+
+    // for oop copies reuse arrayof entries for non-arrayof cases
    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
+    StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
+    StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;

-    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
+    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
+    // checkcast nopush entry is needed by generic copy
+    StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
+    // note that we don't need a returned nopush entry because the
+    // generic copy does not cater for uninit arrays.
    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);


-    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
-                                                              entry_jshort_arraycopy,
-                                                              entry_jint_arraycopy,
-                                                              entry_jlong_arraycopy);
+    // unsafe arraycopy may fallback on conjoint stubs
+    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                              StubRoutines::_jshort_arraycopy_nopush,
+                                                              StubRoutines::_jint_arraycopy_nopush,
+                                                              StubRoutines::_jlong_arraycopy_nopush);

-    StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
-                                                               entry_jshort_arraycopy,
-                                                               entry_jint_arraycopy,
-                                                               entry_oop_arraycopy,
-                                                               entry_jlong_arraycopy,
-                                                               entry_checkcast_arraycopy);
+    // generic arraycopy may fallback on conjoint stubs
+    StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                               StubRoutines::_jshort_arraycopy_nopush,
+                                                               StubRoutines::_jint_arraycopy_nopush,
+                                                               StubRoutines::_oop_arraycopy_nopush,
+                                                               StubRoutines::_jlong_arraycopy_nopush,
+                                                               StubRoutines::_checkcast_arraycopy_nopush);

    StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
    StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
--- a/src/hotspot/cpu/s390/assembler_s390.inline.hpp
+++ b/src/hotspot/cpu/s390/assembler_s390.inline.hpp
@ -26,7 +26,7 @@
 #ifndef CPU_S390_ASSEMBLER_S390_INLINE_HPP
 #define CPU_S390_ASSEMBLER_S390_INLINE_HPP

-#include "asm/assembler.inline.hpp"
+#include "asm/assembler.hpp"
 #include "asm/codeBuffer.hpp"
 #include "code/codeCache.hpp"

--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
@ -28,7 +28,6 @@
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1BarrierSetRuntime.hpp"
 #include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
@ -205,104 +204,71 @@ void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
  BLOCK_COMMENT("} generate_c2_pre_barrier_stub");
 }

+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+
+  __ block_comment("generate_post_barrier_fast_path {");
+
+  assert(thread == Z_thread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
+
+  // Does store cross heap regions?
+  if (VM_Version::has_DistinctOpnds()) {
+    __ z_xgrk(tmp1, store_addr, new_val);    // tmp1 := store address ^ new value
+  } else {
+    __ z_lgr(tmp1, store_addr);
+    __ z_xgr(tmp1, new_val);
+  }
+  __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ branch_optimized(Assembler::bcondEqual, done);
+
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ z_ltgr(new_val, new_val);
+    __ z_bre(done);
+  } else {
+#ifdef ASSERT
+    __ z_ltgr(new_val, new_val);
+    __ asm_assert(Assembler::bcondNotZero, "null oop not allowed (G1 post)", 0x322); // Checked by caller.
+#endif
+  }
+
+  __ z_srag(tmp1, store_addr, CardTable::card_shift());
+
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ z_alg(tmp1, card_table_addr);     // tmp1 := card address
+
+  if(UseCondCardMark) {
+    __ z_cli(0, tmp1, G1CardTable::clean_card_val());
+    __ branch_optimized(Assembler::bcondNotEqual, done);
+  }
+
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use z_mvi");
+  __ z_mvi(0, tmp1, G1CardTable::dirty_card_val()); // *(card address) := dirty_card_val
+
+  __ block_comment("} generate_post_barrier_fast_path");
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register store_addr,
                                                     Register new_val,
                                                     Register thread,
                                                     Register tmp1,
                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
+                                                     bool new_val_may_be_null) {
  BLOCK_COMMENT("g1_write_barrier_post_c2 {");
-
-  assert(thread == Z_thread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, Z_R1_scratch);
-
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  BLOCK_COMMENT("generate_region_crossing_test {");
-  if (VM_Version::has_DistinctOpnds()) {
-    __ z_xgrk(tmp1, store_addr, new_val);
-  } else {
-    __ z_lgr(tmp1, store_addr);
-    __ z_xgr(tmp1, new_val);
-  }
-  __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);
-  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  BLOCK_COMMENT("} generate_region_crossing_test");
-
-  // crosses regions, storing null?
-  if ((stub->barrier_data() & G1C2BarrierPostNotNull) == 0) {
-    __ z_ltgr(new_val, new_val);
-    __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  }
-
-  BLOCK_COMMENT("generate_card_young_test {");
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  // calculate address of card
-  __ load_const_optimized(tmp2, (address)ct->card_table()->byte_map_base());      // Card table base.
-  __ z_srlg(tmp1, store_addr, CardTable::card_shift());         // Index into card table.
-  __ z_algr(tmp1, tmp2);                                      // Explicit calculation needed for cli.
-
-  // Filter young.
-  __ z_cli(0, tmp1, G1CardTable::g1_young_card_val());
-
-  BLOCK_COMMENT("} generate_card_young_test");
-
-  // From here on, tmp1 holds the card address.
-  __ branch_optimized(Assembler::bcondNotEqual, *stub->entry());
-
-  __ bind(*stub->continuation());
-
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
  BLOCK_COMMENT("} g1_write_barrier_post_c2");
 }

-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-
-  BLOCK_COMMENT("generate_c2_post_barrier_stub {");
-
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-
-  Register thread     = stub->thread();
-  Register tmp1       = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2       = stub->tmp2();
-  Register Rcard_addr = tmp1;
-
-  __ bind(*stub->entry());
-
-  BLOCK_COMMENT("generate_card_clean_test {");
-  __ z_sync(); // Required to support concurrent cleaning.
-  __ z_cli(0, Rcard_addr, 0); // Reload after membar.
-  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  BLOCK_COMMENT("} generate_card_clean_test");
-
-  BLOCK_COMMENT("generate_dirty_card {");
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ z_mvi(0, Rcard_addr, CardTable::dirty_card_val());
-  BLOCK_COMMENT("} generate_dirty_card");
-
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    Z_thread, tmp1, tmp2);
-
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-
-  __ bind(runtime);
-
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-
-  BLOCK_COMMENT("} generate_c2_post_barrier_stub");
-}
-
 #endif //COMPILER2

 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@ -451,99 +417,9 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Decorato
                                                  Register Rtmp1, Register Rtmp2, Register Rtmp3) {
  bool not_null = (decorators & IS_NOT_NULL) != 0;

-  assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); // Most probably, Rnew_val == Rtmp3.
-
-  Label callRuntime, filtered;
-
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  BLOCK_COMMENT("g1_write_barrier_post {");
-
-  // Does store cross heap regions?
-  // It does if the two addresses specify different grain addresses.
-  if (VM_Version::has_DistinctOpnds()) {
-    __ z_xgrk(Rtmp1, Rstore_addr, Rnew_val);
-  } else {
-    __ z_lgr(Rtmp1, Rstore_addr);
-    __ z_xgr(Rtmp1, Rnew_val);
-  }
-  __ z_srag(Rtmp1, Rtmp1, G1HeapRegion::LogOfHRGrainBytes);
-  __ z_bre(filtered);
-
-  // Crosses regions, storing null?
-  if (not_null) {
-#ifdef ASSERT
-    __ z_ltgr(Rnew_val, Rnew_val);
-    __ asm_assert(Assembler::bcondNotZero, "null oop not allowed (G1 post)", 0x322); // Checked by caller.
-#endif
-  } else {
-    __ z_ltgr(Rnew_val, Rnew_val);
-    __ z_bre(filtered);
-  }
-
-  Rnew_val = noreg; // end of lifetime
-
-  // Storing region crossing non-null, is card already dirty?
-  assert_different_registers(Rtmp1, Rtmp2, Rtmp3);
-  // Make sure not to use Z_R0 for any of these registers.
-  Register Rcard_addr = (Rtmp1 != Z_R0_scratch) ? Rtmp1 : Rtmp3;
-  Register Rbase      = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp3;
-
-  // calculate address of card
-  __ load_const_optimized(Rbase, (address)ct->card_table()->byte_map_base());      // Card table base.
-  __ z_srlg(Rcard_addr, Rstore_addr, CardTable::card_shift());         // Index into card table.
-  __ z_algr(Rcard_addr, Rbase);                                      // Explicit calculation needed for cli.
-  Rbase = noreg; // end of lifetime
-
-  // Filter young.
-  __ z_cli(0, Rcard_addr, G1CardTable::g1_young_card_val());
-  __ z_bre(filtered);
-
-  // Check the card value. If dirty, we're done.
-  // This also avoids false sharing of the (already dirty) card.
-  __ z_sync(); // Required to support concurrent cleaning.
-  __ z_cli(0, Rcard_addr, G1CardTable::dirty_card_val()); // Reload after membar.
-  __ z_bre(filtered);
-
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  __ z_mvi(0, Rcard_addr, G1CardTable::dirty_card_val());
-
-  Register Rcard_addr_x = Rcard_addr;
-  Register Rqueue_index = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp1;
-  if (Rcard_addr == Rqueue_index) {
-    Rcard_addr_x = Z_R0_scratch;  // Register shortage. We have to use Z_R0.
-  }
-  __ lgr_if_needed(Rcard_addr_x, Rcard_addr);
-
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    callRuntime,
-                                    Z_thread, Rcard_addr_x, Rqueue_index);
-  __ z_bru(filtered);
-
-  __ bind(callRuntime);
-
-  // TODO: do we need a frame? Introduced to be on the safe side.
-  bool needs_frame = true;
-  __ lgr_if_needed(Rcard_addr, Rcard_addr_x); // copy back asap. push_frame will destroy Z_R0_scratch!
-
-  // VM call need frame to access(write) O register.
-  if (needs_frame) {
-    __ save_return_pc();
-    __ push_frame_abi160(0); // Will use Z_R0 as tmp on old CPUs.
-  }
-
-  // Save the live input values.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), Rcard_addr, Z_thread);
-
-  if (needs_frame) {
-    __ pop_frame();
-    __ restore_return_pc();
-  }
-
-  __ bind(filtered);
+  Label done;
+  generate_post_barrier_fast_path(masm, Rstore_addr, Rnew_val, Z_thread, Rtmp1, Rtmp2, done, !not_null);
+  __ bind(done);

  BLOCK_COMMENT("} g1_write_barrier_post");
 }
@ -615,22 +491,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  ce->check_reserved_argument_area(16); // RT stub needs 2 spill slots.
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ z_ltgr(new_val_reg, new_val_reg);
-  __ branch_optimized(Assembler::bcondZero, *stub->continuation());
-  __ z_lgr(Z_R1_scratch, stub->addr()->as_pointer_register());
-  ce->emit_call_c(bs->post_barrier_c1_runtime_code_blob()->code_begin());
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-}
-
 #undef __

+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+   Label done;
+   generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+   masm->bind(done);
+}
+
 #define __ sasm->

 static OopMap* save_volatile_registers(StubAssembler* sasm, Register return_pc = Z_R14) {
@ -705,92 +578,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ z_bru(restart);
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  // Z_R1_scratch: oop address, address of updated memory slot
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  Register addr_oop  = Z_R1_scratch;
-  Register addr_card = Z_R1_scratch;
-  Register r1        = Z_R6; // Must be saved/restored.
-  Register r2        = Z_R7; // Must be saved/restored.
-  Register cardtable = r1;   // Must be non-volatile, because it is used to save addr_card.
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-  CardTable::CardValue* byte_map_base = ct->byte_map_base();
-
-  // Save registers used below (see assertion in G1PreBarrierStub::emit_code()).
-  __ z_stg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-
-  Label not_already_dirty, restart, refill, young_card;
-
-  // Calculate address of card corresponding to the updated oop slot.
-  AddressLiteral rs(byte_map_base);
-  __ z_srlg(addr_card, addr_oop, CardTable::card_shift());
-  addr_oop = noreg; // dead now
-  __ load_const_optimized(cardtable, rs); // cardtable := <card table base>
-  __ z_agr(addr_card, cardtable); // addr_card := addr_oop>>card_shift + cardtable
-
-  __ z_cli(0, addr_card, (int)G1CardTable::g1_young_card_val());
-  __ z_bre(young_card);
-
-  __ z_sync(); // Required to support concurrent cleaning.
-
-  __ z_cli(0, addr_card, (int)CardTable::dirty_card_val());
-  __ z_brne(not_already_dirty);
-
-  __ bind(young_card);
-  // We didn't take the branch, so we're already dirty: restore
-  // used registers and return.
-  __ z_lg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_br(Z_R14);
-
-  // Not dirty.
-  __ bind(not_already_dirty);
-
-  // First, dirty it: [addr_card] := 0
-  __ z_mvi(0, addr_card, CardTable::dirty_card_val());
-
-  Register idx = cardtable; // Must be non-volatile, because it is used to save addr_card.
-  Register buf = r2;
-  cardtable = noreg; // now dead
-
-  // Save registers used below (see assertion in G1PreBarrierStub::emit_code()).
-  __ z_stg(r2, 1*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-
-  ByteSize dirty_card_q_index_byte_offset = G1ThreadLocalData::dirty_card_queue_index_offset();
-  ByteSize dirty_card_q_buf_byte_offset = G1ThreadLocalData::dirty_card_queue_buffer_offset();
-
-  __ bind(restart);
-
-  // Get the index into the update buffer. G1DirtyCardQueue::_index is
-  // a size_t so z_ltg is appropriate here.
-  __ z_ltg(idx, Address(Z_thread, dirty_card_q_index_byte_offset));
-
-  // index == 0?
-  __ z_brz(refill);
-
-  __ z_lg(buf, Address(Z_thread, dirty_card_q_buf_byte_offset));
-  __ add2reg(idx, -oopSize);
-
-  __ z_stg(addr_card, 0, idx, buf); // [_buf + index] := <address_of_card>
-  __ z_stg(idx, Address(Z_thread, dirty_card_q_index_byte_offset));
-  // Restore killed registers and return.
-  __ z_lg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_lg(r2, 1*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_br(Z_R14);
-
-  __ bind(refill);
-  save_volatile_registers(sasm);
-  __ z_lgr(idx, addr_card); // Save addr_card, tmp3 must be non-volatile.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1DirtyCardQueueSet::handle_zero_index_for_thread),
-                                   Z_thread);
-  __ z_lgr(addr_card, idx);
-  restore_volatile_registers(sasm); // Restore addr_card.
-  __ z_bru(restart);
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2018, 2024 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -33,9 +33,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -60,10 +58,16 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 public:
 #ifdef COMPILER1
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif // COMPILER1

 #ifdef COMPILER2
@ -81,9 +85,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
                                Register thread,
                                Register tmp1,
                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif // COMPILER2

  virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
--- a/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
+++ b/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright 2024 IBM Corporation. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm,
                               Register new_val,
                               Register tmp1,
                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, new_val_may_be_null);
 }

 %} // source
--- a/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp
@ -171,6 +171,7 @@ void BarrierSetAssembler::try_resolve_jobject_in_native(MacroAssembler* masm, Re

 void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
  BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
+  __ align(4, __ offset() + OFFSET_TO_PATCHABLE_DATA); // must align the following block which requires atomic updates
  __ block_comment("nmethod_entry_barrier (nmethod_entry_barrier) {");

    // Load jump addr:
--- a/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.hpp
+++ b/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.hpp
@ -66,6 +66,14 @@ public:
  OptoReg::Name refine_register(const Node* node,
                                OptoReg::Name opto_reg) const;
 #endif // COMPILER2
+
+  static const int OFFSET_TO_PATCHABLE_DATA_INSTRUCTION = 6 + 6 + 6; // iihf(6) + iilf(6) + lg(6)
+  static const int BARRIER_TOTAL_LENGTH = OFFSET_TO_PATCHABLE_DATA_INSTRUCTION + 6 + 6 + 2; // cfi(6) + larl(6) + bcr(2)
+
+  // first 2 bytes are for cfi instruction opcode and next 4 bytes will be the value/data to be patched,
+  // so we are skipping first 2 bytes and returning the address of value/data field
+  static const int OFFSET_TO_PATCHABLE_DATA = 6 + 6 + 6 + 2; // iihf(6) + iilf(6) + lg(6) + CFI_OPCODE(2)
+
 };

 #ifdef COMPILER2
--- a/src/hotspot/cpu/s390/gc/shared/barrierSetNMethod_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/shared/barrierSetNMethod_s390.cpp
@ -26,26 +26,32 @@
 #include "code/codeBlob.hpp"
 #include "code/nativeInst.hpp"
 #include "code/nmethod.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
 #include "gc/shared/barrierSetNMethod.hpp"
 #include "utilities/debug.hpp"

 class NativeMethodBarrier: public NativeInstruction {
  private:
-    static const int PATCHABLE_INSTRUCTION_OFFSET = 3*6; // bytes

    address get_barrier_start_address() const {
      return NativeInstruction::addr_at(0);
    }

    address get_patchable_data_address() const {
-      address inst_addr = get_barrier_start_address() + PATCHABLE_INSTRUCTION_OFFSET;
+      address start_address = get_barrier_start_address();
+#ifdef ASSERT
+      address inst_addr = start_address + BarrierSetAssembler::OFFSET_TO_PATCHABLE_DATA_INSTRUCTION;

-      DEBUG_ONLY(Assembler::is_z_cfi(*((long*)inst_addr)));
-      return inst_addr + 2;
+      unsigned long instr = 0;
+      Assembler::get_instruction(inst_addr, &instr);
+      assert(Assembler::is_z_cfi(instr), "sanity check");
+#endif // ASSERT
+
+      return start_address + BarrierSetAssembler::OFFSET_TO_PATCHABLE_DATA;
    }

  public:
-    static const int BARRIER_TOTAL_LENGTH = PATCHABLE_INSTRUCTION_OFFSET + 2*6 + 2; // bytes
+    static const int BARRIER_TOTAL_LENGTH = BarrierSetAssembler::BARRIER_TOTAL_LENGTH;

    int get_guard_value() const {
      address data_addr = get_patchable_data_address();
@ -77,23 +83,30 @@ class NativeMethodBarrier: public NativeInstruction {

    #ifdef ASSERT
      void verify() const {
+        unsigned long instr = 0;
        int offset = 0; // bytes
        const address start = get_barrier_start_address();

-        MacroAssembler::is_load_const(/* address */ start + offset); // two instructions
+        assert(MacroAssembler::is_load_const(/* address */ start + offset), "sanity check"); // two instructions
        offset += Assembler::instr_len(&start[offset]);
        offset += Assembler::instr_len(&start[offset]);

-        Assembler::is_z_lg(*((long*)(start + offset)));
+        Assembler::get_instruction(start + offset, &instr);
+        assert(Assembler::is_z_lg(instr), "sanity check");
        offset += Assembler::instr_len(&start[offset]);

-        Assembler::is_z_cfi(*((long*)(start + offset)));
+        // it will be assignment operation, So it doesn't matter what value is already present in instr
+        // hence, no need to 0 it out.
+        Assembler::get_instruction(start + offset, &instr);
+        assert(Assembler::is_z_cfi(instr), "sanity check");
        offset += Assembler::instr_len(&start[offset]);

-        Assembler::is_z_larl(*((long*)(start + offset)));
+        Assembler::get_instruction(start + offset, &instr);
+        assert(Assembler::is_z_larl(instr), "sanity check");
        offset += Assembler::instr_len(&start[offset]);

-        Assembler::is_z_bcr(*((long*)(start + offset)));
+        Assembler::get_instruction(start + offset, &instr);
+        assert(Assembler::is_z_bcr(instr), "sanity check");
        offset += Assembler::instr_len(&start[offset]);

        assert(offset == BARRIER_TOTAL_LENGTH, "check offset == barrier length constant");
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@ -1398,11 +1398,7 @@ void Assembler::addl(Address dst, Register src) {

 void Assembler::eaddl(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x01);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x01, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::addl(Register dst, int32_t imm32) {
@ -1432,11 +1428,7 @@ void Assembler::addl(Register dst, Register src) {
 }

 void Assembler::eaddl(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void)emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x03, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x03, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::addr_nop_4() {
@ -1657,17 +1649,18 @@ void Assembler::eandl(Register dst, Register src1, Address src2, bool no_flags)
  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x23, no_flags);
 }

+void Assembler::eandl(Register dst, Address src1, Register src2, bool no_flags) {
+  InstructionMark im(this);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x21, no_flags, false /* is_map1 */, true /* is_commutative */);
+}
+
 void Assembler::andl(Register dst, Register src) {
  (void) prefix_and_encode(dst->encoding(), src->encoding());
  emit_arith(0x23, 0xC0, dst, src);
 }

 void Assembler::eandl(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x23, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x23, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::andnl(Register dst, Register src1, Register src2) {
@ -2519,7 +2512,7 @@ void Assembler::imull(Register dst, Register src) {
 }

 void Assembler::eimull(Register dst, Register src1, Register src2, bool no_flags) {
-  emit_eevex_or_demote(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0xAF, no_flags, true /* is_map1 */,  true /* swap */);
+  emit_eevex_or_demote(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0xAF, no_flags, true /* is_map1 */,  true /* swap */, true /* is_commutative */);
 }

 void Assembler::imull(Register dst, Address src, int32_t value) {
@ -4419,11 +4412,7 @@ void Assembler::enotl(Register dst, Register src) {
 }

 void Assembler::eorw(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x0B, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_66, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_16bit, 0x0B, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::orl(Address dst, int32_t imm32) {
@ -4467,11 +4456,7 @@ void Assembler::orl(Register dst, Register src) {
 }

 void Assembler::eorl(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x0B, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x0B, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::orl(Address dst, Register src) {
@ -4483,11 +4468,7 @@ void Assembler::orl(Address dst, Register src) {

 void Assembler::eorl(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x09);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x09, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::orb(Address dst, int imm8) {
@ -4517,11 +4498,7 @@ void Assembler::orb(Address dst, Register src) {

 void Assembler::eorb(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_8bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x08);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_8bit, 0x08, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::packsswb(XMMRegister dst, XMMRegister src) {
@ -7323,11 +7300,7 @@ void Assembler::xorl(Register dst, Register src) {
 }

 void Assembler::exorl(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x33, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x33, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::xorl(Address dst, Register src) {
@ -7339,11 +7312,7 @@ void Assembler::xorl(Address dst, Register src) {

 void Assembler::exorl(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x31);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x31, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::xorb(Register dst, Address src) {
@ -7367,11 +7336,7 @@ void Assembler::xorb(Address dst, Register src) {

 void Assembler::exorb(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_8bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x30);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_8bit, 0x30, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::xorw(Register dst, Address src) {
@ -12955,6 +12920,31 @@ void Assembler::eevex_prefix_ndd(Address adr, int ndd_enc, int xreg_enc, VexSimd
  vex_prefix(adr, ndd_enc, xreg_enc, pre, opc, attributes, /* nds_is_ndd */ true, no_flags);
 }

+void Assembler::emit_eevex_or_demote(Register dst, Address src1, Register src2, VexSimdPrefix pre, VexOpcode opc,
+                                     int size, int opcode_byte, bool no_flags, bool is_map1, bool is_commutative) {
+  if (is_commutative && is_demotable(no_flags, dst->encoding(), src2->encoding())) {
+    // Opcode byte adjustment due to mismatch between NDD and equivalent demotable variant
+    opcode_byte += 2;
+    if (size == EVEX_64bit) {
+      emit_prefix_and_int8(get_prefixq(src1, dst, is_map1), opcode_byte);
+    } else {
+      // For 32-bit, 16-bit and 8-bit
+      if (size == EVEX_16bit) {
+        emit_int8(0x66);
+      }
+      prefix(src1, dst, false, is_map1);
+      emit_int8(opcode_byte);
+    }
+  } else {
+    bool vex_w = (size == EVEX_64bit) ? true : false;
+    InstructionAttr attributes(AVX_128bit, vex_w, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+    attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, size);
+    eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), pre, opc, &attributes, no_flags);
+    emit_int8(opcode_byte);
+  }
+  emit_operand(src2, src1, 0);
+}
+
 void Assembler::emit_eevex_or_demote(Register dst, Register src1, Address src2, VexSimdPrefix pre, VexOpcode opc,
                                     int size, int opcode_byte, bool no_flags, bool is_map1) {
  if (is_demotable(no_flags, dst->encoding(), src1->encoding())) {
@ -13055,18 +13045,20 @@ void Assembler::emit_eevex_or_demote(int dst_enc, int nds_enc, int src_enc, int8
 }

 void Assembler::emit_eevex_or_demote(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
-                                     int size, int opcode_byte, bool no_flags, bool is_map1, bool swap) {
+                                     int size, int opcode_byte, bool no_flags, bool is_map1, bool swap, bool is_commutative) {
  int encode;
  bool is_prefixq = (size == EVEX_64bit) ? true : false;
-  if (is_demotable(no_flags, dst_enc, nds_enc)) {
+  bool first_operand_demotable = is_demotable(no_flags, dst_enc, nds_enc);
+  bool second_operand_demotable = is_commutative && is_demotable(no_flags, dst_enc, src_enc);
+  if (first_operand_demotable || second_operand_demotable) {
    if (size == EVEX_16bit) {
      emit_int8(0x66);
    }
-
+    int src = first_operand_demotable ? src_enc : nds_enc;
    if (swap) {
-      encode = is_prefixq ? prefixq_and_encode(dst_enc, src_enc, is_map1) : prefix_and_encode(dst_enc, src_enc, is_map1);
+      encode = is_prefixq ? prefixq_and_encode(dst_enc, src, is_map1) : prefix_and_encode(dst_enc, src, is_map1);
    } else {
-      encode = is_prefixq ? prefixq_and_encode(src_enc, dst_enc, is_map1) : prefix_and_encode(src_enc, dst_enc, is_map1);
+      encode = is_prefixq ? prefixq_and_encode(src, dst_enc, is_map1) : prefix_and_encode(src, dst_enc, is_map1);
    }
    emit_opcode_prefix_and_encoding((unsigned char)opcode_byte, 0xC0, encode);
  } else {
@ -13114,6 +13106,26 @@ int Assembler::eevex_prefix_and_encode_nf(int dst_enc, int nds_enc, int src_enc,
  return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, attributes, /* src_is_gpr */ true, /* nds_is_ndd */ false, no_flags);
 }

+void Assembler::emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register src1, Register src2, VexSimdPrefix pre, VexOpcode opc,
+                                                      int size, int op1, int op2, bool no_flags, bool is_commutative) {
+  bool demotable = is_demotable(no_flags, dst->encoding(), src1->encoding());
+  if (!demotable && is_commutative) {
+    if (is_demotable(no_flags, dst->encoding(), src2->encoding())) {
+      // swap src1 and src2
+      Register tmp = src1;
+      src1 = src2;
+      src2 = tmp;
+    }
+  }
+  bool vex_w = (size == EVEX_64bit) ? true : false;
+  bool use_prefixq = vex_w;
+  InstructionAttr attributes(AVX_128bit, vex_w, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
+  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
+  (void)emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), pre, opc, &attributes, no_flags, use_prefixq);
+  emit_arith(op1, op2, src1, src2);
+}
+
 void Assembler::emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register nds, int32_t imm32, VexSimdPrefix pre, VexOpcode opc,
                                                      int size, int op1, int op2, bool no_flags) {
  int dst_enc = dst->encoding();
@ -13124,7 +13136,6 @@ void Assembler::emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register nds
  } else {
    bool vex_w = (size == EVEX_64bit) ? true : false;
    InstructionAttr attributes(AVX_128bit, vex_w, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-    //attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, size);
    attributes.set_is_evex_instruction();
    vex_prefix_and_encode(0, dst_enc, nds_enc, pre, opc, &attributes, /* src_is_gpr */ true, /* nds_is_ndd */ true, no_flags);

@ -13769,7 +13780,7 @@ void Assembler::pdepq(Register dst, Register src1, Address src2) {

 void Assembler::sarxl(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13777,7 +13788,7 @@ void Assembler::sarxl(Register dst, Register src1, Register src2) {
 void Assembler::sarxl(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13786,7 +13797,7 @@ void Assembler::sarxl(Register dst, Address src1, Register src2) {

 void Assembler::sarxq(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13794,7 +13805,7 @@ void Assembler::sarxq(Register dst, Register src1, Register src2) {
 void Assembler::sarxq(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13803,7 +13814,7 @@ void Assembler::sarxq(Register dst, Address src1, Register src2) {

 void Assembler::shlxl(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13811,7 +13822,7 @@ void Assembler::shlxl(Register dst, Register src1, Register src2) {
 void Assembler::shlxl(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13820,7 +13831,7 @@ void Assembler::shlxl(Register dst, Address src1, Register src2) {

 void Assembler::shlxq(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13828,7 +13839,7 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) {
 void Assembler::shlxq(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13837,7 +13848,7 @@ void Assembler::shlxq(Register dst, Address src1, Register src2) {

 void Assembler::shrxl(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13845,7 +13856,7 @@ void Assembler::shrxl(Register dst, Register src1, Register src2) {
 void Assembler::shrxl(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13854,7 +13865,7 @@ void Assembler::shrxl(Register dst, Address src1, Register src2) {

 void Assembler::shrxq(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13862,7 +13873,7 @@ void Assembler::shrxq(Register dst, Register src1, Register src2) {
 void Assembler::shrxq(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -14623,11 +14634,7 @@ void Assembler::addq(Address dst, Register src) {

 void Assembler::eaddq(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x01);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x01, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::addq(Register dst, int32_t imm32) {
@ -14656,11 +14663,7 @@ void Assembler::addq(Register dst, Register src) {
 }

 void Assembler::eaddq(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags, true /* use_prefixq */);
-  emit_arith(0x03, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x03, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::adcxq(Register dst, Register src) {
@ -14753,11 +14756,7 @@ void Assembler::andq(Register dst, Register src) {
 }

 void Assembler::eandq(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags, true /* use_prefixq */);
-  emit_arith(0x23, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x23, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::andq(Address dst, Register src) {
@ -14768,11 +14767,7 @@ void Assembler::andq(Address dst, Register src) {

 void Assembler::eandq(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x21);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x21, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::andnq(Register dst, Register src1, Register src2) {
@ -15118,7 +15113,7 @@ void Assembler::eimulq(Register dst, Register src, bool no_flags) {
 }

 void Assembler::eimulq(Register dst, Register src1, Register src2, bool no_flags) {
-  emit_eevex_or_demote(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0xAF, no_flags, true /* is_map1 */,  true /* swap */);
+  emit_eevex_or_demote(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0xAF, no_flags, true /* is_map1 */,  true /* swap */, true /* is_commutative */);
 }

 void Assembler::imulq(Register src) {
@ -15580,11 +15575,7 @@ void Assembler::orq(Address dst, Register src) {

 void Assembler::eorq(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x09);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x09, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::orq(Register dst, int32_t imm32) {
@ -15624,13 +15615,8 @@ void Assembler::orq(Register dst, Register src) {
 }

 void Assembler::eorq(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags, true /* use_prefixq */);
-  emit_arith(0x0B, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x0B, 0xC0, no_flags, true /* is_commutative */);
 }
-
 void Assembler::popcntq(Register dst, Address src) {
  assert(VM_Version::supports_popcnt(), "must support");
  InstructionMark im(this);
@ -16372,11 +16358,7 @@ void Assembler::xorq(Register dst, Register src) {
 }

 void Assembler::exorq(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags, true /* use_prefixq */);
-  emit_arith(0x33, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x33, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::xorq(Register dst, Address src) {
@ -16430,11 +16412,7 @@ void Assembler::esetzucc(Condition cc, Register dst) {

 void Assembler::exorq(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x31);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x31, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void InstructionAttr::set_address_attributes(int tuple_type, int input_size_in_bits) {
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -807,14 +807,20 @@ private:
  int emit_eevex_prefix_or_demote_ndd(int dst_enc, int nds_enc, VexSimdPrefix pre, VexOpcode opc,
                                      InstructionAttr *attributes, bool no_flags = false, bool use_prefixq = false);

+  void emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register src1, Register src2, VexSimdPrefix pre, VexOpcode opc,
+                                             int size, int op1, int op2, bool no_flags = false, bool is_commutative = false);
+
  void emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register nds, int32_t imm32, VexSimdPrefix pre, VexOpcode opc,
                                             int size, int op1, int op2, bool no_flags);

  void emit_eevex_or_demote(Register dst, Register src1, Address src2, VexSimdPrefix pre, VexOpcode opc,
                            int size, int opcode_byte, bool no_flags = false, bool is_map1 = false);

+  void emit_eevex_or_demote(Register dst, Address src1, Register src2, VexSimdPrefix pre, VexOpcode opc,
+                            int size, int opcode_byte, bool no_flags = false, bool is_map1 = false, bool is_commutative = false);
+
  void emit_eevex_or_demote(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
-                            int size, int opcode_byte, bool no_flags, bool is_map1 = false, bool swap = false);
+                            int size, int opcode_byte, bool no_flags, bool is_map1 = false, bool swap = false, bool is_commutative = false);

  void emit_eevex_or_demote(int dst_enc, int nds_enc, int src_enc, int8_t imm8, VexSimdPrefix pre, VexOpcode opc,
                            int size, int opcode_byte, bool no_flags, bool is_map1 = false);
@ -1149,6 +1155,7 @@ private:
  void eandl(Register dst, Register src, int32_t imm32, bool no_flags);
  void andl(Register dst, Address src);
  void eandl(Register dst, Register src1, Address src2, bool no_flags);
+  void eandl(Register dst, Address src1, Register src2, bool no_flags);
  void andl(Register dst, Register src);
  void eandl(Register dst, Register src1, Register src2, bool no_flags);
  void andl(Address dst, Register src);
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
@ -89,19 +89,53 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm

 void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count, Register tmp) {
-  __ push_call_clobbered_registers(false /* save_fpu */);
-  if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
-    assert_different_registers(c_rarg1, addr);
-    __ mov(c_rarg1, count);
-    __ mov(c_rarg0, addr);
-  } else {
-    assert_different_registers(c_rarg0, count);
-    __ mov(c_rarg0, addr);
-    __ mov(c_rarg1, count);
-  }
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop_call_clobbered_registers(false /* save_fpu */);
+  Label done;

+  __ testptr(count, count);
+  __ jcc(Assembler::zero, done);
+
+  // Calculate end address in "count".
+  Address::ScaleFactor scale = UseCompressedOops ? Address::times_4 : Address::times_8;
+  __ leaq(count, Address(addr, count, scale));
+
+  // Calculate start card address in "addr".
+  __ shrptr(addr, CardTable::card_shift());
+
+  Register thread = r15_thread;
+
+  __ movptr(tmp, Address(thread, in_bytes(G1ThreadLocalData::card_table_base_offset())));
+  __ addptr(addr, tmp);
+
+  // Calculate address of card of last word in the array.
+  __ subptr(count, 1);
+  __ shrptr(count, CardTable::card_shift());
+  __ addptr(count, tmp);
+
+  Label loop;
+  // Iterate from start card to end card (inclusive).
+  __ bind(loop);
+
+  Label is_clean_card;
+  if (UseCondCardMark) {
+    __ cmpb(Address(addr, 0), G1CardTable::clean_card_val());
+    __ jcc(Assembler::equal, is_clean_card);
+  } else {
+   __ movb(Address(addr, 0), G1CardTable::dirty_card_val());
+  }
+
+  Label next_card;
+  __ bind(next_card);
+  __ addptr(addr, sizeof(CardTable::CardValue));
+  __ cmpptr(addr, count);
+  __ jcc(Assembler::belowEqual, loop);
+  __ jmp(done);
+
+  __ bind(is_clean_card);
+  // Card was clean. Dirty card and go to next..
+  __ movb(Address(addr, 0), G1CardTable::dirty_card_val());
+  __ jmp(next_card);
+
+  __ bind(done);
 }

 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@ -182,7 +216,6 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
  // If expand_call is true then we expand the call_VM_leaf macro
  // directly to skip generating the check by
  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
-
  const Register thread = r15_thread;

  Label done;
@ -238,73 +271,46 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                            const Register store_addr,
                                            const Register new_val,
-                                            const Register tmp,
-                                            const Register tmp2,
+                                            const Register tmp1,
                                            Label& done,
                                            bool new_val_may_be_null) {
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
+
+  assert_different_registers(store_addr, new_val, tmp1, noreg);
+
+  Register thread = r15_thread;
+
  // Does store cross heap regions?
-  __ movptr(tmp, store_addr);                                    // tmp := store address
-  __ xorptr(tmp, new_val);                                       // tmp := store address ^ new value
-  __ shrptr(tmp, G1HeapRegion::LogOfHRGrainBytes);               // ((store address ^ new value) >> LogOfHRGrainBytes) == 0?
+  __ movptr(tmp1, store_addr);                                    // tmp1 := store address
+  __ xorptr(tmp1, new_val);                                       // tmp1 := store address ^ new value
+  __ shrptr(tmp1, G1HeapRegion::LogOfHRGrainBytes);               // ((store address ^ new value) >> LogOfHRGrainBytes) == 0?
  __ jcc(Assembler::equal, done);
+
  // Crosses regions, storing null?
  if (new_val_may_be_null) {
-    __ cmpptr(new_val, NULL_WORD);                               // new value == null?
+    __ cmpptr(new_val, NULL_WORD);                                // new value == null?
    __ jcc(Assembler::equal, done);
  }
-  // Storing region crossing non-null, is card young?
-  __ movptr(tmp, store_addr);                                    // tmp := store address
-  __ shrptr(tmp, CardTable::card_shift());                       // tmp := card address relative to card table base
-  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
-  // a valid address and therefore is not properly handled by the relocation code.
-  __ movptr(tmp2, (intptr_t)ct->card_table()->byte_map_base());  // tmp2 := card table base address
-  __ addptr(tmp, tmp2);                                          // tmp := card address
-  __ cmpb(Address(tmp, 0), G1CardTable::g1_young_card_val());    // *(card address) == young_card_val?
-}

-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp,
-                                            const Register tmp2,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));  // StoreLoad membar
-  __ cmpb(Address(tmp, 0), G1CardTable::dirty_card_val());       // *(card address) == dirty_card_val?
-  __ jcc(Assembler::equal, done);
+  __ movptr(tmp1, store_addr);                                    // tmp1 := store address
+  __ shrptr(tmp1, CardTable::card_shift());                       // tmp1 := card address relative to card table base
+
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ addptr(tmp1, card_table_addr);                               // tmp1 := card address
+  if (UseCondCardMark) {
+    __ cmpb(Address(tmp1, 0), G1CardTable::clean_card_val());     // *(card address) == clean_card_val?
+    __ jcc(Assembler::notEqual, done);
+  }
  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  __ movb(Address(tmp, 0), G1CardTable::dirty_card_val());       // *(card address) := dirty_card_val
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime,
-                           thread, tmp, tmp2);
-  __ jmp(done);
+  // Dirty card.
+  __ movb(Address(tmp1, 0), G1CardTable::dirty_card_val());       // *(card address) := dirty_card_val
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                  Register store_addr,
                                                  Register new_val,
-                                                  Register tmp,
-                                                  Register tmp2) {
-  const Register thread = r15_thread;
-
+                                                  Register tmp) {
  Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  __ jcc(Assembler::equal, done);
-  generate_post_barrier_slow_path(masm, thread, tmp, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push_set(saved);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp, thread);
-  __ pop_set(saved);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, done, true /* new_val_may_be_null */);
  __ bind(done);
 }

@ -367,34 +373,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register store_addr,
                                                     Register new_val,
                                                     Register tmp,
-                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  const Register thread = r15_thread;
-  stub->initialize_registers(thread, tmp, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ jcc(Assembler::notEqual, *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp = stub->tmp1(); // tmp holds the card address.
-  Register tmp2 = stub->tmp2();
-  assert(stub->tmp3() == noreg, "not needed in this platform");
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ jmp(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -441,8 +423,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco
      g1_write_barrier_post(masm /*masm*/,
                            tmp1 /* store_adr */,
                            new_val /* new_val */,
-                            tmp3 /* tmp */,
-                            tmp2 /* tmp2 */);
+                            tmp3 /* tmp */);
    }
  }
 }
@ -476,21 +457,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier

 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cmpptr(new_val_reg, NULL_WORD);
-  __ jcc(Assembler::equal, *stub->continuation());
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ jmp(*stub->continuation());
-}
-
 #undef __

+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2 /* unused on x86 */) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -555,78 +534,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ epilogue();
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  CardTableBarrierSet* ct =
-    barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  Label done;
-  Label enqueued;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-
-  const Register thread = r15_thread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  __ push_ppx(rax);
-  __ push_ppx(rcx);
-
-  const Register cardtable = rax;
-  const Register card_addr = rcx;
-
-  __ load_parameter(0, card_addr);
-  __ shrptr(card_addr, CardTable::card_shift());
-  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
-  // a valid address and therefore is not properly handled by the relocation code.
-  __ movptr(cardtable, (intptr_t)ct->card_table()->byte_map_base());
-  __ addptr(card_addr, cardtable);
-
-  __ cmpb(Address(card_addr, 0), G1CardTable::g1_young_card_val());
-  __ jcc(Assembler::equal, done);
-
-  __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
-  __ cmpb(Address(card_addr, 0), CardTable::dirty_card_val());
-  __ jcc(Assembler::equal, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-
-  __ movb(Address(card_addr, 0), CardTable::dirty_card_val());
-
-  const Register tmp = rdx;
-  __ push_ppx(rdx);
-
-  __ movptr(tmp, queue_index);
-  __ testptr(tmp, tmp);
-  __ jcc(Assembler::zero, runtime);
-  __ subptr(tmp, wordSize);
-  __ movptr(queue_index, tmp);
-  __ addptr(tmp, buffer);
-  __ movptr(Address(tmp, 0), card_addr);
-  __ jmp(enqueued);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-
-  __ pop_call_clobbered_registers();
-
-  __ bind(enqueued);
-  __ pop_ppx(rdx);
-
-  __ bind(done);
-  __ pop_ppx(rcx);
-  __ pop_ppx(rax);
-
-  __ epilogue();
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -31,10 +31,8 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1BarrierStubC2;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -51,22 +49,28 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
  void g1_write_barrier_post(MacroAssembler* masm,
                             Register store_addr,
                             Register new_val,
-                             Register tmp,
-                             Register tmp2);
+                             Register tmp);

  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                            Address dst, Register val, Register tmp1, Register tmp2, Register tmp3);

 public:
-  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
-
-  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
-
  virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                       Register dst, Address src, Register tmp1);

+#ifdef COMPILER1
+  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
+
+  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+#endif
+
 #ifdef COMPILER2
  void g1_write_barrier_pre_c2(MacroAssembler* masm,
                               Register obj,
@ -79,10 +83,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
                                Register store_addr,
                                Register new_val,
                                Register tmp,
-                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif // COMPILER2
 };

--- a/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
+++ b/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -59,15 +59,14 @@ static void write_barrier_post(MacroAssembler* masm,
                               const MachNode* node,
                               Register store_addr,
                               Register new_val,
-                               Register tmp1,
-                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+                               Register tmp1) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, new_val_may_be_null);
 }

 %}
@ -95,8 +94,7 @@ instruct g1StoreP(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rRegP tmp3,
    write_barrier_post(masm, this,
                       $tmp1$$Register /* store_addr */,
                       $src$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
  %}
  ins_pipe(ialu_mem_reg);
 %}
@ -127,8 +125,7 @@ instruct g1StoreN(memory mem, rRegN src, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFl
    write_barrier_post(masm, this,
                       $tmp1$$Register /* store_addr */,
                       $tmp2$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
  %}
  ins_pipe(ialu_mem_reg);
 %}
@ -158,8 +155,7 @@ instruct g1EncodePAndStoreN(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rR
    write_barrier_post(masm, this,
                       $tmp1$$Register /* store_addr */,
                       $src$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
  %}
  ins_pipe(ialu_mem_reg);
 %}
@ -187,8 +183,7 @@ instruct g1CompareAndExchangeP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -214,8 +209,7 @@ instruct g1CompareAndExchangeN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -246,8 +240,7 @@ instruct g1CompareAndSwapP(rRegI res, indirect mem, rRegP newval, rRegP tmp1, rR
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -279,8 +272,7 @@ instruct g1CompareAndSwapN(rRegI res, indirect mem, rRegN newval, rRegP tmp1, rR
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -303,8 +295,7 @@ instruct g1GetAndSetP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp2, rRegP
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -328,8 +319,7 @@ instruct g1GetAndSetN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp2, rRegP
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp
@ -76,50 +76,95 @@ static uint& get_profile_ctr(int shift) {
 #endif // !PRODUCT

 void StubGenerator::generate_arraycopy_stubs() {
-  address entry;
-  address entry_jbyte_arraycopy;
-  address entry_jshort_arraycopy;
-  address entry_jint_arraycopy;
-  address entry_oop_arraycopy;
-  address entry_jlong_arraycopy;
-  address entry_checkcast_arraycopy;
+  // Some copy stubs publish a normal entry and then a 2nd 'fallback'
+  // entry immediately following their stack push. This can be used
+  // as a post-push branch target for compatible stubs when they
+  // identify a special case that can be handled by the fallback
+  // stub e.g a disjoint copy stub may be use as a special case
+  // fallback for its compatible conjoint copy stub.
+  //
+  // A no push entry is always returned in the following local and
+  // then published by assigning to the appropriate entry field in
+  // class StubRoutines. The entry value is then passed to the
+  // generator for the compatible stub. That means the entry must be
+  // listed when saving to/restoring from the AOT cache, ensuring
+  // that the inter-stub jumps are noted at AOT-cache save and
+  // relocated at AOT cache load.
+  address nopush_entry;

-  StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&entry);
-  StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(entry, &entry_jbyte_arraycopy);
+  StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&nopush_entry);
+  // disjoint nopush entry is needed by conjoint copy
+  StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+  StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
+  // conjoint nopush entry is needed by generic/unsafe copy
+  StubRoutines::_jbyte_arraycopy_nopush    = nopush_entry;

-  StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&entry);
-  StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(entry, &entry_jshort_arraycopy);
+  StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
+  // disjoint nopush entry is needed by conjoint copy
+  StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
+  StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
+  // conjoint nopush entry is needed by generic/unsafe copy
+  StubRoutines::_jshort_arraycopy_nopush   = nopush_entry;

-  StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
-  StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+  StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
+  // disjoint nopush entry is needed by conjoint copy
+  StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
+  StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
+  // conjoint nopush entry is needed by generic/unsafe copy
+  StubRoutines::_jint_arraycopy_nopush     = nopush_entry;
+
+  StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
+  // disjoint nopush entry is needed by conjoint copy
+  StubRoutines::_jlong_disjoint_arraycopy_nopush  = nopush_entry;
+  StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
+  // conjoint nopush entry is needed by generic/unsafe copy
+  StubRoutines::_jlong_arraycopy_nopush    = nopush_entry;

-  StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &entry);
-  StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
  if (UseCompressedOops) {
-    StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &entry);
-    StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, entry, &entry_oop_arraycopy);
-    StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &entry);
-    StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, entry, nullptr);
+    StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
+    StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
+    // note that we don't need a returned nopush entry because the
+    // generic/unsafe copy does not cater for uninit arrays.
+    StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
  } else {
-    StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &entry);
-    StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, entry, &entry_oop_arraycopy);
-    StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &entry);
-    StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, entry, nullptr);
+    StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
+    StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
+    // note that we don't need a returned nopush entry because the
+    // generic/unsafe copy does not cater for uninit arrays.
+    StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
  }

-  StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
+  StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
+  // checkcast nopush entry is needed by generic copy
+  StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
+  // note that we don't need a returned nopush entry because the
+  // generic copy does not cater for uninit arrays.
  StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);

-  StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
-                                                            entry_jshort_arraycopy,
-                                                            entry_jint_arraycopy,
-                                                            entry_jlong_arraycopy);
-  StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
-                                                             entry_jshort_arraycopy,
-                                                             entry_jint_arraycopy,
-                                                             entry_oop_arraycopy,
-                                                             entry_jlong_arraycopy,
-                                                             entry_checkcast_arraycopy);
+  StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                            StubRoutines::_jshort_arraycopy_nopush,
+                                                            StubRoutines::_jint_arraycopy_nopush,
+                                                            StubRoutines::_jlong_arraycopy_nopush);
+  StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                             StubRoutines::_jshort_arraycopy_nopush,
+                                                             StubRoutines::_jint_arraycopy_nopush,
+                                                             StubRoutines::_oop_arraycopy_nopush,
+                                                             StubRoutines::_jlong_arraycopy_nopush,
+                                                             StubRoutines::_checkcast_arraycopy_nopush);

  StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
  StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -139,7 +139,7 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
    const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT);
    bool use_evex = FLAG_IS_DEFAULT(UseAVX) || (UseAVX > 2);

-    Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4, std_cpuid24;
+    Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4, std_cpuid24, std_cpuid29;
    Label sef_cpuid, sefsl1_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7;
    Label ext_cpuid8, done, wrapup, vector_save_restore, apx_save_restore_warning;
    Label legacy_setup, save_restore_except, legacy_save_restore, start_simd_check;
@ -338,6 +338,16 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
    __ movl(Address(rsi, 0), rax);
    __ movl(Address(rsi, 4), rdx);

+    //
+    // cpuid(0x29) APX NCI NDD NF (EAX = 29H, ECX = 0).
+    //
+    __ bind(std_cpuid29);
+    __ movl(rax, 0x29);
+    __ movl(rcx, 0);
+    __ cpuid();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid29_offset())));
+    __ movl(Address(rsi, 0), rbx);
+
    //
    // cpuid(0x24) Converged Vector ISA Main Leaf (EAX = 24H, ECX = 0).
    //
@ -1016,16 +1026,6 @@ void VM_Version::get_processor_features() {
    _features.clear_feature(CPU_AVX10_2);
  }

-  // Currently APX support is only enabled for targets supporting AVX512VL feature.
-  bool apx_supported = os_supports_apx_egprs() && supports_apx_f() && supports_avx512vl();
-  if (UseAPX && !apx_supported) {
-    warning("UseAPX is not supported on this CPU, setting it to false");
-    FLAG_SET_DEFAULT(UseAPX, false);
-  }
-
-  if (!UseAPX) {
-    _features.clear_feature(CPU_APX_F);
-  }

  if (UseAVX < 2) {
    _features.clear_feature(CPU_AVX2);
@ -1049,6 +1049,7 @@ void VM_Version::get_processor_features() {
      _features.clear_feature(CPU_VZEROUPPER);
      _features.clear_feature(CPU_AVX512BW);
      _features.clear_feature(CPU_AVX512VL);
+      _features.clear_feature(CPU_APX_F);
      _features.clear_feature(CPU_AVX512DQ);
      _features.clear_feature(CPU_AVX512_VNNI);
      _features.clear_feature(CPU_AVX512_VAES);
@ -1068,6 +1069,17 @@ void VM_Version::get_processor_features() {
    }
  }

+    // Currently APX support is only enabled for targets supporting AVX512VL feature.
+  bool apx_supported = os_supports_apx_egprs() && supports_apx_f() && supports_avx512vl();
+  if (UseAPX && !apx_supported) {
+    warning("UseAPX is not supported on this CPU, setting it to false");
+    FLAG_SET_DEFAULT(UseAPX, false);
+  }
+
+  if (!UseAPX) {
+    _features.clear_feature(CPU_APX_F);
+  }
+
  if (FLAG_IS_DEFAULT(IntelJccErratumMitigation)) {
    _has_intel_jcc_erratum = compute_has_intel_jcc_erratum();
    FLAG_SET_ERGO(IntelJccErratumMitigation, _has_intel_jcc_erratum);
@ -2912,7 +2924,8 @@ VM_Version::VM_Features VM_Version::CpuidInfo::feature_flags() const {
  if (std_cpuid1_ecx.bits.popcnt != 0)
    vm_features.set_feature(CPU_POPCNT);
  if (sefsl1_cpuid7_edx.bits.apx_f != 0 &&
-      xem_xcr0_eax.bits.apx_f != 0) {
+      xem_xcr0_eax.bits.apx_f != 0 &&
+      std_cpuid29_ebx.bits.apx_nci_ndd_nf != 0) {
    vm_features.set_feature(CPU_APX_F);
  }
  if (std_cpuid1_ecx.bits.avx != 0 &&
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp
@ -306,6 +306,14 @@ class VM_Version : public Abstract_VM_Version {
    } bits;
  };

+  union StdCpuidEax29Ecx0 {
+    uint32_t value;
+    struct {
+      uint32_t  apx_nci_ndd_nf  : 1,
+                                : 31;
+    } bits;
+  };
+
  union StdCpuid24MainLeafEax {
    uint32_t value;
    struct {
@ -591,6 +599,10 @@ protected:
    StdCpuid24MainLeafEax std_cpuid24_eax;
    StdCpuid24MainLeafEbx std_cpuid24_ebx;

+    // cpuid function 0x29 APX Advanced Performance Extensions Leaf
+    // eax = 0x29, ecx = 0
+    StdCpuidEax29Ecx0 std_cpuid29_ebx;
+
    // cpuid function 0xB (processor topology)
    // ecx = 0
    uint32_t     tpl_cpuidB0_eax;
@ -711,6 +723,7 @@ public:
  static ByteSize std_cpuid0_offset() { return byte_offset_of(CpuidInfo, std_max_function); }
  static ByteSize std_cpuid1_offset() { return byte_offset_of(CpuidInfo, std_cpuid1_eax); }
  static ByteSize std_cpuid24_offset() { return byte_offset_of(CpuidInfo, std_cpuid24_eax); }
+  static ByteSize std_cpuid29_offset() { return byte_offset_of(CpuidInfo, std_cpuid29_ebx); }
  static ByteSize dcp_cpuid4_offset() { return byte_offset_of(CpuidInfo, dcp_cpuid4_eax); }
  static ByteSize sef_cpuid7_offset() { return byte_offset_of(CpuidInfo, sef_cpuid7_eax); }
  static ByteSize sefsl1_cpuid7_offset() { return byte_offset_of(CpuidInfo, sefsl1_cpuid7_eax); }
@ -760,7 +773,9 @@ public:
    _features.set_feature(CPU_SSE2);
    _features.set_feature(CPU_VZEROUPPER);
  }
-  static void set_apx_cpuFeatures() { _features.set_feature(CPU_APX_F); }
+  static void set_apx_cpuFeatures() {
+    _features.set_feature(CPU_APX_F);
+  }
  static void set_bmi_cpuFeatures() {
    _features.set_feature(CPU_BMI1);
    _features.set_feature(CPU_BMI2);
--- a/src/hotspot/cpu/zero/assembler_zero.inline.hpp
+++ b/src/hotspot/cpu/zero/assembler_zero.inline.hpp
@ -26,7 +26,7 @@
 #ifndef CPU_ZERO_ASSEMBLER_ZERO_INLINE_HPP
 #define CPU_ZERO_ASSEMBLER_ZERO_INLINE_HPP

-#include "asm/assembler.inline.hpp"
+#include "asm/assembler.hpp"
 #include "asm/codeBuffer.hpp"
 #include "code/codeCache.hpp"
 #include "runtime/handles.inline.hpp"
--- a/src/hotspot/os/linux/compilerThreadTimeout_linux.hpp
+++ b/src/hotspot/os/linux/compilerThreadTimeout_linux.hpp
@ -46,6 +46,10 @@ class CompilerThreadTimeoutLinux : public CHeapObj<mtCompiler> {
  bool init_timeout();
  void arm();
  void disarm();
+  void reset() {
+    disarm();
+    arm();
+  };
 };

 #endif //LINUX_COMPILER_THREAD_TIMEOUT_LINUX_HPP
--- a/src/hotspot/os/linux/memMapPrinter_linux.cpp
+++ b/src/hotspot/os/linux/memMapPrinter_linux.cpp
@ -106,6 +106,7 @@ public:
      PRINTIF(info.swap > 0, "swap");
      PRINTIF(info.ht, "huge");
      PRINTIF(info.anonhugepages > 0, "thp");
+      PRINTIF(info.thpeligible, "thpel");
      PRINTIF(info.hg, "thpad");
      PRINTIF(info.nh, "nothp");
      if (num_printed == 0) {
@ -135,6 +136,7 @@ public:
    st->print_cr("                       com: mapping committed (swap space reserved)");
    st->print_cr("                      swap: mapping partly or completely swapped out");
    st->print_cr("                       thp: mapping uses THP");
+    st->print_cr("                     thpel: mapping is THP-eligible");
    st->print_cr("                     thpad: mapping is THP-madvised");
    st->print_cr("                     nothp: mapping is forbidden to use THP");
    st->print_cr("                      huge: mapping uses hugetlb pages");
--- a/src/hotspot/os/linux/procMapsParser.cpp
+++ b/src/hotspot/os/linux/procMapsParser.cpp
@ -76,8 +76,16 @@ void ProcSmapsParser::scan_additional_line(ProcSmapsInfo& out) {
  SCAN("Private_Hugetlb", out.private_hugetlb);
  SCAN("Shared_Hugetlb", out.shared_hugetlb);
  SCAN("Swap", out.swap);
-  int i = 0;
 #undef SCAN
+
+  // scan THPeligible into a bool
+  int thpel = 0;
+  if (::sscanf(_line, "THPeligible: %d", &thpel) == 1) {
+    assert(thpel == 1 || thpel == 0, "Unexpected value %d", thpel);
+    out.thpeligible = (thpel == 1);
+    return;
+  }
+
  // scan some flags too
  if (strncmp(_line, "VmFlags:", 8) == 0) {
 #define SCAN(flag) { out.flag = (::strstr(_line + 8, " " #flag) != nullptr); }
--- a/src/hotspot/os/linux/procMapsParser.hpp
+++ b/src/hotspot/os/linux/procMapsParser.hpp
@ -49,6 +49,7 @@ struct ProcSmapsInfo {
  size_t shared_hugetlb;
  size_t anonhugepages;
  size_t swap;
+  bool thpeligible;
  bool rd, wr, ex;
  bool sh; // shared
  bool nr; // no reserve
@ -64,7 +65,7 @@ struct ProcSmapsInfo {
    from = to = nullptr;
    prot[0] = filename[0] = '\0';
    kernelpagesize = rss = private_hugetlb = shared_hugetlb = anonhugepages = swap = 0;
-    rd = wr = ex = sh = nr = hg = ht = nh = false;
+    thpeligible = rd = wr = ex = sh = nr = hg = ht = nh = false;
  }
 };

--- a/src/hotspot/os_cpu/aix_ppc/atomicAccess_aix_ppc.hpp
+++ b/src/hotspot/os_cpu/aix_ppc/atomicAccess_aix_ppc.hpp
@ -23,8 +23,8 @@
 *
 */

-#ifndef OS_CPU_AIX_PPC_ATOMIC_AIX_PPC_HPP
-#define OS_CPU_AIX_PPC_ATOMIC_AIX_PPC_HPP
+#ifndef OS_CPU_AIX_PPC_ATOMICACCESS_AIX_PPC_HPP
+#define OS_CPU_AIX_PPC_ATOMICACCESS_AIX_PPC_HPP

 #ifndef PPC64
 #error "Atomic currently only implemented for PPC64"
@ -33,7 +33,7 @@
 #include "orderAccess_aix_ppc.hpp"
 #include "utilities/debug.hpp"

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 //
 // machine barrier instructions:
@ -414,4 +414,4 @@ struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE> {
  }
 };

-#endif // OS_CPU_AIX_PPC_ATOMIC_AIX_PPC_HPP
+#endif // OS_CPU_AIX_PPC_ATOMICACCESS_AIX_PPC_HPP
--- a/src/hotspot/os_cpu/bsd_aarch64/atomicAccess_bsd_aarch64.hpp
+++ b/src/hotspot/os_cpu/bsd_aarch64/atomicAccess_bsd_aarch64.hpp
@ -24,12 +24,12 @@
 *
 */

-#ifndef OS_CPU_BSD_AARCH64_ATOMIC_BSD_AARCH64_HPP
-#define OS_CPU_BSD_AARCH64_ATOMIC_BSD_AARCH64_HPP
+#ifndef OS_CPU_BSD_AARCH64_ATOMICACCESS_BSD_AARCH64_HPP
+#define OS_CPU_BSD_AARCH64_ATOMICACCESS_BSD_AARCH64_HPP

 #include "utilities/debug.hpp"

-// Implementation of class atomic
+// Implementation of class AtomicAccess
 // Note that memory_order_conservative requires a full barrier after atomic stores.
 // See https://patchwork.kernel.org/patch/3575821/

@ -129,5 +129,4 @@ struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>
  void operator()(volatile T* p, T v) const { release_store(p, v); OrderAccess::fence(); }
 };

-
-#endif // OS_CPU_BSD_AARCH64_ATOMIC_BSD_AARCH64_HPP
+#endif // OS_CPU_BSD_AARCH64_ATOMICACCESS_BSD_AARCH64_HPP
--- a/src/hotspot/os_cpu/bsd_x86/atomicAccess_bsd_x86.hpp
+++ b/src/hotspot/os_cpu/bsd_x86/atomicAccess_bsd_x86.hpp
@ -22,10 +22,10 @@
 *
 */

-#ifndef OS_CPU_BSD_X86_ATOMIC_BSD_X86_HPP
-#define OS_CPU_BSD_X86_ATOMIC_BSD_X86_HPP
+#ifndef OS_CPU_BSD_X86_ATOMICACCESS_BSD_X86_HPP
+#define OS_CPU_BSD_X86_ATOMICACCESS_BSD_X86_HPP

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 template<size_t byte_size>
 struct AtomicAccess::PlatformAdd {
@ -230,4 +230,4 @@ struct AtomicAccess::PlatformOrderedStore<8, RELEASE_X_FENCE>
 };
 #endif // AMD64

-#endif // OS_CPU_BSD_X86_ATOMIC_BSD_X86_HPP
+#endif // OS_CPU_BSD_X86_ATOMICACCESS_BSD_X86_HPP
--- a/src/hotspot/os_cpu/bsd_zero/atomicAccess_bsd_zero.hpp
+++ b/src/hotspot/os_cpu/bsd_zero/atomicAccess_bsd_zero.hpp
@ -23,13 +23,13 @@
 *
 */

-#ifndef OS_CPU_BSD_ZERO_ATOMIC_BSD_ZERO_HPP
-#define OS_CPU_BSD_ZERO_ATOMIC_BSD_ZERO_HPP
+#ifndef OS_CPU_BSD_ZERO_ATOMICACCESS_BSD_ZERO_HPP
+#define OS_CPU_BSD_ZERO_ATOMICACCESS_BSD_ZERO_HPP

 #include "orderAccess_bsd_zero.hpp"
 #include "runtime/os.hpp"

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 template<size_t byte_size>
 struct AtomicAccess::PlatformAdd {
@ -149,4 +149,4 @@ inline void AtomicAccess::PlatformStore<8>::operator()(T volatile* dest,
  __atomic_store(dest, &store_value, __ATOMIC_RELAXED);
 }

-#endif // OS_CPU_BSD_ZERO_ATOMIC_BSD_ZERO_HPP
+#endif // OS_CPU_BSD_ZERO_ATOMICACCESS_BSD_ZERO_HPP
--- a/src/hotspot/os_cpu/bsd_zero/os_bsd_zero.cpp
+++ b/src/hotspot/os_cpu/bsd_zero/os_bsd_zero.cpp
@ -24,7 +24,6 @@
 */

 #include "asm/assembler.inline.hpp"
-#include "atomic_bsd_zero.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "code/vtableStubs.hpp"
 #include "interpreter/interpreter.hpp"
@ -36,6 +35,7 @@
 #include "prims/jniFastGetField.hpp"
 #include "prims/jvm_misc.hpp"
 #include "runtime/arguments.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/java.hpp"
--- a/src/hotspot/os_cpu/linux_aarch64/atomicAccess_linux_aarch64.hpp
+++ b/src/hotspot/os_cpu/linux_aarch64/atomicAccess_linux_aarch64.hpp
@ -23,13 +23,13 @@
 *
 */

-#ifndef OS_CPU_LINUX_AARCH64_ATOMIC_LINUX_AARCH64_HPP
-#define OS_CPU_LINUX_AARCH64_ATOMIC_LINUX_AARCH64_HPP
+#ifndef OS_CPU_LINUX_AARCH64_ATOMICACCESS_LINUX_AARCH64_HPP
+#define OS_CPU_LINUX_AARCH64_ATOMICACCESS_LINUX_AARCH64_HPP

 #include "atomic_aarch64.hpp"
 #include "runtime/vm_version.hpp"

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 // Note that memory_order_conservative requires a full barrier after atomic stores.
 // See https://patchwork.kernel.org/patch/3575821/
@ -217,4 +217,4 @@ struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>
  void operator()(volatile T* p, T v) const { release_store(p, v); OrderAccess::fence(); }
 };

-#endif // OS_CPU_LINUX_AARCH64_ATOMIC_LINUX_AARCH64_HPP
+#endif // OS_CPU_LINUX_AARCH64_ATOMICACCESS_LINUX_AARCH64_HPP
--- a/src/hotspot/os_cpu/linux_arm/atomicAccess_linux_arm.hpp
+++ b/src/hotspot/os_cpu/linux_arm/atomicAccess_linux_arm.hpp
@ -22,14 +22,14 @@
 *
 */

-#ifndef OS_CPU_LINUX_ARM_ATOMIC_LINUX_ARM_HPP
-#define OS_CPU_LINUX_ARM_ATOMIC_LINUX_ARM_HPP
+#ifndef OS_CPU_LINUX_ARM_ATOMICACCESS_LINUX_ARM_HPP
+#define OS_CPU_LINUX_ARM_ATOMICACCESS_LINUX_ARM_HPP

 #include "memory/allStatic.hpp"
 #include "runtime/os.hpp"
 #include "runtime/vm_version.hpp"

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 class ARMAtomicFuncs : AllStatic {
 public:
@ -178,4 +178,4 @@ inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
  return cmpxchg_using_helper<int64_t>(reorder_cmpxchg_long_func, dest, compare_value, exchange_value);
 }

-#endif // OS_CPU_LINUX_ARM_ATOMIC_LINUX_ARM_HPP
+#endif // OS_CPU_LINUX_ARM_ATOMICACCESS_LINUX_ARM_HPP
--- a/src/hotspot/os_cpu/linux_ppc/atomicAccess_linux_ppc.hpp
+++ b/src/hotspot/os_cpu/linux_ppc/atomicAccess_linux_ppc.hpp
@ -23,8 +23,8 @@
 *
 */

-#ifndef OS_CPU_LINUX_PPC_ATOMIC_LINUX_PPC_HPP
-#define OS_CPU_LINUX_PPC_ATOMIC_LINUX_PPC_HPP
+#ifndef OS_CPU_LINUX_PPC_ATOMICACCESS_LINUX_PPC_HPP
+#define OS_CPU_LINUX_PPC_ATOMICACCESS_LINUX_PPC_HPP

 #ifndef PPC64
 #error "Atomic currently only implemented for PPC64"
@ -33,7 +33,7 @@
 #include "orderAccess_linux_ppc.hpp"
 #include "utilities/debug.hpp"

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 //
 // machine barrier instructions:
@ -392,4 +392,4 @@ struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE>
  }
 };

-#endif // OS_CPU_LINUX_PPC_ATOMIC_LINUX_PPC_HPP
+#endif // OS_CPU_LINUX_PPC_ATOMICACCESS_LINUX_PPC_HPP
--- a/src/hotspot/os_cpu/linux_riscv/atomicAccess_linux_riscv.hpp
+++ b/src/hotspot/os_cpu/linux_riscv/atomicAccess_linux_riscv.hpp
@ -23,12 +23,12 @@
 *
 */

-#ifndef OS_CPU_LINUX_RISCV_ATOMIC_LINUX_RISCV_HPP
-#define OS_CPU_LINUX_RISCV_ATOMIC_LINUX_RISCV_HPP
+#ifndef OS_CPU_LINUX_RISCV_ATOMICACCESS_LINUX_RISCV_HPP
+#define OS_CPU_LINUX_RISCV_ATOMICACCESS_LINUX_RISCV_HPP

 #include "runtime/vm_version.hpp"

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 // Note that memory_order_conservative requires a full barrier after atomic stores.
 // See https://patchwork.kernel.org/patch/3575821/
@ -226,4 +226,4 @@ struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>

 #undef FULL_COMPILER_ATOMIC_SUPPORT

-#endif // OS_CPU_LINUX_RISCV_ATOMIC_LINUX_RISCV_HPP
+#endif // OS_CPU_LINUX_RISCV_ATOMICACCESS_LINUX_RISCV_HPP
--- a/src/hotspot/os_cpu/linux_s390/atomicAccess_linux_s390.hpp
+++ b/src/hotspot/os_cpu/linux_s390/atomicAccess_linux_s390.hpp
@ -23,8 +23,8 @@
 *
 */

-#ifndef OS_CPU_LINUX_S390_ATOMIC_LINUX_S390_HPP
-#define OS_CPU_LINUX_S390_ATOMIC_LINUX_S390_HPP
+#ifndef OS_CPU_LINUX_S390_ATOMICACCESS_LINUX_S390_HPP
+#define OS_CPU_LINUX_S390_ATOMICACCESS_LINUX_S390_HPP

 #include "runtime/atomicAccess.hpp"
 #include "runtime/os.hpp"
@ -345,4 +345,4 @@ struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE>
  T operator()(const volatile T* p) const { T t = *p; OrderAccess::acquire(); return t; }
 };

-#endif // OS_CPU_LINUX_S390_ATOMIC_LINUX_S390_HPP
+#endif // OS_CPU_LINUX_S390_ATOMICACCESS_LINUX_S390_HPP
--- a/src/hotspot/os_cpu/linux_x86/atomicAccess_linux_x86.hpp
+++ b/src/hotspot/os_cpu/linux_x86/atomicAccess_linux_x86.hpp
@ -22,10 +22,10 @@
 *
 */

-#ifndef OS_CPU_LINUX_X86_ATOMIC_LINUX_X86_HPP
-#define OS_CPU_LINUX_X86_ATOMIC_LINUX_X86_HPP
+#ifndef OS_CPU_LINUX_X86_ATOMICACCESS_LINUX_X86_HPP
+#define OS_CPU_LINUX_X86_ATOMICACCESS_LINUX_X86_HPP

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 template<size_t byte_size>
 struct AtomicAccess::PlatformAdd {
@ -230,4 +230,4 @@ struct AtomicAccess::PlatformOrderedStore<8, RELEASE_X_FENCE>
 };
 #endif // AMD64

-#endif // OS_CPU_LINUX_X86_ATOMIC_LINUX_X86_HPP
+#endif // OS_CPU_LINUX_X86_ATOMICACCESS_LINUX_X86_HPP
--- a/src/hotspot/os_cpu/linux_zero/atomicAccess_linux_zero.hpp
+++ b/src/hotspot/os_cpu/linux_zero/atomicAccess_linux_zero.hpp
@ -23,12 +23,12 @@
 *
 */

-#ifndef OS_CPU_LINUX_ZERO_ATOMIC_LINUX_ZERO_HPP
-#define OS_CPU_LINUX_ZERO_ATOMIC_LINUX_ZERO_HPP
+#ifndef OS_CPU_LINUX_ZERO_ATOMICACCESS_LINUX_ZERO_HPP
+#define OS_CPU_LINUX_ZERO_ATOMICACCESS_LINUX_ZERO_HPP

 #include "orderAccess_linux_zero.hpp"

-// Implementation of class atomic
+// Implementation of class AtomicAccess

 template<size_t byte_size>
 struct AtomicAccess::PlatformAdd {
@ -149,4 +149,4 @@ inline void AtomicAccess::PlatformStore<8>::operator()(T volatile* dest,
  __atomic_store(dest, &store_value, __ATOMIC_RELAXED);
 }

-#endif // OS_CPU_LINUX_ZERO_ATOMIC_LINUX_ZERO_HPP
+#endif // OS_CPU_LINUX_ZERO_ATOMICACCESS_LINUX_ZERO_HPP
--- a/src/hotspot/os_cpu/linux_zero/os_linux_zero.cpp
+++ b/src/hotspot/os_cpu/linux_zero/os_linux_zero.cpp
@ -24,7 +24,6 @@
 */

 #include "asm/assembler.inline.hpp"
-#include "atomic_linux_zero.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "code/vtableStubs.hpp"
 #include "interpreter/interpreter.hpp"
@ -36,6 +35,7 @@
 #include "prims/jniFastGetField.hpp"
 #include "prims/jvm_misc.hpp"
 #include "runtime/arguments.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/java.hpp"
--- a/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp
@ -23,8 +23,8 @@
 *
 */

-#ifndef OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
-#define OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
+#ifndef OS_CPU_WINDOWS_AARCH64_ATOMICACCESS_WINDOWS_AARCH64_HPP
+#define OS_CPU_WINDOWS_AARCH64_ATOMICACCESS_WINDOWS_AARCH64_HPP

 #include <intrin.h>
 #include "runtime/os.hpp"
@ -109,4 +109,4 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)

 #undef DEFINE_INTRINSIC_CMPXCHG

-#endif // OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
+#endif // OS_CPU_WINDOWS_AARCH64_ATOMICACCESS_WINDOWS_AARCH64_HPP
--- a/src/hotspot/os_cpu/windows_x86/atomicAccess_windows_x86.hpp
+++ b/src/hotspot/os_cpu/windows_x86/atomicAccess_windows_x86.hpp
@ -22,8 +22,8 @@
 *
 */

-#ifndef OS_CPU_WINDOWS_X86_ATOMIC_WINDOWS_X86_HPP
-#define OS_CPU_WINDOWS_X86_ATOMIC_WINDOWS_X86_HPP
+#ifndef OS_CPU_WINDOWS_X86_ATOMICACCESS_WINDOWS_X86_HPP
+#define OS_CPU_WINDOWS_X86_ATOMICACCESS_WINDOWS_X86_HPP

 #include <intrin.h>
 #include "runtime/os.hpp"
@ -111,4 +111,4 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)

 #undef DEFINE_INTRINSIC_CMPXCHG

-#endif // OS_CPU_WINDOWS_X86_ATOMIC_WINDOWS_X86_HPP
+#endif // OS_CPU_WINDOWS_X86_ATOMICACCESS_WINDOWS_X86_HPP
--- a/src/hotspot/share/cds/aotClassLinker.cpp
+++ b/src/hotspot/share/cds/aotClassLinker.cpp
@ -191,7 +191,7 @@ void AOTClassLinker::write_to_archive() {
  assert_at_safepoint();

  if (CDSConfig::is_dumping_aot_linked_classes()) {
-    AOTLinkedClassTable* table = AOTLinkedClassTable::get(CDSConfig::is_dumping_static_archive());
+    AOTLinkedClassTable* table = AOTLinkedClassTable::get();
    table->set_boot(write_classes(nullptr, true));
    table->set_boot2(write_classes(nullptr, false));
    table->set_platform(write_classes(SystemDictionary::java_platform_loader(), false));
@ -212,16 +212,7 @@ Array<InstanceKlass*>* AOTClassLinker::write_classes(oop class_loader, bool is_j
      continue;
    }

-    if (ik->in_aot_cache() && CDSConfig::is_dumping_dynamic_archive()) {
-      if (CDSConfig::is_using_aot_linked_classes()) {
-        // This class was recorded as AOT-linked for the base archive,
-        // so there's no need to do so again for the dynamic archive.
-      } else {
-        list.append(ik);
-      }
-    } else {
-      list.append(ArchiveBuilder::current()->get_buffered_addr(ik));
-    }
+    list.append(ArchiveBuilder::current()->get_buffered_addr(ik));
  }

  if (list.length() == 0) {
--- a/src/hotspot/share/cds/aotLinkedClassBulkLoader.cpp
+++ b/src/hotspot/share/cds/aotLinkedClassBulkLoader.cpp
@ -46,8 +46,8 @@ bool AOTLinkedClassBulkLoader::_platform_completed = false;
 bool AOTLinkedClassBulkLoader::_app_completed = false;
 bool AOTLinkedClassBulkLoader::_all_completed = false;

-void AOTLinkedClassBulkLoader::serialize(SerializeClosure* soc, bool is_static_archive) {
-  AOTLinkedClassTable::get(is_static_archive)->serialize(soc);
+void AOTLinkedClassBulkLoader::serialize(SerializeClosure* soc) {
+  AOTLinkedClassTable::get()->serialize(soc);
 }

 bool AOTLinkedClassBulkLoader::class_preloading_finished() {
@ -117,27 +117,24 @@ void AOTLinkedClassBulkLoader::exit_on_exception(JavaThread* current) {

 void AOTLinkedClassBulkLoader::load_classes_in_loader_impl(AOTLinkedClassCategory class_category, oop class_loader_oop, TRAPS) {
  Handle h_loader(THREAD, class_loader_oop);
-  load_table(AOTLinkedClassTable::for_static_archive(),  class_category, h_loader, CHECK);
-  load_table(AOTLinkedClassTable::for_dynamic_archive(), class_category, h_loader, CHECK);
+  AOTLinkedClassTable* table = AOTLinkedClassTable::get();
+  load_table(table, class_category, h_loader, CHECK);

  // Initialize the InstanceKlasses of all archived heap objects that are reachable from the
  // archived java class mirrors.
-  //
-  // Only the classes in the static archive can have archived mirrors.
-  AOTLinkedClassTable* static_table = AOTLinkedClassTable::for_static_archive();
  switch (class_category) {
  case AOTLinkedClassCategory::BOOT1:
    // Delayed until finish_loading_javabase_classes(), as the VM is not ready to
    // execute some of the <clinit> methods.
    break;
  case AOTLinkedClassCategory::BOOT2:
-    init_required_classes_for_loader(h_loader, static_table->boot2(), CHECK);
+    init_required_classes_for_loader(h_loader, table->boot2(), CHECK);
    break;
  case AOTLinkedClassCategory::PLATFORM:
-    init_required_classes_for_loader(h_loader, static_table->platform(), CHECK);
+    init_required_classes_for_loader(h_loader, table->platform(), CHECK);
    break;
  case AOTLinkedClassCategory::APP:
-    init_required_classes_for_loader(h_loader, static_table->app(), CHECK);
+    init_required_classes_for_loader(h_loader, table->app(), CHECK);
    break;
  case AOTLinkedClassCategory::UNREGISTERED:
    ShouldNotReachHere();
@ -333,7 +330,7 @@ void AOTLinkedClassBulkLoader::load_hidden_class(ClassLoaderData* loader_data, I
 }

 void AOTLinkedClassBulkLoader::finish_loading_javabase_classes(TRAPS) {
-  init_required_classes_for_loader(Handle(), AOTLinkedClassTable::for_static_archive()->boot(), CHECK);
+  init_required_classes_for_loader(Handle(), AOTLinkedClassTable::get()->boot(), CHECK);
 }

 // Some AOT-linked classes for <class_loader> must be initialized early. This includes
@ -427,8 +424,7 @@ void AOTLinkedClassBulkLoader::replay_training_at_init(Array<InstanceKlass*>* cl

 void AOTLinkedClassBulkLoader::replay_training_at_init_for_preloaded_classes(TRAPS) {
  if (CDSConfig::is_using_aot_linked_classes() && TrainingData::have_data()) {
-    // Only static archive can have training data.
-    AOTLinkedClassTable* table = AOTLinkedClassTable::for_static_archive();
+    AOTLinkedClassTable* table = AOTLinkedClassTable::get();
    replay_training_at_init(table->boot(),     CHECK);
    replay_training_at_init(table->boot2(),    CHECK);
    replay_training_at_init(table->platform(), CHECK);
--- a/src/hotspot/share/cds/aotLinkedClassBulkLoader.hpp
+++ b/src/hotspot/share/cds/aotLinkedClassBulkLoader.hpp
@ -57,7 +57,7 @@ class AOTLinkedClassBulkLoader :  AllStatic {
  static void init_required_classes_for_loader(Handle class_loader, Array<InstanceKlass*>* classes, TRAPS);
  static void replay_training_at_init(Array<InstanceKlass*>* classes, TRAPS) NOT_CDS_RETURN;
 public:
-  static void serialize(SerializeClosure* soc, bool is_static_archive) NOT_CDS_RETURN;
+  static void serialize(SerializeClosure* soc) NOT_CDS_RETURN;

  static void load_javabase_classes(JavaThread* current) NOT_CDS_RETURN;
  static void load_non_javabase_classes(JavaThread* current) NOT_CDS_RETURN;
--- a/src/hotspot/share/cds/aotLinkedClassTable.cpp
+++ b/src/hotspot/share/cds/aotLinkedClassTable.cpp
@ -27,8 +27,7 @@
 #include "cds/serializeClosure.hpp"
 #include "oops/array.hpp"

-AOTLinkedClassTable AOTLinkedClassTable::_for_static_archive;
-AOTLinkedClassTable AOTLinkedClassTable::_for_dynamic_archive;
+AOTLinkedClassTable AOTLinkedClassTable::_instance;

 void AOTLinkedClassTable::serialize(SerializeClosure* soc) {
  soc->do_ptr((void**)&_boot);
--- a/src/hotspot/share/cds/aotLinkedClassTable.hpp
+++ b/src/hotspot/share/cds/aotLinkedClassTable.hpp
@ -39,10 +39,7 @@ class SerializeClosure;
 // in a production run.
 //
 class AOTLinkedClassTable {
-  // The VM may load up to 2 CDS archives -- static and dynamic. Each
-  // archive can have its own AOTLinkedClassTable.
-  static AOTLinkedClassTable _for_static_archive;
-  static AOTLinkedClassTable _for_dynamic_archive;
+  static AOTLinkedClassTable _instance;

  Array<InstanceKlass*>* _boot;  // only java.base classes
  Array<InstanceKlass*>* _boot2; // boot classes in other modules
@ -54,11 +51,8 @@ public:
    _boot(nullptr), _boot2(nullptr),
    _platform(nullptr), _app(nullptr) {}

-  static AOTLinkedClassTable* for_static_archive()  { return &_for_static_archive; }
-  static AOTLinkedClassTable* for_dynamic_archive() { return &_for_dynamic_archive; }
-
-  static AOTLinkedClassTable* get(bool is_static_archive) {
-    return is_static_archive ? for_static_archive() : for_dynamic_archive();
+  static AOTLinkedClassTable* get() {
+    return &_instance;
  }

  Array<InstanceKlass*>* boot()     const { return _boot;     }
--- a/src/hotspot/share/cds/aotMetaspace.cpp
+++ b/src/hotspot/share/cds/aotMetaspace.cpp
@ -501,7 +501,7 @@ void AOTMetaspace::serialize(SerializeClosure* soc) {
  StringTable::serialize_shared_table_header(soc);
  HeapShared::serialize_tables(soc);
  SystemDictionaryShared::serialize_dictionary_headers(soc);
-  AOTLinkedClassBulkLoader::serialize(soc, true);
+  AOTLinkedClassBulkLoader::serialize(soc);
  FinalImageRecipes::serialize(soc);
  TrainingData::serialize(soc);
  InstanceMirrorKlass::serialize_offsets(soc);
@ -720,6 +720,7 @@ void VM_PopulateDumpSharedSpace::doit() {
  _map_info->set_cloned_vtables(CppVtables::vtables_serialized_base());
  _map_info->header()->set_class_location_config(cl_config);

+  HeapShared::delete_tables_with_raw_oops();
  CDSConfig::set_is_at_aot_safepoint(false);
 }

@ -1076,11 +1077,6 @@ bool AOTMetaspace::write_static_archive(ArchiveBuilder* builder, FileMapInfo* ma
    return false;
  }
  builder->write_archive(map_info, heap_info);
-
-  if (AllowArchivingWithJavaAgent) {
-    aot_log_warning(aot)("This %s was created with AllowArchivingWithJavaAgent. It should be used "
-            "for testing purposes only and should not be used in a production environment", CDSConfig::type_of_archive_being_loaded());
-  }
  return true;
 }

@ -2001,7 +1997,7 @@ void AOTMetaspace::initialize_shared_spaces() {
  if (dynamic_mapinfo != nullptr) {
    intptr_t* buffer = (intptr_t*)dynamic_mapinfo->serialized_data();
    ReadClosure rc(&buffer, (intptr_t)SharedBaseAddress);
-    ArchiveBuilder::serialize_dynamic_archivable_items(&rc);
+    DynamicArchive::serialize(&rc);
    DynamicArchive::setup_array_klasses();
  }

--- a/src/hotspot/share/cds/archiveBuilder.cpp
+++ b/src/hotspot/share/cds/archiveBuilder.cpp
@ -24,7 +24,6 @@

 #include "cds/aotArtifactFinder.hpp"
 #include "cds/aotClassLinker.hpp"
-#include "cds/aotLinkedClassBulkLoader.hpp"
 #include "cds/aotLogging.hpp"
 #include "cds/aotMapLogger.hpp"
 #include "cds/aotMetaspace.hpp"
@ -1015,13 +1014,6 @@ void ArchiveBuilder::make_training_data_shareable() {
  _src_obj_table.iterate_all(clean_td);
 }

-void ArchiveBuilder::serialize_dynamic_archivable_items(SerializeClosure* soc) {
-  SymbolTable::serialize_shared_table_header(soc, false);
-  SystemDictionaryShared::serialize_dictionary_headers(soc, false);
-  DynamicArchive::serialize_array_klasses(soc);
-  AOTLinkedClassBulkLoader::serialize(soc, false);
-}
-
 uintx ArchiveBuilder::buffer_to_offset(address p) const {
  address requested_p = to_requested(p);
  assert(requested_p >= _requested_static_archive_bottom, "must be");
--- a/src/hotspot/share/cds/archiveBuilder.hpp
+++ b/src/hotspot/share/cds/archiveBuilder.hpp
@ -382,7 +382,6 @@ public:
  bool gather_klass_and_symbol(MetaspaceClosure::Ref* ref, bool read_only);
  bool gather_one_source_obj(MetaspaceClosure::Ref* ref, bool read_only);
  void remember_embedded_pointer_in_enclosing_obj(MetaspaceClosure::Ref* ref);
-  static void serialize_dynamic_archivable_items(SerializeClosure* soc);

  DumpRegion* pz_region() { return &_pz_region; }
  DumpRegion* rw_region() { return &_rw_region; }
--- a/src/hotspot/share/cds/archiveHeapWriter.cpp
+++ b/src/hotspot/share/cds/archiveHeapWriter.cpp
@ -95,6 +95,11 @@ void ArchiveHeapWriter::init() {
  }
 }

+void ArchiveHeapWriter::delete_tables_with_raw_oops() {
+  delete _source_objs;
+  _source_objs = nullptr;
+}
+
 void ArchiveHeapWriter::add_source_obj(oop src_obj) {
  _source_objs->append(src_obj);
 }
@ -145,7 +150,7 @@ oop ArchiveHeapWriter::requested_obj_from_buffer_offset(size_t offset) {

 oop ArchiveHeapWriter::source_obj_to_requested_obj(oop src_obj) {
  assert(CDSConfig::is_dumping_heap(), "dump-time only");
-  HeapShared::CachedOopInfo* p = HeapShared::archived_object_cache()->get(src_obj);
+  HeapShared::CachedOopInfo* p = HeapShared::get_cached_oop_info(src_obj);
  if (p != nullptr) {
    return requested_obj_from_buffer_offset(p->buffer_offset());
  } else {
@ -154,9 +159,9 @@ oop ArchiveHeapWriter::source_obj_to_requested_obj(oop src_obj) {
 }

 oop ArchiveHeapWriter::buffered_addr_to_source_obj(address buffered_addr) {
-  oop* p = _buffer_offset_to_source_obj_table->get(buffered_address_to_offset(buffered_addr));
-  if (p != nullptr) {
-    return *p;
+  OopHandle* oh = _buffer_offset_to_source_obj_table->get(buffered_address_to_offset(buffered_addr));
+  if (oh != nullptr) {
+    return oh->resolve();
  } else {
    return nullptr;
  }
@ -356,12 +361,13 @@ void ArchiveHeapWriter::copy_source_objs_to_buffer(GrowableArrayCHeap<oop, mtCla
  for (int i = 0; i < _source_objs_order->length(); i++) {
    int src_obj_index = _source_objs_order->at(i)._index;
    oop src_obj = _source_objs->at(src_obj_index);
-    HeapShared::CachedOopInfo* info = HeapShared::archived_object_cache()->get(src_obj);
+    HeapShared::CachedOopInfo* info = HeapShared::get_cached_oop_info(src_obj);
    assert(info != nullptr, "must be");
    size_t buffer_offset = copy_one_source_obj_to_buffer(src_obj);
    info->set_buffer_offset(buffer_offset);

-    _buffer_offset_to_source_obj_table->put_when_absent(buffer_offset, src_obj);
+    OopHandle handle(Universe::vm_global(), src_obj);
+    _buffer_offset_to_source_obj_table->put_when_absent(buffer_offset, handle);
    _buffer_offset_to_source_obj_table->maybe_grow();

    if (java_lang_Module::is_instance(src_obj)) {
@ -696,7 +702,7 @@ void ArchiveHeapWriter::relocate_embedded_oops(GrowableArrayCHeap<oop, mtClassSh
  for (int i = 0; i < _source_objs_order->length(); i++) {
    int src_obj_index = _source_objs_order->at(i)._index;
    oop src_obj = _source_objs->at(src_obj_index);
-    HeapShared::CachedOopInfo* info = HeapShared::archived_object_cache()->get(src_obj);
+    HeapShared::CachedOopInfo* info = HeapShared::get_cached_oop_info(src_obj);
    assert(info != nullptr, "must be");
    oop requested_obj = requested_obj_from_buffer_offset(info->buffer_offset());
    update_header_for_requested_obj(requested_obj, src_obj, src_obj->klass());
@ -758,7 +764,7 @@ void ArchiveHeapWriter::compute_ptrmap(ArchiveHeapInfo* heap_info) {
    NativePointerInfo info = _native_pointers->at(i);
    oop src_obj = info._src_obj;
    int field_offset = info._field_offset;
-    HeapShared::CachedOopInfo* p = HeapShared::archived_object_cache()->get(src_obj);
+    HeapShared::CachedOopInfo* p = HeapShared::get_cached_oop_info(src_obj);
    // requested_field_addr = the address of this field in the requested space
    oop requested_obj = requested_obj_from_buffer_offset(p->buffer_offset());
    Metadata** requested_field_addr = (Metadata**)(cast_from_oop<address>(requested_obj) + field_offset);
--- a/src/hotspot/share/cds/archiveHeapWriter.hpp
+++ b/src/hotspot/share/cds/archiveHeapWriter.hpp
@ -152,7 +152,7 @@ private:
  };
  static GrowableArrayCHeap<HeapObjOrder, mtClassShared>* _source_objs_order;

-  typedef ResizeableHashTable<size_t, oop,
+  typedef ResizeableHashTable<size_t, OopHandle,
      AnyObj::C_HEAP,
      mtClassShared> BufferOffsetToSourceObjectTable;
  static BufferOffsetToSourceObjectTable* _buffer_offset_to_source_obj_table;
@ -227,6 +227,7 @@ private:

 public:
  static void init() NOT_CDS_JAVA_HEAP_RETURN;
+  static void delete_tables_with_raw_oops();
  static void add_source_obj(oop src_obj);
  static bool is_too_large_to_archive(size_t size);
  static bool is_too_large_to_archive(oop obj);
--- a/src/hotspot/share/cds/cdsConfig.cpp
+++ b/src/hotspot/share/cds/cdsConfig.cpp
@ -470,10 +470,6 @@ void CDSConfig::check_aot_flags() {
    assert(strcmp(AOTMode, "create") == 0, "checked by AOTModeConstraintFunc");
    check_aotmode_create();
  }
-
-  // This is an old flag used by CDS regression testing only. It doesn't apply
-  // to the AOT workflow.
-  FLAG_SET_ERGO(AllowArchivingWithJavaAgent, false);
 }

 void CDSConfig::check_aotmode_off() {
@ -716,13 +712,6 @@ bool CDSConfig::check_vm_args_consistency(bool patch_mod_javabase, bool mode_fla
    }
  }

-  if (is_dumping_classic_static_archive() && AOTClassLinking) {
-    if (JvmtiAgentList::disable_agent_list()) {
-      FLAG_SET_ERGO(AllowArchivingWithJavaAgent, false);
-      log_warning(cds)("Disabled all JVMTI agents with -Xshare:dump -XX:+AOTClassLinking");
-    }
-  }
-
  return true;
 }

@ -756,6 +745,13 @@ void CDSConfig::setup_compiler_args() {
 void CDSConfig::prepare_for_dumping() {
  assert(CDSConfig::is_dumping_archive(), "sanity");

+  if (is_dumping_dynamic_archive() && AOTClassLinking) {
+    if (FLAG_IS_CMDLINE(AOTClassLinking)) {
+      log_warning(cds)("AOTClassLinking is not supported for dynamic CDS archive");
+    }
+    FLAG_SET_ERGO(AOTClassLinking, false);
+  }
+
  if (is_dumping_dynamic_archive() && !is_using_archive()) {
    assert(!is_dumping_static_archive(), "cannot be dumping both static and dynamic archives");

@ -1014,11 +1010,10 @@ void CDSConfig::stop_using_full_module_graph(const char* reason) {
 }

 bool CDSConfig::is_dumping_aot_linked_classes() {
-  if (is_dumping_preimage_static_archive()) {
-    return false;
-  } else if (is_dumping_dynamic_archive()) {
-    return is_using_full_module_graph() && AOTClassLinking;
-  } else if (is_dumping_static_archive()) {
+  if (is_dumping_classic_static_archive() || is_dumping_final_static_archive()) {
+    // FMG is required to guarantee that all cached boot/platform/app classes
+    // are visible in the production run, so they can be unconditionally
+    // loaded during VM bootstrap.
    return is_dumping_full_module_graph() && AOTClassLinking;
  } else {
    return false;
--- a/src/hotspot/share/cds/cdsHeapVerifier.cpp
+++ b/src/hotspot/share/cds/cdsHeapVerifier.cpp
@ -36,6 +36,7 @@
 #include "oops/fieldStreams.inline.hpp"
 #include "oops/klass.inline.hpp"
 #include "oops/oop.inline.hpp"
+#include "oops/oopHandle.inline.hpp"
 #include "runtime/fieldDescriptor.inline.hpp"

 #if INCLUDE_CDS_JAVA_HEAP
@ -273,7 +274,8 @@ void CDSHeapVerifier::add_static_obj_field(InstanceKlass* ik, oop field, Symbol*

 // This function is called once for every archived heap object. Warn if this object is referenced by
 // a static field of a class that's not aot-initialized.
-inline bool CDSHeapVerifier::do_entry(oop& orig_obj, HeapShared::CachedOopInfo& value) {
+inline bool CDSHeapVerifier::do_entry(OopHandle& orig_obj_handle, HeapShared::CachedOopInfo& value) {
+  oop orig_obj = orig_obj_handle.resolve();
  _archived_objs++;

  if (java_lang_String::is_instance(orig_obj) && HeapShared::is_dumped_interned_string(orig_obj)) {
@ -323,7 +325,7 @@ public:

 // Call this function (from gdb, etc) if you want to know why an object is archived.
 void CDSHeapVerifier::trace_to_root(outputStream* st, oop orig_obj) {
-  HeapShared::CachedOopInfo* info = HeapShared::archived_object_cache()->get(orig_obj);
+  HeapShared::CachedOopInfo* info = HeapShared::get_cached_oop_info(orig_obj);
  if (info != nullptr) {
    trace_to_root(st, orig_obj, nullptr, info);
  } else {
@ -357,7 +359,7 @@ const char* static_field_name(oop mirror, oop field) {
 int CDSHeapVerifier::trace_to_root(outputStream* st, oop orig_obj, oop orig_field, HeapShared::CachedOopInfo* info) {
  int level = 0;
  if (info->orig_referrer() != nullptr) {
-    HeapShared::CachedOopInfo* ref = HeapShared::archived_object_cache()->get(info->orig_referrer());
+    HeapShared::CachedOopInfo* ref = HeapShared::get_cached_oop_info(info->orig_referrer());
    assert(ref != nullptr, "sanity");
    level = trace_to_root(st, info->orig_referrer(), orig_obj, ref) + 1;
  } else if (java_lang_String::is_instance(orig_obj)) {
--- a/src/hotspot/share/cds/cdsHeapVerifier.hpp
+++ b/src/hotspot/share/cds/cdsHeapVerifier.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -27,6 +27,7 @@

 #include "cds/heapShared.hpp"
 #include "memory/iterator.hpp"
+#include "oops/oopHandle.hpp"
 #include "utilities/growableArray.hpp"
 #include "utilities/hashTable.hpp"

@ -80,7 +81,7 @@ public:
  virtual void do_klass(Klass* k);

  // For HashTable::iterate()
-  inline bool do_entry(oop& orig_obj, HeapShared::CachedOopInfo& value);
+  inline bool do_entry(OopHandle& orig_obj, HeapShared::CachedOopInfo& value);

  static void verify();

--- a/src/hotspot/share/cds/cds_globals.hpp
+++ b/src/hotspot/share/cds/cds_globals.hpp
@ -63,10 +63,6 @@
          "Average number of symbols per bucket in shared table")           \
          range(2, 246)                                                     \
                                                                            \
-  product(bool, AllowArchivingWithJavaAgent, false, DIAGNOSTIC,             \
-          "Allow Java agent to be run with CDS dumping (not applicable"     \
-          " to AOT")                                                        \
-                                                                            \
  develop(ccstr, ArchiveHeapTestClass, nullptr,                             \
          "For JVM internal testing only. The static field named "          \
          "\"archivedObjects\" of the specified class is stored in the "    \
--- a/src/hotspot/share/cds/dynamicArchive.cpp
+++ b/src/hotspot/share/cds/dynamicArchive.cpp
@ -160,11 +160,10 @@ public:
      SystemDictionaryShared::write_to_archive(false);
      cl_config = AOTClassLocationConfig::dumptime()->write_to_archive();
      DynamicArchive::dump_array_klasses();
-      AOTClassLinker::write_to_archive();

      serialized_data = ro_region()->top();
      WriteClosure wc(ro_region());
-      ArchiveBuilder::serialize_dynamic_archivable_items(&wc);
+      DynamicArchive::serialize(&wc);
    }

    if (CDSConfig::is_dumping_lambdas_in_legacy_mode()) {
@ -396,11 +395,6 @@ public:
  VMOp_Type type() const { return VMOp_PopulateDumpSharedSpace; }
  void doit() {
    ResourceMark rm;
-    if (AllowArchivingWithJavaAgent) {
-      aot_log_warning(aot)("This %s was created with AllowArchivingWithJavaAgent. It should be used "
-                       "for testing purposes only and should not be used in a production environment",
-                       CDSConfig::type_of_archive_being_loaded());
-    }
    AOTClassLocationConfig::dumptime_check_nonempty_dirs();
    _builder.doit();
  }
@ -414,6 +408,12 @@ public:
 GrowableArray<ObjArrayKlass*>* DynamicArchive::_array_klasses = nullptr;
 Array<ObjArrayKlass*>* DynamicArchive::_dynamic_archive_array_klasses = nullptr;

+void DynamicArchive::serialize(SerializeClosure* soc) {
+  SymbolTable::serialize_shared_table_header(soc, false);
+  SystemDictionaryShared::serialize_dictionary_headers(soc, false);
+  soc->do_ptr(&_dynamic_archive_array_klasses);
+}
+
 void DynamicArchive::append_array_klass(ObjArrayKlass* ak) {
  if (_array_klasses == nullptr) {
    _array_klasses = new (mtClassShared) GrowableArray<ObjArrayKlass*>(50, mtClassShared);
@ -456,10 +456,6 @@ void DynamicArchive::setup_array_klasses() {
  }
 }

-void DynamicArchive::serialize_array_klasses(SerializeClosure* soc) {
-  soc->do_ptr(&_dynamic_archive_array_klasses);
-}
-
 void DynamicArchive::make_array_klasses_shareable() {
  if (_array_klasses != nullptr) {
    int num_array_klasses = _array_klasses->length();
--- a/src/hotspot/share/cds/dynamicArchive.hpp
+++ b/src/hotspot/share/cds/dynamicArchive.hpp
@ -71,7 +71,7 @@ public:
  static void dump_array_klasses();
  static void setup_array_klasses();
  static void append_array_klass(ObjArrayKlass* oak);
-  static void serialize_array_klasses(SerializeClosure* soc);
+  static void serialize(SerializeClosure* soc);
  static void make_array_klasses_shareable();
  static void post_dump();
  static int  num_array_klasses();
--- a/src/hotspot/share/cds/filemap.cpp
+++ b/src/hotspot/share/cds/filemap.cpp
@ -259,7 +259,6 @@ void FileMapHeader::populate(FileMapInfo *info, size_t core_region_alignment,
  _has_platform_or_app_classes = AOTClassLocationConfig::dumptime()->has_platform_or_app_classes();
  _requested_base_address = (char*)SharedBaseAddress;
  _mapped_base_address = (char*)SharedBaseAddress;
-  _allow_archiving_with_java_agent = AllowArchivingWithJavaAgent;
 }

 void FileMapHeader::copy_base_archive_name(const char* archive) {
@ -316,7 +315,6 @@ void FileMapHeader::print(outputStream* st) {
  st->print_cr("- _heap_ptrmap_start_pos:         %zu", _heap_ptrmap_start_pos);
  st->print_cr("- _rw_ptrmap_start_pos:           %zu", _rw_ptrmap_start_pos);
  st->print_cr("- _ro_ptrmap_start_pos:           %zu", _ro_ptrmap_start_pos);
-  st->print_cr("- allow_archiving_with_java_agent:%d", _allow_archiving_with_java_agent);
  st->print_cr("- use_optimized_module_handling:  %d", _use_optimized_module_handling);
  st->print_cr("- has_full_module_graph           %d", _has_full_module_graph);
  st->print_cr("- has_aot_linked_classes          %d", _has_aot_linked_classes);
@ -2051,21 +2049,6 @@ bool FileMapHeader::validate() {
    _has_platform_or_app_classes = false;
  }

-  // Java agents are allowed during run time. Therefore, the following condition is not
-  // checked: (!_allow_archiving_with_java_agent && AllowArchivingWithJavaAgent)
-  // Note: _allow_archiving_with_java_agent is set in the shared archive during dump time
-  // while AllowArchivingWithJavaAgent is set during the current run.
-  if (_allow_archiving_with_java_agent && !AllowArchivingWithJavaAgent) {
-    AOTMetaspace::report_loading_error("The setting of the AllowArchivingWithJavaAgent is different "
-                                          "from the setting in the %s.", file_type);
-    return false;
-  }
-
-  if (_allow_archiving_with_java_agent) {
-    aot_log_warning(aot)("This %s was created with AllowArchivingWithJavaAgent. It should be used "
-            "for testing purposes only and should not be used in a production environment", file_type);
-  }
-
  aot_log_info(aot)("The %s was created with UseCompressedOops = %d, UseCompressedClassPointers = %d, UseCompactObjectHeaders = %d",
                          file_type, compressed_oops(), compressed_class_pointers(), compact_headers());
  if (compressed_oops() != UseCompressedOops || compressed_class_pointers() != UseCompressedClassPointers) {
--- a/src/hotspot/share/cds/filemap.hpp
+++ b/src/hotspot/share/cds/filemap.hpp
@ -135,7 +135,6 @@ private:
  char*  _requested_base_address;       // Archive relocation is not necessary if we map with this base address.
  char*  _mapped_base_address;          // Actual base address where archive is mapped.

-  bool   _allow_archiving_with_java_agent; // setting of the AllowArchivingWithJavaAgent option
  bool   _use_optimized_module_handling;// No module-relation VM options were specified, so we can skip
                                        // some expensive operations.
  bool   _has_aot_linked_classes;       // Was the CDS archive created with -XX:+AOTClassLinking
--- a/src/hotspot/share/cds/heapShared.cpp
+++ b/src/hotspot/share/cds/heapShared.cpp
@ -58,6 +58,7 @@
 #include "oops/fieldStreams.inline.hpp"
 #include "oops/objArrayOop.inline.hpp"
 #include "oops/oop.inline.hpp"
+#include "oops/oopHandle.inline.hpp"
 #include "oops/typeArrayOop.inline.hpp"
 #include "prims/jvmtiExport.hpp"
 #include "runtime/arguments.hpp"
@ -159,12 +160,35 @@ bool HeapShared::is_subgraph_root_class(InstanceKlass* ik) {
         is_subgraph_root_class_of(fmg_archive_subgraph_entry_fields, ik);
 }

+oop HeapShared::CachedOopInfo::orig_referrer() const {
+  return _orig_referrer.resolve();
+}
+
 unsigned HeapShared::oop_hash(oop const& p) {
+  assert(SafepointSynchronize::is_at_safepoint() ||
+         JavaThread::current()->is_in_no_safepoint_scope(), "sanity");
  // Do not call p->identity_hash() as that will update the
  // object header.
  return primitive_hash(cast_from_oop<intptr_t>(p));
 }

+unsigned int HeapShared::oop_handle_hash_raw(const OopHandle& oh) {
+  return oop_hash(oh.resolve());
+}
+
+unsigned int HeapShared::oop_handle_hash(const OopHandle& oh) {
+  oop o = oh.resolve();
+  if (o == nullptr) {
+    return 0;
+  } else {
+    return o->identity_hash();
+  }
+}
+
+bool HeapShared::oop_handle_equals(const OopHandle& a, const OopHandle& b) {
+  return a.resolve() == b.resolve();
+}
+
 static void reset_states(oop obj, TRAPS) {
  Handle h_obj(THREAD, obj);
  InstanceKlass* klass = InstanceKlass::cast(obj->klass());
@ -216,7 +240,8 @@ HeapShared::ArchivedObjectCache* HeapShared::_archived_object_cache = nullptr;

 bool HeapShared::has_been_archived(oop obj) {
  assert(CDSConfig::is_dumping_heap(), "dump-time only");
-  return archived_object_cache()->get(obj) != nullptr;
+  OopHandle oh(&obj);
+  return archived_object_cache()->get(oh) != nullptr;
 }

 int HeapShared::append_root(oop obj) {
@ -303,7 +328,9 @@ bool HeapShared::archive_object(oop obj, oop referrer, KlassSubGraphInfo* subgra
    count_allocation(obj->size());
    ArchiveHeapWriter::add_source_obj(obj);
    CachedOopInfo info = make_cached_oop_info(obj, referrer);
-    archived_object_cache()->put_when_absent(obj, info);
+
+    OopHandle oh(Universe::vm_global(), obj);
+    archived_object_cache()->put_when_absent(oh, info);
    archived_object_cache()->maybe_grow();
    mark_native_pointers(obj);

@ -636,14 +663,16 @@ void HeapShared::mark_native_pointers(oop orig_obj) {
 }

 void HeapShared::get_pointer_info(oop src_obj, bool& has_oop_pointers, bool& has_native_pointers) {
-  CachedOopInfo* info = archived_object_cache()->get(src_obj);
+  OopHandle oh(&src_obj);
+  CachedOopInfo* info = archived_object_cache()->get(oh);
  assert(info != nullptr, "must be");
  has_oop_pointers = info->has_oop_pointers();
  has_native_pointers = info->has_native_pointers();
 }

 void HeapShared::set_has_native_pointers(oop src_obj) {
-  CachedOopInfo* info = archived_object_cache()->get(src_obj);
+  OopHandle oh(&src_obj);
+  CachedOopInfo* info = archived_object_cache()->get(oh);
  assert(info != nullptr, "must be");
  info->set_has_native_pointers();
 }
@ -1453,7 +1482,7 @@ public:
 HeapShared::CachedOopInfo HeapShared::make_cached_oop_info(oop obj, oop referrer) {
  PointsToOopsChecker points_to_oops_checker;
  obj->oop_iterate(&points_to_oops_checker);
-  return CachedOopInfo(referrer, points_to_oops_checker.result());
+  return CachedOopInfo(OopHandle(Universe::vm_global(), referrer), points_to_oops_checker.result());
 }

 void HeapShared::init_box_classes(TRAPS) {
@ -2096,6 +2125,18 @@ bool HeapShared::is_dumped_interned_string(oop o) {
  return _dumped_interned_strings->get(o) != nullptr;
 }

+// These tables should be used only within the CDS safepoint, so
+// delete them before we exit the safepoint. Otherwise the table will
+// contain bad oops after a GC.
+void HeapShared::delete_tables_with_raw_oops() {
+  assert(_seen_objects_table == nullptr, "should have been deleted");
+
+  delete _dumped_interned_strings;
+  _dumped_interned_strings = nullptr;
+
+  ArchiveHeapWriter::delete_tables_with_raw_oops();
+}
+
 void HeapShared::debug_trace() {
  ResourceMark rm;
  oop referrer = _object_being_archived.referrer();
--- a/Show More
+++ b/Show More