Merge branch 'master' into cipher-ffm

2026-03-15 02:13:19 +00:00 · 2025-09-17 08:30:20 -04:00 · 2025-09-17 08:30:20 -04:00 · 376a37565e
commit 376a37565e
parent e7e2dc05a2 7e738f0d90
773 changed files with 17862 additions and 10191 deletions
--- a/make/scripts/generate-symbol-data.sh
+++ b/make/scripts/generate-symbol-data.sh
@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@ -52,12 +52,39 @@
 #   include the SCM state that was used to build it, which can be found in ${JDK_N_INSTALL}/release,
 #   in property "SOURCE".

+source_path="$(dirname ${0})"
+this_script_dir="$(cd -- "${source_path}" > /dev/null && pwd)"
+if test -z "${this_script_dir}"; then
+  echo "Error: Could not determine location of this script"
+  exit 1
+fi
+
+symbols_dir="$(dirname $this_script_dir)/src/jdk.compiler/share/data/symbols"
+if [ ! -d $symbols_dir ] ; then
+    echo "Cannot locate symbols directory: $symbols_dir" >&2
+    exit 1
+fi
+
+generator_dir="$(dirname $this_script_dir)/make/langtools/src/classes/build/tools/symbolgenerator"
+
 if [ "$1x" = "x" ] ; then
    echo "Must provide the target JDK as a parameter:" >&2
    echo "$0 <target-jdk>" >&2
    exit 1
 fi;

+if [ ! -d $1 ] ; then
+    echo "Target JDK argument is not a directory:" $1 >&2
+    exit 1
+fi;
+
+if [ ! -x $1/bin/java ] ; then
+    echo "Target JDK argument is not a valid JDK: $1" >&2
+    exit 1
+fi;
+
+cd $symbols_dir
+
 if [ ! -f symbols ] ; then
    echo "Must run inside the src/jdk.compiler/share/data/symbols directory" >&2
    exit 1
@ -72,5 +99,5 @@ $1/bin/java --add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \
            --add-exports jdk.compiler/com.sun.tools.javac.jvm=ALL-UNNAMED \
            --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED \
            --add-modules jdk.jdeps \
-            ../../../../../make/langtools/src/classes/build/tools/symbolgenerator/CreateSymbols.java \
+            $generator_dir/CreateSymbols.java \
            build-description-incremental symbols include.list
--- a/make/scripts/lic_check.sh
+++ b/make/scripts/lic_check.sh
@ -1,6 +1,6 @@
 #! /bin/sh -f
 #
-# Copyright (c) 2012, 2020, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, 2025, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@ -62,7 +62,7 @@ B=`basename "${script_directory}"`
 script_dir="`cd \"${D}\" 2>/dev/null && pwd || echo \"${D}\"`/${B}"

 # set up a variable for the template directory
-template_dir=${script_dir}/../data/license-templates
+template_dir=${script_dir}/../make/data/license-templates

 # Check existence of the template directory.
 if [ ! -d ${template_dir} ] ; then
--- a/make/scripts/normalizer.pl
+++ b/make/scripts/normalizer.pl
--- a/bin/unshuffle_list.txt
+++ b/bin/unshuffle_list.txt
@ -1,191 +0,0 @@
-#
-# Copyright (c) 2014, 2025, Oracle and/or its affiliates. All rights reserved.
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This code is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License version 2 only, as
-# published by the Free Software Foundation.
-#
-# This code is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-# version 2 for more details (a copy is included in the LICENSE file that
-# accompanied this code).
-#
-# You should have received a copy of the GNU General Public License version
-# 2 along with this work; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
-# or visit www.oracle.com if you need additional information or have any
-# questions.
-#
-
-src/bsd : jdk/src/bsd
-src/demo : jdk/src/demo
-src/java.activation : jaxws/src/java.activation
-src/java.base : jdk/src/java.base
-src/java.compiler : langtools/src/java.compiler
-src/java.corba : corba/src/java.corba
-src/java.datatransfer : jdk/src/java.datatransfer
-src/java.desktop : jdk/src/java.desktop
-src/java.instrument : jdk/src/java.instrument
-src/java.logging : jdk/src/java.logging
-src/java.management : jdk/src/java.management
-src/java.management.rmi : jdk/src/java.management.rmi
-src/java.naming : jdk/src/java.naming
-src/java.prefs : jdk/src/java.prefs
-src/java.rmi : jdk/src/java.rmi
-src/java.scripting : jdk/src/java.scripting
-src/java.se : jdk/src/java.se
-src/java.security.jgss : jdk/src/java.security.jgss
-src/java.security.sasl : jdk/src/java.security.sasl
-src/java.se.ee : jdk/src/java.se.ee
-src/java.smartcardio : jdk/src/java.smartcardio
-src/java.sql : jdk/src/java.sql
-src/java.sql.rowset : jdk/src/java.sql.rowset
-src/java.transaction : jdk/src/java.transaction
-src/java.xml : jaxp/src/java.xml
-src/java.xml.bind : jaxws/src/java.xml.bind
-src/java.xml.crypto : jdk/src/java.xml.crypto
-src/java.xml.ws : jaxws/src/java.xml.ws
-src/java.xml.ws.annotation : jaxws/src/java.xml.ws.annotation
-src/jdk.accessibility : jdk/src/jdk.accessibility
-src/jdk.aot : hotspot/src/jdk.aot
-src/jdk.attach : jdk/src/jdk.attach
-src/jdk.charsets : jdk/src/jdk.charsets
-src/jdk.compiler : jdk/src/jdk.compiler  langtools/src/jdk.compiler
-src/jdk.crypto.cryptoki : jdk/src/jdk.crypto.cryptoki
-src/jdk.crypto.ec : jdk/src/jdk.crypto.ec
-src/jdk.crypto.mscapi : jdk/src/jdk.crypto.mscapi
-src/jdk.dynalink : nashorn/src/jdk.dynalink
-src/jdk.editpad : jdk/src/jdk.editpad
-src/jdk.hotspot.agent : hotspot/src/jdk.hotspot.agent
-src/jdk.httpserver : jdk/src/jdk.httpserver
-src/jdk.incubator.httpclient : jdk/src/jdk.incubator.httpclient
-src/jdk.internal.ed : jdk/src/jdk.internal.ed
-src/jdk.internal.jvmstat : jdk/src/jdk.internal.jvmstat
-src/jdk.internal.le : jdk/src/jdk.internal.le
-src/jdk.internal.opt : jdk/src/jdk.internal.opt
-src/jdk.internal.vm.ci : hotspot/src/jdk.internal.vm.ci
-src/jdk.internal.vm.compiler : hotspot/src/jdk.internal.vm.compiler
-src/jdk.jartool : jdk/src/jdk.jartool
-src/jdk.javadoc : langtools/src/jdk.javadoc
-src/jdk.jcmd : jdk/src/jdk.jcmd
-src/jdk.jconsole : jdk/src/jdk.jconsole
-src/jdk.jdeps : langtools/src/jdk.jdeps
-src/jdk.jdi : jdk/src/jdk.jdi
-src/jdk.jdwp.agent : jdk/src/jdk.jdwp.agent
-src/jdk.jlink : jdk/src/jdk.jlink
-src/jdk.jshell : langtools/src/jdk.jshell
-src/jdk.jstatd : jdk/src/jdk.jstatd
-src/jdk.localedata : jdk/src/jdk.localedata
-src/jdk.management : jdk/src/jdk.management
-src/jdk.management.agent : jdk/src/jdk.management.agent
-src/jdk.naming.dns : jdk/src/jdk.naming.dns
-src/jdk.naming.rmi : jdk/src/jdk.naming.rmi
-src/jdk.net : jdk/src/jdk.net
-src/jdk.pack : jdk/src/jdk.pack
-src/jdk.scripting.nashorn : nashorn/src/jdk.scripting.nashorn
-src/jdk.scripting.nashorn.shell : nashorn/src/jdk.scripting.nashorn.shell
-src/jdk.sctp : jdk/src/jdk.sctp
-src/jdk.security.auth : jdk/src/jdk.security.auth
-src/jdk.security.jgss : jdk/src/jdk.security.jgss
-src/jdk.unsupported : jdk/src/jdk.unsupported
-src/jdk.xml.bind : jaxws/src/jdk.xml.bind
-src/jdk.xml.dom : jaxp/src/jdk.xml.dom
-src/jdk.xml.ws : jaxws/src/jdk.xml.ws
-src/jdk.zipfs : jdk/src/jdk.zipfs
-src/langtools/sample : langtools/src/sample
-src/linux : jdk/src/linux
-src/sample : jdk/src/sample
-src/hotspot/share : hotspot/src/share/vm
-src/hotspot/cpu/aarch64 : hotspot/src/cpu/aarch64/vm
-src/hotspot/cpu/arm : hotspot/src/cpu/arm/vm
-src/hotspot/cpu/ppc : hotspot/src/cpu/ppc/vm
-src/hotspot/cpu/s390 : hotspot/src/cpu/s390/vm
-src/hotspot/cpu/x86 : hotspot/src/cpu/x86/vm
-src/hotspot/cpu/zero : hotspot/src/cpu/zero/vm
-src/hotspot/os/aix : hotspot/src/os/aix/vm
-src/hotspot/os/bsd : hotspot/src/os/bsd/vm
-src/hotspot/os/linux : hotspot/src/os/linux/vm
-src/hotspot/os/posix/dtrace : hotspot/src/os/posix/dtrace
-src/hotspot/os/posix : hotspot/src/os/posix/vm
-src/hotspot/os/windows : hotspot/src/os/windows/vm
-src/hotspot/os_cpu/aix_ppc : hotspot/src/os_cpu/aix_ppc/vm
-src/hotspot/os_cpu/bsd_x86 : hotspot/src/os_cpu/bsd_x86/vm
-src/hotspot/os_cpu/bsd_zero : hotspot/src/os_cpu/bsd_zero/vm
-src/hotspot/os_cpu/linux_aarch64 : hotspot/src/os_cpu/linux_aarch64/vm
-src/hotspot/os_cpu/linux_arm : hotspot/src/os_cpu/linux_arm/vm
-src/hotspot/os_cpu/linux_ppc : hotspot/src/os_cpu/linux_ppc/vm
-src/hotspot/os_cpu/linux_s390 : hotspot/src/os_cpu/linux_s390/vm
-src/hotspot/os_cpu/linux_x86 : hotspot/src/os_cpu/linux_x86/vm
-src/hotspot/os_cpu/linux_zero : hotspot/src/os_cpu/linux_zero/vm
-src/hotspot/os_cpu/windows_x86 : hotspot/src/os_cpu/windows_x86/vm
-src/hotspot : hotspot/src
-src/utils/IdealGraphVisualizer : hotspot/src/share/tools/IdealGraphVisualizer
-src/utils/LogCompilation : hotspot/src/share/tools/LogCompilation
-src/utils/hsdis : hotspot/src/share/tools/hsdis
-src/utils/reorder : jdk/make/non-build-utils/reorder
-src/utils/src/build : jdk/make/non-build-utils/src/build
-make/BuildNashorn.gmk : nashorn/make/BuildNashorn.gmk
-make/CompileDemos.gmk : jdk/make/CompileDemos.gmk
-make/CompileInterimLangtools.gmk : langtools/make/CompileInterim.gmk
-make/CompileModuleTools.gmk : jdk/make/CompileModuleTools.gmk
-make/CompileToolsHotspot.gmk : hotspot/make/CompileTools.gmk
-make/CompileToolsJdk.gmk : jdk/make/CompileTools.gmk
-make/CopyInterimCLDRConverter.gmk : jdk/make/CopyInterimCLDRConverter.gmk
-make/GenerateModuleSummary.gmk : jdk/make/GenerateModuleSummary.gmk
-make/ModuleTools.gmk : jdk/make/ModuleTools.gmk
-make/ToolsJdk.gmk : jdk/make/Tools.gmk
-make/ToolsLangtools.gmk : langtools/make/Tools.gmk
-make/UnpackSecurity.gmk : jdk/make/UnpackSecurity.gmk
-make/autoconf : common/autoconf
-make/conf : common/conf
-make/copy : jdk/make/copy
-make/copy/Copy-java.corba.gmk : corba/make/copy/Copy-java.corba.gmk
-make/corba : corba/make
-make/data : jdk/make/data
-make/gendata : jdk/make/gendata
-make/gendata/Gendata-jdk.compiler.gmk : langtools/make/gendata/Gendata-jdk.compiler.gmk
-make/gensrc : jdk/make/gensrc
-make/gensrc/Gensrc-java.corba.gmk : corba/make/gensrc/Gensrc-java.corba.gmk
-make/gensrc/Gensrc-jdk.compiler.gmk : langtools/make/gensrc/Gensrc-jdk.compiler.gmk
-make/gensrc/Gensrc-jdk.hotspot.agent.gmk : hotspot/make/gensrc/Gensrc-jdk.hotspot.agent.gmk
-make/gensrc/Gensrc-jdk.internal.vm.compiler.gmk : hotspot/make/gensrc/Gensrc-jdk.internal.vm.compiler.gmk
-make/gensrc/Gensrc-jdk.javadoc.gmk : langtools/make/gensrc/Gensrc-jdk.javadoc.gmk
-make/gensrc/Gensrc-jdk.jdeps.gmk : langtools/make/gensrc/Gensrc-jdk.jdeps.gmk
-make/gensrc/Gensrc-jdk.jshell.gmk : langtools/make/gensrc/Gensrc-jdk.jshell.gmk
-make/gensrc/GensrcCommonLangtools.gmk : langtools/make/gensrc/GensrcCommon.gmk
-make/hotspot : hotspot/make
-make/jdk : jdk/make
-make/langtools : langtools/make
-make/launcher : jdk/make/launcher
-make/lib : jdk/make/lib
-make/lib/Lib-jdk.hotspot.agent.gmk : hotspot/make/lib/Lib-jdk.hotspot.agent.gmk
-make/mapfiles : jdk/make/mapfiles
-make/mapfiles/libjsig : hotspot/make/mapfiles/libjsig
-make/mapfiles/libjvm_db : hotspot/make/mapfiles/libjvm_db
-make/mapfiles/libjvm_dtrace : hotspot/make/mapfiles/libjvm_dtrace
-make/mapfiles/libsaproc : hotspot/make/mapfiles/libsaproc
-make/nashorn : nashorn/make
-make/nb_native : common/nb_native
-make/scripts/addNotices.sh : jdk/make/scripts/addNotices.sh
-make/scripts/compare.sh : common/bin/compare.sh
-make/scripts/compare_exceptions.sh.incl : common/bin/compare_exceptions.sh.incl
-make/scripts/genExceptions.sh : jdk/make/scripts/genExceptions.sh
-make/scripts/hide_important_warnings_from_javac.sh : common/bin/hide_important_warnings_from_javac.sh
-make/scripts/logger.sh : common/bin/logger.sh
-make/src/native/fixpath.c : common/src/fixpath.c
-make/test/JtregNativeHotspot.gmk : hotspot/make/test/JtregNative.gmk
-make/test/JtregNativeJdk.gmk : jdk/make/test/JtregNative.gmk
-test/jdk : jdk/test
-test/langtools : langtools/test
-test/nashorn : nashorn/test
-test/jaxp : jaxp/test
-test/hotspot/gtest : hotspot/test/native
-test/hotspot/jtreg : hotspot/test
-bin : common/bin
-bin/nashorn : nashorn/bin
-doc : common/doc
-doc/nashorn : nashorn/docs
--- a/bin/unshuffle_patch.sh
+++ b/bin/unshuffle_patch.sh
@ -1,237 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2014, 2017, Oracle and/or its affiliates. All rights reserved.
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This code is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License version 2 only, as
-# published by the Free Software Foundation.
-#
-# This code is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-# version 2 for more details (a copy is included in the LICENSE file that
-# accompanied this code).
-#
-# You should have received a copy of the GNU General Public License version
-# 2 along with this work; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
-# or visit www.oracle.com if you need additional information or have any
-# questions.
-#
-
-# Script for updating a patch file as per the shuffled/unshuffled source location.
-
-usage() {
-  echo "Usage: $0 [-h|--help] [-v|--verbose] [-to9|-to10] [-r <repo>] <input_patch> <output_patch>"
-  echo "where:"
-  echo "  -to9            create patches appropriate for a JDK 9 source tree"
-  echo "                  When going to 9, the output patches will be suffixed with the"
-  echo "                  repo name"
-  echo "  -to10           create patches appropriate for a JDK 10 source tree"
-  echo "  -r <repo>       specify repo for source patch, set to 'top' for top repo"
-  echo "  <input_patch>   is the input patch file, that needs shuffling/unshuffling"
-  echo "  <output_patch>  is the updated patch file "
-  echo " "
-  exit 1
-}
-
-SCRIPT_DIR=`dirname $0`
-UNSHUFFLE_LIST=$SCRIPT_DIR"/unshuffle_list.txt"
-
-if [ ! -f "$UNSHUFFLE_LIST" ] ; then
-  echo "FATAL: cannot find $UNSHUFFLE_LIST" >&2
-  exit 1
-fi
-
-vflag="false"
-while [ $# -gt 0 ]
-do
-  case $1 in
-    -h | --help )
-      usage
-      ;;
-
-    -v | --verbose )
-      vflag="true"
-      ;;
-
-    -r)
-      repo="$2"
-      shift
-      ;;
-
-    -to9)
-      shuffle_to=9
-      ;;
-
-    -to10)
-      shuffle_to=10
-      ;;
-
-    -*)  # bad option
-      usage
-      ;;
-
-    * )  # non option
-      break
-      ;;
-  esac
-  shift
-done
-
-# Make sure we have the right number of arguments
-if [ ! $# -eq 2 ] ; then
-  echo "ERROR: Invalid number of arguments." >&2
-  usage
-fi
-
-# Check the given repo
-repos="top corba jaxp jaxws jdk langtools nashorn hotspot"
-found="false"
-if [ -n "$repo" ]; then
-  for r in $repos ; do
-    if [ $repo = "$r" ] ; then
-      found="true"
-      break;
-    fi
-  done
-  if [ $found = "false" ] ; then
-    echo "ERROR: Unknown repo: $repo. Should be one of [$repos]." >&2
-    usage
-  fi
-fi
-
-if [ "$shuffle_to" != "9" -a "$shuffle_to" != "10" ]; then
-  echo "ERROR: Must pick either -to9 or -to10"
-  exit 1
-fi
-
-# When going to 10, a repo must be specified for the source patch
-if [ "$shuffle_to" = "10" -a -z "$repo" ]; then
-  echo "ERROR: Must specify src repo for JDK 9 patch"
-  exit 1
-fi
-
-# Check given input/output files
-input="$1"
-if [ "x$input" = "x-" ] ; then
-  input="/dev/stdin"
-fi
-
-if [ ! -f $input -a "x$input" != "x/dev/stdin" ] ; then
-  echo "ERROR: Cannot find input patch file: $input" >&2
-  exit 1
-fi
-
-output="$2"
-if [ "x$output" = "x-" ] ; then
-  output="/dev/stdout"
-fi
-base_output="$output"
-
-if [ "$shuffle_to" = "10" ]; then
-  if [ -f $output -a "x$output" != "x/dev/stdout" ] ; then
-    echo "ERROR: Output patch already exists: $output" >&2
-    exit 1
-  fi
-else
-  for r in $repos; do
-    if [ -f "$output.$r" ]; then
-      echo "ERROR: Output patch already exists: $output.$r" >&2
-      exit 1
-    fi
-  done
-fi
-
-verbose() {
-  if [ ${vflag} = "true" ] ; then
-    echo "$@" >&2
-  fi
-}
-
-unshuffle() {
-  line=$@
-  verbose "Attempting to rewrite: \"$line\""
-
-  # Retrieve the file name
-  path=
-  if echo "$line" | egrep '^diff' > /dev/null ; then
-    if ! echo "$line" | egrep '\-\-git' > /dev/null ; then
-      echo "ERROR: Only git patches supported. Please use 'hg export --git ...'." >&2
-      exit 1
-    fi
-    path="`echo "$line" | sed -e s@'diff --git a/'@@ -e s@' b/.*$'@@`"
-  elif echo "$line" | egrep '^\-\-\-' > /dev/null ; then
-    path="`echo "$line" | sed -e s@'--- a/'@@`"
-  elif echo "$line" | egrep '^\+\+\+' > /dev/null ; then
-    path="`echo "$line" | sed s@'+++ b/'@@`"
-  fi
-  verbose "Extracted path: \"$path\""
-
-  # Find the most specific matches in the shuffle list
-  matches=
-  if [ -n "$repo" -a "$repo" != "top" ]; then
-    matchpath="$repo"/"$path"/x
-  else
-    matchpath="$path"/x
-  fi
-  while [ "$matchpath" != "" ] ; do
-    matchpath="`echo $matchpath | sed s@'\(.*\)/.*$'@'\1'@`"
-
-    if [ "$shuffle_to" =  "10" ] ; then
-      pattern=": $matchpath$"
-    else
-      pattern="^$matchpath :"
-    fi
-    verbose "Attempting to find \"$matchpath\""
-    matches=`egrep "$pattern" "$UNSHUFFLE_LIST"`
-    if ! [ "x${matches}" = "x" ] ; then
-      verbose "Got matches: [$matches]"
-      break;
-    fi
-
-    if ! echo "$matchpath" | egrep '.*/.*' > /dev/null ; then
-      break;
-    fi
-  done
-
-  # Rewrite the line, if we have a match
-  if ! [ "x${matches}" = "x" ] ; then
-    shuffled="${matches%% : *}"
-    unshuffled="${matches#* : }"
-    patch_suffix_9=""
-    for r in $repos; do
-      if [ "$unshuffled" != "${unshuffled#$r}" ]; then
-        unshuffled="${unshuffled#$r\/}"
-        patch_suffix_9=".$r"
-      fi
-    done
-    verbose "shuffled: $shuffled"
-    verbose "unshuffled: $unshuffled"
-    verbose "patch_suffix_9: $patch_suffix_9"
-    if [ "$shuffle_to" =  "10" ] ; then
-      newline="`echo "$line" | sed -e s@"$unshuffled"@"$shuffled"@g`"
-    else
-      newline="`echo "$line" | sed -e s@"$shuffled"@"$unshuffled"@g`"
-      output=$base_output$patch_suffix_9
-      verbose "Writing to $output"
-    fi
-    verbose "Rewriting to \"$newline\""
-    echo "$newline" >> $output
-  else
-    echo "WARNING: no match found for $path"
-    echo "$line" >> $output
-  fi
-}
-
-while IFS= read -r line
-do
-  if echo "$line" | egrep '^diff|^\-\-\-|^\+\+\+' > /dev/null ; then
-    unshuffle "$line"
-  else
-    printf "%s\n" "$line" >> $output
-  fi
-done < "$input"
--- a/make/scripts/update_copyright_year.sh
+++ b/make/scripts/update_copyright_year.sh
--- a/make/scripts/update_pch.sh
+++ b/make/scripts/update_pch.sh
@ -23,9 +23,19 @@
 # The output of this script may require some degree of human curation:
 # - Redundant headers, e.g. both x.hpp, x.inline.hpp are included;
 # - Headers relative to a non-default feature should be protected by an
-#   appropriate 'if' clause to make sure all variants can build without 
+#   appropriate 'if' clause to make sure all variants can build without
 #   errors.

+source_path="$(dirname ${0})"
+this_script_dir="$(cd -- "${source_path}" > /dev/null && pwd)"
+if test -z "${this_script_dir}"; then
+  echo "Error: Could not determine location of this script"
+  exit 1
+fi
+
+# Work in top directory
+cd $this_script_dir/..
+
 # Time threshold for header compilation, if the time exceeds the
 # threshold the header will be precompiled.
 if [ -z "$MIN_MS" ]; then
--- a/doc/hotspot-style.html
+++ b/doc/hotspot-style.html
@ -75,6 +75,9 @@ Standard Library</a></li>
 Deduction</a></li>
 <li><a href="#expression-sfinae" id="toc-expression-sfinae">Expression
 SFINAE</a></li>
+<li><a href="#trailing-return-type-syntax-for-functions"
+id="toc-trailing-return-type-syntax-for-functions">Trailing return type
+syntax for functions</a></li>
 <li><a href="#non-type-template-parameter-values"
 id="toc-non-type-template-parameter-values">Non-type template parameter
 values</a></li>
@ -83,8 +86,9 @@ values</a></li>
 <li><a href="#thread_local" id="toc-thread_local">thread_local</a></li>
 <li><a href="#nullptr" id="toc-nullptr">nullptr</a></li>
 <li><a href="#atomic" id="toc-atomic">&lt;atomic&gt;</a></li>
-<li><a href="#inline-variables" id="toc-inline-variables">Inline
-Variables</a></li>
+<li><a href="#variable-templates-and-inline-variables"
+id="toc-variable-templates-and-inline-variables">Variable Templates and
+Inline Variables</a></li>
 <li><a href="#initializing-variables-with-static-storage-duration"
 id="toc-initializing-variables-with-static-storage-duration">Initializing
 variables with static storage duration</a></li>
@ -719,11 +723,14 @@ href="http://wg21.link/p0127r2">p0127r2</a>)<br> <code>auto</code> may
 be used as a placeholder for the type of a non-type template parameter.
 The type is deduced from the value provided in a template
 instantiation.</p></li>
-<li><p>Function return type deduction (<a
+</ul>
+<p><a name="function-return-type-deduction"></a> * Function return type
+deduction (<a
 href="https://isocpp.org/files/papers/N3638.html">n3638</a>)<br> Only
 use if the function body has a very small number of <code>return</code>
-statements, and generally relatively little other code.</p></li>
-<li><p>Class template argument deduction (<a
+statements, and generally relatively little other code.</p>
+<ul>
+<li>Class template argument deduction (<a
 href="http://wg21.link/n3602">n3602</a>, <a
 href="http://wg21.link/p0091r3">p0091r3</a>)<br> The template arguments
 of a class template may be deduced from the arguments to a constructor.
@ -736,7 +743,7 @@ harder to understand, because explicit type information is lacking. But
 it can also remove the need to be explicit about types that are either
 obvious, or that are very hard to write. For example, these allow the
 addition of a scope-guard mechanism with nice syntax; something like
-this</p></li>
+this</li>
 </ul>
 <pre><code>  ScopeGuard guard{[&amp;]{ ... cleanup code ... }};</code></pre>
 <ul>
@ -771,6 +778,44 @@ class="uri">https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95468</a><br>
 <a
 href="https://developercommunity.visualstudio.com/content/problem/396562/sizeof-deduced-type-is-sometimes-not-a-constant-ex.html"
 class="uri">https://developercommunity.visualstudio.com/content/problem/396562/sizeof-deduced-type-is-sometimes-not-a-constant-ex.html</a></p>
+<h3 id="trailing-return-type-syntax-for-functions">Trailing return type
+syntax for functions</h3>
+<p>A function's return type may be specified after the parameters and
+qualifiers (<a
+href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2541.htm">n2541</a>).
+In such a declaration the normal return type is <code>auto</code> and
+the return type is indicated by <code>-&gt;</code> followed by the type.
+Although both use <code>auto</code> in the "normal" leading return type
+position, this differs from <a
+href="#function-return-type-deduction">function return type
+deduction</a>, in that the return type is explicit rather than deduced,
+but specified in a trailing position.</p>
+<p>Use of trailing return types is permitted. However, the normal,
+leading position for the return type is preferred. A trailing return
+type should only be used where it provides some benefit. Such benefits
+usually arise because a trailing return type is in a different scope
+than a leading return type.</p>
+<ul>
+<li><p>If the function identifier is a nested name specifier, then the
+trailing return type occurs in the nested scope. This may permit simpler
+naming in the return type because of the different name lookup
+context.</p></li>
+<li><p>The trailing return type is in the scope of the parameters,
+making their types accessible via <code>decltype</code>. For
+example</p></li>
+</ul>
+<pre><code>template&lt;typename T, typename U&gt; auto add(T t, U u) -&gt; decltype(t + u);</code></pre>
+<p>rather than</p>
+<pre><code>template&lt;typename T, typename U&gt; decltype((*(T*)0) + (*(U*)0)) add(T t, U u);</code></pre>
+<ul>
+<li>Complex calculated leading return types may obscure the normal
+syntactic boundaries, making it more difficult for a reader to find the
+function name and parameters. This is particularly common in cases where
+the return type is being used for <a
+href="https://en.cppreference.com/w/cpp/language/sfinae"
+title="Substitution Failure Is Not An Error">SFINAE</a>. A trailing
+return type may be preferable in such situations.</li>
+</ul>
 <h3 id="non-type-template-parameter-values">Non-type template parameter
 values</h3>
 <p>C++17 extended the arguments permitted for non-type template
@ -893,12 +938,18 @@ differ from what the Java compilers implement.</p>
 "conservative" memory ordering, which may differ from (may be stronger
 than) sequentially consistent. There are algorithms in HotSpot that are
 believed to rely on that ordering.</p>
-<h3 id="inline-variables">Inline Variables</h3>
-<p>Variables with static storage duration may be declared
-<code>inline</code> (<a href="https://wg21.link/p0386r2">p0386r2</a>).
-This has similar effects as for declaring a function inline: it can be
-defined, identically, in multiple translation units, must be defined in
-every translation unit in which it is <a
+<h3 id="variable-templates-and-inline-variables">Variable Templates and
+Inline Variables</h3>
+<p>The use of variable templates (including static data member
+templates) (<a href="https://wg21.link/N3651">N3651</a>) is permitted.
+They provide parameterized variables and constants in a simple and
+direct form, instead of requiring the use of various workarounds.</p>
+<p>Variables with static storage duration and variable templates may be
+declared <code>inline</code> (<a
+href="https://wg21.link/p0386r2">p0386r2</a>), and this usage is
+permitted. This has similar effects as for declaring a function inline:
+it can be defined, identically, in multiple translation units, must be
+defined in every translation unit in which it is <a
 href="https://en.cppreference.com/w/cpp/language/definition"
 title="One Definition Rule">ODR used</a>, and the behavior of the
 program is as if there is exactly one variable.</p>
@ -911,16 +962,17 @@ initializations can make initialization order problems worse. The few
 ordering constraints that exist for non-inline variables don't apply, as
 there isn't a single program-designated translation unit containing the
 definition.</p>
-<p>A <code>constexpr</code> static data member is implicitly
-<code>inline</code>. As a consequence, an <a
+<p>A <code>constexpr</code> static data member or static data member
+template is implicitly <code>inline</code>. As a consequence, an <a
 href="https://en.cppreference.com/w/cpp/language/definition"
-title="One Definition Rule">ODR use</a> of such a variable doesn't
-require a definition in some .cpp file. (This is a change from
-pre-C++17. Beginning with C++17, such a definition is considered a
-duplicate definition, and is deprecated.)</p>
-<p>Declaring a <code>thread_local</code> variable <code>inline</code> is
-forbidden for HotSpot code. <a href="#thread_local">The use of
-<code>thread_local</code></a> is already heavily restricted.</p>
+title="One Definition Rule">ODR use</a> of such a member doesn't require
+a definition in some .cpp file. (This is a change from pre-C++17.
+Beginning with C++17, such a definition is considered a duplicate
+definition, and is deprecated.)</p>
+<p>Declaring a <code>thread_local</code> variable template or
+<code>inline</code> variable is forbidden in HotSpot code. <a
+href="#thread_local">The use of <code>thread_local</code></a> is already
+heavily restricted.</p>
 <h3
 id="initializing-variables-with-static-storage-duration">Initializing
 variables with static storage duration</h3>
@ -1807,11 +1859,6 @@ difference.</p>
 <h3 id="additional-undecided-features">Additional Undecided
 Features</h3>
 <ul>
-<li><p>Trailing return type syntax for functions (<a
-href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2541.htm">n2541</a>)</p></li>
-<li><p>Variable templates (<a
-href="https://isocpp.org/files/papers/N3651.pdf">n3651</a>, <a
-href="http://wg21.link/p0127r2">p0127r2</a>)</p></li>
 <li><p>Member initializers and aggregates (<a
 href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3653.html">n3653</a>)</p></li>
 <li><p>Rvalue references and move semantics</p></li>
--- a/doc/hotspot-style.md
+++ b/doc/hotspot-style.md
@ -642,6 +642,7 @@ use can make code much harder to understand.
 parameter. The type is deduced from the value provided in a template
 instantiation.

+<a name="function-return-type-deduction"></a>
 * Function return type deduction
 ([n3638](https://isocpp.org/files/papers/N3638.html))<br>
 Only use if the function body has a very small number of `return`
@ -691,6 +692,42 @@ Here are a few closely related example bugs:<br>
 <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95468><br>
 <https://developercommunity.visualstudio.com/content/problem/396562/sizeof-deduced-type-is-sometimes-not-a-constant-ex.html>

+### Trailing return type syntax for functions
+
+A function's return type may be specified after the parameters and qualifiers
+([n2541](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2541.htm)).
+In such a declaration the normal return type is `auto` and the return type is
+indicated by `->` followed by the type.  Although both use `auto` in the
+"normal" leading return type position, this differs from
+[function return type deduction](#function-return-type-deduction),
+in that the return type is explicit rather than deduced, but specified in a
+trailing position.
+
+Use of trailing return types is permitted.  However, the normal, leading
+position for the return type is preferred. A trailing return type should only
+be used where it provides some benefit. Such benefits usually arise because a
+trailing return type is in a different scope than a leading return type.
+
+* If the function identifier is a nested name specifier, then the trailing
+return type occurs in the nested scope. This may permit simpler naming in the
+return type because of the different name lookup context.
+
+* The trailing return type is in the scope of the parameters, making their
+types accessible via `decltype`. For example
+```
+template<typename T, typename U> auto add(T t, U u) -> decltype(t + u);
+```
+rather than
+```
+template<typename T, typename U> decltype((*(T*)0) + (*(U*)0)) add(T t, U u);
+```
+
+* Complex calculated leading return types may obscure the normal syntactic
+boundaries, making it more difficult for a reader to find the function name and
+parameters. This is particularly common in cases where the return type is
+being used for [SFINAE]. A trailing return type may be preferable in such
+situations.
+
 ### Non-type template parameter values

 C++17 extended the arguments permitted for non-type template parameters
@ -819,14 +856,19 @@ ordering, which may differ from (may be stronger than) sequentially
 consistent.  There are algorithms in HotSpot that are believed to rely
 on that ordering.

-### Inline Variables
+### Variable Templates and Inline Variables

-Variables with static storage duration may be declared `inline`
-([p0386r2](https://wg21.link/p0386r2)). This has similar effects as for
-declaring a function inline: it can be defined, identically, in multiple
-translation units, must be defined in every translation unit in which it is
-[ODR used][ODR], and the behavior of the program is as if there is exactly one
-variable.
+The use of variable templates (including static data member templates)
+([N3651](https://wg21.link/N3651)) is permitted. They provide parameterized
+variables and constants in a simple and direct form, instead of requiring the
+use of various workarounds.
+
+Variables with static storage duration and variable templates may be declared
+`inline` ([p0386r2](https://wg21.link/p0386r2)), and this usage is
+permitted. This has similar effects as for declaring a function inline: it can
+be defined, identically, in multiple translation units, must be defined in
+every translation unit in which it is [ODR used][ODR], and the behavior of the
+program is as if there is exactly one variable.

 Declaring a variable inline allows the complete definition to be in a header
 file, rather than having a declaration in a header and the definition in a
@ -837,13 +879,15 @@ make initialization order problems worse. The few ordering constraints
 that exist for non-inline variables don't apply, as there isn't a single
 program-designated translation unit containing the definition.

-A `constexpr` static data member is implicitly `inline`. As a consequence, an
-[ODR use][ODR] of such a variable doesn't require a definition in some .cpp
+A `constexpr` static data member or static data member template
+is implicitly `inline`. As a consequence, an
+[ODR use][ODR] of such a member doesn't require a definition in some .cpp
 file. (This is a change from pre-C++17. Beginning with C++17, such a
 definition is considered a duplicate definition, and is deprecated.)

-Declaring a `thread_local` variable `inline` is forbidden for HotSpot code.
-[The use of `thread_local`](#thread_local) is already heavily restricted.
+Declaring a `thread_local` variable template or `inline` variable is forbidden
+in HotSpot code.  [The use of `thread_local`](#thread_local) is already
+heavily restricted.

 ### Initializing variables with static storage duration

@ -1809,13 +1853,6 @@ See Object Lifetime: C++17 6.8/8, C++20 6.7.3/8

 ### Additional Undecided Features

-* Trailing return type syntax for functions
-([n2541](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2541.htm))
-
-* Variable templates
-([n3651](https://isocpp.org/files/papers/N3651.pdf),
-[p0127r2](http://wg21.link/p0127r2))
-
 * Member initializers and aggregates
 ([n3653](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3653.html))

--- a/make/Bundles.gmk
+++ b/make/Bundles.gmk
@ -301,7 +301,7 @@ ifneq ($(filter product-bundles% legacy-bundles, $(MAKECMDGOALS)), )
 	$(call LogWarn, Signing $(JDK_BUNDLE_NAME))
 	$(CODESIGN) -s "$(MACOSX_CODESIGN_IDENTITY)" \
 	    --timestamp --options runtime --deep --force \
-	    $(JDK_MACOSX_BUNDLE_DIR_SIGNED)/$(JDK_MACOSX_BUNDLE_TOP_DIR) $(LOG_DEBUG)
+	    $(JDK_MACOSX_BUNDLE_DIR_SIGNED)/$(JDK_MACOSX_BUNDLE_TOP_SUBDIR) $(LOG_DEBUG)
 	$(TOUCH) $@

    $(eval $(call SetupBundleFile, BUILD_JDK_BUNDLE, \
@ -330,7 +330,7 @@ ifneq ($(filter product-bundles% legacy-bundles, $(MAKECMDGOALS)), )
 	$(call LogWarn, Signing $(JRE_BUNDLE_NAME))
 	$(CODESIGN) -s "$(MACOSX_CODESIGN_IDENTITY)" \
 	    --timestamp --options runtime --deep --force \
-	    $(JRE_MACOSX_BUNDLE_DIR_SIGNED)/$(JRE_MACOSX_BUNDLE_TOP_DIR) $(LOG_DEBUG)
+	    $(JRE_MACOSX_BUNDLE_DIR_SIGNED)/$(JRE_MACOSX_BUNDLE_TOP_SUBDIR) $(LOG_DEBUG)
 	$(TOUCH) $@

    $(eval $(call SetupBundleFile, BUILD_JRE_BUNDLE, \
--- a/make/autoconf/basic.m4
+++ b/make/autoconf/basic.m4
@ -210,17 +210,8 @@ AC_DEFUN([BASIC_SETUP_XCODE_SYSROOT],
    if test $? -ne 0; then
      AC_MSG_ERROR([The xcodebuild tool in the devkit reports an error: $XCODEBUILD_OUTPUT])
    fi
-  elif test "x$TOOLCHAIN_PATH" != x; then
-    UTIL_LOOKUP_PROGS(XCODEBUILD, xcodebuild, $TOOLCHAIN_PATH)
-    if test "x$XCODEBUILD" != x; then
-      XCODEBUILD_OUTPUT=`"$XCODEBUILD" -version 2>&1`
-      if test $? -ne 0; then
-        AC_MSG_WARN([Ignoring the located xcodebuild tool $XCODEBUILD due to an error: $XCODEBUILD_OUTPUT])
-        XCODEBUILD=
-      fi
-    fi
  else
-    UTIL_LOOKUP_PROGS(XCODEBUILD, xcodebuild)
+    UTIL_LOOKUP_TOOLCHAIN_PROGS(XCODEBUILD, xcodebuild)
    if test "x$XCODEBUILD" != x; then
      XCODEBUILD_OUTPUT=`"$XCODEBUILD" -version 2>&1`
      if test $? -ne 0; then
@ -348,21 +339,11 @@ AC_DEFUN_ONCE([BASIC_SETUP_DEVKIT],

  # You can force the sysroot if the sysroot encoded into the compiler tools
  # is not correct.
-  AC_ARG_WITH(sys-root, [AS_HELP_STRING([--with-sys-root],
-      [alias for --with-sysroot for backwards compatibility])],
-      [SYSROOT=$with_sys_root]
-  )
-
  AC_ARG_WITH(sysroot, [AS_HELP_STRING([--with-sysroot],
      [use this directory as sysroot])],
      [SYSROOT=$with_sysroot]
  )

-  AC_ARG_WITH([tools-dir], [AS_HELP_STRING([--with-tools-dir],
-      [alias for --with-toolchain-path for backwards compatibility])],
-      [UTIL_PREPEND_TO_PATH([TOOLCHAIN_PATH],$with_tools_dir)]
-  )
-
  AC_ARG_WITH([toolchain-path], [AS_HELP_STRING([--with-toolchain-path],
      [prepend these directories when searching for toolchain binaries (compilers etc)])],
      [UTIL_PREPEND_TO_PATH([TOOLCHAIN_PATH],$with_toolchain_path)]
@ -371,6 +352,9 @@ AC_DEFUN_ONCE([BASIC_SETUP_DEVKIT],
  AC_ARG_WITH([xcode-path], [AS_HELP_STRING([--with-xcode-path],
      [set up toolchain on Mac OS using a path to an Xcode installation])])

+  UTIL_DEPRECATED_ARG_WITH(sys-root)
+  UTIL_DEPRECATED_ARG_WITH(tools-dir)
+
  if test "x$with_xcode_path" != x; then
    if test "x$OPENJDK_BUILD_OS" = "xmacosx"; then
      UTIL_PREPEND_TO_PATH([TOOLCHAIN_PATH],
--- a/make/autoconf/basic_tools.m4
+++ b/make/autoconf/basic_tools.m4
@ -207,29 +207,14 @@ AC_DEFUN([BASIC_CHECK_GNU_MAKE],
  UTIL_SETUP_TOOL(MAKE,
  [
    # Try our hardest to locate a correct version of GNU make
-    UTIL_LOOKUP_PROGS(CHECK_GMAKE, gmake)
+    UTIL_LOOKUP_TOOLCHAIN_PROGS(CHECK_GMAKE, gmake)
    BASIC_CHECK_MAKE_VERSION("$CHECK_GMAKE", [gmake in PATH])

    if test "x$FOUND_MAKE" = x; then
-      UTIL_LOOKUP_PROGS(CHECK_MAKE, make)
+      UTIL_LOOKUP_TOOLCHAIN_PROGS(CHECK_MAKE, make)
      BASIC_CHECK_MAKE_VERSION("$CHECK_MAKE", [make in PATH])
    fi

-    if test "x$FOUND_MAKE" = x; then
-      if test "x$TOOLCHAIN_PATH" != x; then
-        # We have a toolchain path, check that as well before giving up.
-        OLD_PATH=$PATH
-        PATH=$TOOLCHAIN_PATH:$PATH
-        UTIL_LOOKUP_PROGS(CHECK_TOOLSDIR_GMAKE, gmake)
-        BASIC_CHECK_MAKE_VERSION("$CHECK_TOOLSDIR_GMAKE", [gmake in tools-dir])
-        if test "x$FOUND_MAKE" = x; then
-          UTIL_LOOKUP_PROGS(CHECK_TOOLSDIR_MAKE, make)
-          BASIC_CHECK_MAKE_VERSION("$CHECK_TOOLSDIR_MAKE", [make in tools-dir])
-        fi
-        PATH=$OLD_PATH
-      fi
-    fi
-
    if test "x$FOUND_MAKE" = x; then
      AC_MSG_ERROR([Cannot find GNU make $MAKE_REQUIRED_VERSION or newer! Please put it in the path, or add e.g. MAKE=/opt/gmake3.81/make as argument to configure.])
    fi
--- a/make/autoconf/build-performance.m4
+++ b/make/autoconf/build-performance.m4
@ -162,12 +162,7 @@ AC_DEFUN([BPERF_SETUP_CCACHE],
  # Check if ccache is available
  CCACHE_AVAILABLE=true

-  OLD_PATH="$PATH"
-  if test "x$TOOLCHAIN_PATH" != x; then
-    PATH=$TOOLCHAIN_PATH:$PATH
-  fi
-  UTIL_LOOKUP_PROGS(CCACHE, ccache)
-  PATH="$OLD_PATH"
+  UTIL_LOOKUP_TOOLCHAIN_PROGS(CCACHE, ccache)

  AC_MSG_CHECKING([if ccache is available])
  if test "x$TOOLCHAIN_TYPE" != "xgcc" && test "x$TOOLCHAIN_TYPE" != "xclang"; then
--- a/make/autoconf/compare.sh.template
+++ b/make/autoconf/compare.sh.template
@ -110,4 +110,4 @@ $MV $OUTPUTDIR/compare.log $OUTPUTDIR/compare.log.old 2> /dev/null

 export SCRIPT_DIR="$( cd "$( dirname "$0" )" > /dev/null && pwd )"

-$BASH $TOPDIR/make/scripts/logger.sh $OUTPUTDIR/compare.log $BASH "$REAL_COMPARE_SCRIPT" "$@"
+$BASH $TOPDIR/make/scripts/compare-logger.sh $OUTPUTDIR/compare.log $BASH "$REAL_COMPARE_SCRIPT" "$@"
--- a/make/autoconf/flags-cflags.m4
+++ b/make/autoconf/flags-cflags.m4
@ -934,48 +934,6 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
        IF_FALSE: [$2FDLIBM_CFLAGS=""])
  fi
  AC_SUBST($2FDLIBM_CFLAGS)
-
-  # Check whether the compiler supports the Arm C Language Extensions (ACLE)
-  # for SVE. Set SVE_CFLAGS to -march=armv8-a+sve if it does.
-  # ACLE and this flag are required to build the aarch64 SVE related functions in
-  # libvectormath. Apple Silicon does not support SVE; use macOS as a proxy for
-  # that check.
-  if test "x$OPENJDK_TARGET_CPU" = "xaarch64" && test "x$OPENJDK_TARGET_OS" = "xlinux"; then
-    if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
-      AC_LANG_PUSH(C)
-      OLD_CFLAGS="$CFLAGS"
-      CFLAGS="$CFLAGS -march=armv8-a+sve"
-      AC_MSG_CHECKING([if Arm SVE ACLE is supported])
-      AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <arm_sve.h>],
-          [
-            svint32_t r = svdup_n_s32(1);
-            return 0;
-          ])],
-          [
-            AC_MSG_RESULT([yes])
-            $2SVE_CFLAGS="-march=armv8-a+sve"
-            # Switching the initialization mode with gcc from 'pattern' to 'zero'
-            # avoids the use of unsupported `__builtin_clear_padding` for variable
-            # length aggregates
-            if test "x$DEBUG_LEVEL" != xrelease && test "x$TOOLCHAIN_TYPE" = xgcc ; then
-              INIT_ZERO_FLAG="-ftrivial-auto-var-init=zero"
-              FLAGS_COMPILER_CHECK_ARGUMENTS(ARGUMENT: [$INIT_ZERO_FLAG],
-                IF_TRUE: [
-                  $2SVE_CFLAGS="${$2SVE_CFLAGS} $INIT_ZERO_FLAG"
-                ]
-              )
-            fi
-          ],
-          [
-            AC_MSG_RESULT([no])
-            $2SVE_CFLAGS=""
-          ]
-      )
-      CFLAGS="$OLD_CFLAGS"
-      AC_LANG_POP(C)
-    fi
-  fi
-  AC_SUBST($2SVE_CFLAGS)
 ])

 AC_DEFUN_ONCE([FLAGS_SETUP_BRANCH_PROTECTION],
--- a/make/autoconf/flags-ldflags.m4
+++ b/make/autoconf/flags-ldflags.m4
@ -74,7 +74,7 @@ AC_DEFUN([FLAGS_SETUP_LDFLAGS_HELPER],
      # Clang needs the lld linker to work correctly
      BASIC_LDFLAGS="-fuse-ld=lld -Wl,--exclude-libs,ALL"
      if test "x$CXX_IS_USER_SUPPLIED" = xfalse && test "x$CC_IS_USER_SUPPLIED" = xfalse; then
-        UTIL_REQUIRE_PROGS(LLD, lld, $TOOLCHAIN_PATH:$PATH)
+        UTIL_REQUIRE_TOOLCHAIN_PROGS(LLD, lld)
      fi
    fi
    if test "x$OPENJDK_TARGET_OS" = xaix; then
--- a/make/autoconf/flags-other.m4
+++ b/make/autoconf/flags-other.m4
@ -107,6 +107,62 @@ AC_DEFUN([FLAGS_SETUP_NMFLAGS],
  AC_SUBST(NMFLAGS)
 ])

+# Check whether the compiler supports the Arm C Language Extensions (ACLE)
+# for SVE. Set SVE_CFLAGS to -march=armv8-a+sve if it does.
+# ACLE and this flag are required to build the aarch64 SVE related functions
+# in libvectormath.
+AC_DEFUN([FLAGS_SETUP_SVE],
+[
+  AARCH64_SVE_AVAILABLE=false
+  # Apple Silicon does not support SVE; use macOS as a proxy for that check.
+  if test "x$OPENJDK_TARGET_CPU" = "xaarch64" && test "x$OPENJDK_TARGET_OS" = "xlinux"; then
+    if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
+      # check the compiler and binutils support sve or not
+      AC_MSG_CHECKING([if Arm SVE ACLE is supported])
+      AC_LANG_PUSH([C])
+      saved_cflags="$CFLAGS"
+      CFLAGS="$CFLAGS -march=armv8-a+sve $CFLAGS_WARNINGS_ARE_ERRORS ARG_ARGUMENT"
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+        [
+          #include <arm_sve.h>
+          svfloat64_t a() {}
+        ],
+        [
+          svint32_t r = svdup_n_s32(1)
+        ])],
+        [
+          AARCH64_SVE_AVAILABLE=true
+        ]
+      )
+      CFLAGS="$saved_cflags"
+      AC_LANG_POP([C])
+      AC_MSG_RESULT([$AARCH64_SVE_AVAILABLE])
+    fi
+  fi
+
+  UTIL_ARG_ENABLE(NAME: aarch64-sve, DEFAULT: auto,
+    RESULT: AARCH64_SVE_ENABLED,
+    DESC: [Use SVE when compiling libsleef],
+    AVAILABLE: $AARCH64_SVE_AVAILABLE)
+  SVE_CFLAGS=""
+  if test "x$AARCH64_SVE_ENABLED" = xtrue; then
+    SVE_CFLAGS="-march=armv8-a+sve"
+    # Switching the initialization mode with gcc from 'pattern' to 'zero'
+    # avoids the use of unsupported `__builtin_clear_padding` for variable
+    # length aggregates
+    if test "x$DEBUG_LEVEL" != xrelease && test "x$TOOLCHAIN_TYPE" = xgcc ; then
+      AC_MSG_CHECKING([Switching the initialization mode with gcc from pattern to zero])
+      INIT_ZERO_FLAG="-ftrivial-auto-var-init=zero"
+      FLAGS_COMPILER_CHECK_ARGUMENTS(ARGUMENT: [$INIT_ZERO_FLAG],
+        IF_TRUE: [
+          SVE_CFLAGS="${SVE_CFLAGS} $INIT_ZERO_FLAG"
+        ]
+      )
+    fi
+  fi
+  AC_SUBST(SVE_CFLAGS)
+])
+
 ################################################################################
 # platform independent
 AC_DEFUN([FLAGS_SETUP_ASFLAGS],
--- a/make/autoconf/flags.m4
+++ b/make/autoconf/flags.m4
@ -374,6 +374,7 @@ AC_DEFUN([FLAGS_SETUP_FLAGS],
  FLAGS_SETUP_RCFLAGS
  FLAGS_SETUP_NMFLAGS

+  FLAGS_SETUP_SVE
  FLAGS_SETUP_ASFLAGS
  FLAGS_SETUP_ASFLAGS_CPU_DEP([TARGET])
  FLAGS_SETUP_ASFLAGS_CPU_DEP([BUILD], [OPENJDK_BUILD_])
--- a/make/autoconf/spec.gmk.template
+++ b/make/autoconf/spec.gmk.template
@ -898,12 +898,14 @@ JDK_MACOSX_BUNDLE_DIR = $(IMAGES_OUTPUTDIR)/$(JDK_MACOSX_BUNDLE_SUBDIR)
 JRE_MACOSX_BUNDLE_DIR = $(IMAGES_OUTPUTDIR)/$(JRE_MACOSX_BUNDLE_SUBDIR)
 JDK_MACOSX_BUNDLE_DIR_SIGNED = $(IMAGES_OUTPUTDIR)/$(JDK_MACOSX_BUNDLE_SUBDIR_SIGNED)
 JRE_MACOSX_BUNDLE_DIR_SIGNED = $(IMAGES_OUTPUTDIR)/$(JRE_MACOSX_BUNDLE_SUBDIR_SIGNED)
-JDK_MACOSX_BUNDLE_TOP_DIR = jdk-$(VERSION_NUMBER).jdk
-JRE_MACOSX_BUNDLE_TOP_DIR = jre-$(VERSION_NUMBER).jre
-JDK_MACOSX_CONTENTS_SUBDIR = $(JDK_MACOSX_BUNDLE_TOP_DIR)/Contents
-JRE_MACOSX_CONTENTS_SUBDIR = $(JRE_MACOSX_BUNDLE_TOP_DIR)/Contents
+JDK_MACOSX_BUNDLE_TOP_SUBDIR = jdk-$(VERSION_NUMBER).jdk
+JRE_MACOSX_BUNDLE_TOP_SUBDIR = jre-$(VERSION_NUMBER).jre
+JDK_MACOSX_CONTENTS_SUBDIR = $(JDK_MACOSX_BUNDLE_TOP_SUBDIR)/Contents
+JRE_MACOSX_CONTENTS_SUBDIR = $(JRE_MACOSX_BUNDLE_TOP_SUBDIR)/Contents
 JDK_MACOSX_CONTENTS_DIR = $(JDK_MACOSX_BUNDLE_DIR)/$(JDK_MACOSX_CONTENTS_SUBDIR)
 JRE_MACOSX_CONTENTS_DIR = $(JRE_MACOSX_BUNDLE_DIR)/$(JRE_MACOSX_CONTENTS_SUBDIR)
+JDK_MACOSX_BUNDLE_TOP_DIR = $(JDK_MACOSX_BUNDLE_DIR)/$(JDK_MACOSX_BUNDLE_TOP_SUBDIR)
+JRE_MACOSX_BUNDLE_TOP_DIR = $(JRE_MACOSX_BUNDLE_DIR)/$(JRE_MACOSX_BUNDLE_TOP_SUBDIR)

 # Bundle names
 ifneq ($(VERSION_BUILD), )
--- a/make/autoconf/toolchain.m4
+++ b/make/autoconf/toolchain.m4
@ -276,9 +276,6 @@ AC_DEFUN_ONCE([TOOLCHAIN_PRE_DETECTION],
  ORG_CFLAGS="$CFLAGS"
  ORG_CXXFLAGS="$CXXFLAGS"

-  # autoconf magic only relies on PATH, so update it if tools dir is specified
-  OLD_PATH="$PATH"
-
  if test "x$OPENJDK_BUILD_OS" = "xmacosx"; then
    if test "x$XCODEBUILD" != x; then
      XCODE_VERSION_OUTPUT=`"$XCODEBUILD" -version 2> /dev/null | $HEAD -n 1`
@ -300,9 +297,10 @@ AC_DEFUN_ONCE([TOOLCHAIN_PRE_DETECTION],
  fi
  AC_SUBST(TOOLCHAIN_VERSION)

-  # Finally prepend TOOLCHAIN_PATH to the PATH, to allow --with-tools-dir to
-  # override all other locations.
-  if test "x$TOOLCHAIN_PATH" != x; then
+  # For the microsoft toolchain the toolchain path needs to be added to the
+  # normal path, or the compiler will not work in some situations in later
+  # configure checks.
+  if test "x$TOOLCHAIN_TYPE" = "xmicrosoft" && test "x$TOOLCHAIN_PATH" != x; then
    export PATH=$TOOLCHAIN_PATH:$PATH
  fi
 ])
@ -310,13 +308,6 @@ AC_DEFUN_ONCE([TOOLCHAIN_PRE_DETECTION],
 # Restore path, etc
 AC_DEFUN_ONCE([TOOLCHAIN_POST_DETECTION],
 [
-  # Restore old path, except for the microsoft toolchain, which requires the
-  # toolchain path to remain in place. Otherwise the compiler will not work in
-  # some situations in later configure checks.
-  if test "x$TOOLCHAIN_TYPE" != "xmicrosoft"; then
-    PATH="$OLD_PATH"
-  fi
-
  # Restore the flags to the user specified values.
  # This is necessary since AC_PROG_CC defaults CFLAGS to "-g -O2"
  CFLAGS="$ORG_CFLAGS"
--- a/make/autoconf/util_paths.m4
+++ b/make/autoconf/util_paths.m4
@ -458,17 +458,18 @@ AC_DEFUN([UTIL_LOOKUP_PROGS],

 ################################################################################
 # Call UTIL_SETUP_TOOL with AC_CHECK_TOOLS to locate the tool. This will look
-# first for cross-compilation tools.
+# first for tools using the cross-compilation prefix, and then for tools without
+# this prefix. For each of these name variants, it will look first in the
+# toolchain path, and then in the normal path.
 # $1: variable to set
 # $2: executable name (or list of names) to look for
-# $3: [path]
 AC_DEFUN([UTIL_LOOKUP_TOOLCHAIN_PROGS],
 [
  if test "x$ac_tool_prefix" = x; then
-    UTIL_LOOKUP_PROGS($1, $2, $3)
+    UTIL_LOOKUP_PROGS($1, $2, [$TOOLCHAIN_PATH:$PATH])
  else
    prefixed_names=$(for name in $2; do echo ${ac_tool_prefix}${name} $name; done)
-    UTIL_LOOKUP_PROGS($1, $prefixed_names, $3)
+    UTIL_LOOKUP_PROGS($1, $prefixed_names, [$TOOLCHAIN_PATH:$PATH])
  fi
 ])

@ -497,10 +498,9 @@ AC_DEFUN([UTIL_REQUIRE_PROGS],
 # Like UTIL_LOOKUP_PROGS but fails if no tool was found.
 # $1: variable to set
 # $2: executable name (or list of names) to look for
-# $3: [path]
 AC_DEFUN([UTIL_REQUIRE_TOOLCHAIN_PROGS],
 [
-  UTIL_LOOKUP_TOOLCHAIN_PROGS($1, $2, $3)
+  UTIL_LOOKUP_TOOLCHAIN_PROGS($1, $2)
  UTIL_CHECK_NONEMPTY($1)
 ])

--- a/make/common/Execute.gmk
+++ b/make/common/Execute.gmk
@ -82,6 +82,8 @@ ifeq ($(INCLUDE), true)
 #   INFO        : Message to display at LOG=info level when running command (optional)
 #   WARN        : Message to display at LOG=warn level when running command (optional)
 #   DEPS        : Dependencies for the execution to take place
+#   DRYRUN      : Set to true to perform everything but executing the command \
+#                 (defaults to false, primarily intended for debugging)
 #

 # Setup make rules for copying files, with an option to do more complex
@ -161,8 +163,13 @@ define SetupExecuteBody
 	$$(TOUCH) $$@

    $$($1_EXEC_RESULT): $$($1_PRE_MARKER)
-	$$(call ExecuteWithLog, $$($1_BASE)_exec, \
-	    cd $$($1_WORKING_DIR) && $$($1_COMMAND))
+        ifneq ($$($1_DRYRUN), true)
+	  $$(call ExecuteWithLog, $$($1_BASE)_exec, \
+	      cd $$($1_WORKING_DIR) && $$($1_COMMAND))
+        else
+	  $$(call LogWarn, DRYRUN enabled for $1, not actually running command)
+	  $$(TOUCH) $$@
+        endif
        ifeq ($$($1_EXEC_RESULT), $$($1_EXEC_MARKER))
 	  $$(TOUCH) $$@
        endif
@ -177,8 +184,13 @@ define SetupExecuteBody
 	  $$(call LogInfo, $$($1_INFO))
        endif
 	$$(call MakeDir, $$(call EncodeSpace, $$($1_WORKING_DIR)) $$(call EncodeSpace, $$($1_SUPPORT_DIR)) $$(call EncodeSpace, $$($1_OUTPUT_DIR)))
-	$$(call ExecuteWithLog, $$($1_BASE)_exec, \
-	    cd $$($1_WORKING_DIR) && $$($1_COMMAND))
+        ifneq ($$($1_DRYRUN), true)
+	  $$(call ExecuteWithLog, $$($1_BASE)_exec, \
+	      cd $$($1_WORKING_DIR) && $$($1_COMMAND))
+        else
+	  $$(call LogWarn, DRYRUN enabled for $1, not actually running command)
+	  $$(TOUCH) $$@
+        endif
        ifeq ($$($1_EXEC_RESULT), $$($1_EXEC_MARKER))
 	  $$(TOUCH) $$@
        endif
--- a/make/scripts/compare-logger.sh
+++ b/make/scripts/compare-logger.sh
--- a/make/scripts/hide_important_warnings_from_javac.sh
+++ b/make/scripts/hide_important_warnings_from_javac.sh
@ -1,36 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2012, 2020, Oracle and/or its affiliates. All rights reserved.
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This code is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License version 2 only, as
-# published by the Free Software Foundation.
-#
-# This code is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-# version 2 for more details (a copy is included in the LICENSE file that
-# accompanied this code).
-#
-# You should have received a copy of the GNU General Public License version
-# 2 along with this work; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
-# or visit www.oracle.com if you need additional information or have any
-# questions.
-#
-
-GREP=grep
-
-#
-EXP="Note: Some input files use or override a deprecated API."
-EXP="${EXP}|Note: Recompile with -Xlint:deprecation for details."
-EXP="${EXP}|Note: Some input files use unchecked or unsafe operations."
-EXP="${EXP}|Note: Recompile with -Xlint:unchecked for details."
-EXP="${EXP}| warning"
-EXP="${EXP}|uses or overrides a deprecated API."
-EXP="${EXP}|uses unchecked or unsafe operations."
-#
-${GREP} --line-buffered -v -E "${EXP}"
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@ -5965,9 +5965,6 @@ attributes %{
  instruction_unit_size = 4;         // An instruction is 4 bytes long
  instruction_fetch_unit_size = 64;  // The processor fetches one line
  instruction_fetch_units = 1;       // of 64 bytes
-
-  // List of nop instructions
-  nops( MachNop );
 %}

 // We don't use an actual pipeline model so don't care about resources
--- a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp
@ -2585,11 +2585,6 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
 }


-void LIR_Assembler::emit_delay(LIR_OpDelay*) {
-  Unimplemented();
-}
-
-
 void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst) {
  __ lea(dst->as_register(), frame_map()->address_for_monitor_lock(monitor_no));
 }
--- a/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp
@ -275,7 +275,7 @@ address BarrierSetAssembler::patching_epoch_addr() {
 }

 void BarrierSetAssembler::increment_patching_epoch() {
-  Atomic::inc(&_patching_epoch);
+  AtomicAccess::inc(&_patching_epoch);
 }

 void BarrierSetAssembler::clear_patching_epoch() {
--- a/src/hotspot/cpu/aarch64/gc/shared/barrierSetNMethod_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/gc/shared/barrierSetNMethod_aarch64.cpp
@ -112,11 +112,25 @@ public:
  }

  int get_value() {
-    return Atomic::load_acquire(guard_addr());
+    return AtomicAccess::load_acquire(guard_addr());
  }

-  void set_value(int value) {
-    Atomic::release_store(guard_addr(), value);
+  void set_value(int value, int bit_mask) {
+    if (bit_mask == ~0) {
+      AtomicAccess::release_store(guard_addr(), value);
+      return;
+    }
+    assert((value & ~bit_mask) == 0, "trying to set bits outside the mask");
+    value &= bit_mask;
+    int old_value = AtomicAccess::load(guard_addr());
+    while (true) {
+      // Only bits in the mask are changed
+      int new_value = value | (old_value & ~bit_mask);
+      if (new_value == old_value) break;
+      int v = AtomicAccess::cmpxchg(guard_addr(), old_value, new_value, memory_order_release);
+      if (v == old_value) break;
+      old_value = v;
+    }
  }

  bool check_barrier(err_msg& msg) const;
@ -179,7 +193,7 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
  new_frame->pc = SharedRuntime::get_handle_wrong_method_stub();
 }

-void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
+void BarrierSetNMethod::set_guard_value(nmethod* nm, int value, int bit_mask) {
  if (!supports_entry_barrier(nm)) {
    return;
  }
@ -196,7 +210,7 @@ void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
  }

  NativeNMethodBarrier barrier(nm);
-  barrier.set_value(value);
+  barrier.set_value(value, bit_mask);
 }

 int BarrierSetNMethod::guard_value(nmethod* nm) {
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@ -1623,7 +1623,7 @@ public:
                    FloatRegister p, FloatRegister z, FloatRegister t1);
  void ghash_reduce_wide(int index, FloatRegister result, FloatRegister lo, FloatRegister hi,
                    FloatRegister p, FloatRegister z, FloatRegister t1);
-  void ghash_processBlocks_wide(address p, Register state, Register subkeyH,
+  void ghash_processBlocks_wide(Label& p, Register state, Register subkeyH,
                                Register data, Register blocks, int unrolls);


--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_aes.cpp
@ -507,7 +507,7 @@ void MacroAssembler::ghash_modmul(FloatRegister result,
 //
 // Clobbers all vector registers.
 //
-void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
+void MacroAssembler::ghash_processBlocks_wide(Label& field_polynomial, Register state,
                                              Register subkeyH,
                                              Register data, Register blocks, int unrolls) {
  int register_stride = 7;
@ -531,7 +531,10 @@ void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register
  FloatRegister p = v31;
  eor(vzr, T16B, vzr, vzr); // zero register

-  ldrq(p, field_polynomial);    // The field polynomial
+  // load polynomial via label which must identify local data in the
+  // same code stub
+  adr(rscratch1, field_polynomial);
+  ldrq(p, rscratch1);    // The field polynomial

  ldrq(v0, Address(state));
  ldrq(Hprime, Address(subkeyH));
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -42,7 +42,7 @@
 #include "prims/methodHandles.hpp"
 #include "prims/upcallLinker.hpp"
 #include "runtime/arguments.hpp"
-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/continuation.hpp"
 #include "runtime/continuationEntry.inline.hpp"
 #include "runtime/frame.inline.hpp"
@ -802,7 +802,7 @@ class StubGenerator: public StubCodeGenerator {
  //
  // s and d are adjusted to point to the remaining words to copy
  //
-  void generate_copy_longs(StubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
+  address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
    BasicType type;
    copy_direction direction;

@ -854,7 +854,7 @@ class StubGenerator: public StubCodeGenerator {

    StubCodeMark mark(this, stub_id);

-    __ bind(start);
+    address start = __ pc();

    Label unaligned_copy_long;
    if (AvoidUnalignedAccesses) {
@ -894,9 +894,9 @@ class StubGenerator: public StubCodeGenerator {
    int prefetch = PrefetchCopyIntervalInBytes;
    bool use_stride = false;
    if (direction == copy_backwards) {
-       use_stride = prefetch > 256;
-       prefetch = -prefetch;
-       if (use_stride) __ mov(stride, prefetch);
+      use_stride = prefetch > 256;
+      prefetch = -prefetch;
+      if (use_stride) __ mov(stride, prefetch);
    }

    __ bind(again);
@ -1026,9 +1026,9 @@ class StubGenerator: public StubCodeGenerator {
      int prefetch = PrefetchCopyIntervalInBytes;
      bool use_stride = false;
      if (direction == copy_backwards) {
-         use_stride = prefetch > 256;
-         prefetch = -prefetch;
-         if (use_stride) __ mov(stride, prefetch);
+        use_stride = prefetch > 256;
+        prefetch = -prefetch;
+        if (use_stride) __ mov(stride, prefetch);
      }

      __ bind(again);
@ -1037,15 +1037,15 @@ class StubGenerator: public StubCodeGenerator {
        __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);

      if (direction == copy_forwards) {
-       // allowing for the offset of -8 the store instructions place
-       // registers into the target 64 bit block at the following
-       // offsets
-       //
-       // t0 at offset 0
-       // t1 at offset 8,  t2 at offset 16
-       // t3 at offset 24, t4 at offset 32
-       // t5 at offset 40, t6 at offset 48
-       // t7 at offset 56
+        // allowing for the offset of -8 the store instructions place
+        // registers into the target 64 bit block at the following
+        // offsets
+        //
+        // t0 at offset 0
+        // t1 at offset 8,  t2 at offset 16
+        // t3 at offset 24, t4 at offset 32
+        // t5 at offset 40, t6 at offset 48
+        // t7 at offset 56

        bs.copy_store_at_8(Address(d, 1 * unit), t0);
        bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
@ -1057,18 +1057,18 @@ class StubGenerator: public StubCodeGenerator {
        bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
        bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
      } else {
-       // d was not offset when we started so the registers are
-       // written into the 64 bit block preceding d with the following
-       // offsets
-       //
-       // t1 at offset -8
-       // t3 at offset -24, t0 at offset -16
-       // t5 at offset -48, t2 at offset -32
-       // t7 at offset -56, t4 at offset -48
-       //                   t6 at offset -64
-       //
-       // note that this matches the offsets previously noted for the
-       // loads
+        // d was not offset when we started so the registers are
+        // written into the 64 bit block preceding d with the following
+        // offsets
+        //
+        // t1 at offset -8
+        // t3 at offset -24, t0 at offset -16
+        // t5 at offset -48, t2 at offset -32
+        // t7 at offset -56, t4 at offset -48
+        //                   t6 at offset -64
+        //
+        // note that this matches the offsets previously noted for the
+        // loads

        bs.copy_store_at_8(Address(d, 1 * unit), t1);
        bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
@ -1109,10 +1109,10 @@ class StubGenerator: public StubCodeGenerator {
      {
        Label L1, L2;
        __ tbz(count, exact_log2(4), L1);
-       // this is the same as above but copying only 4 longs hence
-       // with only one intervening stp between the str instructions
-       // but note that the offsets and registers still follow the
-       // same pattern
+        // this is the same as above but copying only 4 longs hence
+        // with only one intervening stp between the str instructions
+        // but note that the offsets and registers still follow the
+        // same pattern
        bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
        bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
        if (direction == copy_forwards) {
@ -1127,10 +1127,10 @@ class StubGenerator: public StubCodeGenerator {
        __ bind(L1);

        __ tbz(count, 1, L2);
-       // this is the same as above but copying only 2 longs hence
-       // there is no intervening stp between the str instructions
-       // but note that the offset and register patterns are still
-       // the same
+        // this is the same as above but copying only 2 longs hence
+        // there is no intervening stp between the str instructions
+        // but note that the offset and register patterns are still
+        // the same
        bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
        if (direction == copy_forwards) {
          bs.copy_store_at_8(Address(d, 1 * unit), t0);
@ -1141,18 +1141,20 @@ class StubGenerator: public StubCodeGenerator {
        }
        __ bind(L2);

-       // for forwards copy we need to re-adjust the offsets we
-       // applied so that s and d are follow the last words written
+        // for forwards copy we need to re-adjust the offsets we
+        // applied so that s and d are follow the last words written

-       if (direction == copy_forwards) {
-         __ add(s, s, 16);
-         __ add(d, d, 8);
-       }
+        if (direction == copy_forwards) {
+          __ add(s, s, 16);
+          __ add(d, d, 8);
+        }

      }

      __ ret(lr);
-      }
+    }
+
+    return start;
  }

  // Small copy: less than 16 bytes.
@ -1206,10 +1208,6 @@ class StubGenerator: public StubCodeGenerator {
    }
  }

-  Label copy_f, copy_b;
-  Label copy_obj_f, copy_obj_b;
-  Label copy_obj_uninit_f, copy_obj_uninit_b;
-
  // All-singing all-dancing memory copy.
  //
  // Copy count units of memory from s to d.  The size of a unit is
@ -1447,19 +1445,19 @@ class StubGenerator: public StubCodeGenerator {
    }
    if (direction == copy_forwards) {
      if (type != T_OBJECT) {
-        __ bl(copy_f);
+        __ bl(StubRoutines::aarch64::copy_byte_f());
      } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
-        __ bl(copy_obj_uninit_f);
+        __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
      } else {
-        __ bl(copy_obj_f);
+        __ bl(StubRoutines::aarch64::copy_oop_f());
      }
    } else {
      if (type != T_OBJECT) {
-        __ bl(copy_b);
+        __ bl(StubRoutines::aarch64::copy_byte_b());
      } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
-        __ bl(copy_obj_uninit_b);
+        __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
      } else {
-        __ bl(copy_obj_b);
+        __ bl(StubRoutines::aarch64::copy_oop_b());
      }
    }

@ -1522,11 +1520,11 @@ class StubGenerator: public StubCodeGenerator {
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
-  // Side Effects: entry is set to the (post push) entry point so it
-  //               can be used by the corresponding conjoint copy
-  //               method
+  // Side Effects: nopush_entry is set to the (post push) entry point
+  //               so it can be used by the corresponding conjoint
+  //               copy method
  //
-  address generate_disjoint_copy(StubId stub_id, address *entry) {
+  address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_reg = RegSet::of(s, d, count);
    int size;
@ -1615,8 +1613,8 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }
@ -1679,10 +1677,10 @@ class StubGenerator: public StubCodeGenerator {
  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
-  //   entry is set to the no-overlap entry point so it can be used by
-  //   some other conjoint copy method
+  //   nopush_entry is set to the no-overlap entry point so it can be
+  //   used by some other conjoint copy method
  //
-  address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) {
+  address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_regs = RegSet::of(s, d, count);
    int size;
@ -1769,16 +1767,19 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }

    // use fwd copy when (d-s) above_equal (count*size)
+    Label L_overlapping;
    __ sub(rscratch1, d, s);
    __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
-    __ br(Assembler::HS, nooverlap_target);
+    __ br(Assembler::LO, L_overlapping);
+    __ b(RuntimeAddress(nooverlap_target));
+    __ bind(L_overlapping);

    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
    if (dest_uninitialized) {
@ -1850,7 +1851,7 @@ class StubGenerator: public StubCodeGenerator {
  //    r0 ==  0  -  success
  //    r0 == -1^K - failure, where K is partial transfer count
  //
-  address generate_checkcast_copy(StubId stub_id, address *entry) {
+  address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
    bool dest_uninitialized;
    switch (stub_id) {
    case StubId::stubgen_checkcast_arraycopy_id:
@ -1911,8 +1912,8 @@ class StubGenerator: public StubCodeGenerator {
 #endif //ASSERT

    // Caller of this entry point must set up the argument registers.
-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      BLOCK_COMMENT("Entry:");
    }

@ -2724,13 +2725,21 @@ class StubGenerator: public StubCodeGenerator {
  }

  void generate_arraycopy_stubs() {
-    address entry;
-    address entry_jbyte_arraycopy;
-    address entry_jshort_arraycopy;
-    address entry_jint_arraycopy;
-    address entry_oop_arraycopy;
-    address entry_jlong_arraycopy;
-    address entry_checkcast_arraycopy;
+    // Some copy stubs publish a normal entry and then a 2nd 'fallback'
+    // entry immediately following their stack push. This can be used
+    // as a post-push branch target for compatible stubs when they
+    // identify a special case that can be handled by the fallback
+    // stub e.g a disjoint copy stub may be use as a special case
+    // fallback for its compatible conjoint copy stub.
+    //
+    // A no push entry is always returned in the following local and
+    // then published by assigning to the appropriate entry field in
+    // class StubRoutines. The entry value is then passed to the
+    // generator for the compatible stub. That means the entry must be
+    // listed when saving to/restoring from the AOT cache, ensuring
+    // that the inter-stub jumps are noted at AOT-cache save and
+    // relocated at AOT cache load.
+    address nopush_entry;

    // generate the common exit first so later stubs can rely on it if
    // they want an UnsafeMemoryAccess exit non-local to the stub
@ -2738,83 +2747,123 @@ class StubGenerator: public StubCodeGenerator {
    // register the stub as the default exit with class UnsafeMemoryAccess
    UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);

-    generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
-    generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
+    // generate and publish arch64-specific bulk copy routines first
+    // so we can call them from other copy stubs
+    StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
+    StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);

-    generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
-    generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
+    StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
+    StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);

-    generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
-    generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
+    StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
+    StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);

    StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();

    //*** jbyte
    // Always need aligned and unaligned versions
-    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
-    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr);
+    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);

    //*** jshort
    // Always need aligned and unaligned versions
-    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
-    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr);
+    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is used by generic/unsafe copy
+    StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);

    //*** jint
    // Aligned versions
-    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
-    // entry_jint_arraycopy always points to the unaligned version
-    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+    // jint_arraycopy_nopush always points to the unaligned version
+    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jint_arraycopy_nopush = nopush_entry;

    //*** jlong
    // It is always aligned
-    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
+    // disjoint normal/nopush and conjoint normal entries are not
+    // generated since the arrayof versions are the same
    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
+    StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;

    //*** oops
    {
-      // With compressed oops we need unaligned versions; notice that
-      // we overwrite entry_oop_arraycopy.
-      bool aligned = !UseCompressedOops;
-
      StubRoutines::_arrayof_oop_disjoint_arraycopy
-        = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry);
+        = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
+      // disjoint arrayof nopush entry is needed by conjoint copy
+      StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
      StubRoutines::_arrayof_oop_arraycopy
-        = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
+        = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
+      // conjoint arrayof nopush entry is needed by generic/unsafe copy
+      StubRoutines::_oop_arraycopy_nopush = nopush_entry;
      // Aligned versions without pre-barriers
      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
-        = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
+        = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
+      // disjoint arrayof+uninit nopush entry is needed by conjoint copy
+      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
+      // note that we don't need a returned nopush entry because the
+      // generic/unsafe copy does not cater for uninit arrays.
      StubRoutines::_arrayof_oop_arraycopy_uninit
-        = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr);
+        = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
    }

+    // for oop copies reuse arrayof entries for non-arrayof cases
    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
+    StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
+    StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;

-    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
+    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
+    // checkcast nopush entry is needed by generic copy
+    StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
+    // note that we don't need a returned nopush entry because the
+    // generic copy does not cater for uninit arrays.
    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);

-    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
-                                                              entry_jshort_arraycopy,
-                                                              entry_jint_arraycopy,
-                                                              entry_jlong_arraycopy);
+    // unsafe arraycopy may fallback on conjoint stubs
+    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                              StubRoutines::_jshort_arraycopy_nopush,
+                                                              StubRoutines::_jint_arraycopy_nopush,
+                                                              StubRoutines::_jlong_arraycopy_nopush);

-    StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
-                                                               entry_jshort_arraycopy,
-                                                               entry_jint_arraycopy,
-                                                               entry_oop_arraycopy,
-                                                               entry_jlong_arraycopy,
-                                                               entry_checkcast_arraycopy);
+    // generic arraycopy may fallback on conjoint stubs
+    StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                               StubRoutines::_jshort_arraycopy_nopush,
+                                                               StubRoutines::_jint_arraycopy_nopush,
+                                                               StubRoutines::_oop_arraycopy_nopush,
+                                                               StubRoutines::_jlong_arraycopy_nopush,
+                                                               StubRoutines::_checkcast_arraycopy_nopush);

    StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
    StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
@ -3402,14 +3451,9 @@ class StubGenerator: public StubCodeGenerator {
  // counter = c_rarg7 - 16 bytes of CTR
  // return - number of processed bytes
  address generate_galoisCounterMode_AESCrypt() {
-    address ghash_polynomial = __ pc();
-    __ emit_int64(0x87);  // The low-order bits of the field
-                          // polynomial (i.e. p = z^7+z^2+z+1)
-                          // repeated in the low and high parts of a
-                          // 128-bit vector
-    __ emit_int64(0x87);
+    Label ghash_polynomial; // local data generated after code

-    __ align(CodeEntryAlignment);
+   __ align(CodeEntryAlignment);
    StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
    StubCodeMark mark(this, stub_id);
    address start = __ pc();
@ -3514,7 +3558,17 @@ class StubGenerator: public StubCodeGenerator {

    __ leave(); // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);
-     return start;
+
+    // bind label and generate polynomial data
+    __ align(wordSize * 2);
+    __ bind(ghash_polynomial);
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
+    return start;
  }

  class Cached64Bytes {
@ -4559,16 +4613,6 @@ class StubGenerator: public StubCodeGenerator {
  // by the second lane from all vectors and so on.
  address generate_chacha20Block_blockpar() {
    Label L_twoRounds, L_cc20_const;
-    // The constant data is broken into two 128-bit segments to be loaded
-    // onto FloatRegisters.  The first 128 bits are a counter add overlay
-    // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
-    // The second 128-bits is a table constant used for 8-bit left rotations.
-    __ BIND(L_cc20_const);
-    __ emit_int64(0x0000000100000000UL);
-    __ emit_int64(0x0000000300000002UL);
-    __ emit_int64(0x0605040702010003UL);
-    __ emit_int64(0x0E0D0C0F0A09080BUL);
-
    __ align(CodeEntryAlignment);
    StubId stub_id = StubId::stubgen_chacha20Block_id;
    StubCodeMark mark(this, stub_id);
@ -4716,6 +4760,17 @@ class StubGenerator: public StubCodeGenerator {
    __ leave();
    __ ret(lr);

+    // bind label and generate local constant data used by this stub
+    // The constant data is broken into two 128-bit segments to be loaded
+    // onto FloatRegisters.  The first 128 bits are a counter add overlay
+    // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
+    // The second 128-bits is a table constant used for 8-bit left rotations.
+    __ BIND(L_cc20_const);
+    __ emit_int64(0x0000000100000000UL);
+    __ emit_int64(0x0000000300000002UL);
+    __ emit_int64(0x0605040702010003UL);
+    __ emit_int64(0x0E0D0C0F0A09080BUL);
+
    return start;
  }

@ -6036,10 +6091,6 @@ class StubGenerator: public StubCodeGenerator {
  address generate_kyber12To16() {
    Label L_F00, L_loop, L_end;

-    __ BIND(L_F00);
-    __ emit_int64(0x0f000f000f000f00);
-    __ emit_int64(0x0f000f000f000f00);
-
    __ align(CodeEntryAlignment);
    StubId stub_id = StubId::stubgen_kyber12To16_id;
    StubCodeMark mark(this, stub_id);
@ -6233,6 +6284,11 @@ class StubGenerator: public StubCodeGenerator {
    __ mov(r0, zr); // return 0
    __ ret(lr);

+    // bind label and generate constant data used by this stub
+    __ BIND(L_F00);
+    __ emit_int64(0x0f000f000f000f00);
+    __ emit_int64(0x0f000f000f000f00);
+
    return start;
  }

@ -9642,14 +9698,7 @@ class StubGenerator: public StubCodeGenerator {

    StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
    StubCodeMark mark(this, stub_id);
-    __ align(wordSize * 2);
-    address p = __ pc();
-    __ emit_int64(0x87);  // The low-order bits of the field
-                          // polynomial (i.e. p = z^7+z^2+z+1)
-                          // repeated in the low and high parts of a
-                          // 128-bit vector
-    __ emit_int64(0x87);
-
+    Label polynomial; // local data generated at end of stub
    __ align(CodeEntryAlignment);
    address start = __ pc();

@ -9661,7 +9710,8 @@ class StubGenerator: public StubCodeGenerator {
    FloatRegister vzr = v30;
    __ eor(vzr, __ T16B, vzr, vzr); // zero register

-    __ ldrq(v24, p);    // The field polynomial
+    __ adr(rscratch1, polynomial);
+    __ ldrq(v24, rscratch1);    // The field polynomial

    __ ldrq(v0, Address(state));
    __ ldrq(v1, Address(subkeyH));
@ -9701,6 +9751,15 @@ class StubGenerator: public StubCodeGenerator {
    __ st1(v0, __ T16B, state);
    __ ret(lr);

+    // bind label and generate local polynomial data
+    __ align(wordSize * 2);
+    __ bind(polynomial);
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
    return start;
  }

@ -9709,14 +9768,7 @@ class StubGenerator: public StubCodeGenerator {

    StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
    StubCodeMark mark(this, stub_id);
-    __ align(wordSize * 2);
-    address p = __ pc();
-    __ emit_int64(0x87);  // The low-order bits of the field
-                          // polynomial (i.e. p = z^7+z^2+z+1)
-                          // repeated in the low and high parts of a
-                          // 128-bit vector
-    __ emit_int64(0x87);
-
+    Label polynomial;           // local data generated after stub
    __ align(CodeEntryAlignment);
    address start = __ pc();

@ -9738,7 +9790,7 @@ class StubGenerator: public StubCodeGenerator {
      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
    }

-    __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
+    __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);

    if (unroll > 1) {
      // And restore state
@ -9751,7 +9803,17 @@ class StubGenerator: public StubCodeGenerator {

    __ ret(lr);

+    // bind label and generate polynomial data
+    __ align(wordSize * 2);
+    __ bind(polynomial);
+    __ emit_int64(0x87);  // The low-order bits of the field
+                          // polynomial (i.e. p = z^7+z^2+z+1)
+                          // repeated in the low and high parts of a
+                          // 128-bit vector
+    __ emit_int64(0x87);
+
    return start;
+
  }

  void generate_base64_encode_simdround(Register src, Register dst,
@ -10265,7 +10327,7 @@ class StubGenerator: public StubCodeGenerator {

 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)

-  // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
+  // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
  //
  // If LSE is in use, generate LSE versions of all the stubs. The
  // non-LSE versions are in atomic_aarch64.S.
--- a/src/hotspot/cpu/arm/arm.ad
+++ b/src/hotspot/cpu/arm/arm.ad
@ -2638,9 +2638,6 @@ attributes %{
  instruction_unit_size = 4;         // An instruction is 4 bytes long
  instruction_fetch_unit_size = 16;  // The processor fetches one line
  instruction_fetch_units = 1;       // of 16 bytes
-
-  // List of nop instructions
-  nops( Nop_A0, Nop_A1, Nop_MS, Nop_FA, Nop_BR );
 %}

 //----------RESOURCES----------------------------------------------------------
@ -3284,18 +3281,18 @@ pipe_class loadPollP(iRegP poll) %{
 %}

 pipe_class br(Universe br, label labl) %{
-    single_instruction_with_delay_slot;
+    single_instruction;
    BR  : R;
 %}

 pipe_class br_cc(Universe br, cmpOp cmp, flagsReg cr, label labl) %{
-    single_instruction_with_delay_slot;
+    single_instruction;
    cr    : E(read);
    BR    : R;
 %}

 pipe_class br_reg(Universe br, cmpOp cmp, iRegI op1, label labl) %{
-    single_instruction_with_delay_slot;
+    single_instruction;
    op1 : E(read);
    BR  : R;
    MS  : R;
@ -3326,14 +3323,14 @@ pipe_class call(method meth) %{
 %}

 pipe_class tail_call(Universe ignore, label labl) %{
-    single_instruction; has_delay_slot;
+    single_instruction;
    fixed_latency(100);
    BR  : R(1);
    MS  : R(1);
 %}

 pipe_class ret(Universe ignore) %{
-    single_instruction; has_delay_slot;
+    single_instruction;
    BR  : R(1);
    MS  : R(1);
 %}
@ -3376,14 +3373,6 @@ pipe_class cadd_cmpltmask( iRegI p, iRegI q, iRegI y ) %{
    IALU  : R(3)
 %}

-// Perform a compare, then move conditionally in a branch delay slot.
-pipe_class min_max( iRegI src2, iRegI srcdst ) %{
-    src2   : E(read);
-    srcdst : E(read);
-    IALU   : R;
-    BR     : R;
-%}
-
 // Define the class for the Nop node
 define %{
   MachNop = ialu_nop;
@ -9056,7 +9045,7 @@ instruct clear_array(iRegX cnt, iRegP base, iRegI temp, iRegX zero, Universe dum
  format %{ "MOV    $zero,0\n"
      "        MOV    $temp,$cnt\n"
      "loop:   SUBS   $temp,$temp,4\t! Count down a dword of bytes\n"
-      "        STR.ge $zero,[$base+$temp]\t! delay slot"
+      "        STR.ge $zero,[$base+$temp]\n"
      "        B.gt   loop\t\t! Clearing loop\n" %}
  ins_encode %{
    __ mov($zero$$Register, 0);
--- a/src/hotspot/cpu/arm/c1_LIRAssembler_arm.cpp
+++ b/src/hotspot/cpu/arm/c1_LIRAssembler_arm.cpp
@ -2552,11 +2552,6 @@ void LIR_Assembler::emit_profile_type(LIR_OpProfileType* op) {
  fatal("Type profiling not implemented on this platform");
 }

-void LIR_Assembler::emit_delay(LIR_OpDelay*) {
-  Unimplemented();
-}
-
-
 void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst) {
  Address mon_addr = frame_map()->address_for_monitor_lock(monitor_no);
  __ add_slow(dst->as_pointer_register(), mon_addr.base(), mon_addr.disp());
--- a/src/hotspot/cpu/arm/gc/shared/barrierSetNMethod_arm.cpp
+++ b/src/hotspot/cpu/arm/gc/shared/barrierSetNMethod_arm.cpp
@ -48,11 +48,25 @@ class NativeNMethodBarrier: public NativeInstruction {

 public:
  int get_value() {
-    return Atomic::load_acquire(guard_addr());
+    return AtomicAccess::load_acquire(guard_addr());
  }

-  void set_value(int value) {
-    Atomic::release_store(guard_addr(), value);
+  void set_value(int value, int bit_mask) {
+    if (bit_mask == ~0) {
+      AtomicAccess::release_store(guard_addr(), value);
+      return;
+    }
+    assert((value & ~bit_mask) == 0, "trying to set bits outside the mask");
+    value &= bit_mask;
+    int old_value = AtomicAccess::load(guard_addr());
+    while (true) {
+      // Only bits in the mask are changed
+      int new_value = value | (old_value & ~bit_mask);
+      if (new_value == old_value) break;
+      int v = AtomicAccess::cmpxchg(guard_addr(), old_value, new_value, memory_order_release);
+      if (v == old_value) break;
+      old_value = v;
+    }
  }

  void verify() const;
@ -115,7 +129,7 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
  new_frame->pc = SharedRuntime::get_handle_wrong_method_stub();
 }

-void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
+void BarrierSetNMethod::set_guard_value(nmethod* nm, int value, int bit_mask) {
  if (!supports_entry_barrier(nm)) {
    return;
  }
@ -123,7 +137,7 @@ void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
  // Disarms the nmethod guard emitted by BarrierSetAssembler::nmethod_entry_barrier.
  // Symmetric "LDR; DMB ISHLD" is in the nmethod barrier.
  NativeNMethodBarrier* barrier = native_nmethod_barrier(nm);
-  barrier->set_value(value);
+  barrier->set_value(value, bit_mask);
 }

 int BarrierSetNMethod::guard_value(nmethod* nm) {
--- a/src/hotspot/cpu/arm/stubGenerator_arm.cpp
+++ b/src/hotspot/cpu/arm/stubGenerator_arm.cpp
@ -421,7 +421,8 @@ class StubGenerator: public StubCodeGenerator {
  }


- // As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as:
+ // As per atomicAccess.hpp the atomic read-modify-write operations must be
+ // logically implemented as:
 //  <fence>; <op>; <membar StoreLoad|StoreStore>
 // But for load-linked/store-conditional based systems a fence here simply means
 // no load/store can be reordered with respect to the initial load-linked, so we have:
@ -440,7 +441,7 @@ class StubGenerator: public StubCodeGenerator {
  // be removed in the future.

  // Implementation of atomic_add(jint add_value, volatile jint* dest)
-  // used by Atomic::add(volatile jint* dest, jint add_value)
+  // used by AtomicAccess::add(volatile jint* dest, jint add_value)
  //
  // Arguments :
  //
@ -492,7 +493,7 @@ class StubGenerator: public StubCodeGenerator {
  }

  // Implementation of jint atomic_xchg(jint exchange_value, volatile jint* dest)
-  // used by Atomic::add(volatile jint* dest, jint exchange_value)
+  // used by AtomicAccess::add(volatile jint* dest, jint exchange_value)
  //
  // Arguments :
  //
@ -542,7 +543,7 @@ class StubGenerator: public StubCodeGenerator {
  }

  // Implementation of jint atomic_cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value)
-  // used by Atomic::cmpxchg(volatile jint *dest, jint compare_value, jint exchange_value)
+  // used by AtomicAccess::cmpxchg(volatile jint *dest, jint compare_value, jint exchange_value)
  //
  // Arguments :
  //
@ -582,7 +583,7 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

-  // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
+  // Support for jlong AtomicAccess::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
  // reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest)
  //
  // Arguments :
@ -3010,6 +3011,10 @@ class StubGenerator: public StubCodeGenerator {
    // Note:  the disjoint stubs must be generated first, some of
    //        the conjoint stubs use them.

+    // Note:   chaining of stubs does not rely on branching to an
+    //         auxiliary post-push entry because none of the stubs
+    //         push/pop a frame.
+
    // these need always status in case they are called from generic_arraycopy
    StubRoutines::_jbyte_disjoint_arraycopy  = generate_primitive_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
    StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(StubId::stubgen_jshort_disjoint_arraycopy_id);
@ -3023,6 +3028,7 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_primitive_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id);
    StubRoutines::_arrayof_oop_disjoint_arraycopy    = generate_oop_copy      (StubId::stubgen_arrayof_oop_disjoint_arraycopy_id);

+    // disjoint copy entry is needed by conjoint copy
    // these need always status in case they are called from generic_arraycopy
    StubRoutines::_jbyte_arraycopy  = generate_primitive_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy);
    StubRoutines::_jshort_arraycopy = generate_primitive_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy);
--- a/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp
+++ b/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp
@ -2747,11 +2747,6 @@ void LIR_Assembler::align_backward_branch_target() {
 }


-void LIR_Assembler::emit_delay(LIR_OpDelay* op) {
-  Unimplemented();
-}
-
-
 void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
  // tmp must be unused
  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
--- a/src/hotspot/cpu/ppc/gc/shared/barrierSetAssembler_ppc.cpp
+++ b/src/hotspot/cpu/ppc/gc/shared/barrierSetAssembler_ppc.cpp
@ -183,12 +183,9 @@ void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Register t
  BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
  assert_different_registers(tmp, R0);

-  __ block_comment("nmethod_entry_barrier (nmethod_entry_barrier) {");
+  __ align(8); // must align the following block which requires atomic updates

-  // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
-  __ calculate_address_from_global_toc(tmp, StubRoutines::method_entry_barrier(),
-                                       true, true, false); // 2 instructions
-  __ mtctr(tmp);
+  __ block_comment("nmethod_entry_barrier (nmethod_entry_barrier) {");

  // This is a compound instruction. Patching support is provided by NativeMovRegMem.
  // Actual patching is done in (platform-specific part of) BarrierSetNMethod.
@ -198,6 +195,11 @@ void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Register t
  __ ld(R0, in_bytes(bs_nm->thread_disarmed_guard_value_offset()), R16_thread);
  __ cmpw(CR0, R0, tmp);

+  // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
+  __ calculate_address_from_global_toc(tmp, StubRoutines::method_entry_barrier(),
+                                       true, true, false); // 2 instructions
+  __ mtctr(tmp);
+
  __ bnectrl(CR0);

  // Oops may have been changed. Make those updates observable.
--- a/src/hotspot/cpu/ppc/gc/shared/barrierSetNMethod_ppc.cpp
+++ b/src/hotspot/cpu/ppc/gc/shared/barrierSetNMethod_ppc.cpp
@ -38,7 +38,7 @@ class NativeNMethodBarrier: public NativeInstruction {

  NativeMovRegMem* get_patchable_instruction_handle() const {
    // Endianness is handled by NativeMovRegMem
-    return reinterpret_cast<NativeMovRegMem*>(get_barrier_start_address() + 3 * 4);
+    return reinterpret_cast<NativeMovRegMem*>(get_barrier_start_address());
  }

 public:
@ -47,7 +47,7 @@ public:
    return get_patchable_instruction_handle()->offset();
  }

-  void release_set_guard_value(int value) {
+  void release_set_guard_value(int value, int bit_mask) {
    // Patching is not atomic.
    // Stale observations of the "armed" state is okay as invoking the barrier stub in that case has no
    // unwanted side effects. Disarming is thus a non-critical operation.
@ -55,8 +55,37 @@ public:

    OrderAccess::release(); // Release modified oops

-    // Set the guard value (naming of 'offset' function is misleading).
-    get_patchable_instruction_handle()->set_offset(value);
+    if (bit_mask == ~0) {
+      // Set the guard value (naming of 'offset' function is misleading).
+      get_patchable_instruction_handle()->set_offset(value);
+      return;
+    }
+
+    assert((value & ~bit_mask) == 0, "trying to set bits outside the mask");
+    value &= bit_mask;
+
+    NativeMovRegMem* mov = get_patchable_instruction_handle();
+    assert(align_up(mov->instruction_address(), sizeof(uint64_t)) ==
+           align_down(mov->instruction_address(), sizeof(uint64_t)), "instruction not aligned");
+    uint64_t *instr = (uint64_t*)mov->instruction_address();
+    assert(NativeMovRegMem::instruction_size == sizeof(*instr), "must be");
+    union {
+      u_char buf[NativeMovRegMem::instruction_size];
+      uint64_t u64;
+    } new_mov_instr, old_mov_instr;
+    new_mov_instr.u64 = old_mov_instr.u64 = AtomicAccess::load(instr);
+    while (true) {
+      // Only bits in the mask are changed
+      int old_value = nativeMovRegMem_at(old_mov_instr.buf)->offset();
+      int new_value = value | (old_value & ~bit_mask);
+      if (new_value == old_value) return; // skip icache flush if nothing changed
+      nativeMovRegMem_at(new_mov_instr.buf)->set_offset(new_value, false /* no icache flush */);
+      // Swap in the new value
+      uint64_t v = AtomicAccess::cmpxchg(instr, old_mov_instr.u64, new_mov_instr.u64, memory_order_relaxed);
+      if (v == old_mov_instr.u64) break;
+      old_mov_instr.u64 = v;
+    }
+    ICache::ppc64_flush_icache_bytes(addr_at(0), NativeMovRegMem::instruction_size);
  }

  void verify() const {
@ -66,12 +95,6 @@ public:

    uint* current_instruction = reinterpret_cast<uint*>(get_barrier_start_address());

-    // calculate_address_from_global_toc (compound instruction)
-    verify_op_code_manually(current_instruction, MacroAssembler::is_addis(*current_instruction));
-    verify_op_code_manually(current_instruction, MacroAssembler::is_addi(*current_instruction));
-
-    verify_op_code_manually(current_instruction, MacroAssembler::is_mtctr(*current_instruction));
-
    get_patchable_instruction_handle()->verify();
    current_instruction += 2;

@ -80,6 +103,12 @@ public:
    // cmpw (mnemonic)
    verify_op_code(current_instruction, Assembler::CMP_OPCODE);

+    // calculate_address_from_global_toc (compound instruction)
+    verify_op_code_manually(current_instruction, MacroAssembler::is_addis(*current_instruction));
+    verify_op_code_manually(current_instruction, MacroAssembler::is_addi(*current_instruction));
+
+    verify_op_code_manually(current_instruction, MacroAssembler::is_mtctr(*current_instruction));
+
    // bnectrl (mnemonic) (weak check; not checking the exact type)
    verify_op_code(current_instruction, Assembler::BCCTR_OPCODE);

@ -117,13 +146,13 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
  // Thus, there's nothing to do here.
 }

-void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
+void BarrierSetNMethod::set_guard_value(nmethod* nm, int value, int bit_mask) {
  if (!supports_entry_barrier(nm)) {
    return;
  }

  NativeNMethodBarrier* barrier = get_nmethod_barrier(nm);
-  barrier->release_set_guard_value(value);
+  barrier->release_set_guard_value(value, bit_mask);
 }

 int BarrierSetNMethod::guard_value(nmethod* nm) {
--- a/src/hotspot/cpu/ppc/nativeInst_ppc.cpp
+++ b/src/hotspot/cpu/ppc/nativeInst_ppc.cpp
@ -347,7 +347,7 @@ void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer)
  // Finally patch out the jump.
  volatile juint *jump_addr = (volatile juint*)instr_addr;
  // Release not needed because caller uses invalidate_range after copying the remaining bytes.
-  //Atomic::release_store(jump_addr, *((juint*)code_buffer));
+  //AtomicAccess::release_store(jump_addr, *((juint*)code_buffer));
  *jump_addr = *((juint*)code_buffer); // atomically store code over branch instruction
  ICache::ppc64_flush_icache_bytes(instr_addr, NativeGeneralJump::instruction_size);
 }
--- a/src/hotspot/cpu/ppc/nativeInst_ppc.hpp
+++ b/src/hotspot/cpu/ppc/nativeInst_ppc.hpp
@ -462,7 +462,7 @@ class NativeMovRegMem: public NativeInstruction {
    return ((*hi_ptr) << 16) | ((*lo_ptr) & 0xFFFF);
  }

-  void set_offset(intptr_t x) {
+  void set_offset(intptr_t x, bool flush_icache = true) {
 #ifdef VM_LITTLE_ENDIAN
    short *hi_ptr = (short*)(addr_at(0));
    short *lo_ptr = (short*)(addr_at(4));
@ -472,7 +472,9 @@ class NativeMovRegMem: public NativeInstruction {
 #endif
    *hi_ptr = x >> 16;
    *lo_ptr = x & 0xFFFF;
-    ICache::ppc64_flush_icache_bytes(addr_at(0), NativeMovRegMem::instruction_size);
+    if (flush_icache) {
+      ICache::ppc64_flush_icache_bytes(addr_at(0), NativeMovRegMem::instruction_size);
+    }
  }

  void add_offset_in_bytes(intptr_t radd_offset) {
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@ -4920,10 +4920,6 @@ attributes %{

  // ...in one line
  instruction_fetch_units = 1
-
-  // Unused, list one so that array generated by adlc is not empty.
-  // Aix compiler chokes if _nop_count = 0.
-  nops(fxNop);
 %}

 //----------RESOURCES----------------------------------------------------------
--- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
+++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
@ -3277,8 +3277,12 @@ class StubGenerator: public StubCodeGenerator {
    // register the stub as the default exit with class UnsafeMemoryAccess
    UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);

-    // Note: the disjoint stubs must be generated first, some of
-    // the conjoint stubs use them.
+    // Note: the disjoint stubs must be generated first, some of the
+    //       conjoint stubs use them.
+
+    // Note: chaining of stubs does not rely on branching to an
+    //       auxiliary post-push entry because none of the stubs
+    //       push/pop a frame.

    // non-aligned disjoint versions
    StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp
@ -912,6 +912,32 @@ protected:
    emit(insn);
  }

+ public:
+
+  static uint32_t encode_jal(Register Rd, const int32_t offset) {
+    guarantee(is_simm21(offset) && ((offset % 2) == 0), "offset is invalid.");
+    uint32_t insn = 0;
+    patch((address)&insn, 6, 0, 0b1101111);
+    patch_reg((address)&insn, 7, Rd);
+    patch((address)&insn, 19, 12, (uint32_t)((offset >> 12) & 0xff));
+    patch((address)&insn, 20, (uint32_t)((offset >> 11) & 0x1));
+    patch((address)&insn, 30, 21, (uint32_t)((offset >> 1) & 0x3ff));
+    patch((address)&insn, 31, (uint32_t)((offset >> 20) & 0x1));
+    return insn;
+  }
+
+  static uint32_t encode_jalr(Register Rd, Register Rs, const int32_t offset) {
+    guarantee(is_simm12(offset), "offset is invalid.");
+    uint32_t insn = 0;
+    patch((address)&insn, 6, 0, 0b1100111);
+    patch_reg((address)&insn, 7, Rd);
+    patch((address)&insn, 14, 12, 0b000);
+    patch_reg((address)&insn, 15, Rs);
+    int32_t val = offset & 0xfff;
+    patch((address)&insn, 31, 20, val);
+    return insn;
+  }
+
 protected:

  enum barrier {
--- a/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.cpp
@ -1590,8 +1590,6 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
  }
 }

-void LIR_Assembler::emit_delay(LIR_OpDelay*) { Unimplemented(); }
-
 void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst) {
  __ la(dst->as_register(), frame_map()->address_for_monitor_lock(monitor_no));
 }
--- a/src/hotspot/cpu/riscv/gc/shared/barrierSetAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/gc/shared/barrierSetAssembler_riscv.cpp
@ -217,7 +217,7 @@ address BarrierSetAssembler::patching_epoch_addr() {
 }

 void BarrierSetAssembler::increment_patching_epoch() {
-  Atomic::inc(&_patching_epoch);
+  AtomicAccess::inc(&_patching_epoch);
 }

 void BarrierSetAssembler::clear_patching_epoch() {
--- a/src/hotspot/cpu/riscv/gc/shared/barrierSetNMethod_riscv.cpp
+++ b/src/hotspot/cpu/riscv/gc/shared/barrierSetNMethod_riscv.cpp
@ -106,11 +106,25 @@ public:
  }

  int get_value() {
-    return Atomic::load_acquire(guard_addr());
+    return AtomicAccess::load_acquire(guard_addr());
  }

-  void set_value(int value) {
-    Atomic::release_store(guard_addr(), value);
+  void set_value(int value, int bit_mask) {
+    if (bit_mask == ~0) {
+      AtomicAccess::release_store(guard_addr(), value);
+      return;
+    }
+    assert((value & ~bit_mask) == 0, "trying to set bits outside the mask");
+    value &= bit_mask;
+    int old_value = AtomicAccess::load(guard_addr());
+    while (true) {
+      // Only bits in the mask are changed
+      int new_value = value | (old_value & ~bit_mask);
+      if (new_value == old_value) break;
+      int v = AtomicAccess::cmpxchg(guard_addr(), old_value, new_value, memory_order_release);
+      if (v == old_value) break;
+      old_value = v;
+    }
  }

  bool check_barrier(err_msg& msg) const;
@ -192,7 +206,7 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
  new_frame->pc = SharedRuntime::get_handle_wrong_method_stub();
 }

-void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
+void BarrierSetNMethod::set_guard_value(nmethod* nm, int value, int bit_mask) {
  if (!supports_entry_barrier(nm)) {
    return;
  }
@ -209,7 +223,7 @@ void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
  }

  NativeNMethodBarrier barrier(nm);
-  barrier.set_value(value);
+  barrier.set_value(value, bit_mask);
 }

 int BarrierSetNMethod::guard_value(nmethod* nm) {
--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
@ -3402,6 +3402,8 @@ void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {

 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
  assert(UseCompressedClassPointers, "should only be used for compressed headers");
+  assert_different_registers(dst, tmp);
+  assert_different_registers(src, tmp);

  if (CompressedKlassPointers::base() == nullptr) {
    if (CompressedKlassPointers::shift() != 0) {
@ -3412,18 +3414,13 @@ void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register
    return;
  }

-  Register xbase = dst;
-  if (dst == src) {
-    xbase = tmp;
-  }
+  Register xbase = tmp;

-  assert_different_registers(src, xbase);
  mv(xbase, (uintptr_t)CompressedKlassPointers::base());

  if (CompressedKlassPointers::shift() != 0) {
-    Register t = src == dst ? dst : t0;
-    assert_different_registers(t, xbase);
-    shadd(dst, src, xbase, t, CompressedKlassPointers::shift());
+    // dst = (src << shift) + xbase
+    shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
  } else {
    add(dst, xbase, src);
  }
@ -5874,13 +5871,14 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
 // in cnt.
 //
 // NOTE: This is intended to be used in the zero_blocks() stub.  If
-// you want to use it elsewhere, note that cnt must be >= CacheLineSize.
+// you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
+  int zicboz_block_size = VM_Version::zicboz_block_size.value();
  Label initial_table_end, loop;

  // Align base with cache line size.
  neg(tmp1, base);
-  andi(tmp1, tmp1, CacheLineSize - 1);
+  andi(tmp1, tmp1, zicboz_block_size - 1);

  // tmp1: the number of bytes to be filled to align the base with cache line size.
  add(base, base, tmp1);
@ -5890,16 +5888,16 @@ void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tm
  la(tmp1, initial_table_end);
  sub(tmp2, tmp1, tmp2);
  jr(tmp2);
-  for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
+  for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
    sd(zr, Address(base, i));
  }
  bind(initial_table_end);

-  mv(tmp1, CacheLineSize / wordSize);
+  mv(tmp1, zicboz_block_size / wordSize);
  bind(loop);
  cbo_zero(base);
  sub(cnt, cnt, tmp1);
-  addi(base, base, CacheLineSize);
+  addi(base, base, zicboz_block_size);
  bge(cnt, tmp1, loop);
 }

--- a/src/hotspot/cpu/riscv/nativeInst_riscv.cpp
+++ b/src/hotspot/cpu/riscv/nativeInst_riscv.cpp
@ -28,11 +28,13 @@
 #include "code/compiledIC.hpp"
 #include "nativeInst_riscv.hpp"
 #include "oops/oop.inline.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/handles.hpp"
 #include "runtime/orderAccess.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/sharedRuntime.hpp"
 #include "runtime/stubRoutines.hpp"
+#include "utilities/align.hpp"
 #include "utilities/ostream.hpp"
 #ifdef COMPILER1
 #include "c1/c1_Runtime1.hpp"
@ -52,15 +54,15 @@ address NativeCall::destination() const {
  address addr = instruction_address();
  assert(NativeCall::is_at(addr), "unexpected code at call site");

-  address destination = MacroAssembler::target_addr_for_insn(addr);
+  address stub_addr = MacroAssembler::target_addr_for_insn(addr);

  CodeBlob* cb = CodeCache::find_blob(addr);
  assert(cb != nullptr && cb->is_nmethod(), "nmethod expected");
  nmethod *nm = (nmethod *)cb;
-  assert(nm != nullptr, "Sanity");
-  assert(nm->stub_contains(destination), "Sanity");
-  assert(destination != nullptr, "Sanity");
-  return stub_address_destination_at(destination);
+  assert(nm->stub_contains(stub_addr), "Sanity");
+  assert(stub_addr != nullptr, "Sanity");
+
+  return stub_address_destination_at(stub_addr);
 }

 address NativeCall::reloc_destination() {
@ -89,6 +91,30 @@ void NativeCall::print() {
  tty->print_cr(PTR_FORMAT ": auipc,ld,jalr x1, offset/reg, ", p2i(instruction_address()));
 }

+void NativeCall::optimize_call(address dest, bool mt_safe) {
+  // Skip over auipc + ld
+  address jmp_ins_pc = instruction_address() + 2 * NativeInstruction::instruction_size;
+  // Rutime calls may be unaligned, but they are never changed after relocation.
+  assert(!mt_safe || is_aligned(jmp_ins_pc, NativeInstruction::instruction_size), "Must be naturally aligned: %p", jmp_ins_pc);
+  // If reachable use JAL
+  if (Assembler::reachable_from_branch_at(jmp_ins_pc, dest)) {
+    int64_t distance = dest - jmp_ins_pc;
+    uint32_t new_jal = Assembler::encode_jal(ra, distance);
+    AtomicAccess::store((uint32_t *)jmp_ins_pc, new_jal);
+  } else if (!MacroAssembler::is_jalr_at(jmp_ins_pc)) { // The jalr is always identical: jalr ra, 0(t1)
+    uint32_t new_jalr = Assembler::encode_jalr(ra, t1, 0);
+    AtomicAccess::store((uint32_t *)jmp_ins_pc, new_jalr);
+  } else {
+    // No change to instruction stream
+    return;
+  }
+  // We changed instruction stream
+  if (mt_safe) {
+    // IC invalidate provides a leading full fence, it thus happens after we changed the instruction stream.
+    ICache::invalidate_range(jmp_ins_pc, NativeInstruction::instruction_size);
+  }
+}
+
 bool NativeCall::set_destination_mt_safe(address dest) {
  assert(NativeCall::is_at(instruction_address()), "unexpected code at call site");
  assert((CodeCache_lock->is_locked() || SafepointSynchronize::is_at_safepoint()) ||
@ -96,15 +122,17 @@ bool NativeCall::set_destination_mt_safe(address dest) {
         "concurrent code patching");

  address stub_addr = stub_address();
-  if (stub_addr != nullptr) {
-    set_stub_address_destination_at(stub_addr, dest);
-    return true;
-  }
+  assert(stub_addr != nullptr, "No stub?");
+  set_stub_address_destination_at(stub_addr, dest); // release
+  // optimize_call happens after we stored new address in addr stub.
+  // patches jalr -> jal/jal -> jalr depending on dest
+  optimize_call(dest, true);

-  return false;
+  return true;
 }

-bool NativeCall::reloc_set_destination(address dest) {
+// The argument passed in is the address to the stub containing the destination
+bool NativeCall::reloc_set_destination(address stub_addr) {
  address call_addr = instruction_address();
  assert(NativeCall::is_at(call_addr), "unexpected code at call site");

@ -113,10 +141,12 @@ bool NativeCall::reloc_set_destination(address dest) {

  if (code->is_nmethod()) {
    // TODO: Need to revisit this when porting the AOT features.
-    assert(dest != nullptr, "Sanity");
-    assert(dest == trampoline_stub_Relocation::get_trampoline_for(call_addr,
-                                                          code->as_nmethod()), "Sanity");
-    MacroAssembler::pd_patch_instruction_size(call_addr, dest);
+    assert(stub_addr != nullptr, "Sanity");
+    assert(stub_addr == trampoline_stub_Relocation::get_trampoline_for(call_addr, code->as_nmethod()), "Sanity");
+    MacroAssembler::pd_patch_instruction_size(call_addr, stub_addr); // patches auipc + ld to stub_addr
+
+    address dest = stub_address_destination_at(stub_addr);
+    optimize_call(dest, false); // patches jalr -> jal/jal -> jalr depending on dest
  }

  return true;
@ -142,9 +172,9 @@ address NativeCall::stub_address() {
  CodeBlob *code = CodeCache::find_blob(call_addr);
  assert(code != nullptr, "Could not find the containing code blob");

-  address dest = MacroAssembler::target_addr_for_insn(call_addr);
-  assert(code->contains(dest), "Sanity");
-  return dest;
+  address stub_addr = MacroAssembler::target_addr_for_insn(call_addr);
+  assert(code->contains(stub_addr), "Sanity");
+  return stub_addr;
 }

 bool NativeCall::is_at(address addr) {
@ -160,6 +190,15 @@ bool NativeCall::is_at(address addr) {
      (MacroAssembler::extract_rd(addr + 2 * instr_size)   == x1)) {
    return true;
  }
+  if (MacroAssembler::is_auipc_at(addr) &&
+      MacroAssembler::is_ld_at(addr + instr_size) &&
+      MacroAssembler::is_jal_at(addr + 2 * instr_size) &&
+      (MacroAssembler::extract_rd(addr)                    == x6) &&
+      (MacroAssembler::extract_rd(addr + instr_size)       == x6) &&
+      (MacroAssembler::extract_rs1(addr + instr_size)      == x6) &&
+      (MacroAssembler::extract_rd(addr + 2 * instr_size)   == x1)) {
+    return true;
+  }
  return false;
 }

--- a/src/hotspot/cpu/riscv/nativeInst_riscv.hpp
+++ b/src/hotspot/cpu/riscv/nativeInst_riscv.hpp
@ -156,6 +156,10 @@ class NativeCall: private NativeInstruction {
  static void set_stub_address_destination_at(address dest, address value);
  // return target address at stub
  static address stub_address_destination_at(address src);
+  // We either have a jalr or jal depending on distance to old destination.
+  // This method emits a new jal if new destination is within jal reach.
+  // Otherwise restores the jalr which can reach any destination.
+  void optimize_call(address dest, bool mt_safe = true);
 };

 // An interface for accessing/manipulating native mov reg, imm instructions.
--- a/src/hotspot/cpu/riscv/riscv.ad
+++ b/src/hotspot/cpu/riscv/riscv.ad
@ -3845,9 +3845,6 @@ attributes %{

  // ...in one line.
  instruction_fetch_units = 1;
-
-  // List of nop instructions
-  nops( MachNop );
 %}

 // We don't use an actual pipeline model so don't care about resources
@ -8941,7 +8938,7 @@ instruct encodeKlass_not_null(iRegNNoSp dst, iRegP src) %{
 instruct decodeKlass_not_null(iRegPNoSp dst, iRegN src, iRegPNoSp tmp) %{
  match(Set dst (DecodeNKlass src));

-  effect(TEMP tmp);
+  effect(TEMP_DEF dst, TEMP tmp);

  ins_cost(ALU_COST);
  format %{ "decode_klass_not_null  $dst, $src\t#@decodeKlass_not_null" %}
--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
+++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
@ -683,10 +683,11 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();

    if (UseBlockZeroing) {
-      // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
-      // after alignment.
+      int zicboz_block_size = VM_Version::zicboz_block_size.value();
+      // Ensure count >= 2 * zicboz_block_size so that it still deserves
+      // a cbo.zero after alignment.
      Label small;
-      int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
+      int low_limit = MAX2(2 * zicboz_block_size, (int)BlockZeroingLowLimit) / wordSize;
      __ mv(tmp1, low_limit);
      __ blt(cnt, tmp1, small);
      __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
@ -731,8 +732,7 @@ class StubGenerator: public StubCodeGenerator {
  //
  // s and d are adjusted to point to the remaining words to copy
  //
-  void generate_copy_longs(StubId stub_id, Label &start,
-                           Register s, Register d, Register count) {
+  address generate_copy_longs(StubId stub_id, Register s, Register d, Register count) {
    BasicType type;
    copy_direction direction;
    switch (stub_id) {
@ -762,7 +762,7 @@ class StubGenerator: public StubCodeGenerator {
    Label again, drain;
    StubCodeMark mark(this, stub_id);
    __ align(CodeEntryAlignment);
-    __ bind(start);
+    address start = __ pc();

    if (direction == copy_forwards) {
      __ sub(s, s, bias);
@ -878,9 +878,9 @@ class StubGenerator: public StubCodeGenerator {
    }

    __ ret();
-  }

-  Label copy_f, copy_b;
+    return start;
+  }

  typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);

@ -1098,8 +1098,8 @@ class StubGenerator: public StubCodeGenerator {
  //   stub_id - is used to name the stub and identify all details of
  //             how to perform the copy.
  //
-  //   entry - is assigned to the stub's post push entry point unless
-  //           it is null
+  //   nopush_entry - is assigned to the stub's post push entry point
+  //                  unless it is null
  //
  // Inputs:
  //   c_rarg0   - source array address
@ -1110,11 +1110,11 @@ class StubGenerator: public StubCodeGenerator {
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
-  // Side Effects: entry is set to the (post push) entry point so it
-  //               can be used by the corresponding conjoint copy
-  //               method
+  // Side Effects: nopush_entry is set to the (post push) entry point
+  //               so it can be used by the corresponding conjoint
+  //               copy method
  //
-  address generate_disjoint_copy(StubId stub_id, address* entry) {
+  address generate_disjoint_copy(StubId stub_id, address* nopush_entry) {
    size_t size;
    bool aligned;
    bool is_oop;
@ -1203,8 +1203,8 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+     *nopush_entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }
@ -1255,8 +1255,8 @@ class StubGenerator: public StubCodeGenerator {
  //             corresponding disjoint copy routine which can be
  //             jumped to if the ranges do not actually overlap
  //
-  //   entry - is assigned to the stub's post push entry point unless
-  //           it is null
+  //   nopush_entry - is assigned to the stub's post push entry point
+  //                 unless it is null
  //
  // Inputs:
  //   c_rarg0   - source array address
@ -1268,10 +1268,10 @@ class StubGenerator: public StubCodeGenerator {
  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
-  //   entry is set to the no-overlap entry point so it can be used by
-  //   some other conjoint copy method
+  //   nopush_entry is set to the no-overlap entry point so it can be
+  //   used by some other conjoint copy method
  //
-  address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) {
+  address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
    const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_regs = RegSet::of(s, d, count);
    int size;
@ -1358,8 +1358,8 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ pc();
    __ enter();

-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }
@ -1369,7 +1369,7 @@ class StubGenerator: public StubCodeGenerator {
    __ slli(t1, count, exact_log2(size));
    Label L_continue;
    __ bltu(t0, t1, L_continue);
-    __ j(nooverlap_target);
+    __ j(RuntimeAddress(nooverlap_target));
    __ bind(L_continue);

    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
@ -1444,7 +1444,7 @@ class StubGenerator: public StubCodeGenerator {
  //    x10 ==  0  -  success
  //    x10 == -1^K - failure, where K is partial transfer count
  //
-  address generate_checkcast_copy(StubId stub_id, address* entry) {
+  address generate_checkcast_copy(StubId stub_id, address* nopush_entry) {
    bool dest_uninitialized;
    switch (stub_id) {
    case StubId::stubgen_checkcast_arraycopy_id:
@ -1495,8 +1495,8 @@ class StubGenerator: public StubCodeGenerator {
    __ enter(); // required for proper stackwalking of RuntimeStub frame

    // Caller of this entry point must set up the argument registers.
-    if (entry != nullptr) {
-      *entry = __ pc();
+    if (nopush_entry != nullptr) {
+      *nopush_entry = __ pc();
      BLOCK_COMMENT("Entry:");
    }

@ -2293,13 +2293,21 @@ class StubGenerator: public StubCodeGenerator {
  }

  void generate_arraycopy_stubs() {
-    address entry                     = nullptr;
-    address entry_jbyte_arraycopy     = nullptr;
-    address entry_jshort_arraycopy    = nullptr;
-    address entry_jint_arraycopy      = nullptr;
-    address entry_oop_arraycopy       = nullptr;
-    address entry_jlong_arraycopy     = nullptr;
-    address entry_checkcast_arraycopy = nullptr;
+    // Some copy stubs publish a normal entry and then a 2nd 'fallback'
+    // entry immediately following their stack push. This can be used
+    // as a post-push branch target for compatible stubs when they
+    // identify a special case that can be handled by the fallback
+    // stub e.g a disjoint copy stub may be use as a special case
+    // fallback for its compatible conjoint copy stub.
+    //
+    // A no push entry is always returned in the following local and
+    // then published by assigning to the appropriate entry field in
+    // class StubRoutines. The entry value is then passed to the
+    // generator for the compatible stub. That means the entry must be
+    // listed when saving to/restoring from the AOT cache, ensuring
+    // that the inter-stub jumps are noted at AOT-cache save and
+    // relocated at AOT cache load.
+    address nopush_entry = nullptr;

    // generate the common exit first so later stubs can rely on it if
    // they want an UnsafeMemoryAccess exit non-local to the stub
@ -2307,72 +2315,117 @@ class StubGenerator: public StubCodeGenerator {
    // register the stub as the default exit with class UnsafeMemoryAccess
    UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);

-    generate_copy_longs(StubId::stubgen_copy_byte_f_id, copy_f, c_rarg0, c_rarg1, t1);
-    generate_copy_longs(StubId::stubgen_copy_byte_b_id, copy_b, c_rarg0, c_rarg1, t1);
+    // generate and publish riscv-specific bulk copy routines first
+    // so we can call them from other copy stubs
+    StubRoutines::riscv::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, c_rarg0, c_rarg1, t1);
+    StubRoutines::riscv::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, c_rarg0, c_rarg1, t1);

    StubRoutines::riscv::_zero_blocks = generate_zero_blocks();

    //*** jbyte
    // Always need aligned and unaligned versions
-    StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
-    StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr);
+    StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);

    //*** jshort
    // Always need aligned and unaligned versions
-    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
-    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr);
+    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is used by generic/unsafe copy
+    StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);

    //*** jint
    // Aligned versions
-    StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+    StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
    // entry_jint_arraycopy always points to the unaligned version
-    StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
-    StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+    StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jint_arraycopy_nopush = nopush_entry;

    //*** jlong
    // It is always aligned
-    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry);
-    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
+    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
+    // disjoint normal/nopush and conjoint normal entries are not
+    // generated since the arrayof versions are the same
    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
+    StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;

    //*** oops
    StubRoutines::_arrayof_oop_disjoint_arraycopy
-      = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry);
+      = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
+      // disjoint arrayof nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
    StubRoutines::_arrayof_oop_arraycopy
-      = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
+      = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint arrayof nopush entry is needed by generic/unsafe copy
+    StubRoutines::_oop_arraycopy_nopush = nopush_entry;
    // Aligned versions without pre-barriers
    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
-      = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
-    StubRoutines::_arrayof_oop_arraycopy_uninit
-      = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr);
+      = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
+    // disjoint arrayof+uninit nopush entry is needed by conjoint copy
+    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;

+    // note that we don't need a returned nopush entry because the
+    // generic/unsafe copy does not cater for uninit arrays.
+    StubRoutines::_arrayof_oop_arraycopy_uninit
+      = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
+
+    // for oop copies reuse arrayof entries for non-arrayof cases
    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
+    StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
+    StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;

-    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
+    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
+    // checkcast nopush entry is needed by generic copy
+    StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
+    // note that we don't need a returned nopush entry because the
+    // generic copy does not cater for uninit arrays.
    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);


-    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
-                                                              entry_jshort_arraycopy,
-                                                              entry_jint_arraycopy,
-                                                              entry_jlong_arraycopy);
+    // unsafe arraycopy may fallback on conjoint stubs
+    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                              StubRoutines::_jshort_arraycopy_nopush,
+                                                              StubRoutines::_jint_arraycopy_nopush,
+                                                              StubRoutines::_jlong_arraycopy_nopush);

-    StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
-                                                               entry_jshort_arraycopy,
-                                                               entry_jint_arraycopy,
-                                                               entry_oop_arraycopy,
-                                                               entry_jlong_arraycopy,
-                                                               entry_checkcast_arraycopy);
+    // generic arraycopy may fallback on conjoint stubs
+    StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                               StubRoutines::_jshort_arraycopy_nopush,
+                                                               StubRoutines::_jint_arraycopy_nopush,
+                                                               StubRoutines::_oop_arraycopy_nopush,
+                                                               StubRoutines::_jlong_arraycopy_nopush,
+                                                               StubRoutines::_checkcast_arraycopy_nopush);

    StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
    StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp
+++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp
@ -181,12 +181,13 @@ void VM_Version::common_initialize() {
    FLAG_SET_DEFAULT(UsePopCountInstruction, false);
  }

-  if (UseZicboz) {
+  if (UseZicboz && zicboz_block_size.enabled() && zicboz_block_size.value() > 0) {
+    assert(is_power_of_2(zicboz_block_size.value()), "Sanity");
    if (FLAG_IS_DEFAULT(UseBlockZeroing)) {
      FLAG_SET_DEFAULT(UseBlockZeroing, true);
    }
    if (FLAG_IS_DEFAULT(BlockZeroingLowLimit)) {
-      FLAG_SET_DEFAULT(BlockZeroingLowLimit, 2 * CacheLineSize);
+      FLAG_SET_DEFAULT(BlockZeroingLowLimit, 4 * zicboz_block_size.value());
    }
  } else if (UseBlockZeroing) {
    warning("Block zeroing is not available");
--- a/src/hotspot/cpu/riscv/vm_version_riscv.hpp
+++ b/src/hotspot/cpu/riscv/vm_version_riscv.hpp
@ -162,45 +162,46 @@ class VM_Version : public Abstract_VM_Version {

  // Note: the order matters, depender should be after their dependee. E.g. ext_V before ext_Zvbb.
  // declaration name  , extension name, bit pos       ,in str, mapped flag)
-  #define RV_FEATURE_FLAGS(decl)                                                                    \
-  decl(ext_I           , "i"           ,    ('I' - 'A'), true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_M           , "m"           ,    ('M' - 'A'), true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_A           , "a"           ,    ('A' - 'A'), true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_F           , "f"           ,    ('F' - 'A'), true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_D           , "d"           ,    ('D' - 'A'), true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_C           , "c"           ,    ('C' - 'A'), true , UPDATE_DEFAULT(UseRVC))             \
-  decl(ext_Q           , "q"           ,    ('Q' - 'A'), true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_H           , "h"           ,    ('H' - 'A'), true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_V           , "v"           ,    ('V' - 'A'), true , UPDATE_DEFAULT(UseRVV))             \
-  decl(ext_Zicbom      , "Zicbom"      , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZicbom))          \
-  decl(ext_Zicboz      , "Zicboz"      , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZicboz))          \
-  decl(ext_Zicbop      , "Zicbop"      , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZicbop))          \
-  decl(ext_Zba         , "Zba"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZba))             \
-  decl(ext_Zbb         , "Zbb"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZbb))             \
-  decl(ext_Zbc         , "Zbc"         , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_Zbs         , "Zbs"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZbs))             \
-  decl(ext_Zbkb        , "Zbkb"        , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZbkb))            \
-  decl(ext_Zcb         , "Zcb"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZcb))             \
-  decl(ext_Zfa         , "Zfa"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZfa))             \
-  decl(ext_Zfh         , "Zfh"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZfh))             \
-  decl(ext_Zfhmin      , "Zfhmin"      , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZfhmin))          \
-  decl(ext_Zicsr       , "Zicsr"       , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_Zicntr      , "Zicntr"      , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_Zifencei    , "Zifencei"    , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)                  \
-  decl(ext_Zic64b      , "Zic64b"      , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZic64b))          \
-  decl(ext_Ztso        , "Ztso"        , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZtso))            \
-  decl(ext_Zihintpause , "Zihintpause" , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZihintpause))     \
-  decl(ext_Zacas       , "Zacas"       , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZacas))           \
-  decl(ext_Zvbb        , "Zvbb"        , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT_DEP(UseZvbb, ext_V)) \
-  decl(ext_Zvbc        , "Zvbc"        , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT_DEP(UseZvbc, ext_V)) \
-  decl(ext_Zvfh        , "Zvfh"        , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT_DEP(UseZvfh, ext_V)) \
-  decl(ext_Zvkn        , "Zvkn"        , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT_DEP(UseZvkn, ext_V)) \
-  decl(ext_Zicond      , "Zicond"      , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZicond))          \
-  decl(mvendorid       , "VendorId"    , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
-  decl(marchid         , "ArchId"      , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
-  decl(mimpid          , "ImpId"       , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
-  decl(unaligned_access, "Unaligned"   , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
-  decl(satp_mode       , "SATP"        , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
+  #define RV_FEATURE_FLAGS(decl)                                                                        \
+  decl(ext_I            , "i"              ,    ('I' - 'A'), true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_M            , "m"              ,    ('M' - 'A'), true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_A            , "a"              ,    ('A' - 'A'), true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_F            , "f"              ,    ('F' - 'A'), true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_D            , "d"              ,    ('D' - 'A'), true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_C            , "c"              ,    ('C' - 'A'), true , UPDATE_DEFAULT(UseRVC))             \
+  decl(ext_Q            , "q"              ,    ('Q' - 'A'), true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_H            , "h"              ,    ('H' - 'A'), true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_V            , "v"              ,    ('V' - 'A'), true , UPDATE_DEFAULT(UseRVV))             \
+  decl(ext_Zicbom       , "Zicbom"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZicbom))          \
+  decl(ext_Zicboz       , "Zicboz"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZicboz))          \
+  decl(ext_Zicbop       , "Zicbop"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZicbop))          \
+  decl(ext_Zba          , "Zba"            , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZba))             \
+  decl(ext_Zbb          , "Zbb"            , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZbb))             \
+  decl(ext_Zbc          , "Zbc"            , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_Zbs          , "Zbs"            , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZbs))             \
+  decl(ext_Zbkb         , "Zbkb"           , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZbkb))            \
+  decl(ext_Zcb          , "Zcb"            , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZcb))             \
+  decl(ext_Zfa          , "Zfa"            , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZfa))             \
+  decl(ext_Zfh          , "Zfh"            , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZfh))             \
+  decl(ext_Zfhmin       , "Zfhmin"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZfhmin))          \
+  decl(ext_Zicsr        , "Zicsr"          , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_Zicntr       , "Zicntr"         , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_Zifencei     , "Zifencei"       , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT)                  \
+  decl(ext_Zic64b       , "Zic64b"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZic64b))          \
+  decl(ext_Ztso         , "Ztso"           , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZtso))            \
+  decl(ext_Zihintpause  , "Zihintpause"    , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZihintpause))     \
+  decl(ext_Zacas        , "Zacas"          , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZacas))           \
+  decl(ext_Zvbb         , "Zvbb"           , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT_DEP(UseZvbb, ext_V)) \
+  decl(ext_Zvbc         , "Zvbc"           , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT_DEP(UseZvbc, ext_V)) \
+  decl(ext_Zvfh         , "Zvfh"           , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT_DEP(UseZvfh, ext_V)) \
+  decl(ext_Zvkn         , "Zvkn"           , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT_DEP(UseZvkn, ext_V)) \
+  decl(ext_Zicond       , "Zicond"         , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZicond))          \
+  decl(mvendorid        , "VendorId"       , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
+  decl(marchid          , "ArchId"         , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
+  decl(mimpid           , "ImpId"          , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
+  decl(unaligned_access , "Unaligned"      , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
+  decl(satp_mode        , "SATP"           , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \
+  decl(zicboz_block_size, "ZicbozBlockSize", RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT)                  \

  #define DECLARE_RV_FEATURE(NAME, PRETTY, BIT, FSTRING, FLAGF)        \
  struct NAME##RVFeatureValue : public RVFeatureValue {                \
--- a/src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp
@ -2816,10 +2816,6 @@ void LIR_Assembler::align_backward_branch_target() {
  __ align(OptoLoopAlignment);
 }

-void LIR_Assembler::emit_delay(LIR_OpDelay* op) {
-  ShouldNotCallThis(); // There are no delay slots on ZARCH_64.
-}
-
 void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest, LIR_Opr tmp) {
  // tmp must be unused
  assert(tmp->is_illegal(), "wasting a register if tmp is allocated");
--- a/src/hotspot/cpu/s390/gc/shared/barrierSetNMethod_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/shared/barrierSetNMethod_s390.cpp
@ -53,11 +53,26 @@ class NativeMethodBarrier: public NativeInstruction {
      return *((int32_t*)data_addr);
    }

-    void set_guard_value(int value) {
-      int32_t* data_addr = (int32_t*)get_patchable_data_address();
+    void set_guard_value(int value, int bit_mask) {
+      if (bit_mask == ~0) {
+        int32_t* data_addr = (int32_t*)get_patchable_data_address();

-      // Set guard instruction value
-      *data_addr = value;
+        // Set guard instruction value
+        *data_addr = value;
+        return;
+      }
+      assert((value & ~bit_mask) == 0, "trying to set bits outside the mask");
+      value &= bit_mask;
+      int32_t* data_addr = (int32_t*)get_patchable_data_address();
+      int old_value = AtomicAccess::load(data_addr);
+      while (true) {
+        // Only bits in the mask are changed
+        int new_value = value | (old_value & ~bit_mask);
+        if (new_value == old_value) break;
+        int v = AtomicAccess::cmpxchg(data_addr, old_value, new_value, memory_order_release);
+        if (v == old_value) break;
+        old_value = v;
+      }
    }

    #ifdef ASSERT
@ -100,13 +115,13 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
  return;
 }

-void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
+void BarrierSetNMethod::set_guard_value(nmethod* nm, int value, int bit_mask) {
  if (!supports_entry_barrier(nm)) {
    return;
  }

  NativeMethodBarrier* barrier = get_nmethod_barrier(nm);
-  barrier->set_guard_value(value);
+  barrier->set_guard_value(value, bit_mask);
 }

 int BarrierSetNMethod::guard_value(nmethod* nm) {
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@ -1398,11 +1398,7 @@ void Assembler::addl(Address dst, Register src) {

 void Assembler::eaddl(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x01);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x01, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::addl(Register dst, int32_t imm32) {
@ -1432,11 +1428,7 @@ void Assembler::addl(Register dst, Register src) {
 }

 void Assembler::eaddl(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void)emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x03, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x03, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::addr_nop_4() {
@ -1657,17 +1649,18 @@ void Assembler::eandl(Register dst, Register src1, Address src2, bool no_flags)
  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x23, no_flags);
 }

+void Assembler::eandl(Register dst, Address src1, Register src2, bool no_flags) {
+  InstructionMark im(this);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x21, no_flags, false /* is_map1 */, true /* is_commutative */);
+}
+
 void Assembler::andl(Register dst, Register src) {
  (void) prefix_and_encode(dst->encoding(), src->encoding());
  emit_arith(0x23, 0xC0, dst, src);
 }

 void Assembler::eandl(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x23, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x23, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::andnl(Register dst, Register src1, Register src2) {
@ -2519,7 +2512,7 @@ void Assembler::imull(Register dst, Register src) {
 }

 void Assembler::eimull(Register dst, Register src1, Register src2, bool no_flags) {
-  emit_eevex_or_demote(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0xAF, no_flags, true /* is_map1 */,  true /* swap */);
+  emit_eevex_or_demote(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0xAF, no_flags, true /* is_map1 */,  true /* swap */, true /* is_commutative */);
 }

 void Assembler::imull(Register dst, Address src, int32_t value) {
@ -4419,11 +4412,7 @@ void Assembler::enotl(Register dst, Register src) {
 }

 void Assembler::eorw(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x0B, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_66, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_16bit, 0x0B, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::orl(Address dst, int32_t imm32) {
@ -4467,11 +4456,7 @@ void Assembler::orl(Register dst, Register src) {
 }

 void Assembler::eorl(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x0B, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x0B, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::orl(Address dst, Register src) {
@ -4483,11 +4468,7 @@ void Assembler::orl(Address dst, Register src) {

 void Assembler::eorl(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x09);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x09, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::orb(Address dst, int imm8) {
@ -4517,11 +4498,7 @@ void Assembler::orb(Address dst, Register src) {

 void Assembler::eorb(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_8bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x08);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_8bit, 0x08, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::packsswb(XMMRegister dst, XMMRegister src) {
@ -7323,11 +7300,7 @@ void Assembler::xorl(Register dst, Register src) {
 }

 void Assembler::exorl(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_arith(0x33, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x33, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::xorl(Address dst, Register src) {
@ -7339,11 +7312,7 @@ void Assembler::xorl(Address dst, Register src) {

 void Assembler::exorl(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x31);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_32bit, 0x31, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::xorb(Register dst, Address src) {
@ -7367,11 +7336,7 @@ void Assembler::xorb(Address dst, Register src) {

 void Assembler::exorb(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_8bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x30);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_8bit, 0x30, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::xorw(Register dst, Address src) {
@ -12955,6 +12920,31 @@ void Assembler::eevex_prefix_ndd(Address adr, int ndd_enc, int xreg_enc, VexSimd
  vex_prefix(adr, ndd_enc, xreg_enc, pre, opc, attributes, /* nds_is_ndd */ true, no_flags);
 }

+void Assembler::emit_eevex_or_demote(Register dst, Address src1, Register src2, VexSimdPrefix pre, VexOpcode opc,
+                                     int size, int opcode_byte, bool no_flags, bool is_map1, bool is_commutative) {
+  if (is_commutative && is_demotable(no_flags, dst->encoding(), src2->encoding())) {
+    // Opcode byte adjustment due to mismatch between NDD and equivalent demotable variant
+    opcode_byte += 2;
+    if (size == EVEX_64bit) {
+      emit_prefix_and_int8(get_prefixq(src1, dst, is_map1), opcode_byte);
+    } else {
+      // For 32-bit, 16-bit and 8-bit
+      if (size == EVEX_16bit) {
+        emit_int8(0x66);
+      }
+      prefix(src1, dst, false, is_map1);
+      emit_int8(opcode_byte);
+    }
+  } else {
+    bool vex_w = (size == EVEX_64bit) ? true : false;
+    InstructionAttr attributes(AVX_128bit, vex_w, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+    attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, size);
+    eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), pre, opc, &attributes, no_flags);
+    emit_int8(opcode_byte);
+  }
+  emit_operand(src2, src1, 0);
+}
+
 void Assembler::emit_eevex_or_demote(Register dst, Register src1, Address src2, VexSimdPrefix pre, VexOpcode opc,
                                     int size, int opcode_byte, bool no_flags, bool is_map1) {
  if (is_demotable(no_flags, dst->encoding(), src1->encoding())) {
@ -13055,18 +13045,20 @@ void Assembler::emit_eevex_or_demote(int dst_enc, int nds_enc, int src_enc, int8
 }

 void Assembler::emit_eevex_or_demote(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
-                                     int size, int opcode_byte, bool no_flags, bool is_map1, bool swap) {
+                                     int size, int opcode_byte, bool no_flags, bool is_map1, bool swap, bool is_commutative) {
  int encode;
  bool is_prefixq = (size == EVEX_64bit) ? true : false;
-  if (is_demotable(no_flags, dst_enc, nds_enc)) {
+  bool first_operand_demotable = is_demotable(no_flags, dst_enc, nds_enc);
+  bool second_operand_demotable = is_commutative && is_demotable(no_flags, dst_enc, src_enc);
+  if (first_operand_demotable || second_operand_demotable) {
    if (size == EVEX_16bit) {
      emit_int8(0x66);
    }
-
+    int src = first_operand_demotable ? src_enc : nds_enc;
    if (swap) {
-      encode = is_prefixq ? prefixq_and_encode(dst_enc, src_enc, is_map1) : prefix_and_encode(dst_enc, src_enc, is_map1);
+      encode = is_prefixq ? prefixq_and_encode(dst_enc, src, is_map1) : prefix_and_encode(dst_enc, src, is_map1);
    } else {
-      encode = is_prefixq ? prefixq_and_encode(src_enc, dst_enc, is_map1) : prefix_and_encode(src_enc, dst_enc, is_map1);
+      encode = is_prefixq ? prefixq_and_encode(src, dst_enc, is_map1) : prefix_and_encode(src, dst_enc, is_map1);
    }
    emit_opcode_prefix_and_encoding((unsigned char)opcode_byte, 0xC0, encode);
  } else {
@ -13114,6 +13106,26 @@ int Assembler::eevex_prefix_and_encode_nf(int dst_enc, int nds_enc, int src_enc,
  return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, attributes, /* src_is_gpr */ true, /* nds_is_ndd */ false, no_flags);
 }

+void Assembler::emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register src1, Register src2, VexSimdPrefix pre, VexOpcode opc,
+                                                      int size, int op1, int op2, bool no_flags, bool is_commutative) {
+  bool demotable = is_demotable(no_flags, dst->encoding(), src1->encoding());
+  if (!demotable && is_commutative) {
+    if (is_demotable(no_flags, dst->encoding(), src2->encoding())) {
+      // swap src1 and src2
+      Register tmp = src1;
+      src1 = src2;
+      src2 = tmp;
+    }
+  }
+  bool vex_w = (size == EVEX_64bit) ? true : false;
+  bool use_prefixq = vex_w;
+  InstructionAttr attributes(AVX_128bit, vex_w, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
+  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
+  (void)emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), pre, opc, &attributes, no_flags, use_prefixq);
+  emit_arith(op1, op2, src1, src2);
+}
+
 void Assembler::emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register nds, int32_t imm32, VexSimdPrefix pre, VexOpcode opc,
                                                      int size, int op1, int op2, bool no_flags) {
  int dst_enc = dst->encoding();
@ -13124,7 +13136,6 @@ void Assembler::emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register nds
  } else {
    bool vex_w = (size == EVEX_64bit) ? true : false;
    InstructionAttr attributes(AVX_128bit, vex_w, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-    //attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, size);
    attributes.set_is_evex_instruction();
    vex_prefix_and_encode(0, dst_enc, nds_enc, pre, opc, &attributes, /* src_is_gpr */ true, /* nds_is_ndd */ true, no_flags);

@ -13769,7 +13780,7 @@ void Assembler::pdepq(Register dst, Register src1, Address src2) {

 void Assembler::sarxl(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13777,7 +13788,7 @@ void Assembler::sarxl(Register dst, Register src1, Register src2) {
 void Assembler::sarxl(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13786,7 +13797,7 @@ void Assembler::sarxl(Register dst, Address src1, Register src2) {

 void Assembler::sarxq(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13794,7 +13805,7 @@ void Assembler::sarxq(Register dst, Register src1, Register src2) {
 void Assembler::sarxq(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13803,7 +13814,7 @@ void Assembler::sarxq(Register dst, Address src1, Register src2) {

 void Assembler::shlxl(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13811,7 +13822,7 @@ void Assembler::shlxl(Register dst, Register src1, Register src2) {
 void Assembler::shlxl(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13820,7 +13831,7 @@ void Assembler::shlxl(Register dst, Address src1, Register src2) {

 void Assembler::shlxq(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13828,7 +13839,7 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) {
 void Assembler::shlxq(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13837,7 +13848,7 @@ void Assembler::shlxq(Register dst, Address src1, Register src2) {

 void Assembler::shrxl(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13845,7 +13856,7 @@ void Assembler::shrxl(Register dst, Register src1, Register src2) {
 void Assembler::shrxl(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_32bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -13854,7 +13865,7 @@ void Assembler::shrxl(Register dst, Address src1, Register src2) {

 void Assembler::shrxq(Register dst, Register src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes, true);
  emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
@ -13862,7 +13873,7 @@ void Assembler::shrxq(Register dst, Register src1, Register src2) {
 void Assembler::shrxq(Register dst, Address src1, Register src2) {
  assert(VM_Version::supports_bmi2(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
  vex_prefix(src1, src2->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
  emit_int8((unsigned char)0xF7);
@ -14623,11 +14634,7 @@ void Assembler::addq(Address dst, Register src) {

 void Assembler::eaddq(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x01);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x01, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::addq(Register dst, int32_t imm32) {
@ -14656,11 +14663,7 @@ void Assembler::addq(Register dst, Register src) {
 }

 void Assembler::eaddq(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags, true /* use_prefixq */);
-  emit_arith(0x03, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x03, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::adcxq(Register dst, Register src) {
@ -14753,11 +14756,7 @@ void Assembler::andq(Register dst, Register src) {
 }

 void Assembler::eandq(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags, true /* use_prefixq */);
-  emit_arith(0x23, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x23, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::andq(Address dst, Register src) {
@ -14768,11 +14767,7 @@ void Assembler::andq(Address dst, Register src) {

 void Assembler::eandq(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x21);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x21, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::andnq(Register dst, Register src1, Register src2) {
@ -15118,7 +15113,7 @@ void Assembler::eimulq(Register dst, Register src, bool no_flags) {
 }

 void Assembler::eimulq(Register dst, Register src1, Register src2, bool no_flags) {
-  emit_eevex_or_demote(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0xAF, no_flags, true /* is_map1 */,  true /* swap */);
+  emit_eevex_or_demote(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0xAF, no_flags, true /* is_map1 */,  true /* swap */, true /* is_commutative */);
 }

 void Assembler::imulq(Register src) {
@ -15580,11 +15575,7 @@ void Assembler::orq(Address dst, Register src) {

 void Assembler::eorq(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x09);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x09, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void Assembler::orq(Register dst, int32_t imm32) {
@ -15624,13 +15615,8 @@ void Assembler::orq(Register dst, Register src) {
 }

 void Assembler::eorq(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags, true /* use_prefixq */);
-  emit_arith(0x0B, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x0B, 0xC0, no_flags, true /* is_commutative */);
 }
-
 void Assembler::popcntq(Register dst, Address src) {
  assert(VM_Version::supports_popcnt(), "must support");
  InstructionMark im(this);
@ -16372,11 +16358,7 @@ void Assembler::xorq(Register dst, Register src) {
 }

 void Assembler::exorq(Register dst, Register src1, Register src2, bool no_flags) {
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  // NDD shares its encoding bits with NDS bits for regular EVEX instruction.
-  // Therefore, DST is passed as the second argument to minimize changes in the leaf level routine.
-  (void) emit_eevex_prefix_or_demote_ndd(src1->encoding(), dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags, true /* use_prefixq */);
-  emit_arith(0x33, 0xC0, src1, src2);
+  emit_eevex_prefix_or_demote_arith_ndd(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x33, 0xC0, no_flags, true /* is_commutative */);
 }

 void Assembler::xorq(Register dst, Address src) {
@ -16430,11 +16412,7 @@ void Assembler::esetzucc(Condition cc, Register dst) {

 void Assembler::exorq(Register dst, Address src1, Register src2, bool no_flags) {
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
-  attributes.set_address_attributes(/* tuple_type */ EVEX_NOSCALE, /* input_size_in_bits */ EVEX_64bit);
-  eevex_prefix_ndd(src1, dst->encoding(), src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, &attributes, no_flags);
-  emit_int8(0x31);
-  emit_operand(src2, src1, 0);
+  emit_eevex_or_demote(dst, src1, src2, VEX_SIMD_NONE, VEX_OPCODE_0F_3C /* MAP4 */, EVEX_64bit, 0x31, no_flags, false /* is_map1 */, true /* is_commutative */);
 }

 void InstructionAttr::set_address_attributes(int tuple_type, int input_size_in_bits) {
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -807,14 +807,20 @@ private:
  int emit_eevex_prefix_or_demote_ndd(int dst_enc, int nds_enc, VexSimdPrefix pre, VexOpcode opc,
                                      InstructionAttr *attributes, bool no_flags = false, bool use_prefixq = false);

+  void emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register src1, Register src2, VexSimdPrefix pre, VexOpcode opc,
+                                             int size, int op1, int op2, bool no_flags = false, bool is_commutative = false);
+
  void emit_eevex_prefix_or_demote_arith_ndd(Register dst, Register nds, int32_t imm32, VexSimdPrefix pre, VexOpcode opc,
                                             int size, int op1, int op2, bool no_flags);

  void emit_eevex_or_demote(Register dst, Register src1, Address src2, VexSimdPrefix pre, VexOpcode opc,
                            int size, int opcode_byte, bool no_flags = false, bool is_map1 = false);

+  void emit_eevex_or_demote(Register dst, Address src1, Register src2, VexSimdPrefix pre, VexOpcode opc,
+                            int size, int opcode_byte, bool no_flags = false, bool is_map1 = false, bool is_commutative = false);
+
  void emit_eevex_or_demote(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
-                            int size, int opcode_byte, bool no_flags, bool is_map1 = false, bool swap = false);
+                            int size, int opcode_byte, bool no_flags, bool is_map1 = false, bool swap = false, bool is_commutative = false);

  void emit_eevex_or_demote(int dst_enc, int nds_enc, int src_enc, int8_t imm8, VexSimdPrefix pre, VexOpcode opc,
                            int size, int opcode_byte, bool no_flags, bool is_map1 = false);
@ -1149,6 +1155,7 @@ private:
  void eandl(Register dst, Register src, int32_t imm32, bool no_flags);
  void andl(Register dst, Address src);
  void eandl(Register dst, Register src1, Address src2, bool no_flags);
+  void eandl(Register dst, Address src1, Register src2, bool no_flags);
  void andl(Register dst, Register src);
  void eandl(Register dst, Register src1, Register src2, bool no_flags);
  void andl(Address dst, Register src);
--- a/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp
@ -3001,11 +3001,6 @@ void LIR_Assembler::emit_profile_type(LIR_OpProfileType* op) {
  __ bind(next);
 }

-void LIR_Assembler::emit_delay(LIR_OpDelay*) {
-  Unimplemented();
-}
-
-
 void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst) {
  __ lea(dst->as_register(), frame_map()->address_for_monitor_lock(monitor_no));
 }
--- a/src/hotspot/cpu/x86/gc/shared/barrierSetNMethod_x86.cpp
+++ b/src/hotspot/cpu/x86/gc/shared/barrierSetNMethod_x86.cpp
@ -50,8 +50,31 @@ public:
  address instruction_address() const { return addr_at(0); }
  address immediate_address() const { return addr_at(imm_offset); }

+  NativeNMethodCmpBarrier* nativeNMethodCmpBarrier_at(address a) { return (NativeNMethodCmpBarrier*)a; }
+
  jint get_immediate() const { return int_at(imm_offset); }
-  void set_immediate(jint imm) { set_int_at(imm_offset, imm); }
+  void set_immediate(jint imm, int bit_mask) {
+    if (bit_mask == ~0) {
+      set_int_at(imm_offset, imm);
+      return;
+    }
+
+    assert((imm & ~bit_mask) == 0, "trying to set bits outside the mask");
+    imm &= bit_mask;
+
+    assert(align_up(immediate_address(), sizeof(jint)) ==
+           align_down(immediate_address(), sizeof(jint)), "immediate not aligned");
+    jint* data_addr = (jint*)immediate_address();
+    jint old_value = AtomicAccess::load(data_addr);
+    while (true) {
+      // Only bits in the mask are changed
+      jint new_value = imm | (old_value & ~bit_mask);
+      if (new_value == old_value) break;
+      jint v = AtomicAccess::cmpxchg(data_addr, old_value, new_value, memory_order_release);
+      if (v == old_value) break;
+      old_value = v;
+    }
+  }
  bool check_barrier(err_msg& msg) const;
  void verify() const {
 #ifdef ASSERT
@ -159,13 +182,13 @@ static NativeNMethodCmpBarrier* native_nmethod_barrier(nmethod* nm) {
  return barrier;
 }

-void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
+void BarrierSetNMethod::set_guard_value(nmethod* nm, int value, int bit_mask) {
  if (!supports_entry_barrier(nm)) {
    return;
  }

  NativeNMethodCmpBarrier* cmp = native_nmethod_barrier(nm);
-  cmp->set_immediate(value);
+  cmp->set_immediate(value, bit_mask);
 }

 int BarrierSetNMethod::guard_value(nmethod* nm) {
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp
@ -76,50 +76,95 @@ static uint& get_profile_ctr(int shift) {
 #endif // !PRODUCT

 void StubGenerator::generate_arraycopy_stubs() {
-  address entry;
-  address entry_jbyte_arraycopy;
-  address entry_jshort_arraycopy;
-  address entry_jint_arraycopy;
-  address entry_oop_arraycopy;
-  address entry_jlong_arraycopy;
-  address entry_checkcast_arraycopy;
+  // Some copy stubs publish a normal entry and then a 2nd 'fallback'
+  // entry immediately following their stack push. This can be used
+  // as a post-push branch target for compatible stubs when they
+  // identify a special case that can be handled by the fallback
+  // stub e.g a disjoint copy stub may be use as a special case
+  // fallback for its compatible conjoint copy stub.
+  //
+  // A no push entry is always returned in the following local and
+  // then published by assigning to the appropriate entry field in
+  // class StubRoutines. The entry value is then passed to the
+  // generator for the compatible stub. That means the entry must be
+  // listed when saving to/restoring from the AOT cache, ensuring
+  // that the inter-stub jumps are noted at AOT-cache save and
+  // relocated at AOT cache load.
+  address nopush_entry;

-  StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&entry);
-  StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(entry, &entry_jbyte_arraycopy);
+  StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(&nopush_entry);
+  // disjoint nopush entry is needed by conjoint copy
+  StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
+  StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
+  // conjoint nopush entry is needed by generic/unsafe copy
+  StubRoutines::_jbyte_arraycopy_nopush    = nopush_entry;

-  StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&entry);
-  StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(entry, &entry_jshort_arraycopy);
+  StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(&nopush_entry);
+  // disjoint nopush entry is needed by conjoint copy
+  StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
+  StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
+  // conjoint nopush entry is needed by generic/unsafe copy
+  StubRoutines::_jshort_arraycopy_nopush   = nopush_entry;

-  StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
-  StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
+  StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
+  // disjoint nopush entry is needed by conjoint copy
+  StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
+  StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
+  // conjoint nopush entry is needed by generic/unsafe copy
+  StubRoutines::_jint_arraycopy_nopush     = nopush_entry;
+
+  StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &nopush_entry);
+  // disjoint nopush entry is needed by conjoint copy
+  StubRoutines::_jlong_disjoint_arraycopy_nopush  = nopush_entry;
+  StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, StubRoutines::_jlong_disjoint_arraycopy_nopush, &nopush_entry);
+  // conjoint nopush entry is needed by generic/unsafe copy
+  StubRoutines::_jlong_arraycopy_nopush    = nopush_entry;

-  StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_jlong_disjoint_arraycopy_id, &entry);
-  StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
  if (UseCompressedOops) {
-    StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &entry);
-    StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, entry, &entry_oop_arraycopy);
-    StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &entry);
-    StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, entry, nullptr);
+    StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
+    StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
+    // note that we don't need a returned nopush entry because the
+    // generic/unsafe copy does not cater for uninit arrays.
+    StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
  } else {
-    StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &entry);
-    StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, entry, &entry_oop_arraycopy);
-    StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &entry);
-    StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, entry, nullptr);
+    StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_oop_disjoint_arraycopy_nopush  = nopush_entry;
+    StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_id, StubRoutines::_oop_disjoint_arraycopy_nopush, &nopush_entry);
+    // conjoint nopush entry is needed by generic/unsafe copy
+    StubRoutines::_oop_arraycopy_nopush    = nopush_entry;
+    StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
+    // disjoint nopush entry is needed by conjoint copy
+    StubRoutines::_oop_disjoint_arraycopy_uninit_nopush  = nopush_entry;
+    // note that we don't need a returned nopush entry because the
+    // generic/unsafe copy does not cater for uninit arrays.
+    StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id, StubRoutines::_oop_disjoint_arraycopy_uninit_nopush, nullptr);
  }

-  StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
+  StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
+  // checkcast nopush entry is needed by generic copy
+  StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
+  // note that we don't need a returned nopush entry because the
+  // generic copy does not cater for uninit arrays.
  StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);

-  StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
-                                                            entry_jshort_arraycopy,
-                                                            entry_jint_arraycopy,
-                                                            entry_jlong_arraycopy);
-  StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
-                                                             entry_jshort_arraycopy,
-                                                             entry_jint_arraycopy,
-                                                             entry_oop_arraycopy,
-                                                             entry_jlong_arraycopy,
-                                                             entry_checkcast_arraycopy);
+  StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                            StubRoutines::_jshort_arraycopy_nopush,
+                                                            StubRoutines::_jint_arraycopy_nopush,
+                                                            StubRoutines::_jlong_arraycopy_nopush);
+  StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
+                                                             StubRoutines::_jshort_arraycopy_nopush,
+                                                             StubRoutines::_jint_arraycopy_nopush,
+                                                             StubRoutines::_oop_arraycopy_nopush,
+                                                             StubRoutines::_jlong_arraycopy_nopush,
+                                                             StubRoutines::_checkcast_arraycopy_nopush);

  StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
  StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -1016,16 +1016,6 @@ void VM_Version::get_processor_features() {
    _features.clear_feature(CPU_AVX10_2);
  }

-  // Currently APX support is only enabled for targets supporting AVX512VL feature.
-  bool apx_supported = os_supports_apx_egprs() && supports_apx_f() && supports_avx512vl();
-  if (UseAPX && !apx_supported) {
-    warning("UseAPX is not supported on this CPU, setting it to false");
-    FLAG_SET_DEFAULT(UseAPX, false);
-  }
-
-  if (!UseAPX) {
-    _features.clear_feature(CPU_APX_F);
-  }

  if (UseAVX < 2) {
    _features.clear_feature(CPU_AVX2);
@ -1049,6 +1039,7 @@ void VM_Version::get_processor_features() {
      _features.clear_feature(CPU_VZEROUPPER);
      _features.clear_feature(CPU_AVX512BW);
      _features.clear_feature(CPU_AVX512VL);
+      _features.clear_feature(CPU_APX_F);
      _features.clear_feature(CPU_AVX512DQ);
      _features.clear_feature(CPU_AVX512_VNNI);
      _features.clear_feature(CPU_AVX512_VAES);
@ -1068,6 +1059,17 @@ void VM_Version::get_processor_features() {
    }
  }

+    // Currently APX support is only enabled for targets supporting AVX512VL feature.
+  bool apx_supported = os_supports_apx_egprs() && supports_apx_f() && supports_avx512vl();
+  if (UseAPX && !apx_supported) {
+    warning("UseAPX is not supported on this CPU, setting it to false");
+    FLAG_SET_DEFAULT(UseAPX, false);
+  }
+
+  if (!UseAPX) {
+    _features.clear_feature(CPU_APX_F);
+  }
+
  if (FLAG_IS_DEFAULT(IntelJccErratumMitigation)) {
    _has_intel_jcc_erratum = compute_has_intel_jcc_erratum();
    FLAG_SET_ERGO(IntelJccErratumMitigation, _has_intel_jcc_erratum);
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@ -3429,9 +3429,6 @@ attributes %{
  instruction_unit_size = 1;         // An instruction is 1 bytes long
  instruction_fetch_unit_size = 16;  // The processor fetches one line
  instruction_fetch_units = 1;       // of 16 bytes
-
-  // List of nop instructions
-  nops( MachNop );
 %}

 //----------RESOURCES----------------------------------------------------------
--- a/src/hotspot/cpu/zero/gc/shared/barrierSetNMethod_zero.cpp
+++ b/src/hotspot/cpu/zero/gc/shared/barrierSetNMethod_zero.cpp
@ -29,7 +29,7 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
  ShouldNotReachHere();
 }

-void BarrierSetNMethod::set_guard_value(nmethod* nm, int value) {
+void BarrierSetNMethod::set_guard_value(nmethod* nm, int value, int bit_mask) {
  ShouldNotReachHere();
 }

--- a/src/hotspot/os/aix/os_aix.cpp
+++ b/src/hotspot/os/aix/os_aix.cpp
@ -43,7 +43,7 @@
 #include "prims/jniFastGetField.hpp"
 #include "prims/jvm_misc.hpp"
 #include "runtime/arguments.hpp"
-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/globals.hpp"
 #include "runtime/globals_extension.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
@ -1753,10 +1753,6 @@ void os::numa_make_global(char *addr, size_t bytes) {
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
 }

-bool os::numa_topology_changed() {
-  return false;
-}
-
 size_t os::numa_get_groups_num() {
  return 1;
 }
--- a/src/hotspot/os/bsd/os_bsd.cpp
+++ b/src/hotspot/os/bsd/os_bsd.cpp
@ -39,7 +39,7 @@
 #include "prims/jniFastGetField.hpp"
 #include "prims/jvm_misc.hpp"
 #include "runtime/arguments.hpp"
-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/globals.hpp"
 #include "runtime/globals_extension.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
@ -809,7 +809,7 @@ jlong os::javaTimeNanos() {
  if (now <= prev) {
    return prev;   // same or retrograde time;
  }
-  const uint64_t obsv = Atomic::cmpxchg(&Bsd::_max_abstime, prev, now);
+  const uint64_t obsv = AtomicAccess::cmpxchg(&Bsd::_max_abstime, prev, now);
  assert(obsv >= prev, "invariant");   // Monotonicity
  // If the CAS succeeded then we're done and return "now".
  // If the CAS failed and the observed value "obsv" is >= now then
@ -1599,8 +1599,6 @@ void os::numa_make_global(char *addr, size_t bytes) {
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
 }

-bool os::numa_topology_changed()   { return false; }
-
 size_t os::numa_get_groups_num() {
  return 1;
 }
@ -2135,14 +2133,14 @@ uint os::processor_id() {
  __asm__ ("cpuid\n\t" : "+a" (eax), "+b" (ebx), "+c" (ecx), "+d" (edx) : );

  uint apic_id = (ebx >> 24) & (processor_id_map_size - 1);
-  int processor_id = Atomic::load(&processor_id_map[apic_id]);
+  int processor_id = AtomicAccess::load(&processor_id_map[apic_id]);

  while (processor_id < 0) {
    // Assign processor id to APIC id
-    processor_id = Atomic::cmpxchg(&processor_id_map[apic_id], processor_id_unassigned, processor_id_assigning);
+    processor_id = AtomicAccess::cmpxchg(&processor_id_map[apic_id], processor_id_unassigned, processor_id_assigning);
    if (processor_id == processor_id_unassigned) {
-      processor_id = Atomic::fetch_then_add(&processor_id_next, 1) % os::processor_count();
-      Atomic::store(&processor_id_map[apic_id], processor_id);
+      processor_id = AtomicAccess::fetch_then_add(&processor_id_next, 1) % os::processor_count();
+      AtomicAccess::store(&processor_id_map[apic_id], processor_id);
    }
  }

--- a/src/hotspot/os/linux/os_linux.cpp
+++ b/src/hotspot/os/linux/os_linux.cpp
@ -42,7 +42,7 @@
 #include "prims/jniFastGetField.hpp"
 #include "prims/jvm_misc.hpp"
 #include "runtime/arguments.hpp"
-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/globals.hpp"
 #include "runtime/globals_extension.hpp"
 #include "runtime/init.hpp"
@ -370,11 +370,20 @@ size_t os::physical_memory() {
  return phys_mem;
 }

+// Returns the resident set size (RSS) of the process.
+// Falls back to using VmRSS from /proc/self/status if /proc/self/smaps_rollup is unavailable.
+// Note: On kernels with memory cgroups or shared memory, VmRSS may underreport RSS.
+// Users requiring accurate RSS values should be aware of this limitation.
 size_t os::rss() {
  size_t size = 0;
-  os::Linux::meminfo_t info;
-  if (os::Linux::query_process_memory_info(&info)) {
-    size = info.vmrss * K;
+  os::Linux::accurate_meminfo_t accurate_info;
+  if (os::Linux::query_accurate_process_memory_info(&accurate_info) && accurate_info.rss != -1) {
+    size = accurate_info.rss * K;
+  } else {
+    os::Linux::meminfo_t info;
+    if (os::Linux::query_process_memory_info(&info)) {
+      size = info.vmrss * K;
+    }
  }
  return size;
 }
@ -2362,6 +2371,37 @@ bool os::Linux::query_process_memory_info(os::Linux::meminfo_t* info) {
  return false;
 }

+// Accurate memory information need Linux 4.14 or newer
+bool os::Linux::query_accurate_process_memory_info(os::Linux::accurate_meminfo_t* info) {
+  FILE* f = os::fopen("/proc/self/smaps_rollup", "r");
+  if (f == nullptr) {
+    return false;
+  }
+
+  const size_t num_values = sizeof(os::Linux::accurate_meminfo_t) / sizeof(size_t);
+  size_t num_found = 0;
+  char buf[256];
+  info->rss = info->pss = info->pssdirty = info->pssanon =
+      info->pssfile = info->pssshmem = info->swap = info->swappss = -1;
+
+  while (::fgets(buf, sizeof(buf), f) != nullptr && num_found < num_values) {
+    if ( (info->rss == -1        && sscanf(buf, "Rss: %zd kB", &info->rss) == 1) ||
+         (info->pss == -1        && sscanf(buf, "Pss: %zd kB", &info->pss) == 1) ||
+         (info->pssdirty == -1   && sscanf(buf, "Pss_Dirty: %zd kB", &info->pssdirty) == 1) ||
+         (info->pssanon == -1    && sscanf(buf, "Pss_Anon: %zd kB", &info->pssanon) == 1) ||
+         (info->pssfile == -1    && sscanf(buf, "Pss_File: %zd kB", &info->pssfile) == 1) ||
+         (info->pssshmem == -1   && sscanf(buf, "Pss_Shmem: %zd kB", &info->pssshmem) == 1) ||
+         (info->swap == -1       && sscanf(buf, "Swap: %zd kB", &info->swap) == 1) ||
+         (info->swappss == -1    && sscanf(buf, "SwapPss: %zd kB", &info->swappss) == 1)
+         )
+    {
+      num_found ++;
+    }
+  }
+  fclose(f);
+  return true;
+}
+
 #ifdef __GLIBC__
 // For Glibc, print a one-liner with the malloc tunables.
 // Most important and popular is MALLOC_ARENA_MAX, but we are
@ -2988,8 +3028,6 @@ void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
  Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
 }

-bool os::numa_topology_changed() { return false; }
-
 size_t os::numa_get_groups_num() {
  // Return just the number of nodes in which it's possible to allocate memory
  // (in numa terminology, configured nodes).
@ -4783,8 +4821,8 @@ static bool should_warn_invalid_processor_id() {

  static volatile int warn_once = 1;

-  if (Atomic::load(&warn_once) == 0 ||
-      Atomic::xchg(&warn_once, 0) == 0) {
+  if (AtomicAccess::load(&warn_once) == 0 ||
+      AtomicAccess::xchg(&warn_once, 0) == 0) {
    // Don't warn more than once
    return false;
  }
--- a/src/hotspot/os/linux/os_linux.hpp
+++ b/src/hotspot/os/linux/os_linux.hpp
@ -181,6 +181,23 @@ class os::Linux {
  // fields will contain -1.
  static bool query_process_memory_info(meminfo_t* info);

+  // Output structure for query_accurate_process_memory_info() (all values in KB)
+  struct accurate_meminfo_t {
+    ssize_t rss;        // current resident set size
+    ssize_t pss;        // current proportional set size
+    ssize_t pssdirty;   // proportional set size (dirty)
+    ssize_t pssanon;    // proportional set size (anonymous mappings)
+    ssize_t pssfile;    // proportional set size (file mappings)
+    ssize_t pssshmem;   // proportional set size (shared mappings)
+    ssize_t swap;       // swapped out
+    ssize_t swappss;    // proportional set size (swapped out)
+  };
+
+  // Attempts to query accurate memory information from /proc/self/smaps_rollup and return it in the output structure.
+  // May fail (returns false) or succeed (returns true) but not all output fields are available; unavailable
+  // fields will contain -1.
+  static bool query_accurate_process_memory_info(accurate_meminfo_t* info);
+
  // Tells if the user asked for transparent huge pages.
  static bool _thp_requested;

--- a/src/hotspot/os/posix/forbiddenFunctions_posix.hpp
+++ b/src/hotspot/os/posix/forbiddenFunctions_posix.hpp
@ -37,28 +37,28 @@
 #endif

 // If needed, add os::strndup and use that instead.
-FORBID_C_FUNCTION(char* strndup(const char*, size_t), "don't use");
+FORBID_C_FUNCTION(char* strndup(const char*, size_t), noexcept, "don't use");

 // These are unimplementable for Windows, and they aren't useful for a
 // POSIX implementation of NMT either.
 // https://stackoverflow.com/questions/62962839/stdaligned-alloc-missing-from-visual-studio-2019
-FORBID_C_FUNCTION(int posix_memalign(void**, size_t, size_t), "don't use");
-FORBID_C_FUNCTION(void* aligned_alloc(size_t, size_t), "don't use");
+FORBID_C_FUNCTION(int posix_memalign(void**, size_t, size_t), noexcept, "don't use");
+FORBID_C_FUNCTION(void* aligned_alloc(size_t, size_t), noexcept, "don't use");

 // realpath with a null second argument mallocs a string for the result.
 // With a non-null second argument, there is a risk of buffer overrun.
 PRAGMA_DIAG_PUSH
 FORBIDDEN_FUNCTION_IGNORE_CLANG_FORTIFY_WARNING
-FORBID_C_FUNCTION(char* realpath(const char*, char*), "use os::realpath");
+FORBID_C_FUNCTION(char* realpath(const char*, char*), noexcept, "use os::realpath");
 PRAGMA_DIAG_POP

 // Returns a malloc'ed string.
-FORBID_C_FUNCTION(char* get_current_dir_name(), "use os::get_current_directory");
+FORBID_C_FUNCTION(char* get_current_dir_name(), noexcept, "use os::get_current_directory");

 // Problematic API that should never be used.
-FORBID_C_FUNCTION(char* getwd(char*), "use os::get_current_directory");
+FORBID_C_FUNCTION(char* getwd(char*), noexcept, "use os::get_current_directory");

 // BSD utility that is subtly different from realloc.
-FORBID_C_FUNCTION(void* reallocf(void*, size_t), "use os::realloc");
+FORBID_C_FUNCTION(void* reallocf(void*, size_t), /* not noexcept */, "use os::realloc");

 #endif // OS_POSIX_FORBIDDENFUNCTIONS_POSIX_HPP
--- a/src/hotspot/os/posix/os_posix.cpp
+++ b/src/hotspot/os/posix/os_posix.cpp
@ -31,7 +31,7 @@
 #include "nmt/memTracker.hpp"
 #include "os_posix.inline.hpp"
 #include "runtime/arguments.hpp"
-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/globals_extension.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
@ -1691,7 +1691,7 @@ void PlatformEvent::park() {       // AKA "down()"
  // atomically decrement _event
  for (;;) {
    v = _event;
-    if (Atomic::cmpxchg(&_event, v, v - 1) == v) break;
+    if (AtomicAccess::cmpxchg(&_event, v, v - 1) == v) break;
  }
  guarantee(v >= 0, "invariant");

@ -1738,7 +1738,7 @@ int PlatformEvent::park_nanos(jlong nanos) {
  // atomically decrement _event
  for (;;) {
    v = _event;
-    if (Atomic::cmpxchg(&_event, v, v - 1) == v) break;
+    if (AtomicAccess::cmpxchg(&_event, v, v - 1) == v) break;
  }
  guarantee(v >= 0, "invariant");

@ -1794,7 +1794,7 @@ void PlatformEvent::unpark() {
  // but only in the correctly written condition checking loops of ObjectMonitor,
  // Mutex/Monitor, and JavaThread::sleep

-  if (Atomic::xchg(&_event, 1) >= 0) return;
+  if (AtomicAccess::xchg(&_event, 1) >= 0) return;

  int status = pthread_mutex_lock(_mutex);
  assert_status(status == 0, status, "mutex_lock");
@ -1847,9 +1847,9 @@ void Parker::park(bool isAbsolute, jlong time) {

  // Optional fast-path check:
  // Return immediately if a permit is available.
-  // We depend on Atomic::xchg() having full barrier semantics
+  // We depend on AtomicAccess::xchg() having full barrier semantics
  // since we are doing a lock-free update to _counter.
-  if (Atomic::xchg(&_counter, 0) > 0) return;
+  if (AtomicAccess::xchg(&_counter, 0) > 0) return;

  JavaThread *jt = JavaThread::current();

--- a/src/hotspot/os/posix/signals_posix.cpp
+++ b/src/hotspot/os/posix/signals_posix.cpp
@ -28,7 +28,7 @@
 #include "jvm.h"
 #include "logging/log.hpp"
 #include "os_posix.hpp"
-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/globals.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/java.hpp"
@ -356,7 +356,7 @@ static void jdk_misc_signal_init() {

 void os::signal_notify(int sig) {
  if (sig_semaphore != nullptr) {
-    Atomic::inc(&pending_signals[sig]);
+    AtomicAccess::inc(&pending_signals[sig]);
    sig_semaphore->signal();
  } else {
    // Signal thread is not created with ReduceSignalUsage and jdk_misc_signal_init
@ -369,7 +369,7 @@ static int check_pending_signals() {
  for (;;) {
    for (int i = 0; i < NSIG + 1; i++) {
      jint n = pending_signals[i];
-      if (n > 0 && n == Atomic::cmpxchg(&pending_signals[i], n, n - 1)) {
+      if (n > 0 && n == AtomicAccess::cmpxchg(&pending_signals[i], n, n - 1)) {
        return i;
      }
    }
--- a/src/hotspot/os/posix/suspendResume_posix.cpp
+++ b/src/hotspot/os/posix/suspendResume_posix.cpp
@ -22,7 +22,7 @@
 *
 */

-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "suspendResume_posix.hpp"

 /* try to switch state from state "from" to state "to"
@ -31,7 +31,7 @@
 SuspendResume::State SuspendResume::switch_state(SuspendResume::State from,
                                                 SuspendResume::State to)
 {
-  SuspendResume::State result = Atomic::cmpxchg(&_state, from, to);
+  SuspendResume::State result = AtomicAccess::cmpxchg(&_state, from, to);
  if (result == from) {
    // success
    return to;
--- a/src/hotspot/os/windows/forbiddenFunctions_windows.hpp
+++ b/src/hotspot/os/windows/forbiddenFunctions_windows.hpp
@ -29,10 +29,12 @@

 #include <stddef.h> // for size_t

+// NOTE: The Windows C standard library doesn't declare functions "noexcept".
+
 // _fullpath with a null first argument mallocs a string for the result.
-FORBID_IMPORTED_C_FUNCTION(char* _fullpath(char*, const char*, size_t), "use os::realpath");
+FORBID_IMPORTED_C_FUNCTION(char* _fullpath(char*, const char*, size_t), /* not noexcept */, "use os::realpath");

 // _snprintf does NOT null terminate if the output would exceed the buffer size.
-FORBID_C_FUNCTION(int _snprintf(char*, size_t, const char*, ...), "use os::snprintf");
+FORBID_C_FUNCTION(int _snprintf(char*, size_t, const char*, ...), /* not noexcept */, "use os::snprintf");

 #endif // OS_WINDOWS_FORBIDDENFUNCTIONS_WINDOWS_HPP
--- a/src/hotspot/os/windows/os_windows.cpp
+++ b/src/hotspot/os/windows/os_windows.cpp
@ -42,7 +42,7 @@
 #include "prims/jniFastGetField.hpp"
 #include "prims/jvm_misc.hpp"
 #include "runtime/arguments.hpp"
-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/globals.hpp"
 #include "runtime/globals_extension.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
@ -2462,7 +2462,7 @@ static void jdk_misc_signal_init() {

 void os::signal_notify(int sig) {
  if (sig_sem != nullptr) {
-    Atomic::inc(&pending_signals[sig]);
+    AtomicAccess::inc(&pending_signals[sig]);
    sig_sem->signal();
  } else {
    // Signal thread is not created with ReduceSignalUsage and jdk_misc_signal_init
@ -2475,7 +2475,7 @@ static int check_pending_signals() {
  while (true) {
    for (int i = 0; i < NSIG + 1; i++) {
      jint n = pending_signals[i];
-      if (n > 0 && n == Atomic::cmpxchg(&pending_signals[i], n, n - 1)) {
+      if (n > 0 && n == AtomicAccess::cmpxchg(&pending_signals[i], n, n - 1)) {
        return i;
      }
    }
@ -3794,7 +3794,6 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) {

 void os::numa_make_global(char *addr, size_t bytes)    { }
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint)    { }
-bool os::numa_topology_changed()                       { return false; }
 size_t os::numa_get_groups_num()                       { return MAX2(numa_node_list_holder.get_count(), 1); }
 int os::numa_get_group_id()                            { return 0; }
 size_t os::numa_get_leaf_groups(uint *ids, size_t size) {
@ -4298,15 +4297,15 @@ static void exit_process_or_thread(Ept what, int exit_code) {
    // The first thread that reached this point, initializes the critical section.
    if (!InitOnceExecuteOnce(&init_once_crit_sect, init_crit_sect_call, &crit_sect, nullptr)) {
      warning("crit_sect initialization failed in %s: %d\n", __FILE__, __LINE__);
-    } else if (Atomic::load_acquire(&process_exiting) == 0) {
+    } else if (AtomicAccess::load_acquire(&process_exiting) == 0) {
      if (what != EPT_THREAD) {
        // Atomically set process_exiting before the critical section
        // to increase the visibility between racing threads.
-        Atomic::cmpxchg(&process_exiting, (DWORD)0, GetCurrentThreadId());
+        AtomicAccess::cmpxchg(&process_exiting, (DWORD)0, GetCurrentThreadId());
      }
      EnterCriticalSection(&crit_sect);

-      if (what == EPT_THREAD && Atomic::load_acquire(&process_exiting) == 0) {
+      if (what == EPT_THREAD && AtomicAccess::load_acquire(&process_exiting) == 0) {
        // Remove from the array those handles of the threads that have completed exiting.
        for (i = 0, j = 0; i < handle_count; ++i) {
          res = WaitForSingleObject(handles[i], 0 /* don't wait */);
@ -4419,7 +4418,7 @@ static void exit_process_or_thread(Ept what, int exit_code) {
    }

    if (!registered &&
-        Atomic::load_acquire(&process_exiting) != 0 &&
+        AtomicAccess::load_acquire(&process_exiting) != 0 &&
        process_exiting != GetCurrentThreadId()) {
      // Some other thread is about to call exit(), so we don't let
      // the current unregistered thread proceed to exit() or _endthreadex()
@ -5585,7 +5584,7 @@ int PlatformEvent::park(jlong Millis) {
  int v;
  for (;;) {
    v = _Event;
-    if (Atomic::cmpxchg(&_Event, v, v-1) == v) break;
+    if (AtomicAccess::cmpxchg(&_Event, v, v-1) == v) break;
  }
  guarantee((v == 0) || (v == 1), "invariant");
  if (v != 0) return OS_OK;
@ -5648,7 +5647,7 @@ void PlatformEvent::park() {
  int v;
  for (;;) {
    v = _Event;
-    if (Atomic::cmpxchg(&_Event, v, v-1) == v) break;
+    if (AtomicAccess::cmpxchg(&_Event, v, v-1) == v) break;
  }
  guarantee((v == 0) || (v == 1), "invariant");
  if (v != 0) return;
@ -5695,7 +5694,7 @@ void PlatformEvent::unpark() {
  // from the first park() call after an unpark() call which will help
  // shake out uses of park() and unpark() without condition variables.

-  if (Atomic::xchg(&_Event, 1) >= 0) return;
+  if (AtomicAccess::xchg(&_Event, 1) >= 0) return;

  ::SetEvent(_ParkHandle);
 }
--- a/src/hotspot/os_cpu/aix_ppc/atomic_aix_ppc.hpp
+++ b/src/hotspot/os_cpu/aix_ppc/atomic_aix_ppc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2012, 2019 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -93,7 +93,7 @@ inline void post_membar(atomic_memory_order order) {


 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -105,8 +105,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
-                                               atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));

@ -131,8 +131,8 @@ inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,
-                                               atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(I));
  STATIC_ASSERT(8 == sizeof(D));

@ -156,9 +156,9 @@ inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  // Note that xchg doesn't necessarily do an acquire
  // (see synchronizer.cpp).

@ -195,9 +195,9 @@ inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<8>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  // Note that xchg doesn't necessarily do an acquire
  // (see synchronizer.cpp).
@ -235,15 +235,15 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<1>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(1 == sizeof(T));

  // Note that cmpxchg guarantees a two-way memory barrier across
  // the cmpxchg, so it's really a 'fence_cmpxchg_fence' if not
-  // specified otherwise (see atomic.hpp).
+  // specified otherwise (see atomicAccess.hpp).

  // Using 32 bit internally.
  volatile int *dest_base = (volatile int*)((uintptr_t)dest & ~3);
@ -305,15 +305,15 @@ inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));

  // Note that cmpxchg guarantees a two-way memory barrier across
  // the cmpxchg, so it's really a 'fence_cmpxchg_fence' if not
-  // specified otherwise (see atomic.hpp).
+  // specified otherwise (see atomicAccess.hpp).

  T old_value;
  const uint64_t zero = 0;
@ -355,15 +355,15 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));

  // Note that cmpxchg guarantees a two-way memory barrier across
  // the cmpxchg, so it's really a 'fence_cmpxchg_fence' if not
-  // specified otherwise (see atomic.hpp).
+  // specified otherwise (see atomicAccess.hpp).

  T old_value;
  const uint64_t zero = 0;
@ -404,10 +404,10 @@ inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
 }

 template<size_t byte_size>
-struct Atomic::PlatformOrderedLoad<byte_size, X_ACQUIRE> {
+struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE> {
  template <typename T>
  T operator()(const volatile T* p) const {
-    T t = Atomic::load(p);
+    T t = AtomicAccess::load(p);
    // Use twi-isync for load_acquire (faster than lwsync).
    __asm__ __volatile__ ("twi 0,%0,0\n isync\n" : : "r" (t) : "memory");
    return t;
--- a/src/hotspot/os_cpu/bsd_aarch64/atomic_bsd_aarch64.hpp
+++ b/src/hotspot/os_cpu/bsd_aarch64/atomic_bsd_aarch64.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
 * Copyright (c) 2021, Azul Systems, Inc. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@ -34,7 +34,7 @@
 // See https://patchwork.kernel.org/patch/3575821/

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const {
    if (order == memory_order_relaxed) {
@ -54,9 +54,9 @@ struct Atomic::PlatformAdd {

 template<size_t byte_size>
 template<typename T>
-inline T Atomic::PlatformXchg<byte_size>::operator()(T volatile* dest,
-                                                     T exchange_value,
-                                                     atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<byte_size>::operator()(T volatile* dest,
+                                                           T exchange_value,
+                                                           atomic_memory_order order) const {
  STATIC_ASSERT(byte_size == sizeof(T));
  T res = __atomic_exchange_n(dest, exchange_value, __ATOMIC_RELEASE);
  FULL_MEM_BARRIER;
@ -65,10 +65,10 @@ inline T Atomic::PlatformXchg<byte_size>::operator()(T volatile* dest,

 template<size_t byte_size>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<byte_size>::operator()(T volatile* dest,
-                                                        T compare_value,
-                                                        T exchange_value,
-                                                        atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<byte_size>::operator()(T volatile* dest,
+                                                              T compare_value,
+                                                              T exchange_value,
+                                                              atomic_memory_order order) const {
  STATIC_ASSERT(byte_size == sizeof(T));
  if (order == memory_order_conservative) {
    T value = compare_value;
@ -109,21 +109,21 @@ inline T Atomic::PlatformCmpxchg<byte_size>::operator()(T volatile* dest,
 }

 template<size_t byte_size>
-struct Atomic::PlatformOrderedLoad<byte_size, X_ACQUIRE>
+struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE>
 {
  template <typename T>
  T operator()(const volatile T* p) const { T data; __atomic_load(const_cast<T*>(p), &data, __ATOMIC_ACQUIRE); return data; }
 };

 template<size_t byte_size>
-struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X>
+struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const { __atomic_store(const_cast<T*>(p), &v, __ATOMIC_RELEASE); }
 };

 template<size_t byte_size>
-struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const { release_store(p, v); OrderAccess::fence(); }
--- a/src/hotspot/os_cpu/bsd_x86/atomic_bsd_x86.hpp
+++ b/src/hotspot/os_cpu/bsd_x86/atomic_bsd_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -28,7 +28,7 @@
 // Implementation of class atomic

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D fetch_then_add(D volatile* dest, I add_value, atomic_memory_order /* order */) const;

@ -40,8 +40,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,
-                                                atomic_memory_order /* order */) const {
+inline D AtomicAccess::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));
  D old_value;
@ -54,9 +54,9 @@ inline D Atomic::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order /* order */) const {
  STATIC_ASSERT(4 == sizeof(T));
  __asm__ volatile (  "xchgl (%2),%0"
                    : "=r" (exchange_value)
@ -67,10 +67,10 @@ inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformCmpxchg<1>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(1 == sizeof(T));
  __asm__ volatile (  "lock cmpxchgb %1,(%3)"
                    : "=a" (exchange_value)
@ -81,10 +81,10 @@ inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(4 == sizeof(T));
  __asm__ volatile (  "lock cmpxchgl %1,(%3)"
                    : "=a" (exchange_value)
@ -96,8 +96,8 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
 #ifdef AMD64
 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,
-                                                atomic_memory_order /* order */) const {
+inline D AtomicAccess::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(8 == sizeof(I));
  STATIC_ASSERT(8 == sizeof(D));
  D old_value;
@ -110,9 +110,9 @@ inline D Atomic::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformXchg<8>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order /* order */) const {
  STATIC_ASSERT(8 == sizeof(T));
  __asm__ __volatile__ ("xchgq (%2),%0"
                        : "=r" (exchange_value)
@ -123,10 +123,10 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(8 == sizeof(T));
  __asm__ __volatile__ (  "lock cmpxchgq %1,(%3)"
                        : "=a" (exchange_value)
@ -145,25 +145,25 @@ extern "C" {

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(8 == sizeof(T));
  return cmpxchg_using_helper<int64_t>(_Atomic_cmpxchg_long, dest, compare_value, exchange_value);
 }

 // No direct support for 8-byte xchg; emulate using cmpxchg.
 template<>
-struct Atomic::PlatformXchg<8> : Atomic::XchgUsingCmpxchg<8> {};
+struct AtomicAccess::PlatformXchg<8> : AtomicAccess::XchgUsingCmpxchg<8> {};

 // No direct support for 8-byte add; emulate using cmpxchg.
 template<>
-struct Atomic::PlatformAdd<8> : Atomic::AddUsingCmpxchg<8> {};
+struct AtomicAccess::PlatformAdd<8> : AtomicAccess::AddUsingCmpxchg<8> {};

 template<>
 template<typename T>
-inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {
+inline T AtomicAccess::PlatformLoad<8>::operator()(T const volatile* src) const {
  STATIC_ASSERT(8 == sizeof(T));
  volatile int64_t dest;
  _Atomic_move_long(reinterpret_cast<const volatile int64_t*>(src), reinterpret_cast<volatile int64_t*>(&dest));
@ -172,8 +172,8 @@ inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {

 template<>
 template<typename T>
-inline void Atomic::PlatformStore<8>::operator()(T volatile* dest,
-                                                 T store_value) const {
+inline void AtomicAccess::PlatformStore<8>::operator()(T volatile* dest,
+                                                       T store_value) const {
  STATIC_ASSERT(8 == sizeof(T));
  _Atomic_move_long(reinterpret_cast<const volatile int64_t*>(&store_value), reinterpret_cast<volatile int64_t*>(dest));
 }
@ -181,7 +181,7 @@ inline void Atomic::PlatformStore<8>::operator()(T volatile* dest,
 #endif // AMD64

 template<>
-struct Atomic::PlatformOrderedStore<1, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<1, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const {
@ -193,7 +193,7 @@ struct Atomic::PlatformOrderedStore<1, RELEASE_X_FENCE>
 };

 template<>
-struct Atomic::PlatformOrderedStore<2, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<2, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const {
@ -205,7 +205,7 @@ struct Atomic::PlatformOrderedStore<2, RELEASE_X_FENCE>
 };

 template<>
-struct Atomic::PlatformOrderedStore<4, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<4, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const {
@ -218,7 +218,7 @@ struct Atomic::PlatformOrderedStore<4, RELEASE_X_FENCE>

 #ifdef AMD64
 template<>
-struct Atomic::PlatformOrderedStore<8, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<8, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const {
--- a/src/hotspot/os_cpu/bsd_zero/atomic_bsd_zero.hpp
+++ b/src/hotspot/os_cpu/bsd_zero/atomic_bsd_zero.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2007, 2008, 2011, 2015, Red Hat, Inc.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -32,7 +32,7 @@
 // Implementation of class atomic

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -44,8 +44,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));

@ -56,8 +56,8 @@ inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(I));
  STATIC_ASSERT(8 == sizeof(D));

@ -68,9 +68,9 @@ inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));
  FULL_MEM_BARRIER;
  T result = __atomic_exchange_n(dest, exchange_value, __ATOMIC_RELAXED);
@ -80,9 +80,9 @@ inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<8>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  FULL_MEM_BARRIER;
  T result = __atomic_exchange_n(dest, exchange_value, __ATOMIC_RELAXED);
@ -92,14 +92,14 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,

 // No direct support for cmpxchg of bytes; emulate using int.
 template<>
-struct Atomic::PlatformCmpxchg<1> : Atomic::CmpxchgByteUsingInt {};
+struct AtomicAccess::PlatformCmpxchg<1> : AtomicAccess::CmpxchgByteUsingInt {};

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));
  T value = compare_value;
  FULL_MEM_BARRIER;
@ -111,10 +111,10 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));

  T value = compare_value;
@ -134,7 +134,7 @@ inline void atomic_copy64(const volatile void *src, volatile void *dst) {

 template<>
 template<typename T>
-inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {
+inline T AtomicAccess::PlatformLoad<8>::operator()(T const volatile* src) const {
  STATIC_ASSERT(8 == sizeof(T));
  T dest;
  __atomic_load(const_cast<T*>(src), &dest, __ATOMIC_RELAXED);
@ -143,8 +143,8 @@ inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {

 template<>
 template<typename T>
-inline void Atomic::PlatformStore<8>::operator()(T volatile* dest,
-                                                 T store_value) const {
+inline void AtomicAccess::PlatformStore<8>::operator()(T volatile* dest,
+                                                       T store_value) const {
  STATIC_ASSERT(8 == sizeof(T));
  __atomic_store(dest, &store_value, __ATOMIC_RELAXED);
 }
--- a/src/hotspot/os_cpu/linux_aarch64/atomic_linux_aarch64.hpp
+++ b/src/hotspot/os_cpu/linux_aarch64/atomic_linux_aarch64.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -70,7 +70,7 @@ inline D atomic_fastcall(F stub, volatile D *dest, T1 arg1, T2 arg2) {
 }

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D fetch_then_add(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -83,8 +83,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));
  aarch64_atomic_stub_t stub;
@ -99,8 +99,8 @@ inline D Atomic::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(I));
  STATIC_ASSERT(8 == sizeof(D));
  aarch64_atomic_stub_t stub;
@ -115,9 +115,9 @@ inline D Atomic::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));
  T old_value = atomic_fastcall(aarch64_atomic_xchg_4_impl, dest, exchange_value);
  return old_value;
@ -125,8 +125,8 @@ inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest, T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<8>::operator()(T volatile* dest, T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  T old_value = atomic_fastcall(aarch64_atomic_xchg_8_impl, dest, exchange_value);
  return old_value;
@ -134,10 +134,10 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest, T exchange_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<1>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(1 == sizeof(T));
  aarch64_atomic_stub_t stub;
  switch (order) {
@ -152,10 +152,10 @@ inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));
  aarch64_atomic_stub_t stub;
  switch (order) {
@ -175,10 +175,10 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  aarch64_atomic_stub_t stub;
  switch (order) {
@ -197,21 +197,21 @@ inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
 }

 template<size_t byte_size>
-struct Atomic::PlatformOrderedLoad<byte_size, X_ACQUIRE>
+struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE>
 {
  template <typename T>
  T operator()(const volatile T* p) const { T data; __atomic_load(const_cast<T*>(p), &data, __ATOMIC_ACQUIRE); return data; }
 };

 template<size_t byte_size>
-struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X>
+struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const { __atomic_store(const_cast<T*>(p), &v, __ATOMIC_RELEASE); }
 };

 template<size_t byte_size>
-struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const { release_store(p, v); OrderAccess::fence(); }
--- a/src/hotspot/os_cpu/linux_arm/atomic_linux_arm.hpp
+++ b/src/hotspot/os_cpu/linux_arm/atomic_linux_arm.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2008, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -78,7 +78,7 @@ public:

 template<>
 template<typename T>
-inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {
+inline T AtomicAccess::PlatformLoad<8>::operator()(T const volatile* src) const {
  STATIC_ASSERT(8 == sizeof(T));
  return PrimitiveConversions::cast<T>(
    (*ARMAtomicFuncs::_load_long_func)(reinterpret_cast<const volatile int64_t*>(src)));
@ -86,20 +86,20 @@ inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {

 template<>
 template<typename T>
-inline void Atomic::PlatformStore<8>::operator()(T volatile* dest,
-                                                 T store_value) const {
+inline void AtomicAccess::PlatformStore<8>::operator()(T volatile* dest,
+                                                       T store_value) const {
  STATIC_ASSERT(8 == sizeof(T));
  (*ARMAtomicFuncs::_store_long_func)(
    PrimitiveConversions::cast<int64_t>(store_value), reinterpret_cast<volatile int64_t*>(dest));
 }

-// As per atomic.hpp all read-modify-write operations have to provide two-way
+// As per atomicAccess.hpp all read-modify-write operations have to provide two-way
 // barriers semantics.
 //
 // For ARMv7 we add explicit barriers in the stubs.

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -111,8 +111,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
-                                               atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));
  return add_using_helper<int32_t>(ARMAtomicFuncs::_add_func, dest, add_value);
@ -121,26 +121,26 @@ inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));
  return xchg_using_helper<int32_t>(ARMAtomicFuncs::_xchg_func, dest, exchange_value);
 }

 // No direct support for 8-byte xchg; emulate using cmpxchg.
 template<>
-struct Atomic::PlatformXchg<8> : Atomic::XchgUsingCmpxchg<8> {};
+struct AtomicAccess::PlatformXchg<8> : AtomicAccess::XchgUsingCmpxchg<8> {};

 // No direct support for 8-byte add; emulate using cmpxchg.
 template<>
-struct Atomic::PlatformAdd<8> : Atomic::AddUsingCmpxchg<8> {};
+struct AtomicAccess::PlatformAdd<8> : AtomicAccess::AddUsingCmpxchg<8> {};

 // The memory_order parameter is ignored - we always provide the strongest/most-conservative ordering

 // No direct support for cmpxchg of bytes; emulate using int.
 template<>
-struct Atomic::PlatformCmpxchg<1> : Atomic::CmpxchgByteUsingInt {};
+struct AtomicAccess::PlatformCmpxchg<1> : AtomicAccess::CmpxchgByteUsingInt {};


 inline int32_t reorder_cmpxchg_func(int32_t exchange_value,
@ -160,20 +160,20 @@ inline int64_t reorder_cmpxchg_long_func(int64_t exchange_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));
  return cmpxchg_using_helper<int32_t>(reorder_cmpxchg_func, dest, compare_value, exchange_value);
 }

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  return cmpxchg_using_helper<int64_t>(reorder_cmpxchg_long_func, dest, compare_value, exchange_value);
 }
--- a/src/hotspot/os_cpu/linux_ppc/atomic_linux_ppc.hpp
+++ b/src/hotspot/os_cpu/linux_ppc/atomic_linux_ppc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2012, 2019 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -93,7 +93,7 @@ inline void post_membar(atomic_memory_order order) {


 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -105,8 +105,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));

@ -131,8 +131,8 @@ inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(I));
  STATIC_ASSERT(8 == sizeof(D));

@ -156,9 +156,9 @@ inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  // Note that xchg doesn't necessarily do an acquire
  // (see synchronizer.cpp).

@ -195,9 +195,9 @@ inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<8>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  // Note that xchg doesn't necessarily do an acquire
  // (see synchronizer.cpp).
@ -235,15 +235,15 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<1>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(1 == sizeof(T));

  // Note that cmpxchg guarantees a two-way memory barrier across
  // the cmpxchg, so it's really a 'fence_cmpxchg_fence' if not
-  // specified otherwise (see atomic.hpp).
+  // specified otherwise (see atomicAccess.hpp).

  // Using 32 bit internally.
  unsigned int old_value, loaded_value;
@ -282,15 +282,15 @@ inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));

  // Note that cmpxchg guarantees a two-way memory barrier across
  // the cmpxchg, so it's really a 'fence_cmpxchg_fence' if not
-  // specified otherwise (see atomic.hpp).
+  // specified otherwise (see atomicAccess.hpp).

  T old_value;
  const uint64_t zero = 0;
@ -332,15 +332,15 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));

  // Note that cmpxchg guarantees a two-way memory barrier across
  // the cmpxchg, so it's really a 'fence_cmpxchg_fence' if not
-  // specified otherwise (see atomic.hpp).
+  // specified otherwise (see atomicAccess.hpp).

  T old_value;
  const uint64_t zero = 0;
@ -381,11 +381,11 @@ inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
 }

 template<size_t byte_size>
-struct Atomic::PlatformOrderedLoad<byte_size, X_ACQUIRE>
+struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE>
 {
  template <typename T>
  T operator()(const volatile T* p) const {
-    T t = Atomic::load(p);
+    T t = AtomicAccess::load(p);
    // Use twi-isync for load_acquire (faster than lwsync).
    __asm__ __volatile__ ("twi 0,%0,0\n isync\n" : : "r" (t) : "memory");
    return t;
--- a/src/hotspot/os_cpu/linux_riscv/atomic_linux_riscv.hpp
+++ b/src/hotspot/os_cpu/linux_riscv/atomic_linux_riscv.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2020, 2021, Huawei Technologies Co., Ltd. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -40,7 +40,7 @@
 #endif

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const {

@ -71,10 +71,10 @@ struct Atomic::PlatformAdd {
 #ifndef FULL_COMPILER_ATOMIC_SUPPORT
 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest __attribute__((unused)),
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<1>::operator()(T volatile* dest __attribute__((unused)),
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(1 == sizeof(T));

  if (order != memory_order_relaxed) {
@ -122,10 +122,10 @@ inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest __attribute__((
 // See also JDK-8326936.
 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest __attribute__((unused)),
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest __attribute__((unused)),
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));

  int32_t old_value;
@ -154,9 +154,9 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest __attribute__((

 template<size_t byte_size>
 template<typename T>
-inline T Atomic::PlatformXchg<byte_size>::operator()(T volatile* dest,
-                                                     T exchange_value,
-                                                     atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<byte_size>::operator()(T volatile* dest,
+                                                           T exchange_value,
+                                                           atomic_memory_order order) const {
 #ifndef FULL_COMPILER_ATOMIC_SUPPORT
  // If we add xchg for sub word and are using older compiler
  // it must be added here due to not using lib atomic.
@ -180,10 +180,10 @@ inline T Atomic::PlatformXchg<byte_size>::operator()(T volatile* dest,
 // __attribute__((unused)) on dest is to get rid of spurious GCC warnings.
 template<size_t byte_size>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<byte_size>::operator()(T volatile* dest __attribute__((unused)),
-                                                        T compare_value,
-                                                        T exchange_value,
-                                                        atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<byte_size>::operator()(T volatile* dest __attribute__((unused)),
+                                                              T compare_value,
+                                                              T exchange_value,
+                                                              atomic_memory_order order) const {

 #ifndef FULL_COMPILER_ATOMIC_SUPPORT
  STATIC_ASSERT(byte_size > 4);
@ -204,21 +204,21 @@ inline T Atomic::PlatformCmpxchg<byte_size>::operator()(T volatile* dest __attri
 }

 template<size_t byte_size>
-struct Atomic::PlatformOrderedLoad<byte_size, X_ACQUIRE>
+struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE>
 {
  template <typename T>
  T operator()(const volatile T* p) const { T data; __atomic_load(const_cast<T*>(p), &data, __ATOMIC_ACQUIRE); return data; }
 };

 template<size_t byte_size>
-struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X>
+struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const { __atomic_store(const_cast<T*>(p), &v, __ATOMIC_RELEASE); }
 };

 template<size_t byte_size>
-struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const { release_store(p, v); OrderAccess::fence(); }
--- a/src/hotspot/os_cpu/linux_riscv/riscv_hwprobe.cpp
+++ b/src/hotspot/os_cpu/linux_riscv/riscv_hwprobe.cpp
@ -89,6 +89,8 @@
 #define   RISCV_HWPROBE_MISALIGNED_UNSUPPORTED  (4 << 0)
 #define   RISCV_HWPROBE_MISALIGNED_MASK         (7 << 0)

+#define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6
+
 #ifndef NR_riscv_hwprobe
 #ifndef NR_arch_specific_syscall
 #define NR_arch_specific_syscall 244
@ -114,7 +116,8 @@ static struct riscv_hwprobe query[] = {{RISCV_HWPROBE_KEY_MVENDORID, 0},
                                       {RISCV_HWPROBE_KEY_MIMPID,    0},
                                       {RISCV_HWPROBE_KEY_BASE_BEHAVIOR, 0},
                                       {RISCV_HWPROBE_KEY_IMA_EXT_0,     0},
-                                       {RISCV_HWPROBE_KEY_CPUPERF_0,     0}};
+                                       {RISCV_HWPROBE_KEY_CPUPERF_0,     0},
+                                       {RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE, 0}};

 bool RiscvHwprobe::probe_features() {
  assert(!rw_hwprobe_completed, "Called twice.");
@ -188,6 +191,9 @@ void RiscvHwprobe::add_features_from_query_result() {
    VM_Version::ext_Zbs.enable_feature();
  }
 #ifndef PRODUCT
+  if (is_set(RISCV_HWPROBE_KEY_IMA_EXT_0, RISCV_HWPROBE_EXT_ZICBOZ)) {
+    VM_Version::ext_Zicboz.enable_feature();
+  }
  if (is_set(RISCV_HWPROBE_KEY_IMA_EXT_0, RISCV_HWPROBE_EXT_ZBKB)) {
    VM_Version::ext_Zbkb.enable_feature();
  }
@ -244,4 +250,7 @@ void RiscvHwprobe::add_features_from_query_result() {
    VM_Version::unaligned_access.enable_feature(
       query[RISCV_HWPROBE_KEY_CPUPERF_0].value & RISCV_HWPROBE_MISALIGNED_MASK);
  }
+  if (is_valid(RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE)) {
+    VM_Version::zicboz_block_size.enable_feature(query[RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE].value);
+  }
 }
--- a/src/hotspot/os_cpu/linux_s390/atomic_linux_s390.hpp
+++ b/src/hotspot/os_cpu/linux_s390/atomic_linux_s390.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2016, 2019 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -26,7 +26,7 @@
 #ifndef OS_CPU_LINUX_S390_ATOMIC_LINUX_S390_HPP
 #define OS_CPU_LINUX_S390_ATOMIC_LINUX_S390_HPP

-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/os.hpp"
 #include "runtime/vm_version.hpp"

@ -55,7 +55,7 @@
 // before the other store becomes visible.

 //------------
-// Atomic::add
+// AtomicAccess::add
 //------------
 // These methods force the value in memory to be augmented by the passed increment.
 // Both, memory value and increment, are treated as 32bit signed binary integers.
@ -75,7 +75,7 @@ inline void z196_fast_sync() {
 }

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -87,8 +87,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I inc,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<4>::add_then_fetch(D volatile* dest, I inc,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));

@ -141,8 +141,8 @@ inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I inc,

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I inc,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I inc,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(I));
  STATIC_ASSERT(8 == sizeof(D));

@ -194,7 +194,7 @@ inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I inc,


 //-------------
-// Atomic::xchg
+// AtomicAccess::xchg
 //-------------
 // These methods force the value in memory to be replaced by the new value passed
 // in as argument.
@ -211,9 +211,9 @@ inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I inc,
 // replacement succeeded.
 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order unused) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order unused) const {
  STATIC_ASSERT(4 == sizeof(T));
  T old;

@ -235,9 +235,9 @@ inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order unused) const {
+inline T AtomicAccess::PlatformXchg<8>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order unused) const {
  STATIC_ASSERT(8 == sizeof(T));
  T old;

@ -258,7 +258,7 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,
 }

 //----------------
-// Atomic::cmpxchg
+// AtomicAccess::cmpxchg
 //----------------
 // These methods compare the value in memory with a given compare value.
 // If both values compare equal, the value in memory is replaced with
@ -288,14 +288,14 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,

 // No direct support for cmpxchg of bytes; emulate using int.
 template<>
-struct Atomic::PlatformCmpxchg<1> : Atomic::CmpxchgByteUsingInt {};
+struct AtomicAccess::PlatformCmpxchg<1> : AtomicAccess::CmpxchgByteUsingInt {};

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T cmp_val,
-                                                T xchg_val,
-                                                atomic_memory_order unused) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T cmp_val,
+                                                      T xchg_val,
+                                                      atomic_memory_order unused) const {
  STATIC_ASSERT(4 == sizeof(T));
  T old;

@ -316,10 +316,10 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T cmp_val,
-                                                T xchg_val,
-                                                atomic_memory_order unused) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T cmp_val,
+                                                      T xchg_val,
+                                                      atomic_memory_order unused) const {
  STATIC_ASSERT(8 == sizeof(T));
  T old;

@ -339,7 +339,7 @@ inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
 }

 template<size_t byte_size>
-struct Atomic::PlatformOrderedLoad<byte_size, X_ACQUIRE>
+struct AtomicAccess::PlatformOrderedLoad<byte_size, X_ACQUIRE>
 {
  template <typename T>
  T operator()(const volatile T* p) const { T t = *p; OrderAccess::acquire(); return t; }
--- a/src/hotspot/os_cpu/linux_x86/atomic_linux_x86.hpp
+++ b/src/hotspot/os_cpu/linux_x86/atomic_linux_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -28,7 +28,7 @@
 // Implementation of class atomic

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D fetch_then_add(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -40,8 +40,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));
  D old_value;
@ -54,9 +54,9 @@ inline D Atomic::PlatformAdd<4>::fetch_then_add(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));
  __asm__ volatile (  "xchgl (%2),%0"
                    : "=r" (exchange_value)
@ -67,10 +67,10 @@ inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformCmpxchg<1>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(1 == sizeof(T));
  __asm__ volatile ("lock cmpxchgb %1,(%3)"
                    : "=a" (exchange_value)
@ -81,10 +81,10 @@ inline T Atomic::PlatformCmpxchg<1>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(4 == sizeof(T));
  __asm__ volatile ("lock cmpxchgl %1,(%3)"
                    : "=a" (exchange_value)
@ -97,8 +97,8 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(I));
  STATIC_ASSERT(8 == sizeof(D));
  D old_value;
@ -111,8 +111,8 @@ inline D Atomic::PlatformAdd<8>::fetch_then_add(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest, T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<8>::operator()(T volatile* dest, T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  __asm__ __volatile__ ("xchgq (%2),%0"
                        : "=r" (exchange_value)
@ -123,10 +123,10 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest, T exchange_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order /* order */) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order /* order */) const {
  STATIC_ASSERT(8 == sizeof(T));
  __asm__ __volatile__ ("lock cmpxchgq %1,(%3)"
                        : "=a" (exchange_value)
@ -145,25 +145,25 @@ extern "C" {

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  return cmpxchg_using_helper<int64_t>(_Atomic_cmpxchg_long, dest, compare_value, exchange_value);
 }

 // No direct support for 8-byte xchg; emulate using cmpxchg.
 template<>
-struct Atomic::PlatformXchg<8> : Atomic::XchgUsingCmpxchg<8> {};
+struct AtomicAccess::PlatformXchg<8> : AtomicAccess::XchgUsingCmpxchg<8> {};

 // No direct support for 8-byte add; emulate using cmpxchg.
 template<>
-struct Atomic::PlatformAdd<8> : Atomic::AddUsingCmpxchg<8> {};
+struct AtomicAccess::PlatformAdd<8> : AtomicAccess::AddUsingCmpxchg<8> {};

 template<>
 template<typename T>
-inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {
+inline T AtomicAccess::PlatformLoad<8>::operator()(T const volatile* src) const {
  STATIC_ASSERT(8 == sizeof(T));
  volatile int64_t dest;
  _Atomic_move_long(reinterpret_cast<const volatile int64_t*>(src), reinterpret_cast<volatile int64_t*>(&dest));
@ -172,8 +172,8 @@ inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {

 template<>
 template<typename T>
-inline void Atomic::PlatformStore<8>::operator()(T volatile* dest,
-                                                 T store_value) const {
+inline void AtomicAccess::PlatformStore<8>::operator()(T volatile* dest,
+                                                       T store_value) const {
  STATIC_ASSERT(8 == sizeof(T));
  _Atomic_move_long(reinterpret_cast<const volatile int64_t*>(&store_value), reinterpret_cast<volatile int64_t*>(dest));
 }
@ -181,7 +181,7 @@ inline void Atomic::PlatformStore<8>::operator()(T volatile* dest,
 #endif // AMD64

 template<>
-struct Atomic::PlatformOrderedStore<1, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<1, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const {
@ -193,7 +193,7 @@ struct Atomic::PlatformOrderedStore<1, RELEASE_X_FENCE>
 };

 template<>
-struct Atomic::PlatformOrderedStore<2, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<2, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const {
@ -205,7 +205,7 @@ struct Atomic::PlatformOrderedStore<2, RELEASE_X_FENCE>
 };

 template<>
-struct Atomic::PlatformOrderedStore<4, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<4, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const {
@ -218,7 +218,7 @@ struct Atomic::PlatformOrderedStore<4, RELEASE_X_FENCE>

 #ifdef AMD64
 template<>
-struct Atomic::PlatformOrderedStore<8, RELEASE_X_FENCE>
+struct AtomicAccess::PlatformOrderedStore<8, RELEASE_X_FENCE>
 {
  template <typename T>
  void operator()(volatile T* p, T v) const {
--- a/src/hotspot/os_cpu/linux_zero/atomic_linux_zero.hpp
+++ b/src/hotspot/os_cpu/linux_zero/atomic_linux_zero.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2007, 2008, 2011, 2015, Red Hat, Inc.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -31,7 +31,7 @@
 // Implementation of class atomic

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -43,8 +43,8 @@ struct Atomic::PlatformAdd {

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(I));
  STATIC_ASSERT(4 == sizeof(D));

@ -55,8 +55,8 @@ inline D Atomic::PlatformAdd<4>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename D, typename I>
-inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,
-                                                atomic_memory_order order) const {
+inline D AtomicAccess::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(I));
  STATIC_ASSERT(8 == sizeof(D));

@ -67,9 +67,9 @@ inline D Atomic::PlatformAdd<8>::add_then_fetch(D volatile* dest, I add_value,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<4>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));
  FULL_MEM_BARRIER;
  T result = __atomic_exchange_n(dest, exchange_value, __ATOMIC_RELAXED);
@ -79,9 +79,9 @@ inline T Atomic::PlatformXchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,
-                                             T exchange_value,
-                                             atomic_memory_order order) const {
+inline T AtomicAccess::PlatformXchg<8>::operator()(T volatile* dest,
+                                                   T exchange_value,
+                                                   atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));
  FULL_MEM_BARRIER;
  T result = __atomic_exchange_n(dest, exchange_value, __ATOMIC_RELAXED);
@ -91,14 +91,14 @@ inline T Atomic::PlatformXchg<8>::operator()(T volatile* dest,

 // No direct support for cmpxchg of bytes; emulate using int.
 template<>
-struct Atomic::PlatformCmpxchg<1> : Atomic::CmpxchgByteUsingInt {};
+struct AtomicAccess::PlatformCmpxchg<1> : AtomicAccess::CmpxchgByteUsingInt {};

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<4>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(4 == sizeof(T));

  T value = compare_value;
@ -111,10 +111,10 @@ inline T Atomic::PlatformCmpxchg<4>::operator()(T volatile* dest,

 template<>
 template<typename T>
-inline T Atomic::PlatformCmpxchg<8>::operator()(T volatile* dest,
-                                                T compare_value,
-                                                T exchange_value,
-                                                atomic_memory_order order) const {
+inline T AtomicAccess::PlatformCmpxchg<8>::operator()(T volatile* dest,
+                                                      T compare_value,
+                                                      T exchange_value,
+                                                      atomic_memory_order order) const {
  STATIC_ASSERT(8 == sizeof(T));

  FULL_MEM_BARRIER;
@ -134,7 +134,7 @@ inline void atomic_copy64(const volatile void *src, volatile void *dst) {

 template<>
 template<typename T>
-inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {
+inline T AtomicAccess::PlatformLoad<8>::operator()(T const volatile* src) const {
  STATIC_ASSERT(8 == sizeof(T));
  T dest;
  __atomic_load(const_cast<T*>(src), &dest, __ATOMIC_RELAXED);
@ -143,8 +143,8 @@ inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {

 template<>
 template<typename T>
-inline void Atomic::PlatformStore<8>::operator()(T volatile* dest,
-                                                 T store_value) const {
+inline void AtomicAccess::PlatformStore<8>::operator()(T volatile* dest,
+                                                       T store_value) const {
  STATIC_ASSERT(8 == sizeof(T));
  __atomic_store(dest, &store_value, __ATOMIC_RELAXED);
 }
--- a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 1999, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2020, Microsoft Corporation. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -30,14 +31,14 @@
 #include "runtime/vm_version.hpp"


-// As per atomic.hpp all read-modify-write operations have to provide two-way
+// As per atomicAccess.hpp all read-modify-write operations have to provide two-way
 // barriers semantics. The memory_order parameter is ignored - we always provide
 // the strongest/most-conservative ordering
 //
 // For AARCH64 we add explicit barriers in the stubs.

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -53,9 +54,9 @@ struct Atomic::PlatformAdd {
 #define DEFINE_INTRINSIC_ADD(IntrinsicName, IntrinsicType)                \
  template<>                                                              \
  template<typename D, typename I>                                        \
-  inline D Atomic::PlatformAdd<sizeof(IntrinsicType)>::add_then_fetch(D volatile* dest, \
-                                                                      I add_value, \
-                                                                      atomic_memory_order order) const { \
+  inline D AtomicAccess::PlatformAdd<sizeof(IntrinsicType)>::add_then_fetch(D volatile* dest, \
+                                                                            I add_value, \
+                                                                            atomic_memory_order order) const { \
    STATIC_ASSERT(sizeof(IntrinsicType) == sizeof(D));                    \
    return PrimitiveConversions::cast<D>(                                 \
      IntrinsicName(reinterpret_cast<IntrinsicType volatile *>(dest),     \
@ -70,9 +71,9 @@ DEFINE_INTRINSIC_ADD(InterlockedAdd64, __int64)
 #define DEFINE_INTRINSIC_XCHG(IntrinsicName, IntrinsicType)               \
  template<>                                                              \
  template<typename T>                                                    \
-  inline T Atomic::PlatformXchg<sizeof(IntrinsicType)>::operator()(T volatile* dest, \
-                                                                   T exchange_value, \
-                                                                   atomic_memory_order order) const { \
+  inline T AtomicAccess::PlatformXchg<sizeof(IntrinsicType)>::operator()(T volatile* dest, \
+                                                                         T exchange_value, \
+                                                                         atomic_memory_order order) const { \
    STATIC_ASSERT(sizeof(IntrinsicType) == sizeof(T));                    \
    return PrimitiveConversions::cast<T>(                                 \
      IntrinsicName(reinterpret_cast<IntrinsicType volatile *>(dest),     \
@ -85,16 +86,16 @@ DEFINE_INTRINSIC_XCHG(InterlockedExchange64, __int64)
 #undef DEFINE_INTRINSIC_XCHG

 // Note: the order of the parameters is different between
-// Atomic::PlatformCmpxchg<*>::operator() and the
+// AtomicAccess::PlatformCmpxchg<*>::operator() and the
 // InterlockedCompareExchange* API.

 #define DEFINE_INTRINSIC_CMPXCHG(IntrinsicName, IntrinsicType)            \
  template<>                                                              \
  template<typename T>                                                    \
-  inline T Atomic::PlatformCmpxchg<sizeof(IntrinsicType)>::operator()(T volatile* dest, \
-                                                                      T compare_value, \
-                                                                      T exchange_value, \
-                                                                      atomic_memory_order order) const { \
+  inline T AtomicAccess::PlatformCmpxchg<sizeof(IntrinsicType)>::operator()(T volatile* dest, \
+                                                                            T compare_value, \
+                                                                            T exchange_value, \
+                                                                            atomic_memory_order order) const { \
    STATIC_ASSERT(sizeof(IntrinsicType) == sizeof(T));                    \
    return PrimitiveConversions::cast<T>(                                 \
      IntrinsicName(reinterpret_cast<IntrinsicType volatile *>(dest),     \
--- a/src/hotspot/os_cpu/windows_aarch64/copy_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/copy_windows_aarch64.hpp
@ -1,6 +1,6 @@
 /*
 * Copyright (c) 2020, Microsoft Corporation. All rights reserved.
- * Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -26,7 +26,7 @@
 #ifndef OS_CPU_WINDOWS_AARCH64_COPY_WINDOWS_AARCH64_HPP
 #define OS_CPU_WINDOWS_AARCH64_COPY_WINDOWS_AARCH64_HPP

-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"

 #include <string.h>

@ -35,14 +35,14 @@ static void pd_conjoint_atomic_helper(const T* from, T* to, size_t count) {
  if (from > to) {
    while (count-- > 0) {
      // Copy forwards
-      Atomic::store(to++, Atomic::load(from++));
+      AtomicAccess::store(to++, AtomicAccess::load(from++));
    }
  } else {
    from += count - 1;
    to   += count - 1;
    while (count-- > 0) {
      // Copy backwards
-      Atomic::store(to--, Atomic::load(from--));
+      AtomicAccess::store(to--, AtomicAccess::load(from--));
    }
  }
 }
--- a/src/hotspot/os_cpu/windows_x86/atomic_windows_x86.hpp
+++ b/src/hotspot/os_cpu/windows_x86/atomic_windows_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -32,15 +32,15 @@
 // guaranteed to have acquire release semantics (w.r.t. compiler
 // reordering) and therefore does not even need a compiler barrier
 // for normal acquire release accesses. And all generalized
-// bound calls like release_store go through Atomic::load
-// and Atomic::store which do volatile memory accesses.
+// bound calls like release_store go through AtomicAccess::load
+// and AtomicAccess::store which do volatile memory accesses.
 template<> inline void ScopedFence<X_ACQUIRE>::postfix()       { }
 template<> inline void ScopedFence<RELEASE_X>::prefix()        { }
 template<> inline void ScopedFence<RELEASE_X_FENCE>::prefix()  { }
 template<> inline void ScopedFence<RELEASE_X_FENCE>::postfix() { OrderAccess::fence(); }

 template<size_t byte_size>
-struct Atomic::PlatformAdd {
+struct AtomicAccess::PlatformAdd {
  template<typename D, typename I>
  D add_then_fetch(D volatile* dest, I add_value, atomic_memory_order order) const;

@ -56,9 +56,9 @@ struct Atomic::PlatformAdd {
 #define DEFINE_INTRINSIC_ADD(IntrinsicName, IntrinsicType)                \
  template<>                                                              \
  template<typename D, typename I>                                        \
-  inline D Atomic::PlatformAdd<sizeof(IntrinsicType)>::add_then_fetch(D volatile* dest, \
-                                                                      I add_value, \
-                                                                      atomic_memory_order order) const { \
+  inline D AtomicAccess::PlatformAdd<sizeof(IntrinsicType)>::add_then_fetch(D volatile* dest, \
+                                                                            I add_value, \
+                                                                            atomic_memory_order order) const { \
    STATIC_ASSERT(sizeof(IntrinsicType) == sizeof(D));                    \
    return PrimitiveConversions::cast<D>(                                 \
      IntrinsicName(reinterpret_cast<IntrinsicType volatile *>(dest),     \
@ -73,9 +73,9 @@ DEFINE_INTRINSIC_ADD(InterlockedAdd64, __int64)
 #define DEFINE_INTRINSIC_XCHG(IntrinsicName, IntrinsicType)               \
  template<>                                                              \
  template<typename T>                                                    \
-  inline T Atomic::PlatformXchg<sizeof(IntrinsicType)>::operator()(T volatile* dest, \
-                                                                   T exchange_value, \
-                                                                   atomic_memory_order order) const { \
+  inline T AtomicAccess::PlatformXchg<sizeof(IntrinsicType)>::operator()(T volatile* dest, \
+                                                                         T exchange_value, \
+                                                                         atomic_memory_order order) const { \
    STATIC_ASSERT(sizeof(IntrinsicType) == sizeof(T));                    \
    return PrimitiveConversions::cast<T>(                                 \
      IntrinsicName(reinterpret_cast<IntrinsicType volatile *>(dest),     \
@ -88,16 +88,16 @@ DEFINE_INTRINSIC_XCHG(InterlockedExchange64, __int64)
 #undef DEFINE_INTRINSIC_XCHG

 // Note: the order of the parameters is different between
-// Atomic::PlatformCmpxchg<*>::operator() and the
+// AtomicAccess::PlatformCmpxchg<*>::operator() and the
 // InterlockedCompareExchange* API.

 #define DEFINE_INTRINSIC_CMPXCHG(IntrinsicName, IntrinsicType)            \
  template<>                                                              \
  template<typename T>                                                    \
-  inline T Atomic::PlatformCmpxchg<sizeof(IntrinsicType)>::operator()(T volatile* dest, \
-                                                                      T compare_value, \
-                                                                      T exchange_value, \
-                                                                      atomic_memory_order order) const { \
+  inline T AtomicAccess::PlatformCmpxchg<sizeof(IntrinsicType)>::operator()(T volatile* dest, \
+                                                                            T compare_value, \
+                                                                            T exchange_value, \
+                                                                            atomic_memory_order order) const { \
    STATIC_ASSERT(sizeof(IntrinsicType) == sizeof(T));                    \
    return PrimitiveConversions::cast<T>(                                 \
      IntrinsicName(reinterpret_cast<IntrinsicType volatile *>(dest),     \
--- a/src/hotspot/os_cpu/windows_x86/copy_windows_x86.hpp
+++ b/src/hotspot/os_cpu/windows_x86/copy_windows_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -25,21 +25,21 @@
 #ifndef OS_CPU_WINDOWS_X86_COPY_WINDOWS_X86_HPP
 #define OS_CPU_WINDOWS_X86_COPY_WINDOWS_X86_HPP

-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"

 template <typename T>
 static void pd_conjoint_atomic_helper(const T* from, T* to, size_t count) {
  if (from > to) {
    while (count-- > 0) {
      // Copy forwards
-      Atomic::store(to++, Atomic::load(from++));
+      AtomicAccess::store(to++, AtomicAccess::load(from++));
    }
  } else {
    from += count - 1;
    to   += count - 1;
    while (count-- > 0) {
      // Copy backwards
-      Atomic::store(to--, Atomic::load(from--));
+      AtomicAccess::store(to--, AtomicAccess::load(from--));
    }
  }
 }
--- a/src/hotspot/share/adlc/adlparse.cpp
+++ b/src/hotspot/share/adlc/adlparse.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -1389,13 +1389,8 @@ void ADLParser::pipe_parse(void) {
        }

        if (!strcmp(ident, "branch_has_delay_slot")) {
-          skipws();
-          if (_curchar == ';') {
-            next_char(); skipws();
-          }
-
-          pipeline->_branchHasDelaySlot = true;
-          continue;
+          parse_err(SYNERR, "Using obsolete token, branch_has_delay_slot");
+          break;
        }

        if (!strcmp(ident, "max_instructions_per_bundle")) {
@ -1507,36 +1502,8 @@ void ADLParser::pipe_parse(void) {
        }

        if (!strcmp(ident, "nops")) {
+          parse_err(WARN, "Using obsolete token, nops");
          skipws();
-          if (_curchar != '(') {
-            parse_err(SYNERR, "expected `(`, found '%c'\n", _curchar);
-            break;
-            }
-
-          next_char(); skipws();
-
-          while (_curchar != ')') {
-            ident = get_ident();
-            if (ident == nullptr) {
-              parse_err(SYNERR, "expected identifier for nop instruction, found '%c'\n", _curchar);
-              break;
-            }
-
-            pipeline->_noplist.addName(ident);
-            pipeline->_nopcnt++;
-            skipws();
-
-            if (_curchar == ',') {
-              next_char(); skipws();
-            }
-          }
-
-          next_char(); skipws();
-
-          if (_curchar == ';') {
-            next_char(); skipws();
-          }
-
          continue;
        }

@ -1790,16 +1757,8 @@ void ADLParser::pipe_class_parse(PipelineForm &pipeline) {

    if (!strcmp(ident, "one_instruction_with_delay_slot") ||
        !strcmp(ident, "single_instruction_with_delay_slot")) {
-      skipws();
-      if (_curchar != ';') {
-        parse_err(SYNERR, "missing \";\" in latency definition\n");
-        return;
-      }
-
-      pipe_class->setInstructionCount(1);
-      pipe_class->setBranchDelay(true);
-      next_char(); skipws();
-      continue;
+      parse_err(SYNERR, "Using obsolete token, %s", ident);
+      return;
    }

    if (!strcmp(ident, "one_instruction") ||
@ -1859,15 +1818,8 @@ void ADLParser::pipe_class_parse(PipelineForm &pipeline) {
    }

    if (!strcmp(ident, "has_delay_slot")) {
-      skipws();
-      if (_curchar != ';') {
-        parse_err(SYNERR, "missing \";\" after \"has_delay_slot\"\n");
-        return;
-      }
-
-      pipe_class->setBranchDelay(true);
-      next_char(); skipws();
-      continue;
+      parse_err(SYNERR, "Using obsolete token, %s", ident);
+      return;
    }

    if (!strcmp(ident, "force_serialization")) {
--- a/src/hotspot/share/adlc/formsopt.cpp
+++ b/src/hotspot/share/adlc/formsopt.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -511,10 +511,7 @@ PipelineForm::PipelineForm()
  ,  _stagecnt              (0)
  ,  _classlist             ()
  ,  _classcnt              (0)
-  ,  _noplist               ()
-  ,  _nopcnt                (0)
  ,  _variableSizeInstrs    (false)
-  ,  _branchHasDelaySlot    (false)
  ,  _maxInstrsPerBundle    (0)
  ,  _maxBundlesPerCycle    (1)
  ,  _instrUnitSize         (0)
@ -533,7 +530,6 @@ void PipelineForm::output(FILE *fp) {           // Write info to output files
  const char *res;
  const char *stage;
  const char *cls;
-  const char *nop;
  int count = 0;

  fprintf(fp,"\nPipeline:");
@ -549,8 +545,6 @@ void PipelineForm::output(FILE *fp) {           // Write info to output files
      fprintf(fp," fixed-sized bundles of %d bytes", _bundleUnitSize);
    else
      fprintf(fp," fixed-sized instructions");
-  if (_branchHasDelaySlot)
-    fprintf(fp,", branch has delay slot");
  if (_maxInstrsPerBundle > 0)
    fprintf(fp,", max of %d instruction%s in parallel",
      _maxInstrsPerBundle, _maxInstrsPerBundle > 1 ? "s" : "");
@ -574,9 +568,6 @@ void PipelineForm::output(FILE *fp) {           // Write info to output files
  for ( _classlist.reset(); (cls = _classlist.iter()) != nullptr; )
    _classdict[cls]->is_pipeclass()->output(fp);

-  fprintf(fp,"\nNop Instructions:");
-  for ( _noplist.reset(); (nop = _noplist.iter()) != nullptr; )
-    fprintf(fp, " \"%s\"", nop);
  fprintf(fp,"\n");
 }

@ -643,7 +634,6 @@ PipeClassForm::PipeClassForm(const char *id, int num)
  , _fixed_latency(0)
  , _instruction_count(0)
  , _has_multiple_bundles(false)
-  , _has_branch_delay_slot(false)
  , _force_serialization(false)
  , _may_have_no_code(false) {
 }
--- a/src/hotspot/share/adlc/formsopt.hpp
+++ b/src/hotspot/share/adlc/formsopt.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -386,11 +386,7 @@ public:
  FormDict   _classdict;          // Class Name -> PipeClassForm mapping
  int        _classcnt;           // Number of classes

-  NameList   _noplist;            // List of NOP instructions
-  int        _nopcnt;             // Number of nop instructions
-
  bool       _variableSizeInstrs; // Indicates if this architecture has variable sized instructions
-  bool       _branchHasDelaySlot; // Indicates that branches have delay slot instructions
  int        _maxInstrsPerBundle; // Indicates the maximum number of instructions for ILP
  int        _maxBundlesPerCycle; // Indicates the maximum number of bundles for ILP
  int        _instrUnitSize;      // The minimum instruction unit size, in bytes
@ -502,7 +498,6 @@ public:
  int               _fixed_latency;     // Always takes this number of cycles
  int               _instruction_count; // Number of instructions in first bundle
  bool              _has_multiple_bundles;  // Indicates if 1 or multiple bundles
-  bool              _has_branch_delay_slot; // Has branch delay slot as last instruction
  bool              _force_serialization;   // This node serializes relative to surrounding nodes
  bool              _may_have_no_code;      // This node may generate no code based on register allocation

@ -521,13 +516,11 @@ public:

  void setInstructionCount(int i)    { _instruction_count = i; }
  void setMultipleBundles(bool b)    { _has_multiple_bundles = b; }
-  void setBranchDelay(bool s)        { _has_branch_delay_slot = s; }
  void setForceSerialization(bool s) { _force_serialization = s; }
  void setMayHaveNoCode(bool s)      { _may_have_no_code = s; }

  int  InstructionCount()   const { return _instruction_count; }
  bool hasMultipleBundles() const { return _has_multiple_bundles; }
-  bool hasBranchDelay()     const { return _has_branch_delay_slot; }
  bool forceSerialization() const { return _force_serialization; }
  bool mayHaveNoCode()      const { return _may_have_no_code; }

--- a/src/hotspot/share/adlc/output_c.cpp
+++ b/src/hotspot/share/adlc/output_c.cpp
@ -794,8 +794,8 @@ void ArchDesc::build_pipe_classes(FILE *fp_cpp) {

  // Create the pipeline class description

-  fprintf(fp_cpp, "static const Pipeline pipeline_class_Zero_Instructions(0, 0, true, 0, 0, false, false, false, false, nullptr, nullptr, nullptr, Pipeline_Use(0, 0, 0, nullptr));\n\n");
-  fprintf(fp_cpp, "static const Pipeline pipeline_class_Unknown_Instructions(0, 0, true, 0, 0, false, true, true, false, nullptr, nullptr, nullptr, Pipeline_Use(0, 0, 0, nullptr));\n\n");
+  fprintf(fp_cpp, "static const Pipeline pipeline_class_Zero_Instructions(0, 0, true, 0, 0, false, false, false, nullptr, nullptr, nullptr, Pipeline_Use(0, 0, 0, nullptr));\n\n");
+  fprintf(fp_cpp, "static const Pipeline pipeline_class_Unknown_Instructions(0, 0, true, 0, 0, true, true, false, nullptr, nullptr, nullptr, Pipeline_Use(0, 0, 0, nullptr));\n\n");

  fprintf(fp_cpp, "const Pipeline_Use_Element Pipeline_Use::elaborated_elements[%d] = {\n", _pipeline->_rescount);
  for (int i1 = 0; i1 < _pipeline->_rescount; i1++) {
@ -895,12 +895,11 @@ void ArchDesc::build_pipe_classes(FILE *fp_cpp) {
      fprintf(fp_cpp, "(uint)stage_%s", _pipeline->_stages.name(maxWriteStage));
    else
      fprintf(fp_cpp, "((uint)stage_%s)+%d", _pipeline->_stages.name(maxWriteStage), maxMoreInstrs);
-    fprintf(fp_cpp, ", %d, %s, %d, %d, %s, %s, %s, %s,\n",
+    fprintf(fp_cpp, ", %d, %s, %d, %d, %s, %s, %s,\n",
      paramcount,
      pipeclass->hasFixedLatency() ? "true" : "false",
      pipeclass->fixedLatency(),
      pipeclass->InstructionCount(),
-      pipeclass->hasBranchDelay() ? "true" : "false",
      pipeclass->hasMultipleBundles() ? "true" : "false",
      pipeclass->forceSerialization() ? "true" : "false",
      pipeclass->mayHaveNoCode() ? "true" : "false" );
@ -977,34 +976,12 @@ void ArchDesc::build_pipe_classes(FILE *fp_cpp) {
  }
  fprintf(fp_cpp, "}\n\n");

-  // Output the list of nop nodes
-  fprintf(fp_cpp, "// Descriptions for emitting different functional unit nops\n");
-  const char *nop;
-  int nopcnt = 0;
-  for ( _pipeline->_noplist.reset(); (nop = _pipeline->_noplist.iter()) != nullptr; nopcnt++ );
-
-  fprintf(fp_cpp, "void Bundle::initialize_nops(MachNode * nop_list[%d]) {\n", nopcnt);
-  int i = 0;
-  for ( _pipeline->_noplist.reset(); (nop = _pipeline->_noplist.iter()) != nullptr; i++ ) {
-    fprintf(fp_cpp, "  nop_list[%d] = (MachNode *) new %sNode();\n", i, nop);
-  }
-  fprintf(fp_cpp, "};\n\n");
  fprintf(fp_cpp, "#ifndef PRODUCT\n");
  fprintf(fp_cpp, "void Bundle::dump(outputStream *st) const {\n");
-  fprintf(fp_cpp, "  static const char * bundle_flags[] = {\n");
-  fprintf(fp_cpp, "    \"\",\n");
-  fprintf(fp_cpp, "    \"use nop delay\",\n");
-  fprintf(fp_cpp, "    \"use unconditional delay\",\n");
-  fprintf(fp_cpp, "    \"use conditional delay\",\n");
-  fprintf(fp_cpp, "    \"used in conditional delay\",\n");
-  fprintf(fp_cpp, "    \"used in unconditional delay\",\n");
-  fprintf(fp_cpp, "    \"used in all conditional delays\",\n");
-  fprintf(fp_cpp, "  };\n\n");
-
  fprintf(fp_cpp, "  static const char *resource_names[%d] = {", _pipeline->_rescount);
  // Don't add compound resources to the list of resource names
  const char* resource;
-  i = 0;
+  int i = 0;
  for (_pipeline->_reslist.reset(); (resource = _pipeline->_reslist.iter()) != nullptr;) {
    if (_pipeline->_resdict[resource]->is_resource()->is_discrete()) {
      fprintf(fp_cpp, " \"%s\"%c", resource, i < _pipeline->_rescount - 1 ? ',' : ' ');
@ -1015,12 +992,8 @@ void ArchDesc::build_pipe_classes(FILE *fp_cpp) {

  // See if the same string is in the table
  fprintf(fp_cpp, "  bool needs_comma = false;\n\n");
-  fprintf(fp_cpp, "  if (_flags) {\n");
-  fprintf(fp_cpp, "    st->print(\"%%s\", bundle_flags[_flags]);\n");
-  fprintf(fp_cpp, "    needs_comma = true;\n");
-  fprintf(fp_cpp, "  };\n");
  fprintf(fp_cpp, "  if (instr_count()) {\n");
-  fprintf(fp_cpp, "    st->print(\"%%s%%d instr%%s\", needs_comma ? \", \" : \"\", instr_count(), instr_count() != 1 ? \"s\" : \"\");\n");
+  fprintf(fp_cpp, "    st->print(\"%%d instr%%s\", instr_count(), instr_count() != 1 ? \"s\" : \"\");\n");
  fprintf(fp_cpp, "    needs_comma = true;\n");
  fprintf(fp_cpp, "  };\n");
  fprintf(fp_cpp, "  uint r = resources_used();\n");
--- a/src/hotspot/share/adlc/output_h.cpp
+++ b/src/hotspot/share/adlc/output_h.cpp
@ -98,7 +98,7 @@ void ArchDesc::buildMachRegisterNumbers(FILE *fp_hpp) {
  }

  fprintf(fp_hpp, "\n// Size of register-mask in ints\n");
-  fprintf(fp_hpp, "#define RM_SIZE %d\n", RegisterForm::RegMask_Size());
+  fprintf(fp_hpp, "#define RM_SIZE_IN_INTS %d\n", RegisterForm::RegMask_Size());
  fprintf(fp_hpp, "// Unroll factor for loops over the data in a RegMask\n");
  fprintf(fp_hpp, "#define FORALL_BODY ");
  int len = RegisterForm::RegMask_Size();
@ -935,8 +935,6 @@ void ArchDesc::declare_pipe_classes(FILE *fp_hpp) {
    _pipeline->_variableSizeInstrs ? 1 : 0);
  fprintf(fp_hpp, "    _fixed_size_instructions = %d,\n",
    _pipeline->_variableSizeInstrs ? 0 : 1);
-  fprintf(fp_hpp, "    _branch_has_delay_slot = %d,\n",
-    _pipeline->_branchHasDelaySlot ? 1 : 0);
  fprintf(fp_hpp, "    _max_instrs_per_bundle = %d,\n",
    _pipeline->_maxInstrsPerBundle);
  fprintf(fp_hpp, "    _max_bundles_per_cycle = %d,\n",
@ -983,7 +981,6 @@ void ArchDesc::declare_pipe_classes(FILE *fp_hpp) {
  fprintf(fp_hpp, "  const unsigned char                   _fixed_latency;\n");
  fprintf(fp_hpp, "  const unsigned char                   _instruction_count;\n");
  fprintf(fp_hpp, "  const bool                            _has_fixed_latency;\n");
-  fprintf(fp_hpp, "  const bool                            _has_branch_delay;\n");
  fprintf(fp_hpp, "  const bool                            _has_multiple_bundles;\n");
  fprintf(fp_hpp, "  const bool                            _force_serialization;\n");
  fprintf(fp_hpp, "  const bool                            _may_have_no_code;\n");
@ -998,7 +995,6 @@ void ArchDesc::declare_pipe_classes(FILE *fp_hpp) {
  fprintf(fp_hpp, "           bool                            has_fixed_latency,\n");
  fprintf(fp_hpp, "           uint                            fixed_latency,\n");
  fprintf(fp_hpp, "           uint                            instruction_count,\n");
-  fprintf(fp_hpp, "           bool                            has_branch_delay,\n");
  fprintf(fp_hpp, "           bool                            has_multiple_bundles,\n");
  fprintf(fp_hpp, "           bool                            force_serialization,\n");
  fprintf(fp_hpp, "           bool                            may_have_no_code,\n");
@ -1011,7 +1007,6 @@ void ArchDesc::declare_pipe_classes(FILE *fp_hpp) {
  fprintf(fp_hpp, "  , _fixed_latency(fixed_latency)\n");
  fprintf(fp_hpp, "  , _instruction_count(instruction_count)\n");
  fprintf(fp_hpp, "  , _has_fixed_latency(has_fixed_latency)\n");
-  fprintf(fp_hpp, "  , _has_branch_delay(has_branch_delay)\n");
  fprintf(fp_hpp, "  , _has_multiple_bundles(has_multiple_bundles)\n");
  fprintf(fp_hpp, "  , _force_serialization(force_serialization)\n");
  fprintf(fp_hpp, "  , _may_have_no_code(may_have_no_code)\n");
@ -1046,8 +1041,6 @@ void ArchDesc::declare_pipe_classes(FILE *fp_hpp) {
  fprintf(fp_hpp, "    return (_resource_use._count); }\n\n");
  fprintf(fp_hpp, "  uint instructionCount() const {\n");
  fprintf(fp_hpp, "    return (_instruction_count); }\n\n");
-  fprintf(fp_hpp, "  bool hasBranchDelay() const {\n");
-  fprintf(fp_hpp, "    return (_has_branch_delay); }\n\n");
  fprintf(fp_hpp, "  bool hasMultipleBundles() const {\n");
  fprintf(fp_hpp, "    return (_has_multiple_bundles); }\n\n");
  fprintf(fp_hpp, "  bool forceSerialization() const {\n");
@ -1071,56 +1064,19 @@ void ArchDesc::declare_pipe_classes(FILE *fp_hpp) {
  uint rshift = rescount;

  fprintf(fp_hpp, "protected:\n");
-  fprintf(fp_hpp, "  enum {\n");
-  fprintf(fp_hpp, "    _unused_delay                   = 0x%x,\n", 0);
-  fprintf(fp_hpp, "    _use_nop_delay                  = 0x%x,\n", 1);
-  fprintf(fp_hpp, "    _use_unconditional_delay        = 0x%x,\n", 2);
-  fprintf(fp_hpp, "    _use_conditional_delay          = 0x%x,\n", 3);
-  fprintf(fp_hpp, "    _used_in_conditional_delay      = 0x%x,\n", 4);
-  fprintf(fp_hpp, "    _used_in_unconditional_delay    = 0x%x,\n", 5);
-  fprintf(fp_hpp, "    _used_in_all_conditional_delays = 0x%x,\n", 6);
-  fprintf(fp_hpp, "\n");
-  fprintf(fp_hpp, "    _use_delay                      = 0x%x,\n", 3);
-  fprintf(fp_hpp, "    _used_in_delay                  = 0x%x\n",  4);
-  fprintf(fp_hpp, "  };\n\n");
-  fprintf(fp_hpp, "  uint _flags          : 3,\n");
-  fprintf(fp_hpp, "       _starts_bundle  : 1,\n");
+  fprintf(fp_hpp, "  uint _starts_bundle  : 1,\n");
  fprintf(fp_hpp, "       _instr_count    : %d,\n",   mshift);
  fprintf(fp_hpp, "       _resources_used : %d;\n",   rshift);
  fprintf(fp_hpp, "public:\n");
-  fprintf(fp_hpp, "  Bundle() : _flags(_unused_delay), _starts_bundle(0), _instr_count(0), _resources_used(0) {}\n\n");
+  fprintf(fp_hpp, "  Bundle() : _starts_bundle(0), _instr_count(0), _resources_used(0) {}\n\n");
  fprintf(fp_hpp, "  void set_instr_count(uint i) { _instr_count  = i; }\n");
  fprintf(fp_hpp, "  void set_resources_used(uint i) { _resources_used   = i; }\n");
-  fprintf(fp_hpp, "  void clear_usage() { _flags = _unused_delay; }\n");
  fprintf(fp_hpp, "  void set_starts_bundle() { _starts_bundle = true; }\n");

-  fprintf(fp_hpp, "  uint flags() const { return (_flags); }\n");
  fprintf(fp_hpp, "  uint instr_count() const { return (_instr_count); }\n");
  fprintf(fp_hpp, "  uint resources_used() const { return (_resources_used); }\n");
  fprintf(fp_hpp, "  bool starts_bundle() const { return (_starts_bundle != 0); }\n");

-  fprintf(fp_hpp, "  void set_use_nop_delay() { _flags = _use_nop_delay; }\n");
-  fprintf(fp_hpp, "  void set_use_unconditional_delay() { _flags = _use_unconditional_delay; }\n");
-  fprintf(fp_hpp, "  void set_use_conditional_delay() { _flags = _use_conditional_delay; }\n");
-  fprintf(fp_hpp, "  void set_used_in_unconditional_delay() { _flags = _used_in_unconditional_delay; }\n");
-  fprintf(fp_hpp, "  void set_used_in_conditional_delay() { _flags = _used_in_conditional_delay; }\n");
-  fprintf(fp_hpp, "  void set_used_in_all_conditional_delays() { _flags = _used_in_all_conditional_delays; }\n");
-
-  fprintf(fp_hpp, "  bool use_nop_delay() { return (_flags == _use_nop_delay); }\n");
-  fprintf(fp_hpp, "  bool use_unconditional_delay() { return (_flags == _use_unconditional_delay); }\n");
-  fprintf(fp_hpp, "  bool use_conditional_delay() { return (_flags == _use_conditional_delay); }\n");
-  fprintf(fp_hpp, "  bool used_in_unconditional_delay() { return (_flags == _used_in_unconditional_delay); }\n");
-  fprintf(fp_hpp, "  bool used_in_conditional_delay() { return (_flags == _used_in_conditional_delay); }\n");
-  fprintf(fp_hpp, "  bool used_in_all_conditional_delays() { return (_flags == _used_in_all_conditional_delays); }\n");
-  fprintf(fp_hpp, "  bool use_delay() { return ((_flags & _use_delay) != 0); }\n");
-  fprintf(fp_hpp, "  bool used_in_delay() { return ((_flags & _used_in_delay) != 0); }\n\n");
-
-  fprintf(fp_hpp, "  enum {\n");
-  fprintf(fp_hpp, "    _nop_count = %d\n",
-    _pipeline->_nopcnt);
-  fprintf(fp_hpp, "  };\n\n");
-  fprintf(fp_hpp, "  static void initialize_nops(MachNode *nop_list[%d]);\n\n",
-    _pipeline->_nopcnt);
  fprintf(fp_hpp, "#ifndef PRODUCT\n");
  fprintf(fp_hpp, "  void dump(outputStream *st = tty) const;\n");
  fprintf(fp_hpp, "#endif\n");
--- a/src/hotspot/share/c1/c1_LIR.cpp
+++ b/src/hotspot/share/c1/c1_LIR.cpp
@ -800,15 +800,6 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
    }


-// LIR_OpDelay
-    case lir_delay_slot: {
-      assert(op->as_OpDelay() != nullptr, "must be");
-      LIR_OpDelay* opDelay = (LIR_OpDelay*)op;
-
-      visit(opDelay->delay_op());
-      break;
-    }
-
 // LIR_OpTypeCheck
    case lir_instanceof:
    case lir_checkcast:
@ -1073,10 +1064,6 @@ void LIR_OpAssert::emit_code(LIR_Assembler* masm) {
 }
 #endif

-void LIR_OpDelay::emit_code(LIR_Assembler* masm) {
-  masm->emit_delay(this);
-}
-
 void LIR_OpProfileCall::emit_code(LIR_Assembler* masm) {
  masm->emit_profile_call(this);
 }
@ -1761,8 +1748,6 @@ const char * LIR_Op::name() const {
     // LIR_OpLock
     case lir_lock:                  s = "lock";          break;
     case lir_unlock:                s = "unlock";        break;
-     // LIR_OpDelay
-     case lir_delay_slot:            s = "delay";         break;
     // LIR_OpTypeCheck
     case lir_instanceof:            s = "instanceof";    break;
     case lir_checkcast:             s = "checkcast";     break;
@ -2044,11 +2029,6 @@ void LIR_OpAssert::print_instr(outputStream* out) const {
 #endif


-void LIR_OpDelay::print_instr(outputStream* out) const {
-  _op->print_on(out);
-}
-
-
 // LIR_OpProfileCall
 void LIR_OpProfileCall::print_instr(outputStream* out) const {
  profiled_method()->name()->print_symbol_on(out);
--- a/src/hotspot/share/c1/c1_LIR.hpp
+++ b/src/hotspot/share/c1/c1_LIR.hpp
@ -879,7 +879,6 @@ class      LIR_OpConvert;
 class      LIR_OpAllocObj;
 class      LIR_OpReturn;
 class    LIR_Op2;
-class    LIR_OpDelay;
 class    LIR_Op3;
 class      LIR_OpAllocArray;
 class    LIR_Op4;
@ -985,9 +984,6 @@ enum LIR_Code {
    , lir_lock
    , lir_unlock
  , end_opLock
-  , begin_delay_slot
-    , lir_delay_slot
-  , end_delay_slot
  , begin_opTypeCheck
    , lir_instanceof
    , lir_checkcast
@ -1124,7 +1120,6 @@ class LIR_Op: public CompilationResourceObj {
  virtual LIR_OpCall* as_OpCall() { return nullptr; }
  virtual LIR_OpJavaCall* as_OpJavaCall() { return nullptr; }
  virtual LIR_OpLabel* as_OpLabel() { return nullptr; }
-  virtual LIR_OpDelay* as_OpDelay() { return nullptr; }
  virtual LIR_OpLock* as_OpLock() { return nullptr; }
  virtual LIR_OpAllocArray* as_OpAllocArray() { return nullptr; }
  virtual LIR_OpAllocObj* as_OpAllocObj() { return nullptr; }
@ -1886,25 +1881,6 @@ class LIR_OpLoadKlass: public LIR_Op {
  void print_instr(outputStream* out) const PRODUCT_RETURN;
 };

-class LIR_OpDelay: public LIR_Op {
- friend class LIR_OpVisitState;
-
- private:
-  LIR_Op* _op;
-
- public:
-  LIR_OpDelay(LIR_Op* op, CodeEmitInfo* info):
-    LIR_Op(lir_delay_slot, LIR_OprFact::illegalOpr, info),
-    _op(op) {
-    assert(op->code() == lir_nop, "should be filling with nops");
-  }
-  virtual void emit_code(LIR_Assembler* masm);
-  virtual LIR_OpDelay* as_OpDelay() { return this; }
-  void print_instr(outputStream* out) const PRODUCT_RETURN;
-  LIR_Op* delay_op() const { return _op; }
-  CodeEmitInfo* call_info() const { return info(); }
-};
-
 #ifdef ASSERT
 // LIR_OpAssert
 class LIR_OpAssert : public LIR_Op2 {
--- a/src/hotspot/share/c1/c1_LIRAssembler.cpp
+++ b/src/hotspot/share/c1/c1_LIRAssembler.cpp
@ -194,8 +194,7 @@ void LIR_Assembler::emit_exception_entries(ExceptionInfoList* info_list) {
      XHandler* handler = handlers->handler_at(j);
      assert(handler->lir_op_id() != -1, "handler not processed by LinearScan");
      assert(handler->entry_code() == nullptr ||
-             handler->entry_code()->instructions_list()->last()->code() == lir_branch ||
-             handler->entry_code()->instructions_list()->last()->code() == lir_delay_slot, "last operation must be branch");
+             handler->entry_code()->instructions_list()->last()->code() == lir_branch, "last operation must be branch");

      if (handler->entry_pco() == -1) {
        // entry code not emitted yet
--- a/src/hotspot/share/c1/c1_LIRAssembler.hpp
+++ b/src/hotspot/share/c1/c1_LIRAssembler.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -154,8 +154,7 @@ class LIR_Assembler: public CompilationResourceObj {
  void emit_block(BlockBegin* block);
  void emit_lir_list(LIR_List* list);

-  // any last minute peephole optimizations are performed here.  In
-  // particular sparc uses this for delay slot filling.
+  // any last minute peephole optimizations are performed here.
  void peephole(LIR_List* list);

  void return_op(LIR_Opr result, C1SafepointPollStub* code_stub);
@ -204,7 +203,6 @@ class LIR_Assembler: public CompilationResourceObj {
  void emit_rtcall(LIR_OpRTCall* op);
  void emit_profile_call(LIR_OpProfileCall* op);
  void emit_profile_type(LIR_OpProfileType* op);
-  void emit_delay(LIR_OpDelay* op);

  void arith_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dest, CodeEmitInfo* info);
  void arithmetic_idiv(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr temp, LIR_Opr result, CodeEmitInfo* info);
--- a/src/hotspot/share/c1/c1_Runtime1.cpp
+++ b/src/hotspot/share/c1/c1_Runtime1.cpp
@ -54,7 +54,7 @@
 #include "oops/objArrayOop.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "prims/jvmtiExport.hpp"
-#include "runtime/atomic.hpp"
+#include "runtime/atomicAccess.hpp"
 #include "runtime/fieldDescriptor.inline.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/handles.inline.hpp"
--- a/src/hotspot/share/cds/aotLinkedClassBulkLoader.cpp
+++ b/src/hotspot/share/cds/aotLinkedClassBulkLoader.cpp
@ -57,7 +57,7 @@ bool AOTLinkedClassBulkLoader::class_preloading_finished() {
    // The ConstantPools of preloaded classes have references to other preloaded classes. We don't
    // want any Java code (including JVMCI compiler) to use these classes until all of them
    // are loaded.
-    return Atomic::load_acquire(&_all_completed);
+    return AtomicAccess::load_acquire(&_all_completed);
  }
 }

@ -90,7 +90,7 @@ void AOTLinkedClassBulkLoader::load_non_javabase_classes(JavaThread* current) {
  }

  _app_completed = true;
-  Atomic::release_store(&_all_completed, true);
+  AtomicAccess::release_store(&_all_completed, true);
 }

 void AOTLinkedClassBulkLoader::load_classes_in_loader(JavaThread* current, AOTLinkedClassCategory class_category, oop class_loader_oop) {
--- a/Show More
+++ b/Show More