Merge branch 'master' into 8044609-ssl

2026-03-01 03:30:34 +00:00 · 2024-11-25 20:33:55 +00:00 · 2024-11-25 20:33:55 +00:00 · a2e311facf
commit a2e311facf
parent b3abd98523 8de158aefe
983 changed files with 29033 additions and 19885 deletions
--- a/.gitignore
+++ b/.gitignore
@ -20,3 +20,5 @@ NashornProfile.txt
 /.settings/
 /compile_commands.json
 /.cache
+/.gdbinit
+/.lldbinit
--- a/make/conf/jib-profiles.js
+++ b/make/conf/jib-profiles.js
@ -253,6 +253,7 @@ var getJibProfilesCommon = function (input, data) {
        configure_args: concat(
            "--with-exclude-translations=es,fr,it,ko,pt_BR,sv,ca,tr,cs,sk,ja_JP_A,ja_JP_HA,ja_JP_HI,ja_JP_I,zh_TW,zh_HK",
            "--disable-jvm-feature-shenandoahgc",
+            "--disable-cds-archive-coh",
            versionArgs(input, common))
    };

--- a/make/hotspot/lib/JvmOverrideFiles.gmk
+++ b/make/hotspot/lib/JvmOverrideFiles.gmk
@ -37,6 +37,10 @@ ifeq ($(TOOLCHAIN_TYPE), gcc)
    # Need extra inlining to collapse shared marking code into the hot marking loop
    BUILD_LIBJVM_shenandoahMark.cpp_CXXFLAGS := --param inline-unit-growth=1000
  endif
+  # disable lto in g1ParScanThreadState because of special inlining/flattening used there
+  ifeq ($(call check-jvm-feature, link-time-opt), true)
+    BUILD_LIBJVM_g1ParScanThreadState.cpp_CXXFLAGS := -fno-lto
+  endif
 endif

 LIBJVM_FDLIBM_COPY_OPT_FLAG := $(CXX_O_FLAG_NONE)
--- a/make/modules/java.desktop/Java.gmk
+++ b/make/modules/java.desktop/Java.gmk
@ -46,18 +46,6 @@ EXCLUDE_FILES += \
    javax/swing/plaf/nimbus/SpinnerPainter.java \
    javax/swing/plaf/nimbus/SplitPanePainter.java \
    javax/swing/plaf/nimbus/TabbedPanePainter.java \
-    sun/awt/resources/security-icon-bw16.png \
-    sun/awt/resources/security-icon-bw24.png \
-    sun/awt/resources/security-icon-bw32.png \
-    sun/awt/resources/security-icon-bw48.png \
-    sun/awt/resources/security-icon-interim16.png \
-    sun/awt/resources/security-icon-interim24.png \
-    sun/awt/resources/security-icon-interim32.png \
-    sun/awt/resources/security-icon-interim48.png \
-    sun/awt/resources/security-icon-yellow16.png \
-    sun/awt/resources/security-icon-yellow24.png \
-    sun/awt/resources/security-icon-yellow32.png \
-    sun/awt/resources/security-icon-yellow48.png \
    sun/awt/X11/java-icon16.png \
    sun/awt/X11/java-icon24.png \
    sun/awt/X11/java-icon32.png \
--- a/make/modules/java.desktop/gensrc/GensrcIcons.gmk
+++ b/make/modules/java.desktop/gensrc/GensrcIcons.gmk
@ -37,23 +37,6 @@ GENSRC_AWT_ICONS_SRC += \
    $(X11_ICONS_PATH_PREFIX)/classes/sun/awt/X11/java-icon32.png \
    $(X11_ICONS_PATH_PREFIX)/classes/sun/awt/X11/java-icon48.png

-
-AWT_ICONPATH := $(MODULE_SRC)/share/classes/sun/awt/resources
-
-GENSRC_AWT_ICONS_SRC += \
-    $(AWT_ICONPATH)/security-icon-bw16.png \
-    $(AWT_ICONPATH)/security-icon-interim16.png \
-    $(AWT_ICONPATH)/security-icon-yellow16.png \
-    $(AWT_ICONPATH)/security-icon-bw24.png \
-    $(AWT_ICONPATH)/security-icon-interim24.png \
-    $(AWT_ICONPATH)/security-icon-yellow24.png \
-    $(AWT_ICONPATH)/security-icon-bw32.png \
-    $(AWT_ICONPATH)/security-icon-interim32.png \
-    $(AWT_ICONPATH)/security-icon-yellow32.png \
-    $(AWT_ICONPATH)/security-icon-bw48.png \
-    $(AWT_ICONPATH)/security-icon-interim48.png \
-    $(AWT_ICONPATH)/security-icon-yellow48.png
-
 GENSRC_AWT_ICONS_FILES := $(notdir $(GENSRC_AWT_ICONS_SRC))

 GENSRC_AWT_ICONS_SHORT_NAME = $(subst .,_,$(subst -,_,$(1)))
--- a/src/demo/share/jfc/Font2DTest/FontPanel.java
+++ b/src/demo/share/jfc/Font2DTest/FontPanel.java
@ -1114,13 +1114,7 @@ public final class FontPanel extends JPanel implements AdjustmentListener {
            /// Position and set size of zoom window as needed
            zoomWindow.setLocation( canvasLoc.x + zoomAreaX, canvasLoc.y + zoomAreaY );
            if ( !nowZooming ) {
-                if ( zoomWindow.getWarningString() != null )
-                  /// If this is not opened as a "secure" window,
-                  /// it has a banner below the zoom dialog which makes it look really BAD
-                  /// So enlarge it by a bit
-                  zoomWindow.setSize( zoomAreaWidth + 1, zoomAreaHeight + 20 );
-                else
-                  zoomWindow.setSize( zoomAreaWidth + 1, zoomAreaHeight + 1 );
+                zoomWindow.setSize( zoomAreaWidth + 1, zoomAreaHeight + 1 );
            }

            /// Prepare zoomed image
--- a/src/demo/share/jfc/SampleTree/SampleTree.java
+++ b/src/demo/share/jfc/SampleTree/SampleTree.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@ -142,7 +142,6 @@ public final class SampleTree {

    /** Constructs a JPanel containing check boxes for the different
     * options that tree supports. */
-    @SuppressWarnings("serial")
    private JPanel constructOptionsPanel() {
        JCheckBox aCheckbox;
        JPanel retPanel = new JPanel(false);
--- a/src/demo/share/jfc/TableExample/TableExample3.java
+++ b/src/demo/share/jfc/TableExample/TableExample3.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@ -97,7 +97,6 @@ public class TableExample3 {
        };

        // Create a model of the data.
-        @SuppressWarnings("serial")
        TableModel dataModel = new AbstractTableModel() {
            // These methods always need to be implemented.

--- a/src/demo/share/jfc/TableExample/TableExample4.java
+++ b/src/demo/share/jfc/TableExample/TableExample4.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@ -99,7 +99,6 @@ public class TableExample4 {
        };

        // Create a model of the data.
-        @SuppressWarnings("serial")
        TableModel dataModel = new AbstractTableModel() {
            // These methods always need to be implemented.

@ -180,7 +179,6 @@ public class TableExample4 {

        // Show the values in the "Favorite Number" column in different colors.
        TableColumn numbersColumn = tableView.getColumn("Favorite Number");
-        @SuppressWarnings("serial")
        DefaultTableCellRenderer numberColumnRenderer
                = new DefaultTableCellRenderer() {

--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@ -2632,6 +2632,23 @@ bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 // to be subsumed into complex addressing expressions or compute them
 // into registers?
 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
+
+  // Loads and stores with indirect memory input (e.g., volatile loads and
+  // stores) do not subsume the input into complex addressing expressions. If
+  // the addressing expression is input to at least one such load or store, do
+  // not clone the addressing expression. Query needs_acquiring_load and
+  // needs_releasing_store as a proxy for indirect memory input, as it is not
+  // possible to directly query for indirect memory input at this stage.
+  for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax; i++) {
+    Node* n = m->fast_out(i);
+    if (n->is_Load() && needs_acquiring_load(n)) {
+      return false;
+    }
+    if (n->is_Store() && needs_releasing_store(n)) {
+      return false;
+    }
+  }
+
  if (clone_base_plus_offset_address(m, mstack, address_visited)) {
    return true;
  }
--- a/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_globals_aarch64.hpp
@ -66,6 +66,7 @@ define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 8);
 define_pd_global(bool, IdealizeClearArrayNode,       true);

 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
--- a/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp
@ -393,7 +393,13 @@ void InterpreterMacroAssembler::dispatch_base(TosState state,
                                              bool verifyoop,
                                              bool generate_poll) {
  if (VerifyActivationFrameSize) {
-    Unimplemented();
+    Label L;
+    sub(rscratch2, rfp, esp);
+    int min_frame_size = (frame::link_offset - frame::interpreter_frame_initial_sp_offset) * wordSize;
+    subs(rscratch2, rscratch2, min_frame_size);
+    br(Assembler::GE, L);
+    stop("broken stack frame");
+    bind(L);
  }
  if (verifyoop) {
    interp_verify_oop(r0, state);
--- a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp
@ -620,7 +620,7 @@ address TemplateInterpreterGenerator::generate_cont_resume_interpreter_adapter()
  // Restore Java expression stack pointer
  __ ldr(rscratch1, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
  __ lea(esp, Address(rfp, rscratch1, Address::lsl(Interpreter::logStackElementSize)));
-  // and NULL it as marker that esp is now tos until next java call
+  // and null it as marker that esp is now tos until next java call
  __ str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));

  // Restore machine SP
--- a/src/hotspot/cpu/arm/c2_globals_arm.hpp
+++ b/src/hotspot/cpu/arm/c2_globals_arm.hpp
@ -64,6 +64,7 @@ define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               true);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  false);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 define_pd_global(bool, IdealizeClearArrayNode,       true);

 #ifdef _LP64
--- a/src/hotspot/cpu/ppc/c2_globals_ppc.hpp
+++ b/src/hotspot/cpu/ppc/c2_globals_ppc.hpp
@ -59,6 +59,7 @@ define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 // GL:
 // Detected a problem with unscaled compressed oops and
 // narrow_oop_use_complex_address() == false.
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@ -2339,83 +2339,6 @@ void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, Basi
  vfsgnj_vv(dst, one, dst, v0_t);
 }

-void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) {
-  Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
-  // intrinsic is enabled when MaxVectorSize >= 16
-  Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
-  long len = is_long ? 64 : 32;
-
-  // load the src data(in bits) to be compressed.
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_s_x(v0, src);
-  // reset the src data(in bytes) to zero.
-  mv(t0, len);
-  vsetvli(x0, t0, Assembler::e8, lmul);
-  vmv_v_i(v4, 0);
-  // convert the src data from bits to bytes.
-  vmerge_vim(v4, v4, 1); // v0 as the implicit mask register
-  // reset the dst data(in bytes) to zero.
-  vmv_v_i(v8, 0);
-  // load the mask data(in bits).
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_s_x(v0, mask);
-  // compress the src data(in bytes) to dst(in bytes).
-  vsetvli(x0, t0, Assembler::e8, lmul);
-  vcompress_vm(v8, v4, v0);
-  // convert the dst data from bytes to bits.
-  vmseq_vi(v0, v8, 1);
-  // store result back.
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_x_s(dst, v0);
-}
-
-void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) {
-  compress_bits_v(dst, src, mask, /* is_long */ false);
-}
-
-void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) {
-  compress_bits_v(dst, src, mask, /* is_long */ true);
-}
-
-void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) {
-  Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
-  // intrinsic is enabled when MaxVectorSize >= 16
-  Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
-  long len = is_long ? 64 : 32;
-
-  // load the src data(in bits) to be expanded.
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_s_x(v0, src);
-  // reset the src data(in bytes) to zero.
-  mv(t0, len);
-  vsetvli(x0, t0, Assembler::e8, lmul);
-  vmv_v_i(v4, 0);
-  // convert the src data from bits to bytes.
-  vmerge_vim(v4, v4, 1); // v0 as implicit mask register
-  // reset the dst data(in bytes) to zero.
-  vmv_v_i(v12, 0);
-  // load the mask data(in bits).
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_s_x(v0, mask);
-  // expand the src data(in bytes) to dst(in bytes).
-  vsetvli(x0, t0, Assembler::e8, lmul);
-  viota_m(v8, v0);
-  vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register
-  // convert the dst data from bytes to bits.
-  vmseq_vi(v0, v12, 1);
-  // store result back.
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_x_s(dst, v0);
-}
-
-void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) {
-  expand_bits_v(dst, src, mask, /* is_long */ false);
-}
-
-void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) {
-  expand_bits_v(dst, src, mask, /* is_long */ true);
-}
-
 // j.l.Math.round(float)
 //  Returns the closest int to the argument, with ties rounding to positive infinity.
 // We need to handle 3 special cases defined by java api spec:
@ -3124,13 +3047,3 @@ void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, Basi
    vfmv_f_s(dst, tmp);
  }
 }
-
-void C2_MacroAssembler::load_narrow_klass_compact_c2(Register dst, Address src) {
-  // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
-  // obj-start, so that we can load from the object's mark-word instead. Usually the address
-  // comes as obj-start in obj and klass_offset_in_bytes in disp.
-  assert(UseCompactObjectHeaders, "must");
-  int offset = oopDesc::mark_offset_in_bytes() - oopDesc::klass_offset_in_bytes();
-  ld(dst, Address(src.base(), src.offset() + offset));
-  srli(dst, dst, markWord::klass_shift);
-}
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
@ -39,9 +39,6 @@
                       VectorRegister vrs,
                       bool is_latin, Label& DONE, Assembler::LMUL lmul);

-  void compress_bits_v(Register dst, Register src, Register mask, bool is_long);
-  void expand_bits_v(Register dst, Register src, Register mask, bool is_long);
-
 public:
  // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
  void fast_lock(Register object, Register box,
@ -184,13 +181,6 @@

  // intrinsic methods implemented by rvv instructions

-  // compress bits, i.e. j.l.Integer/Long::compress.
-  void compress_bits_i_v(Register dst, Register src, Register mask);
-  void compress_bits_l_v(Register dst, Register src, Register mask);
-  // expand bits, i.e. j.l.Integer/Long::expand.
-  void expand_bits_i_v(Register dst, Register src, Register mask);
-  void expand_bits_l_v(Register dst, Register src, Register mask);
-
  void java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, BasicType bt, uint vector_length);
  void java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, BasicType bt, uint vector_length);

@ -281,6 +271,4 @@
  void extract_v(Register dst, VectorRegister src, BasicType bt, int idx, VectorRegister tmp);
  void extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt, int idx, VectorRegister tmp);

-  void load_narrow_klass_compact_c2(Register dst, Address src);
-
 #endif // CPU_RISCV_C2_MACROASSEMBLER_RISCV_HPP
--- a/src/hotspot/cpu/riscv/c2_globals_riscv.hpp
+++ b/src/hotspot/cpu/riscv/c2_globals_riscv.hpp
@ -66,6 +66,7 @@ define_pd_global(bool, OptoScheduling,               true);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 define_pd_global(bool, IdealizeClearArrayNode,       true);

 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
--- a/src/hotspot/cpu/riscv/globals_riscv.hpp
+++ b/src/hotspot/cpu/riscv/globals_riscv.hpp
@ -117,8 +117,6 @@ define_pd_global(intx, InlineSmallCode,          1000);
  product(bool, UseZvfh, false, DIAGNOSTIC, "Use Zvfh instructions")             \
  product(bool, UseZvkn, false, EXPERIMENTAL,                                    \
          "Use Zvkn group extension, Zvkned, Zvknhb, Zvkb, Zvkt")                \
-  product(bool, UseRVVForBigIntegerShiftIntrinsics, true,                        \
-          "Use RVV instructions for left/right shift of BigInteger")             \
  product(bool, UseCtxFencei, false, EXPERIMENTAL,                               \
          "Use PR_RISCV_CTX_SW_FENCEI_ON to avoid explicit icache flush")

--- a/src/hotspot/cpu/riscv/interp_masm_riscv.cpp
+++ b/src/hotspot/cpu/riscv/interp_masm_riscv.cpp
@ -441,7 +441,14 @@ void InterpreterMacroAssembler::dispatch_base(TosState state,
                                              Register Rs) {
  // Pay attention to the argument Rs, which is acquiesce in t0.
  if (VerifyActivationFrameSize) {
-    Unimplemented();
+    Label L;
+    sub(t1, fp, esp);
+    int min_frame_size =
+      (frame::link_offset - frame::interpreter_frame_initial_sp_offset + frame::metadata_words) * wordSize;
+    sub(t1, t1, min_frame_size);
+    bgez(t1, L);
+    stop("broken stack frame");
+    bind(L);
  }
  if (verifyoop && state == atos) {
    verify_oop(x10);
--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
@ -3466,6 +3466,17 @@ void MacroAssembler::cmpxchg(Register addr, Register expected,
  assert_different_registers(expected, t0);
  assert_different_registers(new_val, t0);

+  // NOTE:
+  // Register _result_ may be the same register as _new_val_ or _expected_.
+  // Hence do NOT use _result_ until after 'cas'.
+  //
+  // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
+  // Hence do NOT change _expected_ or _new_val_.
+  //
+  // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
+  //
+  // TODO: Address these issues.
+
  if (UseZacas) {
    if (result_as_bool) {
      mv(t0, expected);
@ -3473,8 +3484,9 @@ void MacroAssembler::cmpxchg(Register addr, Register expected,
      xorr(t0, t0, expected);
      seqz(result, t0);
    } else {
-      mv(result, expected);
-      atomic_cas(result, new_val, addr, size, acquire, release);
+      mv(t0, expected);
+      atomic_cas(t0, new_val, addr, size, acquire, release);
+      mv(result, t0);
    }
    return;
  }
@ -3510,15 +3522,16 @@ void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
                                  enum operand_size size,
                                  Assembler::Aqrl acquire, Assembler::Aqrl release,
                                  Register result) {
-  if (UseZacas) {
-    cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
-    return;
-  }

  assert_different_registers(addr, t0);
  assert_different_registers(expected, t0);
  assert_different_registers(new_val, t0);

+  if (UseZacas) {
+    cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
+    return;
+  }
+
  Label fail, done;
  load_reserved(t0, addr, size, acquire);
  bne(t0, expected, fail);
@ -3581,83 +3594,18 @@ ATOMIC_XCHGU(xchgalwu, xchgalw)

 #undef ATOMIC_XCHGU

-#define ATOMIC_CAS(OP, AOP, ACQUIRE, RELEASE)                                        \
-void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
-  assert(UseZacas, "invariant");                                                     \
-  prev = prev->is_valid() ? prev : zr;                                               \
-  AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
-  return;                                                                            \
-}
-
-ATOMIC_CAS(cas, amocas_d, Assembler::relaxed, Assembler::relaxed)
-ATOMIC_CAS(casw, amocas_w, Assembler::relaxed, Assembler::relaxed)
-ATOMIC_CAS(casl, amocas_d, Assembler::relaxed, Assembler::rl)
-ATOMIC_CAS(caslw, amocas_w, Assembler::relaxed, Assembler::rl)
-ATOMIC_CAS(casal, amocas_d, Assembler::aq, Assembler::rl)
-ATOMIC_CAS(casalw, amocas_w, Assembler::aq, Assembler::rl)
-
-#undef ATOMIC_CAS
-
-#define ATOMIC_CASU(OP1, OP2)                                                        \
-void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
-  atomic_##OP2(prev, newv, addr);                                                    \
-  zero_extend(prev, prev, 32);                                                       \
-  return;                                                                            \
-}
-
-ATOMIC_CASU(caswu, casw)
-ATOMIC_CASU(caslwu, caslw)
-ATOMIC_CASU(casalwu, casalw)
-
-#undef ATOMIC_CASU
-
-void MacroAssembler::atomic_cas(
-    Register prev, Register newv, Register addr, enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
+void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
+                                enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
  switch (size) {
    case int64:
-      switch ((Assembler::Aqrl)(acquire | release)) {
-        case Assembler::relaxed:
-          atomic_cas(prev, newv, addr);
-          break;
-        case Assembler::rl:
-          atomic_casl(prev, newv, addr);
-          break;
-        case Assembler::aqrl:
-          atomic_casal(prev, newv, addr);
-          break;
-        default:
-          ShouldNotReachHere();
-      }
+      amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
      break;
    case int32:
-      switch ((Assembler::Aqrl)(acquire | release)) {
-        case Assembler::relaxed:
-          atomic_casw(prev, newv, addr);
-          break;
-        case Assembler::rl:
-          atomic_caslw(prev, newv, addr);
-          break;
-        case Assembler::aqrl:
-          atomic_casalw(prev, newv, addr);
-          break;
-        default:
-          ShouldNotReachHere();
-      }
+      amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
      break;
    case uint32:
-      switch ((Assembler::Aqrl)(acquire | release)) {
-        case Assembler::relaxed:
-          atomic_caswu(prev, newv, addr);
-          break;
-        case Assembler::rl:
-          atomic_caslwu(prev, newv, addr);
-          break;
-        case Assembler::aqrl:
-          atomic_casalwu(prev, newv, addr);
-          break;
-        default:
-          ShouldNotReachHere();
-      }
+      amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
+      zero_extend(prev, prev, 32);
      break;
    default:
      ShouldNotReachHere();
--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
@ -1175,16 +1175,6 @@ public:
  void atomic_xchgwu(Register prev, Register newv, Register addr);
  void atomic_xchgalwu(Register prev, Register newv, Register addr);

-  void atomic_cas(Register prev, Register newv, Register addr);
-  void atomic_casw(Register prev, Register newv, Register addr);
-  void atomic_casl(Register prev, Register newv, Register addr);
-  void atomic_caslw(Register prev, Register newv, Register addr);
-  void atomic_casal(Register prev, Register newv, Register addr);
-  void atomic_casalw(Register prev, Register newv, Register addr);
-  void atomic_caswu(Register prev, Register newv, Register addr);
-  void atomic_caslwu(Register prev, Register newv, Register addr);
-  void atomic_casalwu(Register prev, Register newv, Register addr);
-
  void atomic_cas(Register prev, Register newv, Register addr, enum operand_size size,
              Assembler::Aqrl acquire = Assembler::relaxed, Assembler::Aqrl release = Assembler::relaxed);

--- a/src/hotspot/cpu/riscv/riscv.ad
+++ b/src/hotspot/cpu/riscv/riscv.ad
@ -942,26 +942,6 @@ reg_class v11_reg(
    V11, V11_H, V11_J, V11_K
 );

-// class for vector register v12
-reg_class v12_reg(
-    V12, V12_H, V12_J, V12_K
-);
-
-// class for vector register v13
-reg_class v13_reg(
-    V13, V13_H, V13_J, V13_K
-);
-
-// class for vector register v14
-reg_class v14_reg(
-    V14, V14_H, V14_J, V14_K
-);
-
-// class for vector register v15
-reg_class v15_reg(
-    V15, V15_H, V15_J, V15_K
-);
-
 // class for condition codes
 reg_class reg_flags(RFLAGS);

@ -1896,9 +1876,6 @@ bool Matcher::match_rule_supported(int opcode) {
      }
      break;

-    case Op_ExpandBits:        // fall through
-    case Op_CompressBits:      // fall through
-      guarantee(UseRVV == (MaxVectorSize >= 16), "UseRVV and MaxVectorSize not matched");
    case Op_StrCompressedCopy: // fall through
    case Op_StrInflatedCopy:   // fall through
    case Op_CountPositives:    // fall through
@ -3541,46 +3518,6 @@ operand vReg_V11()
  interface(REG_INTER);
 %}

-operand vReg_V12()
-%{
-  constraint(ALLOC_IN_RC(v12_reg));
-  match(VecA);
-  match(vReg);
-  op_cost(0);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-operand vReg_V13()
-%{
-  constraint(ALLOC_IN_RC(v13_reg));
-  match(VecA);
-  match(vReg);
-  op_cost(0);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-operand vReg_V14()
-%{
-  constraint(ALLOC_IN_RC(v14_reg));
-  match(VecA);
-  match(vReg);
-  op_cost(0);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-operand vReg_V15()
-%{
-  constraint(ALLOC_IN_RC(v15_reg));
-  match(VecA);
-  match(vReg);
-  op_cost(0);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
 operand vRegMask()
 %{
  constraint(ALLOC_IN_RC(vmask_reg));
@ -4814,10 +4751,14 @@ instruct loadNKlassCompactHeaders(iRegNNoSp dst, memory mem)
  match(Set dst (LoadNKlass mem));

  ins_cost(LOAD_COST);
-  format %{ "load_narrow_klass_compact $dst, $mem\t# compressed class ptr, #@loadNKlassCompactHeaders" %}
+  format %{
+    "lwu  $dst, $mem\t# compressed klass ptr, shifted\n\t"
+    "srli $dst, $dst, markWord::klass_shift_at_offset"
+  %}

  ins_encode %{
-    __ load_narrow_klass_compact_c2(as_Register($dst$$reg), Address(as_Register($mem$$base), $mem$$disp));
+    __ lwu(as_Register($dst$$reg), Address(as_Register($mem$$base), $mem$$disp));
+    __ srli(as_Register($dst$$reg), as_Register($dst$$reg), (unsigned) markWord::klass_shift_at_offset);
  %}

  ins_pipe(iload_reg_mem);
--- a/src/hotspot/cpu/riscv/riscv_v.ad
+++ b/src/hotspot/cpu/riscv/riscv_v.ad
@ -3843,116 +3843,6 @@ instruct vclearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy,
  ins_pipe(pipe_class_memory);
 %}

-// CompressBits of Long & Integer
-
-instruct compressBitsI(iRegINoSp dst, iRegIorL2I src, iRegIorL2I mask, vRegMask_V0 v0,
-                       vReg_V4 v4, vReg_V5 v5, vReg_V8 v8, vReg_V9 v9) %{
-  match(Set dst (CompressBits src mask));
-  effect(TEMP v0, TEMP v4, TEMP v5, TEMP v8, TEMP v9);
-  format %{ "vsetivli x0, 1, e32, m1, tu, mu\t#@compressBitsI\n\t"
-            "vmv.s.x $v0, $src\n\t"
-            "mv t0, 32\n\t"
-            "vsetvli x0, t0, e8, m2, tu, mu\n\t"
-            "vmv.v.i $v4, 0\n\t"
-            "vmerge.vim $v4, $v4, 1, $v0\n\t"
-            "vmv.v.i $v8, 0\n\t"
-            "vsetivli x0, 1, e32, m1, tu, mu\n\t"
-            "vmv.s.x $v0, $mask\n\t"
-            "vsetvli x0, t0, e8, m2, tu, mu\n\t"
-            "vcompress.vm $v8, $v4, $v0\n\t"
-            "vmseq.vi $v0, $v8, 1\n\t"
-            "vsetivli x0, 1, e32, m1, tu, mu\n\t"
-            "vmv.x.s $dst, $v0\t#@compressBitsI\n\t"
-          %}
-  ins_encode %{
-    __ compress_bits_i_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct compressBitsL(iRegLNoSp dst, iRegL src, iRegL mask, vRegMask_V0 v0,
-                       vReg_V4 v4, vReg_V5 v5, vReg_V6 v6, vReg_V7 v7,
-                       vReg_V8 v8, vReg_V9 v9, vReg_V10 v10, vReg_V11 v11) %{
-  match(Set dst (CompressBits src mask));
-  effect(TEMP v0, TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11);
-  format %{ "vsetivli x0, 1, e64, m1, tu, mu\t#@compressBitsL\n\t"
-            "vmv.s.x $v0, $src\n\t"
-            "mv t0, 64\n\t"
-            "vsetvli x0, t0, e8, m4, tu, mu\n\t"
-            "vmv.v.i $v4, 0\n\t"
-            "vmerge.vim $v4, $v4, 1, $v0\n\t"
-            "vmv.v.i $v8, 0\n\t"
-            "vsetivli x0, 1, e64, m1, tu, mu\n\t"
-            "vmv.s.x $v0, $mask\n\t"
-            "vsetvli x0, t0, e8, m4, tu, mu\n\t"
-            "vcompress.vm $v8, $v4, $v0\n\t"
-            "vmseq.vi $v0, $v8, 1\n\t"
-            "vsetivli x0, 1, e64, m1, tu, mu\n\t"
-            "vmv.x.s $dst, $v0\t#@compressBitsL\n\t"
-          %}
-  ins_encode %{
-    __ compress_bits_l_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-// ExpandBits of Long & Integer
-
-instruct expandBitsI(iRegINoSp dst, iRegIorL2I src, iRegIorL2I mask, vRegMask_V0 v0,
-                     vReg_V4 v4, vReg_V5 v5, vReg_V8 v8, vReg_V9 v9, vReg_V12 v12, vReg_V13 v13) %{
-  match(Set dst (ExpandBits src mask));
-  effect(TEMP v0, TEMP v4, TEMP v5, TEMP v8, TEMP v9, TEMP v12, TEMP v13);
-  format %{ "vsetivli x0, 1, e32, m1, tu, mu\t#@expandBitsI\n\t"
-            "vmv.s.x $v0, $src\n\t"
-            "mv t0, 32\n\t"
-            "vsetvli x0, t0, e8, m2, tu, mu\n\t"
-            "vmv.v.i $v4, 0\n\t"
-            "vmerge.vim $v4, $v4, 1, $v0\n\t"
-            "vmv.v.i $v12, 0\n\t"
-            "vsetivli x0, 1, e32, m1, tu, mu\n\t"
-            "vmv.s.x $v0, $mask\n\t"
-            "vsetvli x0, t0, e8, m2, tu, mu\n\t"
-            "viota.m $v8, $v0\n\t"
-            "vrgather.vv $v12, $v4, $v8, $v0.t\n\t"
-            "vmseq.vi $v0, $v12, 1\n\t"
-            "vsetivli x0, 1, e32, m1, tu, mu\n\t"
-            "vmv.x.s $dst, $v0\t#@expandBitsI\n\t"
-          %}
-  ins_encode %{
-    __ expand_bits_i_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct expandBitsL(iRegLNoSp dst, iRegL src, iRegL mask, vRegMask_V0 v0,
-                      vReg_V4 v4, vReg_V5 v5, vReg_V6 v6, vReg_V7 v7,
-                      vReg_V8 v8, vReg_V9 v9, vReg_V10 v10, vReg_V11 v11,
-                      vReg_V12 v12, vReg_V13 v13, vReg_V14 v14, vReg_V15 v15) %{
-  match(Set dst (ExpandBits src mask));
-  effect(TEMP v0, TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11,
-         TEMP v12, TEMP v13, TEMP v14, TEMP v15);
-  format %{ "vsetivli x0, 1, e64, m1, tu, mu\t#@expandBitsL\n\t"
-            "vmv.s.x $v0, $src\n\t"
-            "mv t0, 64\n\t"
-            "vsetvli x0, t0, e8, m4, tu, mu\n\t"
-            "vmv.v.i $v4, 0\n\t"
-            "vmerge.vim $v4, $v4, 1, $v0\n\t"
-            "vmv.v.i $v12, 0\n\t"
-            "vsetivli x0, 1, e64, m1, tu, mu\n\t"
-            "vmv.s.x $v0, $mask\n\t"
-            "vsetvli x0, t0, e8, m4, tu, mu\n\t"
-            "viota.m $v8, $v0\n\t"
-            "vrgather.vv $v12, $v4, $v8, $v0.t\n\t"
-            "vmseq.vi $v0, $v12, 1\n\t"
-            "vsetivli x0, 1, e64, m1, tu, mu\n\t"
-            "vmv.x.s $dst, $v0\t#@expandBitsL\n\t"
-          %}
-  ins_encode %{
-    __ expand_bits_l_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
 // Vector Load Const
 instruct vloadcon(vReg dst, immI0 src) %{
  match(Set dst (VectorLoadConst src));
--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
+++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
@ -6502,7 +6502,7 @@ static const int64_t right_3_bits = right_n_bits(3);
      StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
    }

-    if (UseRVVForBigIntegerShiftIntrinsics) {
+    if (UseRVV) {
      StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
      StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
    }
--- a/src/hotspot/cpu/riscv/templateInterpreterGenerator_riscv.cpp
+++ b/src/hotspot/cpu/riscv/templateInterpreterGenerator_riscv.cpp
@ -552,7 +552,7 @@ address TemplateInterpreterGenerator::generate_cont_resume_interpreter_adapter()
  // Restore Java expression stack pointer
  __ ld(t0, Address(fp, frame::interpreter_frame_last_sp_offset * wordSize));
  __ shadd(esp, t0, fp, t0, Interpreter::logStackElementSize);
-  // and NULL it as marker that esp is now tos until next java call
+  // and null it as marker that esp is now tos until next java call
  __ sd(zr, Address(fp, frame::interpreter_frame_last_sp_offset * wordSize));

  // Restore machine SP
--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp
+++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp
@ -150,11 +150,12 @@ void VM_Version::common_initialize() {
  }

  if (FLAG_IS_DEFAULT(AvoidUnalignedAccesses)) {
-    if (unaligned_access.value() != MISALIGNED_FAST) {
-      FLAG_SET_DEFAULT(AvoidUnalignedAccesses, true);
-    } else {
-      FLAG_SET_DEFAULT(AvoidUnalignedAccesses, false);
-    }
+    FLAG_SET_DEFAULT(AvoidUnalignedAccesses,
+      unaligned_access.value() != MISALIGNED_FAST);
+  }
+
+  if (FLAG_IS_DEFAULT(AlignVector)) {
+    FLAG_SET_DEFAULT(AlignVector, AvoidUnalignedAccesses);
  }

  // See JDK-8026049
@ -233,7 +234,6 @@ void VM_Version::c2_initialize() {

  if (!UseRVV) {
    FLAG_SET_DEFAULT(MaxVectorSize, 0);
-    FLAG_SET_DEFAULT(UseRVVForBigIntegerShiftIntrinsics, false);
  } else {
    if (!FLAG_IS_DEFAULT(MaxVectorSize) && MaxVectorSize != _initial_vector_length) {
      warning("Current system does not support RVV vector length for MaxVectorSize %d. Set MaxVectorSize to %d",
--- a/src/hotspot/cpu/s390/c2_globals_s390.hpp
+++ b/src/hotspot/cpu/s390/c2_globals_s390.hpp
@ -61,6 +61,7 @@ define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 // On s390x, we can clear the array with a single instruction,
 // so don't idealize it.
 define_pd_global(bool, IdealizeClearArrayNode,       false);
--- a/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp
@ -218,10 +218,10 @@ SaveLiveRegisters::SaveLiveRegisters(MacroAssembler *masm, BarrierStubC2 *stub)

  const int register_save_size = iterate_over_register_mask(ACTION_COUNT_ONLY) * BytesPerWord;

-  _frame_size = align_up(register_save_size, frame::alignment_in_bytes) + frame::z_abi_160_size; // FIXME: this could be restricted to argument only
+  _frame_size = align_up(register_save_size, frame::alignment_in_bytes) + frame::z_abi_160_size;

  __ save_return_pc();
-  __ push_frame(_frame_size, Z_R14); // FIXME: check if Z_R1_scaratch can do a job here;
+  __ push_frame(_frame_size, Z_R14);

  __ z_lg(Z_R14, _z_common_abi(return_pc) + _frame_size, Z_SP);

@ -240,6 +240,7 @@ int SaveLiveRegisters::iterate_over_register_mask(IterationAction action, int of
  int reg_save_index = 0;
  RegMaskIterator live_regs_iterator(_reg_mask);

+  // Going to preserve the volatile registers which can be used by Register Allocator.
  while(live_regs_iterator.has_next()) {
    const OptoReg::Name opto_reg = live_regs_iterator.next();

@ -251,8 +252,11 @@ int SaveLiveRegisters::iterate_over_register_mask(IterationAction action, int of
    const VMReg vm_reg = OptoReg::as_VMReg(opto_reg);
    if (vm_reg->is_Register()) {
      Register std_reg = vm_reg->as_Register();
-
-      if (std_reg->encoding() >= Z_R2->encoding() && std_reg->encoding() <= Z_R15->encoding()) {
+      // Z_R0 and Z_R1 will not be allocated by the register allocator, see s390.ad (Integer Register Classes)
+      // Z_R6 to Z_R15 are saved registers, except Z_R14 (see Z-Abi)
+      if (std_reg->encoding() == Z_R14->encoding() ||
+         (std_reg->encoding() >= Z_R2->encoding()  &&
+          std_reg->encoding() <= Z_R5->encoding())) {
        reg_save_index++;

        if (action == ACTION_SAVE) {
@ -265,8 +269,10 @@ int SaveLiveRegisters::iterate_over_register_mask(IterationAction action, int of
      }
    } else if (vm_reg->is_FloatRegister()) {
      FloatRegister fp_reg = vm_reg->as_FloatRegister();
-      if (fp_reg->encoding() >= Z_F0->encoding() && fp_reg->encoding() <= Z_F15->encoding()
-          && fp_reg->encoding() != Z_F1->encoding()) {
+      // Z_R1 will not be allocated by the register allocator, see s390.ad (Float Register Classes)
+      if (fp_reg->encoding() >= Z_F0->encoding() &&
+          fp_reg->encoding() <= Z_F7->encoding() &&
+          fp_reg->encoding() != Z_F1->encoding()) {
        reg_save_index++;

        if (action == ACTION_SAVE) {
@ -277,8 +283,20 @@ int SaveLiveRegisters::iterate_over_register_mask(IterationAction action, int of
          assert(action == ACTION_COUNT_ONLY, "Sanity");
        }
      }
-    } else if (false /* vm_reg->is_VectorRegister() */){
-      fatal("Vector register support is not there yet!");
+    } else if (vm_reg->is_VectorRegister()) {
+      VectorRegister vs_reg = vm_reg->as_VectorRegister();
+      // Z_V0 to Z_V15 will not be allocated by the register allocator, see s390.ad (reg class z_v_reg)
+      if (vs_reg->encoding() >= Z_V16->encoding() &&
+          vs_reg->encoding() <= Z_V31->encoding()) {
+        reg_save_index += 2;
+        if (action == ACTION_SAVE) {
+          __ z_vst(vs_reg, Address(Z_SP, offset - reg_save_index * BytesPerWord));
+        } else if (action == ACTION_RESTORE) {
+          __ z_vl(vs_reg, Address(Z_SP, offset - reg_save_index * BytesPerWord));
+        } else {
+          assert(action == ACTION_COUNT_ONLY, "Sanity");
+        }
+      }
    } else {
      fatal("Register type is not known");
    }
--- a/src/hotspot/cpu/s390/vm_version_s390.cpp
+++ b/src/hotspot/cpu/s390/vm_version_s390.cpp
@ -320,6 +320,12 @@ void VM_Version::initialize() {
  if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
    FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
  }
+
+  // The OptoScheduling information is not maintained in s390.ad.
+  if (OptoScheduling) {
+    warning("OptoScheduling is not supported on this CPU.");
+    FLAG_SET_DEFAULT(OptoScheduling, false);
+  }
 }


--- a/src/hotspot/cpu/x86/c2_globals_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_globals_x86.hpp
@ -76,6 +76,7 @@ define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            true);
 define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);
+define_pd_global(uint, SuperWordStoreToLoadForwardingFailureDetection, 16);
 define_pd_global(bool, IdealizeClearArrayNode,       true);

 define_pd_global(uintx, ReservedCodeCacheSize,       48*M);
--- a/src/hotspot/cpu/x86/templateInterpreterGenerator_x86.cpp
+++ b/src/hotspot/cpu/x86/templateInterpreterGenerator_x86.cpp
@ -400,7 +400,7 @@ address TemplateInterpreterGenerator::generate_cont_resume_interpreter_adapter()
  // Restore stack bottom
  __ movptr(rcx, Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize));
  __ lea(rsp, Address(rbp, rcx, Address::times_ptr));
-  // and NULL it as marker that esp is now tos until next java call
+  // and null it as marker that esp is now tos until next java call
  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);

  __ jmp(rax);
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -6179,6 +6179,7 @@ instruct evmulL_reg(vec dst, vec src1, vec src2) %{
             VM_Version::supports_avx512dq()) ||
            VM_Version::supports_avx512vldq());
  match(Set dst (MulVL src1 src2));
+  ins_cost(500);
  format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
  ins_encode %{
    assert(UseAVX > 2, "required");
@ -6195,6 +6196,7 @@ instruct evmulL_mem(vec dst, vec src, memory mem) %{
             VM_Version::supports_avx512vldq()));
  match(Set dst (MulVL src (LoadVector mem)));
  format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
+  ins_cost(500);
  ins_encode %{
    assert(UseAVX > 2, "required");
    int vlen_enc = vector_length_encoding(this);
@ -6206,6 +6208,7 @@ instruct evmulL_mem(vec dst, vec src, memory mem) %{
 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
  predicate(UseAVX == 0);
  match(Set dst (MulVL src1 src2));
+  ins_cost(500);
  effect(TEMP dst, TEMP xtmp);
  format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
  ins_encode %{
@ -6232,6 +6235,7 @@ instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
              !VM_Version::supports_avx512vldq())));
  match(Set dst (MulVL src1 src2));
  effect(TEMP xtmp1, TEMP xtmp2);
+  ins_cost(500);
  format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
  ins_encode %{
    int vlen_enc = vector_length_encoding(this);
@ -6248,6 +6252,30 @@ instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
  ins_pipe( pipe_slow );
 %}

+instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
+  predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
+  match(Set dst (MulVL src1 src2));
+  ins_cost(100);
+  format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this);
+    __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
+  predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
+  match(Set dst (MulVL src1 src2));
+  ins_cost(100);
+  format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this);
+    __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Floats vector mul
 instruct vmulF(vec dst, vec src) %{
  predicate(UseAVX == 0);
--- a/src/hotspot/os/windows/attachListener_windows.cpp
+++ b/src/hotspot/os/windows/attachListener_windows.cpp
@ -161,7 +161,7 @@ public:


 // Win32AttachOperationRequest is an element of AttachOperation request list.
-class Win32AttachOperationRequest {
+class Win32AttachOperationRequest: public CHeapObj<mtServiceability> {
 private:
  AttachAPIVersion _ver;
  char _name[AttachOperation::name_length_max + 1];
--- a/src/hotspot/share/cds/archiveUtils.hpp
+++ b/src/hotspot/share/cds/archiveUtils.hpp
@ -162,7 +162,7 @@ public:
  DumpRegion(const char* name, uintx max_delta = 0)
    : _name(name), _base(nullptr), _top(nullptr), _end(nullptr),
      _max_delta(max_delta), _is_packed(false),
-      _rs(NULL), _vs(NULL) {}
+      _rs(nullptr), _vs(nullptr) {}

  char* expand_top_to(char* newtop);
  char* allocate(size_t num_bytes, size_t alignment = 0);
--- a/src/hotspot/share/cds/metaspaceShared.cpp
+++ b/src/hotspot/share/cds/metaspaceShared.cpp
@ -315,7 +315,7 @@ static GrowableArrayCHeap<OopHandle, mtClassShared>* _extra_interned_strings = n
 // Extra Symbols to be added to the archive
 static GrowableArrayCHeap<Symbol*, mtClassShared>* _extra_symbols = nullptr;
 // Methods managed by SystemDictionary::find_method_handle_intrinsic() to be added to the archive
-static GrowableArray<Method*>* _pending_method_handle_intrinsics = NULL;
+static GrowableArray<Method*>* _pending_method_handle_intrinsics = nullptr;

 void MetaspaceShared::read_extra_data(JavaThread* current, const char* filename) {
  _extra_interned_strings = new GrowableArrayCHeap<OopHandle, mtClassShared>(10000);
--- a/src/hotspot/share/classfile/classFileError.cpp
+++ b/src/hotspot/share/classfile/classFileError.cpp
@ -34,6 +34,10 @@
 PRAGMA_DIAG_PUSH
 PRAGMA_FORMAT_NONLITERAL_IGNORED

+// None of the error routines below take in a free-form, potentially unbounded
+// string, and names are all limited to < 64K, so we know that all formatted
+// strings passed to fthrow will not be excessively large.
+
 void ClassFileParser::classfile_parse_error(const char* msg, TRAPS) const {
  assert(_class_name != nullptr, "invariant");
  ResourceMark rm(THREAD);
--- a/src/hotspot/share/classfile/classFileParser.cpp
+++ b/src/hotspot/share/classfile/classFileParser.cpp
@ -1794,6 +1794,7 @@ void ClassFileParser::throwIllegalSignature(const char* type,
  assert(sig != nullptr, "invariant");

  ResourceMark rm(THREAD);
+  // Names are all known to be < 64k so we know this formatted message is not excessively large.
  Exceptions::fthrow(THREAD_AND_LOCATION,
      vmSymbols::java_lang_ClassFormatError(),
      "%s \"%s\" in class %s has illegal signature \"%s\"", type,
@ -4073,6 +4074,8 @@ void ClassFileParser::check_super_class_access(const InstanceKlass* this_klass,
      char* msg = Reflection::verify_class_access_msg(this_klass,
                                                      InstanceKlass::cast(super),
                                                      vca_result);
+
+      // Names are all known to be < 64k so we know this formatted message is not excessively large.
      if (msg == nullptr) {
        bool same_module = (this_klass->module() == super->module());
        Exceptions::fthrow(
@ -4121,6 +4124,8 @@ void ClassFileParser::check_super_interface_access(const InstanceKlass* this_kla
      char* msg = Reflection::verify_class_access_msg(this_klass,
                                                      k,
                                                      vca_result);
+
+      // Names are all known to be < 64k so we know this formatted message is not excessively large.
      if (msg == nullptr) {
        bool same_module = (this_klass->module() == k->module());
        Exceptions::fthrow(
@ -4217,6 +4222,8 @@ static void check_illegal_static_method(const InstanceKlass* this_klass, TRAPS)
    // if m is static and not the init method, throw a verify error
    if ((m->is_static()) && (m->name() != vmSymbols::class_initializer_name())) {
      ResourceMark rm(THREAD);
+
+      // Names are all known to be < 64k so we know this formatted message is not excessively large.
      Exceptions::fthrow(
        THREAD_AND_LOCATION,
        vmSymbols::java_lang_VerifyError(),
@ -4236,6 +4243,7 @@ void ClassFileParser::verify_legal_class_modifiers(jint flags, TRAPS) const {
  assert(_major_version >= JAVA_9_VERSION || !is_module, "JVM_ACC_MODULE should not be set");
  if (is_module) {
    ResourceMark rm(THREAD);
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_NoClassDefFoundError(),
@ -4259,6 +4267,7 @@ void ClassFileParser::verify_legal_class_modifiers(jint flags, TRAPS) const {
      (is_interface && major_gte_1_5 && (is_super || is_enum)) ||
      (!is_interface && major_gte_1_5 && is_annotation)) {
    ResourceMark rm(THREAD);
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_ClassFormatError(),
@ -4295,6 +4304,7 @@ void ClassFileParser::verify_class_version(u2 major, u2 minor, Symbol* class_nam
  }

  if (major > max_version) {
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_UnsupportedClassVersionError(),
@ -4310,6 +4320,7 @@ void ClassFileParser::verify_class_version(u2 major, u2 minor, Symbol* class_nam

  if (minor == JAVA_PREVIEW_MINOR_VERSION) {
    if (major != max_version) {
+      // Names are all known to be < 64k so we know this formatted message is not excessively large.
      Exceptions::fthrow(
        THREAD_AND_LOCATION,
        vmSymbols::java_lang_UnsupportedClassVersionError(),
@ -4362,6 +4373,7 @@ void ClassFileParser::verify_legal_field_modifiers(jint flags,

  if (is_illegal) {
    ResourceMark rm(THREAD);
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_ClassFormatError(),
@ -4445,6 +4457,7 @@ void ClassFileParser::verify_legal_method_modifiers(jint flags,

  if (is_illegal) {
    ResourceMark rm(THREAD);
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_ClassFormatError(),
@ -4686,6 +4699,7 @@ void ClassFileParser::verify_legal_class_name(const Symbol* name, TRAPS) const {
  if (!legal) {
    ResourceMark rm(THREAD);
    assert(_class_name != nullptr, "invariant");
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_ClassFormatError(),
@ -4719,6 +4733,7 @@ void ClassFileParser::verify_legal_field_name(const Symbol* name, TRAPS) const {
  if (!legal) {
    ResourceMark rm(THREAD);
    assert(_class_name != nullptr, "invariant");
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_ClassFormatError(),
@ -4756,6 +4771,7 @@ void ClassFileParser::verify_legal_method_name(const Symbol* name, TRAPS) const
  if (!legal) {
    ResourceMark rm(THREAD);
    assert(_class_name != nullptr, "invariant");
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_ClassFormatError(),
@ -5527,6 +5543,7 @@ void ClassFileParser::parse_stream(const ClassFileStream* const stream,
    if (_class_name != class_name_in_cp) {
      if (_class_name != vmSymbols::unknown_class_name()) {
        ResourceMark rm(THREAD);
+        // Names are all known to be < 64k so we know this formatted message is not excessively large.
        Exceptions::fthrow(THREAD_AND_LOCATION,
                           vmSymbols::java_lang_NoClassDefFoundError(),
                           "%s (wrong name: %s)",
--- a/src/hotspot/share/classfile/dictionary.cpp
+++ b/src/hotspot/share/classfile/dictionary.cpp
@ -44,7 +44,7 @@ const size_t REHASH_LEN = 100;
 Dictionary::Dictionary(ClassLoaderData* loader_data, size_t table_size)
  : _number_of_entries(0), _loader_data(loader_data) {

-  size_t start_size_log_2 = MAX2(ceil_log2(table_size), (size_t)2); // 2 is minimum size even though some dictionaries only have one entry
+  size_t start_size_log_2 = MAX2(log2i_ceil(table_size), 2); // 2 is minimum size even though some dictionaries only have one entry
  size_t current_size = ((size_t)1) << start_size_log_2;
  log_info(class, loader, data)("Dictionary start size: " SIZE_FORMAT " (" SIZE_FORMAT ")",
                                current_size, start_size_log_2);
--- a/src/hotspot/share/classfile/javaClasses.cpp
+++ b/src/hotspot/share/classfile/javaClasses.cpp
@ -2052,6 +2052,7 @@ int java_lang_VirtualThread::_next_offset;
 int java_lang_VirtualThread::_onWaitingList_offset;
 int java_lang_VirtualThread::_notified_offset;
 int java_lang_VirtualThread::_timeout_offset;
+int java_lang_VirtualThread::_objectWaiter_offset;

 #define VTHREAD_FIELDS_DO(macro) \
  macro(static_vthread_scope_offset,       k, "VTHREAD_SCOPE",      continuationscope_signature, true);  \
@ -2067,6 +2068,7 @@ int java_lang_VirtualThread::_timeout_offset;
 void java_lang_VirtualThread::compute_offsets() {
  InstanceKlass* k = vmClasses::VirtualThread_klass();
  VTHREAD_FIELDS_DO(FIELD_COMPUTE_OFFSET);
+  VTHREAD_INJECTED_FIELDS(INJECTED_FIELD_COMPUTE_OFFSET);
 }

 bool java_lang_VirtualThread::is_instance(oop obj) {
@ -2182,6 +2184,22 @@ JavaThreadStatus java_lang_VirtualThread::map_state_to_thread_status(int state)
  return status;
 }

+ObjectMonitor* java_lang_VirtualThread::current_pending_monitor(oop vthread) {
+  ObjectWaiter* waiter = objectWaiter(vthread);
+  if (waiter != nullptr && waiter->at_monitorenter()) {
+    return waiter->monitor();
+  }
+  return nullptr;
+}
+
+ObjectMonitor* java_lang_VirtualThread::current_waiting_monitor(oop vthread) {
+  ObjectWaiter* waiter = objectWaiter(vthread);
+  if (waiter != nullptr && waiter->is_wait()) {
+    return waiter->monitor();
+  }
+  return nullptr;
+}
+
 bool java_lang_VirtualThread::is_preempted(oop vthread) {
  oop continuation = java_lang_VirtualThread::continuation(vthread);
  assert(continuation != nullptr, "vthread with no continuation");
@ -2192,6 +2210,7 @@ bool java_lang_VirtualThread::is_preempted(oop vthread) {
 #if INCLUDE_CDS
 void java_lang_VirtualThread::serialize_offsets(SerializeClosure* f) {
   VTHREAD_FIELDS_DO(FIELD_SERIALIZE_OFFSET);
+   VTHREAD_INJECTED_FIELDS(INJECTED_FIELD_SERIALIZE_OFFSET);
 }
 #endif

--- a/src/hotspot/share/classfile/javaClasses.hpp
+++ b/src/hotspot/share/classfile/javaClasses.hpp
@ -38,6 +38,8 @@
 class JvmtiThreadState;
 class RecordComponent;
 class SerializeClosure;
+class ObjectWaiter;
+class ObjectMonitor;

 #define CHECK_INIT(offset)  assert(offset != 0, "should be initialized"); return offset;

@ -537,6 +539,8 @@ class java_lang_ThreadGroup : AllStatic {


 // Interface to java.lang.VirtualThread objects
+#define VTHREAD_INJECTED_FIELDS(macro)                                           \
+  macro(java_lang_VirtualThread,   objectWaiter,  intptr_signature,       false)

 class java_lang_VirtualThread : AllStatic {
 private:
@ -549,6 +553,7 @@ class java_lang_VirtualThread : AllStatic {
  static int _notified_offset;
  static int _recheckInterval_offset;
  static int _timeout_offset;
+  static int _objectWaiter_offset;
  JFR_ONLY(static int _jfr_epoch_offset;)
 public:
  enum {
@ -600,6 +605,11 @@ class java_lang_VirtualThread : AllStatic {
  static void set_notified(oop vthread, jboolean value);
  static bool is_preempted(oop vthread);
  static JavaThreadStatus map_state_to_thread_status(int state);
+
+  static inline ObjectWaiter* objectWaiter(oop vthread);
+  static inline void set_objectWaiter(oop vthread, ObjectWaiter* waiter);
+  static ObjectMonitor* current_pending_monitor(oop vthread);
+  static ObjectMonitor* current_waiting_monitor(oop vthread);
 };


--- a/src/hotspot/share/classfile/javaClasses.inline.hpp
+++ b/src/hotspot/share/classfile/javaClasses.inline.hpp
@ -220,6 +220,14 @@ inline oop java_lang_VirtualThread::vthread_scope() {
  return base->obj_field(static_vthread_scope_offset);
 }

+inline ObjectWaiter* java_lang_VirtualThread::objectWaiter(oop vthread) {
+  return (ObjectWaiter*)vthread->address_field(_objectWaiter_offset);
+}
+
+inline void java_lang_VirtualThread::set_objectWaiter(oop vthread, ObjectWaiter* value) {
+  vthread->address_field_put(_objectWaiter_offset, (address)value);
+}
+
 #if INCLUDE_JFR
 inline u2 java_lang_Thread::jfr_epoch(oop ref) {
  return ref->short_field(_jfr_epoch_offset);
--- a/src/hotspot/share/classfile/javaClassesImpl.hpp
+++ b/src/hotspot/share/classfile/javaClassesImpl.hpp
@ -40,6 +40,7 @@
  STACKFRAMEINFO_INJECTED_FIELDS(macro)     \
  MODULE_INJECTED_FIELDS(macro)             \
  THREAD_INJECTED_FIELDS(macro)             \
+  VTHREAD_INJECTED_FIELDS(macro)            \
  INTERNALERROR_INJECTED_FIELDS(macro)      \
  STACKCHUNK_INJECTED_FIELDS(macro)

--- a/src/hotspot/share/classfile/stringTable.cpp
+++ b/src/hotspot/share/classfile/stringTable.cpp
@ -309,7 +309,7 @@ public:
 };

 void StringTable::create_table() {
-  size_t start_size_log_2 = ceil_log2(StringTableSize);
+  size_t start_size_log_2 = log2i_ceil(StringTableSize);
  _current_size = ((size_t)1) << start_size_log_2;
  log_trace(stringtable)("Start size: " SIZE_FORMAT " (" SIZE_FORMAT ")",
                         _current_size, start_size_log_2);
--- a/src/hotspot/share/classfile/symbolTable.cpp
+++ b/src/hotspot/share/classfile/symbolTable.cpp
@ -212,7 +212,7 @@ private:
 };

 void SymbolTable::create_table ()  {
-  size_t start_size_log_2 = ceil_log2(SymbolTableSize);
+  size_t start_size_log_2 = log2i_ceil(SymbolTableSize);
  _current_size = ((size_t)1) << start_size_log_2;
  log_trace(symboltable)("Start size: " SIZE_FORMAT " (" SIZE_FORMAT ")",
                         _current_size, start_size_log_2);
--- a/src/hotspot/share/classfile/verifier.cpp
+++ b/src/hotspot/share/classfile/verifier.cpp
@ -105,11 +105,13 @@ static verify_byte_codes_fn_t verify_byte_codes_fn() {

 // Methods in Verifier

+// This method determines whether we run the verifier and class file format checking code.
 bool Verifier::should_verify_for(oop class_loader) {
  return class_loader == nullptr ?
    BytecodeVerificationLocal : BytecodeVerificationRemote;
 }

+// This method determines whether we allow package access in access checks in reflection.
 bool Verifier::relax_access_for(oop loader) {
  bool trusted = java_lang_ClassLoader::is_trusted_loader(loader);
  bool need_verify =
@ -120,6 +122,21 @@ bool Verifier::relax_access_for(oop loader) {
  return !need_verify;
 }

+// Callers will pass should_verify_class as true, depending on the results of should_verify_for() above,
+// or pass true for redefinition of any class.
+static bool is_eligible_for_verification(InstanceKlass* klass, bool should_verify_class) {
+  Symbol* name = klass->name();
+
+  return (should_verify_class &&
+    // Can not verify the bytecodes for shared classes because they have
+    // already been rewritten to contain constant pool cache indices,
+    // which the verifier can't understand.
+    // Shared classes shouldn't have stackmaps either.
+    // However, bytecodes for shared old classes can be verified because
+    // they have not been rewritten.
+    !(klass->is_shared() && klass->is_rewritten()));
+}
+
 void Verifier::trace_class_resolution(Klass* resolve_class, InstanceKlass* verify_class) {
  assert(verify_class != nullptr, "Unexpected null verify_class");
  ResourceMark rm;
@ -273,27 +290,6 @@ bool Verifier::verify(InstanceKlass* klass, bool should_verify_class, TRAPS) {
  }
 }

-bool Verifier::is_eligible_for_verification(InstanceKlass* klass, bool should_verify_class) {
-  Symbol* name = klass->name();
-
-  return (should_verify_class &&
-    // return if the class is a bootstrapping class
-    // or defineClass specified not to verify by default (flags override passed arg)
-    // We need to skip the following four for bootstraping
-    name != vmSymbols::java_lang_Object() &&
-    name != vmSymbols::java_lang_Class() &&
-    name != vmSymbols::java_lang_String() &&
-    name != vmSymbols::java_lang_Throwable() &&
-
-    // Can not verify the bytecodes for shared classes because they have
-    // already been rewritten to contain constant pool cache indices,
-    // which the verifier can't understand.
-    // Shared classes shouldn't have stackmaps either.
-    // However, bytecodes for shared old classes can be verified because
-    // they have not been rewritten.
-    !(klass->is_shared() && klass->is_rewritten()));
-}
-
 Symbol* Verifier::inference_verify(
    InstanceKlass* klass, char* message, size_t message_len, TRAPS) {
  JavaThread* thread = THREAD;
--- a/src/hotspot/share/classfile/verifier.hpp
+++ b/src/hotspot/share/classfile/verifier.hpp
@ -61,7 +61,6 @@ class Verifier : AllStatic {
  static void trace_class_resolution(Klass* resolve_class, InstanceKlass* verify_class);

 private:
-  static bool is_eligible_for_verification(InstanceKlass* klass, bool should_verify_class);
  static Symbol* inference_verify(
    InstanceKlass* klass, char* msg, size_t msg_len, TRAPS);
 };
--- a/src/hotspot/share/gc/g1/g1Arguments.cpp
+++ b/src/hotspot/share/gc/g1/g1Arguments.cpp
@ -244,10 +244,7 @@ void G1Arguments::initialize() {
  if (max_parallel_refinement_threads > UINT_MAX / divisor) {
    vm_exit_during_initialization("Too large parallelism for remembered sets.");
  }
-}

-void G1Arguments::initialize_heap_flags_and_sizes() {
-  GCArguments::initialize_heap_flags_and_sizes();
  FullGCForwarding::initialize_flags(heap_reserved_size_bytes());
 }

--- a/src/hotspot/share/gc/g1/g1Arguments.hpp
+++ b/src/hotspot/share/gc/g1/g1Arguments.hpp
@ -39,7 +39,6 @@ class G1Arguments : public GCArguments {
  static void parse_verification_type(const char* type);

  virtual void initialize_alignments();
-  virtual void initialize_heap_flags_and_sizes();

  virtual void initialize();
  virtual size_t conservative_max_heap_alignment();
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
@ -2483,6 +2483,214 @@ bool G1ConcurrentMark::try_stealing(uint worker_id, G1TaskQueueEntry& task_entry
  return _task_queues->steal(worker_id, task_entry);
 }

+void G1CMTask::process_current_region(G1CMBitMapClosure& bitmap_closure) {
+  if (has_aborted() || _curr_region == nullptr) {
+    return;
+  }
+
+  // This means that we're already holding on to a region.
+  assert(_finger != nullptr, "if region is not null, then the finger "
+         "should not be null either");
+
+  // We might have restarted this task after an evacuation pause
+  // which might have evacuated the region we're holding on to
+  // underneath our feet. Let's read its limit again to make sure
+  // that we do not iterate over a region of the heap that
+  // contains garbage (update_region_limit() will also move
+  // _finger to the start of the region if it is found empty).
+  update_region_limit();
+  // We will start from _finger not from the start of the region,
+  // as we might be restarting this task after aborting half-way
+  // through scanning this region. In this case, _finger points to
+  // the address where we last found a marked object. If this is a
+  // fresh region, _finger points to start().
+  MemRegion mr = MemRegion(_finger, _region_limit);
+
+  assert(!_curr_region->is_humongous() || mr.start() == _curr_region->bottom(),
+         "humongous regions should go around loop once only");
+
+  // Some special cases:
+  // If the memory region is empty, we can just give up the region.
+  // If the current region is humongous then we only need to check
+  // the bitmap for the bit associated with the start of the object,
+  // scan the object if it's live, and give up the region.
+  // Otherwise, let's iterate over the bitmap of the part of the region
+  // that is left.
+  // If the iteration is successful, give up the region.
+  if (mr.is_empty()) {
+    giveup_current_region();
+    abort_marking_if_regular_check_fail();
+  } else if (_curr_region->is_humongous() && mr.start() == _curr_region->bottom()) {
+    if (_mark_bitmap->is_marked(mr.start())) {
+      // The object is marked - apply the closure
+      bitmap_closure.do_addr(mr.start());
+    }
+    // Even if this task aborted while scanning the humongous object
+    // we can (and should) give up the current region.
+    giveup_current_region();
+    abort_marking_if_regular_check_fail();
+  } else if (_mark_bitmap->iterate(&bitmap_closure, mr)) {
+    giveup_current_region();
+    abort_marking_if_regular_check_fail();
+  } else {
+    assert(has_aborted(), "currently the only way to do so");
+    // The only way to abort the bitmap iteration is to return
+    // false from the do_bit() method. However, inside the
+    // do_bit() method we move the _finger to point to the
+    // object currently being looked at. So, if we bail out, we
+    // have definitely set _finger to something non-null.
+    assert(_finger != nullptr, "invariant");
+
+    // Region iteration was actually aborted. So now _finger
+    // points to the address of the object we last scanned. If we
+    // leave it there, when we restart this task, we will rescan
+    // the object. It is easy to avoid this. We move the finger by
+    // enough to point to the next possible object header.
+    assert(_finger < _region_limit, "invariant");
+    HeapWord* const new_finger = _finger + cast_to_oop(_finger)->size();
+    if (new_finger >= _region_limit) {
+      giveup_current_region();
+    } else {
+      move_finger_to(new_finger);
+    }
+  }
+}
+
+void G1CMTask::claim_new_region() {
+  // Read the note on the claim_region() method on why it might
+  // return null with potentially more regions available for
+  // claiming and why we have to check out_of_regions() to determine
+  // whether we're done or not.
+  while (!has_aborted() && _curr_region == nullptr && !_cm->out_of_regions()) {
+    // We are going to try to claim a new region. We should have
+    // given up on the previous one.
+    // Separated the asserts so that we know which one fires.
+    assert(_curr_region  == nullptr, "invariant");
+    assert(_finger       == nullptr, "invariant");
+    assert(_region_limit == nullptr, "invariant");
+    G1HeapRegion* claimed_region = _cm->claim_region(_worker_id);
+    if (claimed_region != nullptr) {
+      // Yes, we managed to claim one
+      setup_for_region(claimed_region);
+      assert(_curr_region == claimed_region, "invariant");
+    }
+    // It is important to call the regular clock here. It might take
+    // a while to claim a region if, for example, we hit a large
+    // block of empty regions. So we need to call the regular clock
+    // method once round the loop to make sure it's called
+    // frequently enough.
+    abort_marking_if_regular_check_fail();
+  }
+}
+
+void G1CMTask::attempt_stealing() {
+  // We cannot check whether the global stack is empty, since other
+  // tasks might be pushing objects to it concurrently.
+  assert(_cm->out_of_regions() && _task_queue->size() == 0,
+         "only way to reach here");
+  while (!has_aborted()) {
+    G1TaskQueueEntry entry;
+    if (_cm->try_stealing(_worker_id, entry)) {
+      scan_task_entry(entry);
+
+      // And since we're towards the end, let's totally drain the
+      // local queue and global stack.
+      drain_local_queue(false);
+      drain_global_stack(false);
+    } else {
+      break;
+    }
+  }
+}
+
+void G1CMTask::attempt_termination(bool is_serial) {
+  // We cannot check whether the global stack is empty, since other
+  // tasks might be concurrently pushing objects on it.
+  // Separated the asserts so that we know which one fires.
+  assert(_cm->out_of_regions(), "only way to reach here");
+  assert(_task_queue->size() == 0, "only way to reach here");
+  double termination_start_time_ms = os::elapsedTime() * 1000.0;
+
+  // The G1CMTask class also extends the TerminatorTerminator class,
+  // hence its should_exit_termination() method will also decide
+  // whether to exit the termination protocol or not.
+  bool finished = (is_serial ||
+                   _cm->terminator()->offer_termination(this));
+  _termination_time_ms += (os::elapsedTime() * 1000.0 - termination_start_time_ms);
+
+  if (finished) {
+    // We're all done.
+
+    // We can now guarantee that the global stack is empty, since
+    // all other tasks have finished. We separated the guarantees so
+    // that, if a condition is false, we can immediately find out
+    // which one.
+    guarantee(_cm->out_of_regions(), "only way to reach here");
+    guarantee(_cm->mark_stack_empty(), "only way to reach here");
+    guarantee(_task_queue->size() == 0, "only way to reach here");
+    guarantee(!_cm->has_overflown(), "only way to reach here");
+    guarantee(!has_aborted(), "should never happen if termination has completed");
+  } else {
+    // Apparently there's more work to do. Let's abort this task. We
+    // will restart it and hopefully we can find more things to do.
+    set_has_aborted();
+  }
+}
+
+void G1CMTask::handle_abort(bool is_serial, double elapsed_time_ms) {
+  if (_has_timed_out) {
+    double diff_ms = elapsed_time_ms - _time_target_ms;
+    // Keep statistics of how well we did with respect to hitting
+    // our target only if we actually timed out (if we aborted for
+    // other reasons, then the results might get skewed).
+    _marking_step_diff_ms.add(diff_ms);
+  }
+
+  if (!_cm->has_overflown()) {
+    return;
+  }
+
+  // This is the interesting one. We aborted because a global
+  // overflow was raised. This means we have to restart the
+  // marking phase and start iterating over regions. However, in
+  // order to do this we have to make sure that all tasks stop
+  // what they are doing and re-initialize in a safe manner. We
+  // will achieve this with the use of two barrier sync points.
+  if (!is_serial) {
+    // We only need to enter the sync barrier if being called
+    // from a parallel context
+    _cm->enter_first_sync_barrier(_worker_id);
+
+    // When we exit this sync barrier we know that all tasks have
+    // stopped doing marking work. So, it's now safe to
+    // re-initialize our data structures.
+  }
+
+  clear_region_fields();
+  flush_mark_stats_cache();
+
+  if (!is_serial) {
+    // If we're executing the concurrent phase of marking, reset the marking
+    // state; otherwise the marking state is reset after reference processing,
+    // during the remark pause.
+    // If we reset here as a result of an overflow during the remark we will
+    // see assertion failures from any subsequent set_concurrency_and_phase()
+    // calls.
+    if (_cm->concurrent() && _worker_id == 0) {
+      // Worker 0 is responsible for clearing the global data structures because
+      // of an overflow. During STW we should not clear the overflow flag (in
+      // G1ConcurrentMark::reset_marking_state()) since we rely on it being true when we exit
+      // method to abort the pause and restart concurrent marking.
+      _cm->reset_marking_for_restart();
+
+      log_info(gc, marking)("Concurrent Mark reset for overflow");
+    }
+
+    // ...and enter the second barrier.
+    _cm->enter_second_sync_barrier(_worker_id);
+  }
+}
+
 /*****************************************************************************

    The do_marking_step(time_target_ms, ...) method is the building
@ -2653,123 +2861,27 @@ void G1CMTask::do_marking_step(double time_target_ms,
  drain_global_stack(true);

  do {
-    if (!has_aborted() && _curr_region != nullptr) {
-      // This means that we're already holding on to a region.
-      assert(_finger != nullptr, "if region is not null, then the finger "
-             "should not be null either");
-
-      // We might have restarted this task after an evacuation pause
-      // which might have evacuated the region we're holding on to
-      // underneath our feet. Let's read its limit again to make sure
-      // that we do not iterate over a region of the heap that
-      // contains garbage (update_region_limit() will also move
-      // _finger to the start of the region if it is found empty).
-      update_region_limit();
-      // We will start from _finger not from the start of the region,
-      // as we might be restarting this task after aborting half-way
-      // through scanning this region. In this case, _finger points to
-      // the address where we last found a marked object. If this is a
-      // fresh region, _finger points to start().
-      MemRegion mr = MemRegion(_finger, _region_limit);
-
-      assert(!_curr_region->is_humongous() || mr.start() == _curr_region->bottom(),
-             "humongous regions should go around loop once only");
-
-      // Some special cases:
-      // If the memory region is empty, we can just give up the region.
-      // If the current region is humongous then we only need to check
-      // the bitmap for the bit associated with the start of the object,
-      // scan the object if it's live, and give up the region.
-      // Otherwise, let's iterate over the bitmap of the part of the region
-      // that is left.
-      // If the iteration is successful, give up the region.
-      if (mr.is_empty()) {
-        giveup_current_region();
-        abort_marking_if_regular_check_fail();
-      } else if (_curr_region->is_humongous() && mr.start() == _curr_region->bottom()) {
-        if (_mark_bitmap->is_marked(mr.start())) {
-          // The object is marked - apply the closure
-          bitmap_closure.do_addr(mr.start());
-        }
-        // Even if this task aborted while scanning the humongous object
-        // we can (and should) give up the current region.
-        giveup_current_region();
-        abort_marking_if_regular_check_fail();
-      } else if (_mark_bitmap->iterate(&bitmap_closure, mr)) {
-        giveup_current_region();
-        abort_marking_if_regular_check_fail();
-      } else {
-        assert(has_aborted(), "currently the only way to do so");
-        // The only way to abort the bitmap iteration is to return
-        // false from the do_bit() method. However, inside the
-        // do_bit() method we move the _finger to point to the
-        // object currently being looked at. So, if we bail out, we
-        // have definitely set _finger to something non-null.
-        assert(_finger != nullptr, "invariant");
-
-        // Region iteration was actually aborted. So now _finger
-        // points to the address of the object we last scanned. If we
-        // leave it there, when we restart this task, we will rescan
-        // the object. It is easy to avoid this. We move the finger by
-        // enough to point to the next possible object header.
-        assert(_finger < _region_limit, "invariant");
-        HeapWord* const new_finger = _finger + cast_to_oop(_finger)->size();
-        // Check if bitmap iteration was aborted while scanning the last object
-        if (new_finger >= _region_limit) {
-          giveup_current_region();
-        } else {
-          move_finger_to(new_finger);
-        }
-      }
-    }
+    process_current_region(bitmap_closure);
    // At this point we have either completed iterating over the
    // region we were holding on to, or we have aborted.

    // We then partially drain the local queue and the global stack.
-    // (Do we really need this?)
    drain_local_queue(true);
    drain_global_stack(true);

-    // Read the note on the claim_region() method on why it might
-    // return null with potentially more regions available for
-    // claiming and why we have to check out_of_regions() to determine
-    // whether we're done or not.
-    while (!has_aborted() && _curr_region == nullptr && !_cm->out_of_regions()) {
-      // We are going to try to claim a new region. We should have
-      // given up on the previous one.
-      // Separated the asserts so that we know which one fires.
-      assert(_curr_region  == nullptr, "invariant");
-      assert(_finger       == nullptr, "invariant");
-      assert(_region_limit == nullptr, "invariant");
-      G1HeapRegion* claimed_region = _cm->claim_region(_worker_id);
-      if (claimed_region != nullptr) {
-        // Yes, we managed to claim one
-        setup_for_region(claimed_region);
-        assert(_curr_region == claimed_region, "invariant");
-      }
-      // It is important to call the regular clock here. It might take
-      // a while to claim a region if, for example, we hit a large
-      // block of empty regions. So we need to call the regular clock
-      // method once round the loop to make sure it's called
-      // frequently enough.
-      abort_marking_if_regular_check_fail();
-    }
+    claim_new_region();

-    if (!has_aborted() && _curr_region == nullptr) {
-      assert(_cm->out_of_regions(),
-             "at this point we should be out of regions");
-    }
+    assert(has_aborted() || _curr_region != nullptr || _cm->out_of_regions(),
+           "at this point we should be out of regions");
  } while ( _curr_region != nullptr && !has_aborted());

-  if (!has_aborted()) {
-    // We cannot check whether the global stack is empty, since other
-    // tasks might be pushing objects to it concurrently.
-    assert(_cm->out_of_regions(),
-           "at this point we should be out of regions");
-    // Try to reduce the number of available SATB buffers so that
-    // remark has less work to do.
-    drain_satb_buffers();
-  }
+  // We cannot check whether the global stack is empty, since other
+  // tasks might be pushing objects to it concurrently.
+  assert(has_aborted() || _cm->out_of_regions(),
+         "at this point we should be out of regions");
+  // Try to reduce the number of available SATB buffers so that
+  // remark has less work to do.
+  drain_satb_buffers();

  // Since we've done everything else, we can now totally drain the
  // local queue and global stack.
@ -2780,60 +2892,13 @@ void G1CMTask::do_marking_step(double time_target_ms,
  if (do_stealing && !has_aborted()) {
    // We have not aborted. This means that we have finished all that
    // we could. Let's try to do some stealing...
-
-    // We cannot check whether the global stack is empty, since other
-    // tasks might be pushing objects to it concurrently.
-    assert(_cm->out_of_regions() && _task_queue->size() == 0,
-           "only way to reach here");
-    while (!has_aborted()) {
-      G1TaskQueueEntry entry;
-      if (_cm->try_stealing(_worker_id, entry)) {
-        scan_task_entry(entry);
-
-        // And since we're towards the end, let's totally drain the
-        // local queue and global stack.
-        drain_local_queue(false);
-        drain_global_stack(false);
-      } else {
-        break;
-      }
-    }
+    attempt_stealing();
  }

  // We still haven't aborted. Now, let's try to get into the
  // termination protocol.
  if (do_termination && !has_aborted()) {
-    // We cannot check whether the global stack is empty, since other
-    // tasks might be concurrently pushing objects on it.
-    // Separated the asserts so that we know which one fires.
-    assert(_cm->out_of_regions(), "only way to reach here");
-    assert(_task_queue->size() == 0, "only way to reach here");
-    double termination_start_time_ms = os::elapsedTime() * 1000.0;
-
-    // The G1CMTask class also extends the TerminatorTerminator class,
-    // hence its should_exit_termination() method will also decide
-    // whether to exit the termination protocol or not.
-    bool finished = (is_serial ||
-                     _cm->terminator()->offer_termination(this));
-    _termination_time_ms += (os::elapsedTime() * 1000.0 - termination_start_time_ms);
-
-    if (finished) {
-      // We're all done.
-
-      // We can now guarantee that the global stack is empty, since
-      // all other tasks have finished. We separated the guarantees so
-      // that, if a condition is false, we can immediately find out
-      // which one.
-      guarantee(_cm->out_of_regions(), "only way to reach here");
-      guarantee(_cm->mark_stack_empty(), "only way to reach here");
-      guarantee(_task_queue->size() == 0, "only way to reach here");
-      guarantee(!_cm->has_overflown(), "only way to reach here");
-      guarantee(!has_aborted(), "should never happen if termination has completed");
-    } else {
-      // Apparently there's more work to do. Let's abort this task. It
-      // will restart it and we can hopefully find more things to do.
-      set_has_aborted();
-    }
+    attempt_termination(is_serial);
  }

  // Mainly for debugging purposes to make sure that a pointer to the
@ -2847,59 +2912,7 @@ void G1CMTask::do_marking_step(double time_target_ms,

  if (has_aborted()) {
    // The task was aborted for some reason.
-    if (_has_timed_out) {
-      double diff_ms = elapsed_time_ms - _time_target_ms;
-      // Keep statistics of how well we did with respect to hitting
-      // our target only if we actually timed out (if we aborted for
-      // other reasons, then the results might get skewed).
-      _marking_step_diff_ms.add(diff_ms);
-    }
-
-    if (_cm->has_overflown()) {
-      // This is the interesting one. We aborted because a global
-      // overflow was raised. This means we have to restart the
-      // marking phase and start iterating over regions. However, in
-      // order to do this we have to make sure that all tasks stop
-      // what they are doing and re-initialize in a safe manner. We
-      // will achieve this with the use of two barrier sync points.
-
-      if (!is_serial) {
-        // We only need to enter the sync barrier if being called
-        // from a parallel context
-        _cm->enter_first_sync_barrier(_worker_id);
-
-        // When we exit this sync barrier we know that all tasks have
-        // stopped doing marking work. So, it's now safe to
-        // re-initialize our data structures.
-      }
-
-      clear_region_fields();
-      flush_mark_stats_cache();
-
-      if (!is_serial) {
-        // If we're executing the concurrent phase of marking, reset the marking
-        // state; otherwise the marking state is reset after reference processing,
-        // during the remark pause.
-        // If we reset here as a result of an overflow during the remark we will
-        // see assertion failures from any subsequent set_concurrency_and_phase()
-        // calls.
-        if (_cm->concurrent() && _worker_id == 0) {
-          // Worker 0 is responsible for clearing the global data structures because
-          // of an overflow. During STW we should not clear the overflow flag (in
-          // G1ConcurrentMark::reset_marking_state()) since we rely on it being true when we exit
-          // method to abort the pause and restart concurrent marking.
-          _cm->reset_marking_for_restart();
-
-          log_info(gc, marking)("Concurrent Mark reset for overflow");
-        }
-
-        // ...and enter the second barrier.
-        _cm->enter_second_sync_barrier(_worker_id);
-      }
-      // At this point, if we're during the concurrent phase of
-      // marking, everything has been re-initialized and we're
-      // ready to restart.
-    }
+    handle_abort(is_serial, elapsed_time_ms);
  }
 }

--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
@ -810,6 +810,21 @@ private:
  // Makes the limit of the region up-to-date
  void update_region_limit();

+  // Handles the processing of the current region.
+  void process_current_region(G1CMBitMapClosure& bitmap_closure);
+
+  // Claims a new region if available.
+  void claim_new_region();
+
+  // Attempts to steal work from other tasks.
+  void attempt_stealing();
+
+  // Handles the termination protocol.
+  void attempt_termination(bool is_serial);
+
+  // Handles the has_aborted scenario.
+  void handle_abort(bool is_serial, double elapsed_time_ms);
+
  // Called when either the words scanned or the refs visited limit
  // has been reached
  void reached_limit();
--- a/src/hotspot/share/gc/parallel/parallelArguments.cpp
+++ b/src/hotspot/share/gc/parallel/parallelArguments.cpp
@ -83,6 +83,8 @@ void ParallelArguments::initialize() {
  if (FLAG_IS_DEFAULT(ParallelRefProcEnabled) && ParallelGCThreads > 1) {
    FLAG_SET_DEFAULT(ParallelRefProcEnabled, true);
  }
+
+  FullGCForwarding::initialize_flags(heap_reserved_size_bytes());
 }

 // The alignment used for boundary between young gen and old gen
@ -128,7 +130,6 @@ void ParallelArguments::initialize_heap_flags_and_sizes() {
    // Redo everything from the start
    initialize_heap_flags_and_sizes_one_pass();
  }
-  FullGCForwarding::initialize_flags(heap_reserved_size_bytes());
 }

 size_t ParallelArguments::heap_reserved_size_bytes() {
--- a/src/hotspot/share/gc/serial/serialArguments.cpp
+++ b/src/hotspot/share/gc/serial/serialArguments.cpp
@ -24,12 +24,12 @@

 #include "precompiled.hpp"
 #include "gc/shared/fullGCForwarding.hpp"
-#include "gc/shared/genArguments.hpp"
+#include "gc/shared/gcArguments.hpp"
 #include "gc/serial/serialArguments.hpp"
 #include "gc/serial/serialHeap.hpp"

-void SerialArguments::initialize_heap_flags_and_sizes() {
-  GenArguments::initialize_heap_flags_and_sizes();
+void SerialArguments::initialize() {
+  GCArguments::initialize();
  FullGCForwarding::initialize_flags(MaxHeapSize);
 }

--- a/src/hotspot/share/gc/serial/serialArguments.hpp
+++ b/src/hotspot/share/gc/serial/serialArguments.hpp
@ -31,8 +31,8 @@ class CollectedHeap;

 class SerialArguments : public GenArguments {
 private:
+  virtual void initialize();
  virtual CollectedHeap* create_heap();
-  virtual void initialize_heap_flags_and_sizes();
 };

 #endif // SHARE_GC_SERIAL_SERIALARGUMENTS_HPP
--- a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
+++ b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
@ -140,7 +140,8 @@ LIR_Opr BarrierSetC1::atomic_add_at(LIRAccess& access, LIRItem& value) {

 void BarrierSetC1::store_at_resolved(LIRAccess& access, LIR_Opr value) {
  DecoratorSet decorators = access.decorators();
-  bool is_volatile = (((decorators & MO_SEQ_CST) != 0) || AlwaysAtomicAccesses);
+  bool is_volatile = (decorators & MO_SEQ_CST) != 0;
+  bool is_atomic = is_volatile || AlwaysAtomicAccesses;
  bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0;
  bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0;
  LIRGenerator* gen = access.gen();
@ -154,7 +155,7 @@ void BarrierSetC1::store_at_resolved(LIRAccess& access, LIR_Opr value) {
  }

  LIR_PatchCode patch_code = needs_patching ? lir_patch_normal : lir_patch_none;
-  if (is_volatile && !needs_patching) {
+  if (is_atomic && !needs_patching) {
    gen->volatile_field_store(value, access.resolved_addr()->as_address_ptr(), access.access_emit_info());
  } else {
    __ store(value, access.resolved_addr()->as_address_ptr(), access.access_emit_info(), patch_code);
@ -168,7 +169,8 @@ void BarrierSetC1::store_at_resolved(LIRAccess& access, LIR_Opr value) {
 void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
  LIRGenerator *gen = access.gen();
  DecoratorSet decorators = access.decorators();
-  bool is_volatile = (((decorators & MO_SEQ_CST) != 0) || AlwaysAtomicAccesses);
+  bool is_volatile = (decorators & MO_SEQ_CST) != 0;
+  bool is_atomic = is_volatile || AlwaysAtomicAccesses;
  bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0;
  bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0;
  bool in_native = (decorators & IN_NATIVE) != 0;
@ -180,7 +182,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
  LIR_PatchCode patch_code = needs_patching ? lir_patch_normal : lir_patch_none;
  if (in_native) {
    __ move_wide(access.resolved_addr()->as_address_ptr(), result);
-  } else if (is_volatile && !needs_patching) {
+  } else if (is_atomic && !needs_patching) {
    gen->volatile_field_load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info());
  } else {
    __ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
--- a/src/hotspot/share/gc/shared/locationPrinter.inline.hpp
+++ b/src/hotspot/share/gc/shared/locationPrinter.inline.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -54,7 +54,7 @@ bool BlockLocationPrinter<CollectedHeapT>::print_location(outputStream* st, void
  // Check if addr points into Java heap.
  bool in_heap = CollectedHeapT::heap()->is_in(addr);
  if (in_heap) {
-    // base_oop_or_null() might be unimplemented and return NULL for some GCs/generations
+    // base_oop_or_null() might be unimplemented and return null for some GCs/generations
    oop o = base_oop_or_null(addr);
    if (o != nullptr) {
      if ((void*)o == addr) {
--- a/src/hotspot/share/gc/shenandoah/shenandoahArguments.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahArguments.cpp
@ -176,6 +176,8 @@ void ShenandoahArguments::initialize() {
  if (FLAG_IS_DEFAULT(TLABAllocationWeight)) {
    FLAG_SET_DEFAULT(TLABAllocationWeight, 90);
  }
+
+  FullGCForwarding::initialize_flags(MaxHeapSize);
 }

 size_t ShenandoahArguments::conservative_max_heap_alignment() {
@ -199,11 +201,6 @@ void ShenandoahArguments::initialize_alignments() {
  HeapAlignment = align;
 }

-void ShenandoahArguments::initialize_heap_flags_and_sizes() {
-  GCArguments::initialize_heap_flags_and_sizes();
-  FullGCForwarding::initialize_flags(MaxHeapSize);
-}
-
 CollectedHeap* ShenandoahArguments::create_heap() {
  return new ShenandoahHeap(new ShenandoahCollectorPolicy());
 }
--- a/src/hotspot/share/gc/shenandoah/shenandoahArguments.hpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahArguments.hpp
@ -35,7 +35,6 @@ private:

  virtual void initialize();
  virtual size_t conservative_max_heap_alignment();
-  virtual void initialize_heap_flags_and_sizes();
  virtual CollectedHeap* create_heap();
 };

--- a/src/hotspot/share/gc/shenandoah/shenandoahPacer.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahPacer.cpp
@ -253,9 +253,9 @@ void ShenandoahPacer::pace_for_alloc(size_t words) {
    return;
  }

-  jlong const max_delay = ShenandoahPacingMaxDelay * NANOSECS_PER_MILLISEC;
-  jlong const start_time = os::elapsed_counter();
-  while (!claimed && (os::elapsed_counter() - start_time) < max_delay) {
+  jlong const start_time = os::javaTimeNanos();
+  jlong const deadline = start_time + (ShenandoahPacingMaxDelay * NANOSECS_PER_MILLISEC);
+  while (!claimed && os::javaTimeNanos() < deadline) {
    // We could instead assist GC, but this would suffice for now.
    wait(1);
    claimed = claim_for_alloc<false>(words);
@ -267,7 +267,7 @@ void ShenandoahPacer::pace_for_alloc(size_t words) {
    claimed = claim_for_alloc<true>(words);
    assert(claimed, "Should always succeed");
  }
-  ShenandoahThreadLocalData::add_paced_time(current, (double)(os::elapsed_counter() - start_time) / NANOSECS_PER_SEC);
+  ShenandoahThreadLocalData::add_paced_time(current, (double)(os::javaTimeNanos() - start_time) / NANOSECS_PER_SEC);
 }

 void ShenandoahPacer::wait(size_t time_ms) {
@ -276,7 +276,7 @@ void ShenandoahPacer::wait(size_t time_ms) {
  assert(time_ms > 0, "Should not call this with zero argument, as it would stall until notify");
  assert(time_ms <= LONG_MAX, "Sanity");
  MonitorLocker locker(_wait_monitor);
-  _wait_monitor->wait((long)time_ms);
+  _wait_monitor->wait(time_ms);
 }

 void ShenandoahPacer::notify_waiters() {
--- a/src/hotspot/share/gc/shenandoah/shenandoahPhaseTimings.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahPhaseTimings.cpp
@ -185,33 +185,33 @@ void ShenandoahPhaseTimings::flush_par_workers_to_cycle() {
  for (uint pi = 0; pi < _num_phases; pi++) {
    Phase phase = Phase(pi);
    if (is_worker_phase(phase)) {
-      double s = uninitialized();
+      double sum = uninitialized();
      for (uint i = 1; i < _num_par_phases; i++) {
        ShenandoahWorkerData* wd = worker_data(phase, ParPhase(i));
-        double ws = uninitialized();
+        double worker_sum = uninitialized();
        for (uint c = 0; c < _max_workers; c++) {
-          double v = wd->get(c);
-          if (v != ShenandoahWorkerData::uninitialized()) {
-            if (ws == uninitialized()) {
-              ws = v;
+          double worker_time = wd->get(c);
+          if (worker_time != ShenandoahWorkerData::uninitialized()) {
+            if (worker_sum == uninitialized()) {
+              worker_sum = worker_time;
            } else {
-              ws += v;
+              worker_sum += worker_time;
            }
          }
        }
-        if (ws != uninitialized()) {
+        if (worker_sum != uninitialized()) {
          // add to each line in phase
-          set_cycle_data(Phase(phase + i + 1), ws);
-          if (s == uninitialized()) {
-            s = ws;
+          set_cycle_data(Phase(phase + i + 1), worker_sum);
+          if (sum == uninitialized()) {
+            sum = worker_sum;
          } else {
-            s += ws;
+            sum += worker_sum;
          }
        }
      }
-      if (s != uninitialized()) {
+      if (sum != uninitialized()) {
        // add to total for phase
-        set_cycle_data(Phase(phase + 1), s);
+        set_cycle_data(Phase(phase + 1), sum);
      }
    }
  }
--- a/src/hotspot/share/interpreter/linkResolver.cpp
+++ b/src/hotspot/share/interpreter/linkResolver.cpp
@ -323,6 +323,9 @@ void LinkResolver::check_klass_accessibility(Klass* ref_klass, Klass* sel_klass,
    char* msg = Reflection::verify_class_access_msg(ref_klass,
                                                    InstanceKlass::cast(base_klass),
                                                    vca_result);
+
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
+
    bool same_module = (base_klass->module() == ref_klass->module());
    if (msg == nullptr) {
      Exceptions::fthrow(
@ -615,6 +618,7 @@ void LinkResolver::check_method_accessability(Klass* ref_klass,
      print_nest_host_error_on(&ss, ref_klass, sel_klass);
    }

+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(THREAD_AND_LOCATION,
                       vmSymbols::java_lang_IllegalAccessError(),
                       "%s",
@ -968,6 +972,7 @@ void LinkResolver::check_field_accessability(Klass* ref_klass,
    if (fd.is_private()) {
      print_nest_host_error_on(&ss, ref_klass, sel_klass);
    }
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(THREAD_AND_LOCATION,
                       vmSymbols::java_lang_IllegalAccessError(),
                       "%s",
@ -1187,6 +1192,7 @@ Method* LinkResolver::linktime_resolve_special_method(const LinkInfo& link_info,
    ss.print(" %s(", resolved_method->name()->as_C_string());
    resolved_method->signature()->print_as_signature_external_parameters(&ss);
    ss.print(")' not found");
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(
      THREAD_AND_LOCATION,
      vmSymbols::java_lang_NoSuchMethodError(),
--- a/src/hotspot/share/jfr/leakprofiler/sampling/objectSampler.cpp
+++ b/src/hotspot/share/jfr/leakprofiler/sampling/objectSampler.cpp
@ -258,12 +258,25 @@ void ObjectSampler::add(HeapWord* obj, size_t allocated, traceid thread_id, bool
      // quick reject, will not fit
      return;
    }
-    sample = _list->reuse(_priority_queue->pop());
+    ObjectSample* popped = _priority_queue->pop();
+    size_t popped_span = popped->span();
+    ObjectSample* previous = popped->prev();
+    sample = _list->reuse(popped);
+    assert(sample != nullptr, "invariant");
+    if (previous != nullptr) {
+      push_span(previous, popped_span);
+      sample->set_span(span);
+    } else {
+      // The removed sample was the youngest sample in the list, which means the new sample is now the youngest
+      // sample. It should cover the spans of both.
+      sample->set_span(span + popped_span);
+    }
  } else {
    sample = _list->get();
+    assert(sample != nullptr, "invariant");
+    sample->set_span(span);
  }

-  assert(sample != nullptr, "invariant");
  signal_unresolved_entry();
  sample->set_thread_id(thread_id);
  if (virtual_thread) {
@ -278,7 +291,6 @@ void ObjectSampler::add(HeapWord* obj, size_t allocated, traceid thread_id, bool
    sample->set_stack_trace_hash(stacktrace_hash);
  }

-  sample->set_span(allocated);
  sample->set_object(cast_to_oop(obj));
  sample->set_allocated(allocated);
  sample->set_allocation_time(JfrTicks::now());
@ -305,14 +317,18 @@ void ObjectSampler::remove_dead(ObjectSample* sample) {
  ObjectSample* const previous = sample->prev();
  // push span onto previous
  if (previous != nullptr) {
-    _priority_queue->remove(previous);
-    previous->add_span(sample->span());
-    _priority_queue->push(previous);
+    push_span(previous, sample->span());
  }
  _priority_queue->remove(sample);
  _list->release(sample);
 }

+void ObjectSampler::push_span(ObjectSample* sample, size_t span) {
+    _priority_queue->remove(sample);
+    sample->add_span(span);
+    _priority_queue->push(sample);
+}
+
 ObjectSample* ObjectSampler::last() const {
  return _list->last();
 }
--- a/src/hotspot/share/jfr/leakprofiler/sampling/objectSampler.hpp
+++ b/src/hotspot/share/jfr/leakprofiler/sampling/objectSampler.hpp
@ -64,6 +64,7 @@ class ObjectSampler : public CHeapObj<mtTracing> {
  void add(HeapWord* object, size_t size, traceid thread_id, bool virtual_thread, const JfrBlobHandle& bh, JavaThread* thread);
  void scavenge();
  void remove_dead(ObjectSample* sample);
+  void push_span(ObjectSample* sample, size_t span);

  const ObjectSample* item_at(int index) const;
  ObjectSample* item_at(int index);
--- a/src/hotspot/share/memory/allocation.hpp
+++ b/src/hotspot/share/memory/allocation.hpp
@ -353,7 +353,7 @@ class MetaspaceObj {
  void* operator new(size_t size, ClassLoaderData* loader_data,
                     size_t word_size,
                     Type type) throw();
-  void operator delete(void* p) { ShouldNotCallThis(); }
+  void operator delete(void* p) = delete;

  // Declare a *static* method with the same signature in any subclass of MetaspaceObj
  // that should be read-only by default. See symbol.hpp for an example. This function
--- a/src/hotspot/share/memory/universe.cpp
+++ b/src/hotspot/share/memory/universe.cpp
@ -62,6 +62,7 @@
 #include "oops/instanceMirrorKlass.hpp"
 #include "oops/klass.inline.hpp"
 #include "oops/objArrayOop.inline.hpp"
+#include "oops/objLayout.hpp"
 #include "oops/oop.inline.hpp"
 #include "oops/oopHandle.inline.hpp"
 #include "oops/typeArrayKlass.hpp"
@ -868,6 +869,8 @@ jint universe_init() {
  // Initialize CPUTimeCounters object, which must be done before creation of the heap.
  CPUTimeCounters::initialize();

+  ObjLayout::initialize();
+
 #ifdef _LP64
  MetaspaceShared::adjust_heap_sizes_for_dumping();
 #endif // _LP64
--- a/src/hotspot/share/nmt/memoryFileTracker.cpp
+++ b/src/hotspot/share/nmt/memoryFileTracker.cpp
@ -179,15 +179,11 @@ const GrowableArrayCHeap<MemoryFileTracker::MemoryFile*, mtNMT>& MemoryFileTrack
 };

 void MemoryFileTracker::summary_snapshot(VirtualMemorySnapshot* snapshot) const {
-  for (int d = 0; d < _files.length(); d++) {
-    const MemoryFile* file = _files.at(d);
-    for (int i = 0; i < mt_number_of_tags; i++) {
-      VirtualMemory* snap = snapshot->by_type(NMTUtil::index_to_tag(i));
-      const VirtualMemory* current = file->_summary.by_type(NMTUtil::index_to_tag(i));
-      // Only account the committed memory.
-      snap->commit_memory(current->committed());
-    }
-  }
+  iterate_summary([&](MemTag tag, const VirtualMemory* current) {
+    VirtualMemory* snap = snapshot->by_type(tag);
+    // Only account the committed memory.
+    snap->commit_memory(current->committed());
+  });
 }

 void MemoryFileTracker::Instance::summary_snapshot(VirtualMemorySnapshot* snapshot) {
--- a/src/hotspot/share/nmt/memoryFileTracker.hpp
+++ b/src/hotspot/share/nmt/memoryFileTracker.hpp
@ -39,6 +39,8 @@
 // The MemoryFileTracker tracks memory of 'memory files',
 // storage with its own memory space separate from the process.
 // A typical example of such a file is a memory mapped file.
+// All memory is accounted as committed, there is no reserved memory.
+// Any reserved memory is expected to exist in the VirtualMemoryTracker.
 class MemoryFileTracker {
  friend class NMTMemoryFileTrackerTest;

@ -72,6 +74,16 @@ public:
  MemoryFile* make_file(const char* descriptive_name);
  void free_file(MemoryFile* file);

+  template<typename F>
+  void iterate_summary(F f) const {
+    for (int d = 0; d < _files.length(); d++) {
+      const MemoryFile* file = _files.at(d);
+      for (int i = 0; i < mt_number_of_tags; i++) {
+        f(NMTUtil::index_to_tag(i), file->_summary.by_type(NMTUtil::index_to_tag(i)));
+      }
+    }
+  }
+
  void summary_snapshot(VirtualMemorySnapshot* snapshot) const;

  // Print detailed report of file
@ -99,6 +111,11 @@ public:
                                const NativeCallStack& stack, MemTag mem_tag);
    static void free_memory(MemoryFile* device, size_t offset, size_t size);

+    template<typename F>
+    static void iterate_summary(F f) {
+      _tracker->iterate_summary(f);
+    };
+
    static void summary_snapshot(VirtualMemorySnapshot* snapshot);

    static void print_report_on(const MemoryFile* device, outputStream* stream, size_t scale);
--- a/src/hotspot/share/nmt/nmtUsage.cpp
+++ b/src/hotspot/share/nmt/nmtUsage.cpp
@ -24,6 +24,7 @@

 #include "precompiled.hpp"
 #include "nmt/mallocTracker.hpp"
+#include "nmt/memoryFileTracker.hpp"
 #include "nmt/nmtCommon.hpp"
 #include "nmt/nmtUsage.hpp"
 #include "nmt/threadStackTracker.hpp"
@ -90,6 +91,16 @@ void NMTUsage::update_vm_usage() {
    _vm_total.reserved += vm->reserved();
    _vm_total.committed += vm->committed();
  }
+
+  { // MemoryFileTracker addition
+    using MFT = MemoryFileTracker::Instance;
+    MFT::Locker lock;
+    MFT::iterate_summary([&](MemTag tag, const VirtualMemory* vm) {
+      int i = NMTUtil::tag_to_index(tag);
+      _vm_by_type[i].committed += vm->committed();
+      _vm_total.committed += vm->committed();
+    });
+  }
 }

 void NMTUsage::refresh() {
--- a/src/hotspot/share/nmt/nmtUsage.hpp
+++ b/src/hotspot/share/nmt/nmtUsage.hpp
@ -26,6 +26,7 @@
 #define SHARE_NMT_NMTUSAGE_HPP

 #include "memory/allocation.hpp"
+#include "nmt/memTag.hpp"
 #include "utilities/globalDefinitions.hpp"

 struct NMTUsagePair {
--- a/src/hotspot/share/oops/constantPool.cpp
+++ b/src/hotspot/share/oops/constantPool.cpp
@ -428,7 +428,7 @@ void ConstantPool::restore_unshareable_info(TRAPS) {
  assert(is_shared(), "should always be set for shared constant pools");
  if (is_for_method_handle_intrinsic()) {
    // See the same check in remove_unshareable_info() below.
-    assert(cache() == NULL, "must not have cpCache");
+    assert(cache() == nullptr, "must not have cpCache");
    return;
  }
  assert(_cache != nullptr, "constant pool _cache should not be null");
@ -474,7 +474,7 @@ void ConstantPool::remove_unshareable_info() {
    // This CP was created by Method::make_method_handle_intrinsic() and has nothing
    // that need to be removed/restored. It has no cpCache since the intrinsic methods
    // don't have any bytecodes.
-    assert(cache() == NULL, "must not have cpCache");
+    assert(cache() == nullptr, "must not have cpCache");
    return;
  }

@ -1266,6 +1266,7 @@ oop ConstantPool::resolve_constant_at_impl(const constantPoolHandle& this_cp,
                 cp_index,
                 callee->is_interface() ? "CONSTANT_MethodRef" : "CONSTANT_InterfaceMethodRef",
                 callee->is_interface() ? "CONSTANT_InterfaceMethodRef" : "CONSTANT_MethodRef");
+        // Names are all known to be < 64k so we know this formatted message is not excessively large.
        Exceptions::fthrow(THREAD_AND_LOCATION, vmSymbols::java_lang_IncompatibleClassChangeError(), "%s", ss.as_string());
        save_and_throw_exception(this_cp, cp_index, tag, CHECK_NULL);
      }
--- a/src/hotspot/share/oops/cpCache.cpp
+++ b/src/hotspot/share/oops/cpCache.cpp
@ -829,9 +829,15 @@ oop ConstantPoolCache::appendix_if_resolved(ResolvedMethodEntry* method_entry) c
 void ConstantPoolCache::print_on(outputStream* st) const {
  st->print_cr("%s", internal_name());
  // print constant pool cache entries
-  print_resolved_field_entries(st);
-  print_resolved_method_entries(st);
-  print_resolved_indy_entries(st);
+  if (_resolved_field_entries != nullptr) {
+    print_resolved_field_entries(st);
+  }
+  if (_resolved_method_entries != nullptr) {
+    print_resolved_method_entries(st);
+  }
+  if (_resolved_indy_entries != nullptr) {
+    print_resolved_indy_entries(st);
+  }
 }

 void ConstantPoolCache::print_resolved_field_entries(outputStream* st) const {
--- a/src/hotspot/share/oops/instanceKlass.cpp
+++ b/src/hotspot/share/oops/instanceKlass.cpp
@ -899,6 +899,7 @@ bool InstanceKlass::link_class_impl(TRAPS) {
    // if we are executing Java code. This is not a problem for CDS dumping phase since
    // it doesn't execute any Java code.
    ResourceMark rm(THREAD);
+    // Names are all known to be < 64k so we know this formatted message is not excessively large.
    Exceptions::fthrow(THREAD_AND_LOCATION,
                       vmSymbols::java_lang_NoClassDefFoundError(),
                       "Class %s, or one of its supertypes, failed class initialization",
@ -919,6 +920,7 @@ bool InstanceKlass::link_class_impl(TRAPS) {
  if (super_klass != nullptr) {
    if (super_klass->is_interface()) {  // check if super class is an interface
      ResourceMark rm(THREAD);
+      // Names are all known to be < 64k so we know this formatted message is not excessively large.
      Exceptions::fthrow(
        THREAD_AND_LOCATION,
        vmSymbols::java_lang_IncompatibleClassChangeError(),
@ -3286,6 +3288,7 @@ InstanceKlass* InstanceKlass::compute_enclosing_class(bool* inner_is_member, TRA
        // If the outer class is not an instance klass then it cannot have
        // declared any inner classes.
        ResourceMark rm(THREAD);
+        // Names are all known to be < 64k so we know this formatted message is not excessively large.
        Exceptions::fthrow(
          THREAD_AND_LOCATION,
          vmSymbols::java_lang_IncompatibleClassChangeError(),
--- a/src/hotspot/share/oops/instanceKlass.hpp
+++ b/src/hotspot/share/oops/instanceKlass.hpp
@ -45,7 +45,6 @@
 class ConstantPool;
 class DeoptimizationScope;
 class klassItable;
-class Monitor;
 class RecordComponent;

 // An InstanceKlass is the VM level representation of a Java class.
@ -68,7 +67,6 @@ class ClassFileStream;
 class KlassDepChange;
 class DependencyContext;
 class fieldDescriptor;
-class jniIdMapBase;
 class JNIid;
 class JvmtiCachedClassFieldMap;
 class nmethodBucket;
--- a/src/hotspot/share/oops/klass.hpp
+++ b/src/hotspot/share/oops/klass.hpp
@ -25,8 +25,6 @@
 #ifndef SHARE_OOPS_KLASS_HPP
 #define SHARE_OOPS_KLASS_HPP

-#include "memory/iterator.hpp"
-#include "memory/memRegion.hpp"
 #include "oops/klassFlags.hpp"
 #include "oops/markWord.hpp"
 #include "oops/metadata.hpp"
@ -60,8 +58,6 @@ class fieldDescriptor;
 class klassVtable;
 class ModuleEntry;
 class PackageEntry;
-class ParCompactionManager;
-class PSPromotionManager;
 class vtableEntry;

 class Klass : public Metadata {
--- a/src/hotspot/share/oops/metadata.hpp
+++ b/src/hotspot/share/oops/metadata.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -44,11 +44,10 @@ class Metadata : public MetaspaceObj {
  virtual bool is_method()             const { return false; }
  virtual bool is_methodData()         const { return false; }
  virtual bool is_constantPool()       const { return false; }
-  virtual bool is_methodCounters()     const { return false; }
  virtual int  size()                  const = 0;
  virtual MetaspaceObj::Type type()    const = 0;
  virtual const char* internal_name()  const = 0;
-  virtual void metaspace_pointers_do(MetaspaceClosure* iter) {}
+  virtual void metaspace_pointers_do(MetaspaceClosure* iter) = 0;

  void print()       const;
  void print_value() const;
--- a/src/hotspot/share/oops/method.cpp
+++ b/src/hotspot/share/oops/method.cpp
@ -335,7 +335,8 @@ int Method::bci_from(address bcp) const {


 int Method::validate_bci(int bci) const {
-  return (bci == 0 || bci < code_size()) ? bci : -1;
+  // Called from the verifier, and should return -1 if not valid.
+  return ((is_native() && bci == 0) || (!is_native() && 0 <= bci && bci < code_size())) ? bci : -1;
 }

 // Return bci if it appears to be a valid bcp
--- a/src/hotspot/share/oops/method.hpp
+++ b/src/hotspot/share/oops/method.hpp
@ -32,7 +32,6 @@
 #include "oops/methodFlags.hpp"
 #include "oops/instanceKlass.hpp"
 #include "oops/oop.hpp"
-#include "oops/typeArrayOop.hpp"
 #include "utilities/accessFlags.hpp"
 #include "utilities/align.hpp"
 #include "utilities/growableArray.hpp"
--- a/src/hotspot/share/oops/methodCounters.cpp
+++ b/src/hotspot/share/oops/methodCounters.cpp
@ -50,12 +50,12 @@ MethodCounters::MethodCounters(const methodHandle& mh) :

 MethodCounters* MethodCounters::allocate_no_exception(const methodHandle& mh) {
  ClassLoaderData* loader_data = mh->method_holder()->class_loader_data();
-  return new(loader_data, method_counters_size(), MetaspaceObj::MethodCountersType) MethodCounters(mh);
+  return new(loader_data, size(), MetaspaceObj::MethodCountersType) MethodCounters(mh);
 }

 MethodCounters* MethodCounters::allocate_with_exception(const methodHandle& mh, TRAPS) {
  ClassLoaderData* loader_data = mh->method_holder()->class_loader_data();
-  return new(loader_data, method_counters_size(), MetaspaceObj::MethodCountersType, THREAD) MethodCounters(mh);
+  return new(loader_data, size(), MetaspaceObj::MethodCountersType, THREAD) MethodCounters(mh);
 }

 void MethodCounters::clear_counters() {
@ -70,7 +70,6 @@ void MethodCounters::clear_counters() {
 }

 void MethodCounters::print_value_on(outputStream* st) const {
-  assert(is_methodCounters(), "must be methodCounters");
  st->print("method counters");
  print_address_on(st);
 }
--- a/src/hotspot/share/oops/methodCounters.hpp
+++ b/src/hotspot/share/oops/methodCounters.hpp
@ -30,7 +30,7 @@
 #include "interpreter/invocationCounter.hpp"
 #include "utilities/align.hpp"

-class MethodCounters : public Metadata {
+class MethodCounters : public MetaspaceObj {
 friend class VMStructs;
 friend class JVMCIVMStructs;
 private:
@ -52,19 +52,18 @@ class MethodCounters : public Metadata {

  MethodCounters(const methodHandle& mh);
 public:
-  virtual bool is_methodCounters() const { return true; }
-
  static MethodCounters* allocate_no_exception(const methodHandle& mh);
  static MethodCounters* allocate_with_exception(const methodHandle& mh, TRAPS);

+  DEBUG_ONLY(bool on_stack() { return false; })
  void deallocate_contents(ClassLoaderData* loader_data) {}

-  static int method_counters_size() {
+  void metaspace_pointers_do(MetaspaceClosure* it) { return; }
+
+  static int size() {
    return align_up((int)sizeof(MethodCounters), wordSize) / wordSize;
  }
-  virtual int size() const {
-    return method_counters_size();
-  }
+
  MetaspaceObj::Type type() const { return MethodCountersType; }
  void clear_counters();

@ -128,8 +127,7 @@ class MethodCounters : public Metadata {
    return byte_offset_of(MethodCounters, _backedge_mask);
  }

-  virtual const char* internal_name() const { return "{method counters}"; }
-  virtual void print_value_on(outputStream* st) const;
-
+  const char* internal_name() const { return "{method counters}"; }
+  void print_value_on(outputStream* st) const;
 };
 #endif // SHARE_OOPS_METHODCOUNTERS_HPP
--- a/src/hotspot/share/oops/stackChunkOop.hpp
+++ b/src/hotspot/share/oops/stackChunkOop.hpp
@ -98,12 +98,6 @@ public:
  inline uint8_t lockstack_size() const;
  inline void set_lockstack_size(uint8_t value);

-  inline ObjectWaiter* object_waiter() const;
-  inline void set_object_waiter(ObjectWaiter* obj_waiter);
-
-  inline ObjectMonitor* current_pending_monitor() const;
-  inline ObjectMonitor* current_waiting_monitor() const;
-
  inline oop cont() const;
  template<typename P>
  inline oop cont() const;
--- a/src/hotspot/share/oops/stackChunkOop.inline.hpp
+++ b/src/hotspot/share/oops/stackChunkOop.inline.hpp
@ -92,9 +92,6 @@ inline void stackChunkOopDesc::set_max_thawing_size(int value)  {
 inline uint8_t stackChunkOopDesc::lockstack_size() const         { return jdk_internal_vm_StackChunk::lockStackSize(as_oop()); }
 inline void stackChunkOopDesc::set_lockstack_size(uint8_t value) { jdk_internal_vm_StackChunk::set_lockStackSize(this, value); }

-inline ObjectWaiter* stackChunkOopDesc::object_waiter() const       { return (ObjectWaiter*)jdk_internal_vm_StackChunk::objectWaiter(as_oop()); }
-inline void stackChunkOopDesc::set_object_waiter(ObjectWaiter* obj) { jdk_internal_vm_StackChunk::set_objectWaiter(this, (address)obj); }
-
 inline oop stackChunkOopDesc::cont() const                { return jdk_internal_vm_StackChunk::cont(as_oop()); }
 inline void stackChunkOopDesc::set_cont(oop value)        { jdk_internal_vm_StackChunk::set_cont(this, value); }
 template<typename P>
@ -171,22 +168,6 @@ inline void stackChunkOopDesc::set_preempted(bool value) {
  set_flag(FLAG_PREEMPTED, value);
 }

-inline ObjectMonitor* stackChunkOopDesc::current_pending_monitor() const {
-  ObjectWaiter* waiter = object_waiter();
-  if (waiter != nullptr && waiter->at_monitorenter()) {
-    return waiter->monitor();
-  }
-  return nullptr;
-}
-
-inline ObjectMonitor* stackChunkOopDesc::current_waiting_monitor() const {
-  ObjectWaiter* waiter = object_waiter();
-  if (waiter != nullptr && waiter->is_wait()) {
-    return waiter->monitor();
-  }
-  return nullptr;
-}
-
 inline bool stackChunkOopDesc::has_lockstack() const         { return is_flag(FLAG_HAS_LOCKSTACK); }
 inline void stackChunkOopDesc::set_has_lockstack(bool value) { set_flag(FLAG_HAS_LOCKSTACK, value); }

--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@ -355,6 +355,12 @@
  product(bool, SuperWordReductions, true,                                  \
          "Enable reductions support in superword.")                        \
                                                                            \
+  product_pd(uint, SuperWordStoreToLoadForwardingFailureDetection, DIAGNOSTIC, \
+          "if >0, auto-vectorization detects possible store-to-load "       \
+          "forwarding failures. The number specifies over how many "        \
+          "loop iterations this detection spans.")                          \
+          range(0, 4096)                                                    \
+                                                                            \
  product(bool, UseCMoveUnconditionally, false,                             \
          "Use CMove (scalar and vector) ignoring profitability test.")     \
                                                                            \
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@ -3257,7 +3257,10 @@ bool LibraryCallKit::inline_native_getEventWriter() {
  set_all_memory(input_memory_state);
  Node* input_io_state = i_o();

-  Node* excluded_mask = _gvn.intcon(32768);
+  // The most significant bit of the u2 is used to denote thread exclusion
+  Node* excluded_shift = _gvn.intcon(15);
+  Node* excluded_mask = _gvn.intcon(1 << 15);
+  // The epoch generation is the range [1-32767]
  Node* epoch_mask = _gvn.intcon(32767);

  // TLS
@ -3411,7 +3414,7 @@ bool LibraryCallKit::inline_native_getEventWriter() {
  record_for_igvn(vthread_compare_io);
  PhiNode* tid = new PhiNode(vthread_compare_rgn, TypeLong::LONG);
  record_for_igvn(tid);
-  PhiNode* exclusion = new PhiNode(vthread_compare_rgn, TypeInt::BOOL);
+  PhiNode* exclusion = new PhiNode(vthread_compare_rgn, TypeInt::CHAR);
  record_for_igvn(exclusion);
  PhiNode* pinVirtualThread = new PhiNode(vthread_compare_rgn, TypeInt::BOOL);
  record_for_igvn(pinVirtualThread);
@ -3476,7 +3479,8 @@ bool LibraryCallKit::inline_native_getEventWriter() {
  store_to_memory(tid_is_not_equal, event_writer_pin_field, _gvn.transform(pinVirtualThread), T_BOOLEAN, MemNode::unordered);

  // Store the exclusion state to the event writer.
-  store_to_memory(tid_is_not_equal, event_writer_excluded_field, _gvn.transform(exclusion), T_BOOLEAN, MemNode::unordered);
+  Node* excluded_bool = _gvn.transform(new URShiftINode(_gvn.transform(exclusion), excluded_shift));
+  store_to_memory(tid_is_not_equal, event_writer_excluded_field, excluded_bool, T_BOOLEAN, MemNode::unordered);

  // Store the tid to the event writer.
  store_to_memory(tid_is_not_equal, event_writer_tid_field, tid, T_LONG, MemNode::unordered);
@ -3543,7 +3547,9 @@ void LibraryCallKit::extend_setCurrentThread(Node* jt, Node* thread) {
  Node* input_memory_state = reset_memory();
  set_all_memory(input_memory_state);

-  Node* excluded_mask = _gvn.intcon(32768);
+  // The most significant bit of the u2 is used to denote thread exclusion
+  Node* excluded_mask = _gvn.intcon(1 << 15);
+  // The epoch generation is the range [1-32767]
  Node* epoch_mask = _gvn.intcon(32767);

  Node* const carrierThread = generate_current_thread(jt);
--- a/src/hotspot/share/opto/loopPredicate.cpp
+++ b/src/hotspot/share/opto/loopPredicate.cpp
@ -348,8 +348,6 @@ PhaseIdealLoop::clone_assertion_predicate_for_unswitched_loops(IfTrueNode* templ
                                                               ParsePredicateNode* unswitched_loop_parse_predicate) {
  TemplateAssertionPredicate template_assertion_predicate(template_assertion_predicate_success_proj);
  IfTrueNode* template_success_proj = template_assertion_predicate.clone(unswitched_loop_parse_predicate->in(0), this);
-  assert(assertion_predicate_has_loop_opaque_node(template_success_proj->in(0)->as_If()),
-         "must find Assertion Predicate for fast loop");
  _igvn.replace_input_of(unswitched_loop_parse_predicate, 0, template_success_proj);
  set_idom(unswitched_loop_parse_predicate, template_success_proj, dom_depth(template_success_proj));
  return template_success_proj;
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -1312,80 +1312,6 @@ void PhaseIdealLoop::ensure_zero_trip_guard_proj(Node* node, bool is_main_loop)
 }
 #endif

-#ifdef ASSERT
-bool PhaseIdealLoop::assertion_predicate_has_loop_opaque_node(IfNode* iff) {
-  uint init;
-  uint stride;
-  count_opaque_loop_nodes(iff->in(1)->in(1), init, stride);
-  ResourceMark rm;
-  Unique_Node_List wq;
-  wq.clear();
-  wq.push(iff->in(1)->in(1));
-  uint verif_init = 0;
-  uint verif_stride = 0;
-  for (uint i = 0; i < wq.size(); i++) {
-    Node* n = wq.at(i);
-    int op = n->Opcode();
-    if (!n->is_CFG()) {
-      if (n->Opcode() == Op_OpaqueLoopInit) {
-        verif_init++;
-      } else if (n->Opcode() == Op_OpaqueLoopStride) {
-        verif_stride++;
-      } else {
-        for (uint j = 1; j < n->req(); j++) {
-          Node* m = n->in(j);
-          if (m != nullptr) {
-            wq.push(m);
-          }
-        }
-      }
-    }
-  }
-  assert(init == verif_init && stride == verif_stride, "missed opaque node");
-  assert(stride == 0 || init != 0, "init should be there every time stride is");
-  return init != 0;
-}
-
-void PhaseIdealLoop::count_opaque_loop_nodes(Node* n, uint& init, uint& stride) {
-  init = 0;
-  stride = 0;
-  ResourceMark rm;
-  Unique_Node_List wq;
-  wq.push(n);
-  for (uint i = 0; i < wq.size(); i++) {
-    Node* n = wq.at(i);
-    if (TemplateAssertionExpressionNode::is_maybe_in_expression(n)) {
-      if (n->is_OpaqueLoopInit()) {
-        init++;
-      } else if (n->is_OpaqueLoopStride()) {
-        stride++;
-      } else {
-        for (uint j = 1; j < n->req(); j++) {
-          Node* m = n->in(j);
-          if (m != nullptr) {
-            wq.push(m);
-          }
-        }
-      }
-    }
-  }
-}
-#endif // ASSERT
-
-// Create an Initialized Assertion Predicate from the template_assertion_predicate
-IfTrueNode* PhaseIdealLoop::create_initialized_assertion_predicate(IfNode* template_assertion_predicate, Node* new_init,
-                                                                   Node* new_stride, Node* new_control) {
-  assert(assertion_predicate_has_loop_opaque_node(template_assertion_predicate),
-         "must find OpaqueLoop* nodes for Template Assertion Predicate");
-  InitializedAssertionPredicateCreator initialized_assertion_predicate(this);
-  IfTrueNode* success_proj = initialized_assertion_predicate.create_from_template(template_assertion_predicate,
-                                                                                  new_control, new_init, new_stride);
-
-  assert(!assertion_predicate_has_loop_opaque_node(success_proj->in(0)->as_If()),
-         "Initialized Assertion Predicates do not have OpaqueLoop* nodes in the bool expression anymore");
-  return success_proj;
-}
-
 //------------------------------insert_pre_post_loops--------------------------
 // Insert pre and post loops.  If peel_only is set, the pre-loop can not have
 // more iterations added.  It acts as a 'peel' only, no lower-bound RCE, no
@ -2761,7 +2687,6 @@ void PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
            loop_entry = initialized_assertion_predicate_creator.create(final_iv_placeholder, loop_entry, stride_con,
                                                                        scale_con, int_offset, int_limit,
                                                                        AssertionPredicateType::FinalIv);
-            assert(!assertion_predicate_has_loop_opaque_node(loop_entry->in(0)->as_If()), "unexpected");
          }

          // Add two Template Assertion Predicates to create new Initialized Assertion Predicates from when either
@ -2769,13 +2694,11 @@ void PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
          TemplateAssertionPredicateCreator template_assertion_predicate_creator(cl, scale_con , int_offset, int_limit,
                                                                                 this);
          loop_entry = template_assertion_predicate_creator.create(loop_entry);
-          assert(assertion_predicate_has_loop_opaque_node(loop_entry->in(0)->as_If()), "unexpected");

          // Initialized Assertion Predicate for the value of the initial main-loop.
          loop_entry = initialized_assertion_predicate_creator.create(init, loop_entry, stride_con, scale_con,
                                                                      int_offset, int_limit,
                                                                      AssertionPredicateType::InitValue);
-          assert(!assertion_predicate_has_loop_opaque_node(loop_entry->in(0)->as_If()), "unexpected");

        } else {
          if (PrintOpto) {
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@ -941,12 +941,7 @@ private:
 #ifdef ASSERT
  static void ensure_zero_trip_guard_proj(Node* node, bool is_main_loop);
 #endif
- public:
-  IfTrueNode* create_initialized_assertion_predicate(IfNode* template_assertion_predicate, Node* new_init,
-                                                     Node* new_stride, Node* control);
-  DEBUG_ONLY(static bool assertion_predicate_has_loop_opaque_node(IfNode* iff);)
 private:
-  DEBUG_ONLY(static void count_opaque_loop_nodes(Node* n, uint& init, uint& stride);)
  static void get_template_assertion_predicates(ParsePredicateSuccessProj* parse_predicate_proj, Unique_Node_List& list, bool get_opaque = false);
  void update_main_loop_assertion_predicates(CountedLoopNode* main_loop_head);
  void initialize_assertion_predicates_for_peeled_loop(CountedLoopNode* peeled_loop_head,
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@ -789,7 +789,6 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) {
  assert(!bol->is_OpaqueInitializedAssertionPredicate(), "Initialized Assertion Predicates cannot form a diamond with Halt");
  if (bol->is_OpaqueTemplateAssertionPredicate()) {
    // Ignore Template Assertion Predicates with OpaqueTemplateAssertionPredicate nodes.
-    assert(assertion_predicate_has_loop_opaque_node(iff), "must find OpaqueLoop* nodes");
    return nullptr;
  }
  assert(bol->Opcode() == Op_Bool, "Unexpected node");
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@ -193,6 +193,7 @@ class VectorUnboxNode;
 class VectorSet;
 class VectorReinterpretNode;
 class ShiftVNode;
+class MulVLNode;
 class ExpandVNode;
 class CompressVNode;
 class CompressMNode;
@ -743,6 +744,7 @@ public:
        DEFINE_CLASS_ID(Reduction, Vector, 7)
        DEFINE_CLASS_ID(NegV, Vector, 8)
        DEFINE_CLASS_ID(SaturatingVector, Vector, 9)
+        DEFINE_CLASS_ID(MulVL, Vector, 10)
      DEFINE_CLASS_ID(Con, Type, 8)
          DEFINE_CLASS_ID(ConI, Con, 0)
      DEFINE_CLASS_ID(SafePointScalarMerge, Type, 9)
@ -970,6 +972,7 @@ public:
  DEFINE_CLASS_QUERY(Mul)
  DEFINE_CLASS_QUERY(Multi)
  DEFINE_CLASS_QUERY(MultiBranch)
+  DEFINE_CLASS_QUERY(MulVL)
  DEFINE_CLASS_QUERY(Neg)
  DEFINE_CLASS_QUERY(NegV)
  DEFINE_CLASS_QUERY(NeverBranch)
@ -2111,4 +2114,51 @@ inline int Op_DivModIL(BasicType bt, bool is_unsigned) {
  }
 }

+// Interface to define actions that should be taken when running DataNodeBFS. Each use can extend this class to specify
+// a customized BFS.
+class BFSActions : public StackObj {
+ public:
+  // Should a node's inputs further be visited in the BFS traversal? By default, we visit all data inputs. Override this
+  // method to provide a custom filter.
+  virtual bool should_visit(Node* node) const {
+    // By default, visit all inputs.
+    return true;
+  };
+
+  // Is the visited node a target node that we are looking for in the BFS traversal? We do not visit its inputs further
+  // but the BFS will continue to visit all unvisited nodes in the queue.
+  virtual bool is_target_node(Node* node) const = 0;
+
+  // Defines an action that should be taken when we visit a target node in the BFS traversal.
+  virtual void target_node_action(Node* target_node) = 0;
+};
+
+// Class to perform a BFS traversal on the data nodes from a given start node. The provided BFSActions guide which
+// data node's inputs should be further visited, which data nodes are target nodes and what to do with the target nodes.
+class DataNodeBFS : public StackObj {
+  BFSActions& _bfs_actions;
+
+ public:
+  explicit DataNodeBFS(BFSActions& bfs_action) : _bfs_actions(bfs_action) {}
+
+  // Run the BFS starting from 'start_node' and apply the actions provided to this class.
+  void run(Node* start_node) {
+    ResourceMark rm;
+    Unique_Node_List _nodes_to_visit;
+    _nodes_to_visit.push(start_node);
+    for (uint i = 0; i < _nodes_to_visit.size(); i++) {
+      Node* next = _nodes_to_visit[i];
+      for (uint j = 1; j < next->req(); j++) {
+        Node* input = next->in(j);
+        if (_bfs_actions.is_target_node(input)) {
+          assert(_bfs_actions.should_visit(input), "must also pass node filter");
+          _bfs_actions.target_node_action(input);
+        } else if (_bfs_actions.should_visit(input)) {
+          _nodes_to_visit.push(input);
+        }
+      }
+    }
+  }
+};
+
 #endif // SHARE_OPTO_NODE_HPP
--- a/src/hotspot/share/opto/postaloc.cpp
+++ b/src/hotspot/share/opto/postaloc.cpp
@ -402,7 +402,6 @@ bool PhaseChaitin::eliminate_copy_of_constant(Node* val, Node* n,
 // as they get encountered with the merge node and keep adding these defs to the merge inputs.
 void PhaseChaitin::merge_multidefs() {
  Compile::TracePhase tp(_t_mergeMultidefs);
-  ResourceMark rm;
  // Keep track of the defs seen in registers and collect their uses in the block.
  RegToDefUseMap reg2defuse(_max_reg, _max_reg, RegDefUse());
  for (uint i = 0; i < _cfg.number_of_blocks(); i++) {
--- a/src/hotspot/share/opto/predicates.cpp
+++ b/src/hotspot/share/opto/predicates.cpp
@ -153,24 +153,21 @@ bool TemplateAssertionPredicate::is_predicate(Node* node) {

 // Clone this Template Assertion Predicate and replace the OpaqueLoopInitNode with the provided 'new_opaque_init' node.
 IfTrueNode* TemplateAssertionPredicate::clone(Node* new_control, PhaseIdealLoop* phase) const {
-  assert(PhaseIdealLoop::assertion_predicate_has_loop_opaque_node(_if_node),
-         "must find OpaqueLoop* nodes for Template Assertion Predicate");
+  DEBUG_ONLY(verify();)
  TemplateAssertionExpression template_assertion_expression(opaque_node());
  OpaqueTemplateAssertionPredicateNode* new_opaque_node = template_assertion_expression.clone(new_control, phase);
  AssertionPredicateIfCreator assertion_predicate_if_creator(phase);
  IfTrueNode* success_proj = assertion_predicate_if_creator.create_for_template(new_control, _if_node->Opcode(),
                                                                                new_opaque_node,
                                                                                _if_node->assertion_predicate_type());
-  assert(PhaseIdealLoop::assertion_predicate_has_loop_opaque_node(success_proj->in(0)->as_If()),
-         "Template Assertion Predicates must have OpaqueLoop* nodes in the bool expression");
+  DEBUG_ONLY(TemplateAssertionPredicate::verify(success_proj);)
  return success_proj;
 }

 // Clone this Template Assertion Predicate and replace the OpaqueLoopInitNode with the provided 'new_opaque_init' node.
 IfTrueNode* TemplateAssertionPredicate::clone_and_replace_init(Node* new_control, OpaqueLoopInitNode* new_opaque_init,
                                                               PhaseIdealLoop* phase) const {
-  assert(PhaseIdealLoop::assertion_predicate_has_loop_opaque_node(_if_node),
-         "must find OpaqueLoop* nodes for Template Assertion Predicate");
+  DEBUG_ONLY(verify();)
  TemplateAssertionExpression template_assertion_expression(opaque_node());
  OpaqueTemplateAssertionPredicateNode* new_opaque_node =
      template_assertion_expression.clone_and_replace_init(new_control, new_opaque_init, phase);
@ -178,13 +175,13 @@ IfTrueNode* TemplateAssertionPredicate::clone_and_replace_init(Node* new_control
  IfTrueNode* success_proj = assertion_predicate_if_creator.create_for_template(new_control, _if_node->Opcode(),
                                                                                new_opaque_node,
                                                                                _if_node->assertion_predicate_type());
-  assert(PhaseIdealLoop::assertion_predicate_has_loop_opaque_node(success_proj->in(0)->as_If()),
-         "Template Assertion Predicates must have OpaqueLoop* nodes in the bool expression");
+  DEBUG_ONLY(TemplateAssertionPredicate::verify(success_proj);)
  return success_proj;
 }

 // Replace the input to OpaqueLoopStrideNode with 'new_stride' and leave the other nodes unchanged.
 void TemplateAssertionPredicate::replace_opaque_stride_input(Node* new_stride, PhaseIterGVN& igvn) const {
+  DEBUG_ONLY(verify();)
  TemplateAssertionExpression expression(opaque_node());
  expression.replace_opaque_stride_input(new_stride, igvn);
 }
@ -192,15 +189,80 @@ void TemplateAssertionPredicate::replace_opaque_stride_input(Node* new_stride, P
 // Create a new Initialized Assertion Predicate from this template at 'new_control' and return the success projection
 // of the newly created Initialized Assertion Predicate.
 IfTrueNode* TemplateAssertionPredicate::initialize(PhaseIdealLoop* phase, Node* new_control) const {
-  assert(phase->assertion_predicate_has_loop_opaque_node(head()),
-         "must find OpaqueLoop* nodes for Template Assertion Predicate");
-  InitializedAssertionPredicateCreator initialized_assertion_predicate(phase);
-  IfTrueNode* success_proj = initialized_assertion_predicate.create_from_template(head(), new_control);
-  assert(!phase->assertion_predicate_has_loop_opaque_node(success_proj->in(0)->as_If()),
-         "Initialized Assertion Predicates do not have OpaqueLoop* nodes in the bool expression anymore");
+  DEBUG_ONLY(verify();)
+  InitializedAssertionPredicateCreator initialized_assertion_predicate_creator(phase);
+  IfTrueNode* success_proj = initialized_assertion_predicate_creator.create_from_template(head(), new_control);
+  DEBUG_ONLY(InitializedAssertionPredicate::verify(success_proj);)
  return success_proj;
 }

+#ifdef ASSERT
+// Class to verify Initialized and Template Assertion Predicates by trying to find OpaqueLoop*Nodes.
+class OpaqueLoopNodesVerifier : public BFSActions {
+  bool _found_init;
+  bool _found_stride;
+
+ public:
+  OpaqueLoopNodesVerifier()
+      : _found_init(false),
+        _found_stride(false) {}
+
+  // A Template Assertion Predicate has:
+  // - Always an OpaqueLoopInitNode
+  // - Only an OpaqueLoopStrideNode for the last value.
+  void verify(const TemplateAssertionPredicate& template_assertion_predicate) {
+    DataNodeBFS bfs(*this);
+    bfs.run(template_assertion_predicate.opaque_node());
+    if (template_assertion_predicate.is_last_value()) {
+      assert(_found_init && _found_stride,
+             "must find OpaqueLoopInit and OpaqueLoopStride for last value Template Assertion Predicate");
+    } else {
+      assert(_found_init && !_found_stride,
+             "must find OpaqueLoopInit but not OpaqueLoopStride for init value Template Assertion Predicate");
+    }
+  }
+
+  // An Initialized Assertion Predicate never has any OpaqueLoop*Nodes.
+  void verify(const InitializedAssertionPredicate& initialized_assertion_predicate) {
+    DataNodeBFS bfs(*this);
+    bfs.run(initialized_assertion_predicate.opaque_node());
+    assert(!_found_init && !_found_stride,
+           "must neither find OpaqueLoopInit nor OpaqueLoopStride for Initialized Assertion Predicate");
+  }
+
+  bool should_visit(Node* node) const override {
+    return TemplateAssertionExpressionNode::is_maybe_in_expression(node);
+  }
+
+  bool is_target_node(Node* node) const override {
+    return node->is_Opaque1();
+  }
+
+  void target_node_action(Node* target_node) override {
+    if (target_node->is_OpaqueLoopInit()) {
+      assert(!_found_init, "should only find one OpaqueLoopInitNode");
+      _found_init = true;
+    } else {
+      assert(target_node->is_OpaqueLoopStride(), "unexpected Opaque1 node");
+      assert(!_found_stride, "should only find one OpaqueLoopStrideNode");
+      _found_stride = true;
+    }
+  }
+};
+
+// Verify that the Template Assertion Predicate has the correct OpaqueLoop*Nodes.
+void TemplateAssertionPredicate::verify() const {
+  OpaqueLoopNodesVerifier opaque_loop_nodes_verifier;
+  opaque_loop_nodes_verifier.verify(*this);
+}
+
+// Verify that the Initialized Assertion Predicate has no OpaqueLoop*Node.
+void InitializedAssertionPredicate::verify() const {
+  OpaqueLoopNodesVerifier opaque_loop_nodes_verifier;
+  opaque_loop_nodes_verifier.verify(*this);
+}
+#endif // ASSERT
+
 // Initialized Assertion Predicates always have the dedicated OpaqueInitiailizedAssertionPredicate node to identify
 // them.
 bool InitializedAssertionPredicate::is_predicate(Node* node) {
@ -418,36 +480,38 @@ TemplateAssertionExpression::clone(const TransformStrategyForOpaqueLoopNodes& tr

 // This class is used to replace the input to OpaqueLoopStrideNode with a new node while leaving the other nodes
 // unchanged.
-class ReplaceOpaqueStrideInput : public StackObj {
+class ReplaceOpaqueStrideInput : public BFSActions {
+  Node* _new_opaque_stride_input;
  PhaseIterGVN& _igvn;
-  Unique_Node_List _nodes_to_visit;

 public:
-  ReplaceOpaqueStrideInput(OpaqueTemplateAssertionPredicateNode* start_node, PhaseIterGVN& igvn) : _igvn(igvn) {
-    _nodes_to_visit.push(start_node);
-  }
+  ReplaceOpaqueStrideInput(Node* new_opaque_stride_input, PhaseIterGVN& igvn)
+      : _new_opaque_stride_input(new_opaque_stride_input),
+        _igvn(igvn) {}
  NONCOPYABLE(ReplaceOpaqueStrideInput);

-  void replace(Node* new_opaque_stride_input) {
-    for (uint i = 0; i < _nodes_to_visit.size(); i++) {
-      Node* next = _nodes_to_visit[i];
-      for (uint j = 1; j < next->req(); j++) {
-        Node* input = next->in(j);
-        if (input->is_OpaqueLoopStride()) {
-          assert(TemplateAssertionExpressionNode::is_maybe_in_expression(input), "must also pass node filter");
-          _igvn.replace_input_of(input, 1, new_opaque_stride_input);
-        } else if (TemplateAssertionExpressionNode::is_maybe_in_expression(input)) {
-          _nodes_to_visit.push(input);
-        }
-      }
-    }
+  void replace_for(OpaqueTemplateAssertionPredicateNode* opaque_node) {
+    DataNodeBFS bfs(*this);
+    bfs.run(opaque_node);
+  }
+
+  bool should_visit(Node* node) const override {
+    return TemplateAssertionExpressionNode::is_maybe_in_expression(node);
+  }
+
+  bool is_target_node(Node* node) const override {
+    return node->is_OpaqueLoopStride();
+  }
+
+  void target_node_action(Node* target_node) override {
+    _igvn.replace_input_of(target_node, 1, _new_opaque_stride_input);
  }
 };

 // Replace the input to OpaqueLoopStrideNode with 'new_stride' and leave the other nodes unchanged.
 void TemplateAssertionExpression::replace_opaque_stride_input(Node* new_stride, PhaseIterGVN& igvn) {
-  ReplaceOpaqueStrideInput replace_opaque_stride_input(_opaque_node, igvn);
-  replace_opaque_stride_input.replace(new_stride);
+  ReplaceOpaqueStrideInput replace_opaque_stride_input(new_stride, igvn);
+  replace_opaque_stride_input.replace_for(_opaque_node);
 }

 // The transformations of this class fold the OpaqueLoop* nodes by returning their inputs.
@ -676,10 +740,15 @@ IfTrueNode* TemplateAssertionPredicateCreator::create(Node* new_control) {
  IfTrueNode* template_predicate_success_proj =
      create_if_node(new_control, template_assertion_predicate_expression, does_overflow,
                     AssertionPredicateType::InitValue);
+  DEBUG_ONLY(TemplateAssertionPredicate::verify(template_predicate_success_proj);)
+
  template_assertion_predicate_expression = create_for_last_value(template_predicate_success_proj, opaque_init,
                                                                  does_overflow);
-  return create_if_node(template_predicate_success_proj, template_assertion_predicate_expression,
-                        does_overflow, AssertionPredicateType::LastValue);
+  template_predicate_success_proj = create_if_node(template_predicate_success_proj,
+                                                   template_assertion_predicate_expression, does_overflow,
+                                                   AssertionPredicateType::LastValue);
+  DEBUG_ONLY(TemplateAssertionPredicate::verify(template_predicate_success_proj);)
+  return template_predicate_success_proj;
 }

 InitializedAssertionPredicateCreator::InitializedAssertionPredicateCreator(PhaseIdealLoop* phase)
@ -735,8 +804,10 @@ IfTrueNode* InitializedAssertionPredicateCreator::create(Node* operand, Node* ne
  bool does_overflow;
  OpaqueInitializedAssertionPredicateNode* assertion_expression =
      expression_creator.create_for_initialized(new_control, operand, does_overflow);
-  return create_control_nodes(new_control, does_overflow ? Op_If : Op_RangeCheck, assertion_expression,
-                              assertion_predicate_type);
+  IfTrueNode* success_proj = create_control_nodes(new_control, does_overflow ? Op_If : Op_RangeCheck,
+                                                  assertion_expression, assertion_predicate_type);
+  DEBUG_ONLY(InitializedAssertionPredicate::verify(success_proj);)
+  return success_proj;
 }

 // Creates the CFG nodes for the Initialized Assertion Predicate.
@ -832,9 +903,13 @@ void CreateAssertionPredicatesVisitor::visit(const TemplateAssertionPredicate& t
 // Create an Initialized Assertion Predicate from the provided Template Assertion Predicate.
 IfTrueNode* CreateAssertionPredicatesVisitor::initialize_from_template(
    const TemplateAssertionPredicate& template_assertion_predicate) const {
+  DEBUG_ONLY(template_assertion_predicate.verify();)
  IfNode* template_head = template_assertion_predicate.head();
-  IfTrueNode* initialized_predicate = _phase->create_initialized_assertion_predicate(template_head, _init, _stride,
-                                                                                     _new_control);
+  InitializedAssertionPredicateCreator initialized_assertion_predicate_creator(_phase);
+  IfTrueNode* initialized_predicate = initialized_assertion_predicate_creator.create_from_template(template_head,
+                                                                                                   _new_control,
+                                                                                                   _init, _stride);
+  DEBUG_ONLY(InitializedAssertionPredicate::verify(initialized_predicate);)
  template_assertion_predicate.rewire_loop_data_dependencies(initialized_predicate, _node_in_loop_body, _phase);
  return initialized_predicate;
 }
--- a/src/hotspot/share/opto/predicates.hpp
+++ b/src/hotspot/share/opto/predicates.hpp
@ -400,6 +400,15 @@ class TemplateAssertionPredicate : public Predicate {
  void rewire_loop_data_dependencies(IfTrueNode* target_predicate, const NodeInLoopBody& data_in_loop_body,
                                     PhaseIdealLoop* phase) const;
  static bool is_predicate(Node* node);
+
+#ifdef ASSERT
+  static void verify(IfTrueNode* template_assertion_predicate_success_proj) {
+    TemplateAssertionPredicate template_assertion_predicate(template_assertion_predicate_success_proj);
+    template_assertion_predicate.verify();
+  }
+
+  void verify() const;
+#endif // ASSERT
 };

 // Class to represent an Initialized Assertion Predicate which always has a halt node on the failing path.
@ -419,6 +428,10 @@ class InitializedAssertionPredicate : public Predicate {
    return _if_node->in(0);
  }

+  OpaqueInitializedAssertionPredicateNode* opaque_node() const {
+    return _if_node->in(1)->as_OpaqueInitializedAssertionPredicate();
+  }
+
  IfNode* head() const override {
    return _if_node;
  }
@ -433,6 +446,15 @@ class InitializedAssertionPredicate : public Predicate {

  void kill(PhaseIdealLoop* phase) const;
  static bool is_predicate(Node* node);
+
+#ifdef ASSERT
+  static void verify(IfTrueNode* initialized_assertion_predicate_success_proj) {
+    InitializedAssertionPredicate initialized_assertion_predicate(initialized_assertion_predicate_success_proj);
+    initialized_assertion_predicate.verify();
+  }
+
+  void verify() const;
+#endif // ASSERT
 };

 // Interface to transform OpaqueLoopInit and OpaqueLoopStride nodes of a Template Assertion Expression.
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -1868,6 +1868,7 @@ bool SuperWord::schedule_and_apply() const {
  }

  if (!vtransform.schedule()) { return false; }
+  if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
  vtransform.apply();
  return true;
 }
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@ -31,7 +31,7 @@
 #include "opto/vectorization.hpp"

 #ifndef PRODUCT
-static void print_con_or_idx(const Node* n) {
+void VPointer::print_con_or_idx(const Node* n) {
  if (n == nullptr) {
    tty->print("(   0)");
  } else if (n->is_ConI()) {
@ -1369,12 +1369,12 @@ void VPointer::print() const {
  tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);

  tty->print(" base");
-  print_con_or_idx(_base);
+  VPointer::print_con_or_idx(_base);

  tty->print(" + offset(%4d)", _offset);

  tty->print(" + invar");
-  print_con_or_idx(_invar);
+  VPointer::print_con_or_idx(_invar);

  tty->print_cr(" + scale(%4d) * iv]", _scale);
 }
@ -2168,15 +2168,15 @@ void AlignmentSolver::trace_start_solve() const {

    // iv = init + pre_iter * pre_stride + main_iter * main_stride
    tty->print("  iv = init");
-    print_con_or_idx(_init_node);
+    VPointer::print_con_or_idx(_init_node);
    tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)",
                  _pre_stride, _main_stride);

    // adr = base + offset + invar + scale * iv
    tty->print("  adr = base");
-    print_con_or_idx(_base);
+    VPointer::print_con_or_idx(_base);
    tty->print(" + offset(%d) + invar", _offset);
-    print_con_or_idx(_invar);
+    VPointer::print_con_or_idx(_invar);
    tty->print_cr(" + scale(%d) * iv", _scale);
  }
 }
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -870,6 +870,7 @@ class VPointer : public ArenaObj {
  static int cmp_for_sort(const VPointer** p1, const VPointer** p2);

  NOT_PRODUCT( void print() const; )
+  NOT_PRODUCT( static void print_con_or_idx(const Node* n); )

 #ifndef PRODUCT
  class Tracer {
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -2085,6 +2085,55 @@ Node* VectorBlendNode::Identity(PhaseGVN* phase) {
  }
  return this;
 }
+static bool is_replicate_uint_constant(const Node* n) {
+  return n->Opcode() == Op_Replicate &&
+         n->in(1)->is_Con() &&
+         n->in(1)->bottom_type()->isa_long() &&
+         n->in(1)->bottom_type()->is_long()->get_con() <= 0xFFFFFFFFL;
+}
+
+static bool has_vector_elements_fit_uint(Node* n) {
+  auto is_lower_doubleword_mask_pattern = [](const Node* n) {
+    return n->Opcode() == Op_AndV &&
+           (is_replicate_uint_constant(n->in(1)) ||
+            is_replicate_uint_constant(n->in(2)));
+  };
+
+  auto is_clear_upper_doubleword_uright_shift_pattern = [](const Node* n) {
+    return n->Opcode() == Op_URShiftVL &&
+           n->in(2)->Opcode() == Op_RShiftCntV && n->in(2)->in(1)->is_Con() &&
+           n->in(2)->in(1)->bottom_type()->isa_int() &&
+           n->in(2)->in(1)->bottom_type()->is_int()->get_con() >= 32;
+  };
+  return is_lower_doubleword_mask_pattern(n) ||             // (AndV     SRC (Replicate C)) where C <= 0xFFFFFFFF
+         is_clear_upper_doubleword_uright_shift_pattern(n); // (URShiftV SRC S) where S >= 32
+}
+
+static bool has_vector_elements_fit_int(Node* n) {
+  auto is_cast_integer_to_long_pattern = [](const Node* n) {
+    return n->Opcode() == Op_VectorCastI2X && Matcher::vector_element_basic_type(n) == T_LONG;
+  };
+
+  auto is_clear_upper_doubleword_right_shift_pattern = [](const Node* n) {
+    return n->Opcode() == Op_RShiftVL &&
+           n->in(2)->Opcode() == Op_RShiftCntV && n->in(2)->in(1)->is_Con() &&
+           n->in(2)->in(1)->bottom_type()->isa_int() &&
+           n->in(2)->in(1)->bottom_type()->is_int()->get_con() >= 32;
+  };
+
+  return is_cast_integer_to_long_pattern(n) ||             // (VectorCastI2X SRC)
+         is_clear_upper_doubleword_right_shift_pattern(n); // (RShiftV SRC S) where S >= 32
+}
+
+bool MulVLNode::has_int_inputs() const {
+  return has_vector_elements_fit_int(in(1)) &&
+         has_vector_elements_fit_int(in(2));
+}
+
+bool MulVLNode::has_uint_inputs() const {
+  return has_vector_elements_fit_uint(in(1)) &&
+         has_vector_elements_fit_uint(in(2));
+}

 #ifndef PRODUCT
 void VectorBoxAllocateNode::dump_spec(outputStream *st) const {
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -441,8 +441,12 @@ class MulVINode : public VectorNode {
 // Vector multiply long
 class MulVLNode : public VectorNode {
 public:
-  MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {
+    init_class_id(Class_MulVL);
+  }
  virtual int Opcode() const;
+  bool has_int_inputs() const;
+  bool has_uint_inputs() const;
 };

 //------------------------------MulVFNode--------------------------------------
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@ -144,6 +144,274 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const {
 }
 #endif

+// We use two comparisons, because a subtraction could underflow.
+#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \
+  if (a < b) { return -1; }                 \
+  if (a > b) { return  1; }
+
+// Helper-class for VTransformGraph::has_store_to_load_forwarding_failure.
+// It represents a memory region: [ptr, ptr + memory_size)
+class VMemoryRegion : public StackObj {
+private:
+  Node* _base;        // ptr = base + offset + invar + scale * iv
+  int _scale;
+  Node* _invar;
+  int _offset;
+  uint _memory_size;
+  bool _is_load;      // load or store?
+  uint _schedule_order;
+
+public:
+  VMemoryRegion() {} // empty constructor for GrowableArray
+  VMemoryRegion(const VPointer& vpointer, int iv_offset, int vector_length, uint schedule_order) :
+    _base(vpointer.base()),
+    _scale(vpointer.scale_in_bytes()),
+    _invar(vpointer.invar()),
+    _offset(vpointer.offset_in_bytes() + _scale * iv_offset),
+    _memory_size(vpointer.memory_size() * vector_length),
+    _is_load(vpointer.mem()->is_Load()),
+    _schedule_order(schedule_order) {}
+
+    Node* base()          const { return _base; }
+    int scale()           const { return _scale; }
+    Node* invar()         const { return _invar; }
+    int offset()          const { return _offset; }
+    uint memory_size()    const { return _memory_size; }
+    bool is_load()        const { return _is_load; }
+    uint schedule_order() const { return _schedule_order; }
+
+    static int cmp_for_sort_by_group(VMemoryRegion* r1, VMemoryRegion* r2) {
+      RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->base()->_idx, r2->base()->_idx);
+      RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->scale(),      r2->scale());
+      int r1_invar_idx = r1->invar() == nullptr ? 0 : r1->invar()->_idx;
+      int r2_invar_idx = r2->invar() == nullptr ? 0 : r2->invar()->_idx;
+      RETURN_CMP_VALUE_IF_NOT_EQUAL(r1_invar_idx,      r2_invar_idx);
+      return 0; // equal
+    }
+
+    static int cmp_for_sort(VMemoryRegion* r1, VMemoryRegion* r2) {
+      int cmp_group = cmp_for_sort_by_group(r1, r2);
+      if (cmp_group != 0) { return cmp_group; }
+
+      RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->offset(),     r2->offset());
+      return 0; // equal
+    }
+
+    enum Aliasing { DIFFERENT_GROUP, BEFORE, EXACT_OVERLAP, PARTIAL_OVERLAP, AFTER };
+
+    Aliasing aliasing(VMemoryRegion& other) {
+      VMemoryRegion* p1 = this;
+      VMemoryRegion* p2 = &other;
+      if (cmp_for_sort_by_group(p1, p2) != 0) { return DIFFERENT_GROUP; }
+
+      jlong offset1 = p1->offset();
+      jlong offset2 = p2->offset();
+      jlong memory_size1 = p1->memory_size();
+      jlong memory_size2 = p2->memory_size();
+
+      if (offset1 >= offset2 + memory_size2) { return AFTER; }
+      if (offset2 >= offset1 + memory_size1) { return BEFORE; }
+      if (offset1 == offset2 && memory_size1 == memory_size2) { return EXACT_OVERLAP; }
+      return PARTIAL_OVERLAP;
+    }
+
+#ifndef PRODUCT
+  void print() const {
+    tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), base",
+               _is_load ? "load " : "store", _memory_size, _schedule_order);
+    VPointer::print_con_or_idx(_base);
+    tty->print(" + offset(%4d)", _offset);
+    tty->print(" + invar");
+    VPointer::print_con_or_idx(_invar);
+    tty->print_cr(" + scale(%4d) * iv]", _scale);
+  }
+#endif
+};
+
+// Store-to-load-forwarding is a CPU memory optimization, where a load can directly fetch
+// its value from the store-buffer, rather than from the L1 cache. This is many CPU cycles
+// faster. However, this optimization comes with some restrictions, depending on the CPU.
+// Generally, store-to-load-forwarding works if the load and store memory regions match
+// exactly (same start and width). Generally problematic are partial overlaps - though
+// some CPU's can handle even some subsets of these cases. We conservatively assume that
+// all such partial overlaps lead to a store-to-load-forwarding failures, which means the
+// load has to stall until the store goes from the store-buffer into the L1 cache, incurring
+// a penalty of many CPU cycles.
+//
+// Example (with "iteration distance" 2):
+//   for (int i = 10; i < SIZE; i++) {
+//       aI[i] = aI[i - 2] + 1;
+//   }
+//
+//   load_4_bytes( ptr +  -8)
+//   store_4_bytes(ptr +   0)    *
+//   load_4_bytes( ptr +  -4)    |
+//   store_4_bytes(ptr +   4)    | *
+//   load_4_bytes( ptr +   0)  <-+ |
+//   store_4_bytes(ptr +   8)      |
+//   load_4_bytes( ptr +   4)  <---+
+//   store_4_bytes(ptr +  12)
+//   ...
+//
+//   In the scalar loop, we can forward the stores from 2 iterations back.
+//
+// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 2
+// example. This gives us this machine code:
+//   load_8_bytes( ptr +  -8)
+//   store_8_bytes(ptr +   0) |
+//   load_8_bytes( ptr +   0) v
+//   store_8_bytes(ptr +   8)   |
+//   load_8_bytes( ptr +   8)   v
+//   store_8_bytes(ptr +  16)
+//   ...
+//
+//   We packed 2 iterations, and the stores can perfectly forward to the loads of
+//   the next 2 iterations.
+//
+// Example (with "iteration distance" 3):
+//   for (int i = 10; i < SIZE; i++) {
+//       aI[i] = aI[i - 3] + 1;
+//   }
+//
+//   load_4_bytes( ptr + -12)
+//   store_4_bytes(ptr +   0)    *
+//   load_4_bytes( ptr +  -8)    |
+//   store_4_bytes(ptr +   4)    |
+//   load_4_bytes( ptr +  -4)    |
+//   store_4_bytes(ptr +   8)    |
+//   load_4_bytes( ptr +   0)  <-+
+//   store_4_bytes(ptr +  12)
+//   ...
+//
+//   In the scalar loop, we can forward the stores from 3 iterations back.
+//
+// Unfortunately, vectorization can introduce such store-to-load-forwarding failures.
+// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 3
+// example. This gives us this machine code:
+//   load_8_bytes( ptr + -12)
+//   store_8_bytes(ptr +   0)  |   |
+//   load_8_bytes( ptr +  -4)  x   |
+//   store_8_bytes(ptr +   8)     ||
+//   load_8_bytes( ptr +   4)     xx  <-- partial overlap with 2 stores
+//   store_8_bytes(ptr +  16)
+//   ...
+//
+// We see that eventually all loads are dependent on earlier stores, but the values cannot
+// be forwarded because there is some partial overlap.
+//
+// Preferably, we would have some latency-based cost-model that accounts for such forwarding
+// failures, and decide if vectorization with forwarding failures is still profitable. For
+// now we go with a simpler heuristic: we simply forbid vectorization if we can PROVE that
+// there will be a forwarding failure. This approach has at least 2 possible weaknesses:
+//
+//  (1) There may be forwarding failures in cases where we cannot prove it.
+//      Example:
+//        for (int i = 10; i < SIZE; i++) {
+//            bI[i] = aI[i - 3] + 1;
+//        }
+//
+//      We do not know if aI and bI refer to the same array or not. However, it is reasonable
+//      to assume that if we have two different array references, that they most likely refer
+//      to different arrays (i.e. no aliasing), where we would have no forwarding failures.
+//  (2) There could be some loops where vectorization introduces forwarding failures, and thus
+//      the latency of the loop body is high, but this does not matter because it is dominated
+//      by other latency/throughput based costs in the loop body.
+//
+// Performance measurements with the JMH benchmark StoreToLoadForwarding.java have indicated
+// that there is some iteration threshold: if the failure happens between a store and load that
+// have an iteration distance below this threshold, the latency is the limiting factor, and we
+// should not vectorize to avoid the latency penalty of store-to-load-forwarding failures. If
+// the iteration distance is larger than this threshold, the throughput is the limiting factor,
+// and we should vectorize in these cases to improve throughput.
+//
+bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const {
+  if (SuperWordStoreToLoadForwardingFailureDetection == 0) { return false; }
+
+  // Collect all pointers for scalar and vector loads/stores.
+  ResourceMark rm;
+  GrowableArray<VMemoryRegion> memory_regions;
+
+  // To detect store-to-load-forwarding failures at the iteration threshold or below, we
+  // simulate a super-unrolling to reach SuperWordStoreToLoadForwardingFailureDetection
+  // iterations at least. This is a heuristic, and we are not trying to be very precise
+  // with the iteration distance. If we have already unrolled more than the iteration
+  // threshold, i.e. if "SuperWordStoreToLoadForwardingFailureDetection < unrolled_count",
+  // then we simply check if there are any store-to-load-forwarding failures in the unrolled
+  // loop body, which may be at larger distance than the desired threshold. We cannot do any
+  // more fine-grained analysis, because the unrolling has lost the information about the
+  // iteration distance.
+  int simulated_unrolling_count = SuperWordStoreToLoadForwardingFailureDetection;
+  int unrolled_count = vloop_analyzer.vloop().cl()->unrolled_count();
+  uint simulated_super_unrolling_count = MAX2(1, simulated_unrolling_count / unrolled_count);
+  int iv_stride = vloop_analyzer.vloop().iv_stride();
+  int schedule_order = 0;
+  for (uint k = 0; k < simulated_super_unrolling_count; k++) {
+    int iv_offset = k * iv_stride; // virtual super-unrolling
+    for (int i = 0; i < _schedule.length(); i++) {
+      VTransformNode* vtn = _schedule.at(i);
+      if (vtn->is_load_or_store_in_loop()) {
+        const VPointer& p = vtn->vpointer(vloop_analyzer);
+        if (p.valid()) {
+          VTransformVectorNode* vector = vtn->isa_Vector();
+          uint vector_length = vector != nullptr ? vector->nodes().length() : 1;
+          memory_regions.push(VMemoryRegion(p, iv_offset, vector_length, schedule_order++));
+        }
+      }
+    }
+  }
+
+  // Sort the pointers by group (same base, invar and stride), and then by offset.
+  memory_regions.sort(VMemoryRegion::cmp_for_sort);
+
+#ifndef PRODUCT
+  if (_trace._verbose) {
+    tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
+    tty->print_cr("  simulated_unrolling_count = %d", simulated_unrolling_count);
+    tty->print_cr("  simulated_super_unrolling_count = %d", simulated_super_unrolling_count);
+    for (int i = 0; i < memory_regions.length(); i++) {
+      VMemoryRegion& region = memory_regions.at(i);
+      region.print();
+    }
+  }
+#endif
+
+  // For all pairs of pointers in the same group, check if they have a partial overlap.
+  for (int i = 0; i < memory_regions.length(); i++) {
+    VMemoryRegion& region1 = memory_regions.at(i);
+
+    for (int j = i + 1; j < memory_regions.length(); j++) {
+      VMemoryRegion& region2 = memory_regions.at(j);
+
+      const VMemoryRegion::Aliasing aliasing = region1.aliasing(region2);
+      if (aliasing == VMemoryRegion::Aliasing::DIFFERENT_GROUP ||
+          aliasing == VMemoryRegion::Aliasing::BEFORE) {
+        break; // We have reached the next group or pointers that are always after.
+      } else if (aliasing == VMemoryRegion::Aliasing::EXACT_OVERLAP) {
+        continue;
+      } else {
+        assert(aliasing == VMemoryRegion::Aliasing::PARTIAL_OVERLAP, "no other case can happen");
+        if ((region1.is_load() && !region2.is_load() && region1.schedule_order() > region2.schedule_order()) ||
+            (!region1.is_load() && region2.is_load() && region1.schedule_order() < region2.schedule_order())) {
+          // We predict that this leads to a store-to-load-forwarding failure penalty.
+#ifndef PRODUCT
+          if (_trace._rejections) {
+            tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
+            tty->print_cr("  Partial overlap of store->load. We predict that this leads to");
+            tty->print_cr("  a store-to-load-forwarding failure penalty which makes");
+            tty->print_cr("  vectorization unprofitable. These are the two pointers:");
+            region1.print();
+            region2.print();
+          }
+#endif
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
 Node* VTransformNode::find_transformed_input(int i, const GrowableArray<Node*>& vnode_idx_to_transformed_node) const {
  Node* n = vnode_idx_to_transformed_node.at(in(i)->_idx);
  assert(n != nullptr, "must find input IR node");
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@ -66,6 +66,8 @@ class VTransformVectorNode;
 class VTransformElementWiseVectorNode;
 class VTransformBoolVectorNode;
 class VTransformReductionVectorNode;
+class VTransformLoadVectorNode;
+class VTransformStoreVectorNode;

 // Result from VTransformNode::apply
 class VTransformApplyResult {
@ -157,6 +159,7 @@ public:
  const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }

  bool schedule();
+  bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
  void apply_memops_reordering_with_schedule() const;
  void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;

@ -221,6 +224,7 @@ public:
  VTransformGraph& graph() { return _graph; }

  bool schedule() { return _graph.schedule(); }
+  bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
  void apply();

 private:
@ -310,6 +314,11 @@ public:
  virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() { return nullptr; }
  virtual VTransformBoolVectorNode* isa_BoolVector() { return nullptr; }
  virtual VTransformReductionVectorNode* isa_ReductionVector() { return nullptr; }
+  virtual VTransformLoadVectorNode* isa_LoadVector() { return nullptr; }
+  virtual VTransformStoreVectorNode* isa_StoreVector() { return nullptr; }
+
+  virtual bool is_load_or_store_in_loop() const { return false; }
+  virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const { ShouldNotReachHere(); }

  virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
                                      const GrowableArray<Node*>& vnode_idx_to_transformed_node) const = 0;
@ -333,6 +342,8 @@ public:
    VTransformNode(vtransform, n->req()), _node(n) {}
  Node* node() const { return _node; }
  virtual VTransformScalarNode* isa_Scalar() override { return this; }
+  virtual bool is_load_or_store_in_loop() const override { return _node->is_Load() || _node->is_Store(); }
+  virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(node()->as_Mem()); }
  virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
                                      const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };)
@ -347,6 +358,7 @@ public:
  VTransformInputScalarNode(VTransform& vtransform, Node* n) :
    VTransformScalarNode(vtransform, n) {}
  virtual VTransformInputScalarNode* isa_InputScalar() override { return this; }
+  virtual bool is_load_or_store_in_loop() const override { return false; }
  NOT_PRODUCT(virtual const char* name() const override { return "InputScalar"; };)
 };

@ -472,6 +484,9 @@ public:
  VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes) :
    VTransformVectorNode(vtransform, 3, number_of_nodes) {}
  LoadNode::ControlDependency control_dependency() const;
+  virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; }
+  virtual bool is_load_or_store_in_loop() const override { return true; }
+  virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); }
  virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
                                      const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
@ -482,6 +497,9 @@ public:
  // req = 4 -> [ctrl, mem, adr, val]
  VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) :
    VTransformVectorNode(vtransform, 4, number_of_nodes) {}
+  virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; }
+  virtual bool is_load_or_store_in_loop() const override { return true; }
+  virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); }
  virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer,
                                      const GrowableArray<Node*>& vnode_idx_to_transformed_node) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
--- a/src/hotspot/share/prims/jvmtiEnvBase.cpp
+++ b/src/hotspot/share/prims/jvmtiEnvBase.cpp
@ -1080,12 +1080,7 @@ JvmtiEnvBase::get_locked_objects_in_frame(JavaThread* calling_thread, JavaThread
      ObjectMonitor *mon = target->current_waiting_monitor();
      if (mon != nullptr) wait_obj = mon->object();
    } else {
-      assert(vthread != nullptr, "no vthread oop");
-      oop cont = java_lang_VirtualThread::continuation(vthread);
-      assert(cont != nullptr, "vthread with no continuation");
-      stackChunkOop chunk = jdk_internal_vm_Continuation::tail(cont);
-      assert(chunk != nullptr, "unmounted vthread should have a chunk");
-      ObjectMonitor *mon = chunk->current_waiting_monitor();
+      ObjectMonitor *mon = java_lang_VirtualThread::current_waiting_monitor(vthread);
      if (mon != nullptr) wait_obj = mon->object();
    }
  }
@ -1099,12 +1094,7 @@ JvmtiEnvBase::get_locked_objects_in_frame(JavaThread* calling_thread, JavaThread
      ObjectMonitor *mon = target->current_pending_monitor();
      if (mon != nullptr) pending_obj = mon->object();
    } else {
-      assert(vthread != nullptr, "no vthread oop");
-      oop cont = java_lang_VirtualThread::continuation(vthread);
-      assert(cont != nullptr, "vthread with no continuation");
-      stackChunkOop chunk = jdk_internal_vm_Continuation::tail(cont);
-      assert(chunk != nullptr, "unmounted vthread should have a chunk");
-      ObjectMonitor *mon = chunk->current_pending_monitor();
+      ObjectMonitor *mon = java_lang_VirtualThread::current_pending_monitor(vthread);
      if (mon != nullptr) pending_obj = mon->object();
    }
  }
@ -2569,12 +2559,9 @@ GetCurrentContendedMonitorClosure::do_thread(Thread *target) {
 void
 GetCurrentContendedMonitorClosure::do_vthread(Handle target_h) {
  if (_target_jt == nullptr) {
-    oop cont = java_lang_VirtualThread::continuation(target_h());
-    assert(cont != nullptr, "vthread with no continuation");
-    stackChunkOop chunk = jdk_internal_vm_Continuation::tail(cont);
-    assert(chunk != nullptr, "unmounted vthread should have a chunk");
-    if (chunk->current_pending_monitor() != nullptr) {
-      *_owned_monitor_ptr = JNIHandles::make_local(_calling_thread, chunk->current_pending_monitor()->object());
+    ObjectMonitor *mon = java_lang_VirtualThread::current_pending_monitor(target_h());
+    if (mon != nullptr) {
+      *_owned_monitor_ptr = JNIHandles::make_local(_calling_thread, mon->object());
    }
    _result = JVMTI_ERROR_NONE; // target virtual thread is unmounted
    return;
--- a/src/hotspot/share/prims/whitebox.cpp
+++ b/src/hotspot/share/prims/whitebox.cpp
@ -186,6 +186,16 @@ WB_ENTRY(jstring, WB_PrintString(JNIEnv* env, jobject wb, jstring str, jint max_
  return (jstring) JNIHandles::make_local(THREAD, result);
 WB_END

+WB_ENTRY(jint, WB_TakeLockAndHangInSafepoint(JNIEnv* env, jobject wb))
+  JavaThread* self = JavaThread::current();
+  // VMStatistic_lock is used to minimize interference with VM locking
+  MutexLocker mu(VMStatistic_lock);
+  VM_HangInSafepoint force_safepoint_stuck_op;
+  VMThread::execute(&force_safepoint_stuck_op);
+  ShouldNotReachHere();
+  return 0;
+WB_END
+
 class WBIsKlassAliveClosure : public LockedClassesDo {
    Symbol* _name;
    int _count;
@ -2988,6 +2998,7 @@ static JNINativeMethod methods[] = {
  {CC"cleanMetaspaces", CC"()V",                      (void*)&WB_CleanMetaspaces},
  {CC"rss", CC"()J",                                  (void*)&WB_Rss},
  {CC"printString", CC"(Ljava/lang/String;I)Ljava/lang/String;", (void*)&WB_PrintString},
+  {CC"lockAndStuckInSafepoint", CC"()V", (void*)&WB_TakeLockAndHangInSafepoint},
  {CC"wordSize", CC"()J",                             (void*)&WB_WordSize},
  {CC"rootChunkWordSize", CC"()J",                    (void*)&WB_RootChunkWordSize}
 };
--- a/Show More
+++ b/Show More