8323582: C2 SuperWord AlignVector: misaligned vector memory access with unaligned native memory

Reviewed-by: roland, kvn
2026-03-14 18:03:44 +00:00 · 2025-02-27 06:58:43 +00:00 · 2025-02-27 06:58:43 +00:00 · 885338b5f3
commit 885338b5f3
parent bb48b7319c
27 changed files with 1067 additions and 129 deletions
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@ -708,6 +708,7 @@
  declare_constant(Deoptimization::Reason_constraint)                     \
  declare_constant(Deoptimization::Reason_div0_check)                     \
  declare_constant(Deoptimization::Reason_loop_limit_check)               \
+  declare_constant(Deoptimization::Reason_auto_vectorization_check)       \
  declare_constant(Deoptimization::Reason_type_checked_inlining)          \
  declare_constant(Deoptimization::Reason_optimized_type_check)           \
  declare_constant(Deoptimization::Reason_aliasing)                       \
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@ -346,6 +346,12 @@
  develop(bool, TraceLoopUnswitching, false,                                \
          "Trace loop unswitching")                                         \
                                                                            \
+  product(bool, LoopMultiversioning, true, DIAGNOSTIC,                      \
+          "Enable loop multiversioning (for speculative compilation)")      \
+                                                                            \
+  develop(bool, TraceLoopMultiversioning, false,                            \
+          "Trace loop multiversioning")                                     \
+                                                                            \
  product(bool, AllowVectorizeOnDemand, true,                               \
          "Globally suppress vectorization set in VectorizeMethod")         \
                                                                            \
--- a/src/hotspot/share/opto/cfgnode.hpp
+++ b/src/hotspot/share/opto/cfgnode.hpp
@ -428,7 +428,7 @@ public:
  IfNode(Node* control, Node* bol, float p, float fcnt);
  IfNode(Node* control, Node* bol, float p, float fcnt, AssertionPredicateType assertion_predicate_type);

-  static IfNode* make_with_same_profile(IfNode* if_node_profile, Node* ctrl, BoolNode* bol);
+  static IfNode* make_with_same_profile(IfNode* if_node_profile, Node* ctrl, Node* bol);

  virtual int Opcode() const;
  virtual bool pinned() const { return true; }
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@ -277,6 +277,7 @@ macro(OnSpinWait)
 macro(Opaque1)
 macro(OpaqueLoopInit)
 macro(OpaqueLoopStride)
+macro(OpaqueMultiversioning)
 macro(OpaqueZeroTripGuard)
 macro(OpaqueNotNull)
 macro(OpaqueInitializedAssertionPredicate)
--- a/src/hotspot/share/opto/graphKit.cpp
+++ b/src/hotspot/share/opto/graphKit.cpp
@ -4086,6 +4086,7 @@ void GraphKit::add_parse_predicates(int nargs) {
  if (UseProfiledLoopPredicate) {
    add_parse_predicate(Deoptimization::Reason_profile_predicate, nargs);
  }
+  add_parse_predicate(Deoptimization::Reason_auto_vectorization_check, nargs);
  // Loop Limit Check Predicate should be near the loop.
  add_parse_predicate(Deoptimization::Reason_loop_limit_check, nargs);
 }
--- a/src/hotspot/share/opto/ifnode.cpp
+++ b/src/hotspot/share/opto/ifnode.cpp
@ -469,7 +469,7 @@ static Node* split_if(IfNode *iff, PhaseIterGVN *igvn) {
  return new ConINode(TypeInt::ZERO);
 }

-IfNode* IfNode::make_with_same_profile(IfNode* if_node_profile, Node* ctrl, BoolNode* bol) {
+IfNode* IfNode::make_with_same_profile(IfNode* if_node_profile, Node* ctrl, Node* bol) {
  // Assert here that we only try to create a clone from an If node with the same profiling if that actually makes sense.
  // Some If node subtypes should not be cloned in this way. In theory, we should not clone BaseCountedLoopEndNodes.
  // But they can end up being used as normal If nodes when peeling a loop - they serve as zero-trip guard.
@ -2177,6 +2177,7 @@ ParsePredicateNode::ParsePredicateNode(Node* control, Deoptimization::DeoptReaso
  switch (deopt_reason) {
    case Deoptimization::Reason_predicate:
    case Deoptimization::Reason_profile_predicate:
+    case Deoptimization::Reason_auto_vectorization_check:
    case Deoptimization::Reason_loop_limit_check:
      break;
    default:
@ -2214,6 +2215,9 @@ void ParsePredicateNode::dump_spec(outputStream* st) const {
    case Deoptimization::DeoptReason::Reason_profile_predicate:
      st->print("Profiled Loop ");
      break;
+    case Deoptimization::DeoptReason::Reason_auto_vectorization_check:
+      st->print("Auto_Vectorization_Check ");
+      break;
    case Deoptimization::DeoptReason::Reason_loop_limit_check:
      st->print("Loop Limit Check ");
      break;
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -745,6 +745,11 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) {
    cl->set_trip_count(cl->trip_count() - 1);
    if (cl->is_main_loop()) {
      cl->set_normal_loop();
+      if (cl->is_multiversion()) {
+        // Peeling also destroys the connection of the main loop
+        // to the multiversion_if.
+        cl->set_no_multiversion();
+      }
 #ifndef PRODUCT
      if (PrintOpto && VerifyLoopOptimizations) {
        tty->print("Peeling a 'main' loop; resetting to 'normal' ");
@ -1174,8 +1179,9 @@ bool IdealLoopTree::policy_range_check(PhaseIdealLoop* phase, bool provisional,
      if (!bol->is_Bool()) {
        assert(bol->is_OpaqueNotNull() ||
               bol->is_OpaqueTemplateAssertionPredicate() ||
-               bol->is_OpaqueInitializedAssertionPredicate(),
-               "Opaque node of a non-null-check or an Assertion Predicate");
+               bol->is_OpaqueInitializedAssertionPredicate() ||
+               bol->is_OpaqueMultiversioning(),
+               "Opaque node of a non-null-check or an Assertion Predicate or Multiversioning");
        continue;
      }
      if (bol->as_Bool()->_test._test == BoolTest::ne) {
@ -3354,6 +3360,23 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
  // Do nothing special to pre- and post- loops
  if (cl->is_pre_loop() || cl->is_post_loop()) return true;

+  // With multiversioning, we create a fast_loop and a slow_loop, and a multiversion_if that
+  // decides which loop is taken at runtime. At first, the multiversion_if always takes the
+  // fast_loop, and we only optimize the fast_loop. Since we are not sure if we will ever use
+  // the slow_loop, we delay optimizations for it, so we do not waste compile time and code
+  // size. If we never change the condition of the multiversion_if, the slow_loop is eventually
+  // folded away after loop-opts. While optimizing the fast_loop, we may want to perform some
+  // speculative optimization, for which we need a runtime-check. We add this runtime-check
+  // condition to the multiversion_if. Now, it becomes possible to execute the slow_loop at
+  // runtime, and we resume optimizations for slow_loop ("un-delay" it).
+  // TLDR: If the slow_loop is still in "delay" mode, check if the multiversion_if was changed
+  //       and we should now resume optimizations for it.
+  if (cl->is_multiversion_delayed_slow_loop() &&
+      !phase->try_resume_optimizations_for_delayed_slow_loop(this)) {
+    // We are still delayed, so wait with further loop-opts.
+    return true;
+  }
+
  // Compute loop trip count from profile data
  compute_profile_trip_cnt(phase);

@ -3413,6 +3436,12 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
      if (!phase->may_require_nodes(estimate)) {
        return false;
      }
+
+      // We are going to add pre-loop and post-loop.
+      // But should we also multi-version for auto-vectorization speculative
+      // checks, i.e. fast and slow-paths?
+      phase->maybe_multiversion_for_auto_vectorization_runtime_checks(this, old_new);
+
      phase->insert_pre_post_loops(this, old_new, peel_only);
    }
    // Adjust the pre- and main-loop limits to let the pre and  post loops run
--- a/src/hotspot/share/opto/loopUnswitch.cpp
+++ b/src/hotspot/share/opto/loopUnswitch.cpp
@ -32,6 +32,23 @@
 #include "opto/predicates.hpp"
 #include "opto/rootnode.hpp"

+// Multiversioning:
+// A loop is cloned, and a selector If decides which loop is taken at run-time: the true-path-loop (original) or the
+// false-path-loop (cloned).
+//
+// Use-cases:
+// - Speculative compilation:
+//   The selector If checks some assumptions which allow stronger optimization in the true-path-loop. If the assumptions
+//   do not hold, we can still execute in the false-path-loop, although with fewer optimizations.
+//   See: PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks
+//        PhaseIdealLoop::create_new_if_for_multiversion
+//
+// - Unswitching:
+//   The selector If has the same (loop invariant) condition as some unswitching candidate If inside the loop. This
+//   allows us to constant-fold the unswitching candidate If to true in the true-path-loop and to false in the
+//   false-path-loop, thus eliminating the unswitching candidate If from the loop.
+//
+//
 // Loop Unswitching is a loop optimization to move an invariant, non-loop-exiting test in the loop body before the loop.
 // Such a test is either always true or always false in all loop iterations and could therefore only be executed once.
 // To achieve that, we duplicate the loop and change the original and cloned loop as follows:
@ -145,14 +162,16 @@ IfNode* PhaseIdealLoop::find_unswitch_candidate(const IdealLoopTree* loop) const
  return unswitch_candidate;
 }

-// This class creates an If node (i.e. loop selector) that selects if the true-path-loop or the false-path-loop should be
-// executed at runtime. This is done by finding an invariant and non-loop-exiting unswitch candidate If node (guaranteed
-// to exist at this point) to perform Loop Unswitching on.
-class UnswitchedLoopSelector : public StackObj {
+// LoopSelector is used for loop multiversioning and unswitching. This class creates an If node (i.e. loop selector)
+// that selects if the true-path-loop or the false-path-loop should be executed at runtime.
+class LoopSelector : public StackObj {
+  // Cached fields for construction.
  PhaseIdealLoop* const _phase;
  IdealLoopTree* const _outer_loop;
  Node* const _original_loop_entry;
-  IfNode* const _unswitch_candidate;
+  const uint _dom_depth; // of original_loop_entry
+
+  // Constructed selector if with its projections.
  IfNode* const _selector;
  IfTrueNode* const _true_path_loop_proj;
  IfFalseNode* const _false_path_loop_proj;
@ -160,52 +179,59 @@ class UnswitchedLoopSelector : public StackObj {
  enum PathToLoop { TRUE_PATH, FALSE_PATH };

 public:
-  UnswitchedLoopSelector(IdealLoopTree* loop)
+  // For multiversioning: create a new selector (multiversion_if) from a bol condition.
+  LoopSelector(IdealLoopTree* loop, Node* bol, float prob, float fcnt)
      : _phase(loop->_phase),
        _outer_loop(loop->skip_strip_mined()->_parent),
        _original_loop_entry(loop->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl)),
-        _unswitch_candidate(find_unswitch_candidate(loop)),
-        _selector(create_selector_if()),
+        _dom_depth(_phase->dom_depth(_original_loop_entry)),
+        _selector(create_multiversioning_if(bol, prob, fcnt)), // multiversioning
        _true_path_loop_proj(create_proj_to_loop(TRUE_PATH)->as_IfTrue()),
        _false_path_loop_proj(create_proj_to_loop(FALSE_PATH)->as_IfFalse()) {
  }
-  NONCOPYABLE(UnswitchedLoopSelector);

- private:
-  IfNode* find_unswitch_candidate(IdealLoopTree* loop) {
-    IfNode* unswitch_candidate = _phase->find_unswitch_candidate(loop);
-    assert(unswitch_candidate != nullptr, "guaranteed to exist by policy_unswitching");
-    assert(_phase->is_member(loop, unswitch_candidate), "must be inside original loop");
-    return unswitch_candidate;
+  // For unswitching: create an unswitching if before the loop, from a pre-existing
+  //                  unswitching_candidate inside the loop.
+  LoopSelector(IdealLoopTree* loop, IfNode* unswitch_candidate)
+      : _phase(loop->_phase),
+        _outer_loop(loop->skip_strip_mined()->_parent),
+        _original_loop_entry(loop->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl)),
+        _dom_depth(_phase->dom_depth(_original_loop_entry)),
+        _selector(create_unswitching_if(unswitch_candidate)), // unswitching
+        _true_path_loop_proj(create_proj_to_loop(TRUE_PATH)->as_IfTrue()),
+        _false_path_loop_proj(create_proj_to_loop(FALSE_PATH)->as_IfFalse()) {
  }
+  NONCOPYABLE(LoopSelector);

-  IfNode* create_selector_if() const {
-    const uint dom_depth = _phase->dom_depth(_original_loop_entry);
+  IfNode* create_multiversioning_if(Node* bol, float prob, float fcnt) {
    _phase->igvn().rehash_node_delayed(_original_loop_entry);
-    BoolNode* unswitch_candidate_bool = _unswitch_candidate->in(1)->as_Bool();
-    IfNode* selector_if = IfNode::make_with_same_profile(_unswitch_candidate, _original_loop_entry,
-                                                         unswitch_candidate_bool);
-    _phase->register_node(selector_if, _outer_loop, _original_loop_entry, dom_depth);
+    IfNode* selector_if = new IfNode(_original_loop_entry, bol, prob, fcnt);
+    _phase->register_node(selector_if, _outer_loop, _original_loop_entry, _dom_depth);
    return selector_if;
  }

+  IfNode* create_unswitching_if(IfNode* unswitch_candidate) {
+    _phase->igvn().rehash_node_delayed(_original_loop_entry);
+    BoolNode* unswitch_candidate_bool = unswitch_candidate->in(1)->as_Bool();
+    IfNode* selector_if = IfNode::make_with_same_profile(unswitch_candidate, _original_loop_entry,
+                                                         unswitch_candidate_bool);
+    _phase->register_node(selector_if, _outer_loop, _original_loop_entry, _dom_depth);
+    return selector_if;
+  }
+
+ private:
  IfProjNode* create_proj_to_loop(const PathToLoop path_to_loop) {
-    const uint dom_depth = _phase->dom_depth(_original_loop_entry);
    IfProjNode* proj_to_loop;
    if (path_to_loop == TRUE_PATH) {
      proj_to_loop = new IfTrueNode(_selector);
    } else {
      proj_to_loop = new IfFalseNode(_selector);
    }
-    _phase->register_node(proj_to_loop, _outer_loop, _selector, dom_depth);
+    _phase->register_node(proj_to_loop, _outer_loop, _selector, _dom_depth);
    return proj_to_loop;
  }

 public:
-  IfNode* unswitch_candidate() const {
-    return _unswitch_candidate;
-  }
-
  IfNode* selector() const {
    return _selector;
  }
@ -219,6 +245,37 @@ class UnswitchedLoopSelector : public StackObj {
  }
 };

+// This class creates an If node (i.e. loop selector) that selects if the true-path-loop or the false-path-loop should be
+// executed at runtime. This is done by finding an invariant and non-loop-exiting unswitch candidate If node (guaranteed
+// to exist at this point) to perform Loop Unswitching on.
+class UnswitchedLoopSelector : public StackObj {
+  IfNode* const _unswitch_candidate;
+  const LoopSelector _loop_selector;
+
+ public:
+  UnswitchedLoopSelector(IdealLoopTree* loop)
+      : _unswitch_candidate(find_unswitch_candidate(loop)),
+        _loop_selector(loop, _unswitch_candidate) {}
+  NONCOPYABLE(UnswitchedLoopSelector);
+
+ private:
+  static IfNode* find_unswitch_candidate(IdealLoopTree* loop) {
+    IfNode* unswitch_candidate = loop->_phase->find_unswitch_candidate(loop);
+    assert(unswitch_candidate != nullptr, "guaranteed to exist by policy_unswitching");
+    assert(loop->_phase->is_member(loop, unswitch_candidate), "must be inside original loop");
+    return unswitch_candidate;
+  }
+
+ public:
+  IfNode* unswitch_candidate() const {
+    return _unswitch_candidate;
+  }
+
+  const LoopSelector& loop_selector() const {
+    return _loop_selector;
+  }
+};
+
 // Class to unswitch the original loop and create Predicates at the new unswitched loop versions. The newly cloned loop
 // becomes the false-path-loop while original loop becomes the true-path-loop.
 class OriginalLoop : public StackObj {
@ -238,55 +295,62 @@ class OriginalLoop : public StackObj {
  // Unswitch the original loop on the invariant loop selector by creating a true-path-loop and a false-path-loop.
  // Remove the unswitch candidate If from both unswitched loop versions which are now covered by the loop selector If.
  void unswitch(const UnswitchedLoopSelector& unswitched_loop_selector) {
-    const uint first_false_path_loop_node_index = _phase->C->unique();
-    clone_loop(unswitched_loop_selector);
-
-    move_parse_and_template_assertion_predicates_to_unswitched_loops(unswitched_loop_selector,
-                                                                     first_false_path_loop_node_index);
-    DEBUG_ONLY(verify_unswitched_loop_versions(_loop->_head->as_Loop(), unswitched_loop_selector);)
-
-    _phase->recompute_dom_depth();
+    multiversion(unswitched_loop_selector.loop_selector());
    remove_unswitch_candidate_from_loops(unswitched_loop_selector);
  }

- private:
-  void clone_loop(const UnswitchedLoopSelector& unswitched_loop_selector) {
-    _phase->clone_loop(_loop, _old_new, _phase->dom_depth(_loop_head),
-                       PhaseIdealLoop::CloneIncludesStripMined, unswitched_loop_selector.selector());
-    fix_loop_entries(unswitched_loop_selector);
+  // Multiversion the original loop. The loop selector if selects between the original loop (true-path-loop), and
+  // a copy of it (false-path-loop).
+  void multiversion(const LoopSelector& loop_selector) {
+    const uint first_false_path_loop_node_index = _phase->C->unique();
+    clone_loop(loop_selector);
+
+    move_parse_and_template_assertion_predicates_to_unswitched_loops(loop_selector,
+                                                                     first_false_path_loop_node_index);
+    DEBUG_ONLY(verify_loop_versions(_loop->_head->as_Loop(), loop_selector);)
+
+    _phase->recompute_dom_depth();
  }

-  void fix_loop_entries(const UnswitchedLoopSelector& unswitched_loop_selector) {
-    _phase->replace_loop_entry(_loop_head, unswitched_loop_selector.true_path_loop_proj());
+ private:
+  void clone_loop(const LoopSelector& loop_selector) {
+    _phase->clone_loop(_loop, _old_new, _phase->dom_depth(_loop_head),
+                       PhaseIdealLoop::CloneIncludesStripMined, loop_selector.selector());
+    fix_loop_entries(loop_selector);
+  }
+
+  void fix_loop_entries(const LoopSelector& loop_selector) {
+    _phase->replace_loop_entry(_loop_head, loop_selector.true_path_loop_proj());
    LoopNode* false_path_loop_strip_mined_head = old_to_new(_loop_head)->as_Loop();
    _phase->replace_loop_entry(false_path_loop_strip_mined_head,
-                               unswitched_loop_selector.false_path_loop_proj());
+                               loop_selector.false_path_loop_proj());
  }

  // Moves the Parse And Template Assertion Predicates to the true and false path loop. They are inserted between the
  // loop heads and the loop selector If projections. The old Parse and Template Assertion Predicates before
  // the unswitched loop selector are killed.
  void move_parse_and_template_assertion_predicates_to_unswitched_loops(
-    const UnswitchedLoopSelector& unswitched_loop_selector, const uint first_false_path_loop_node_index) const {
+    const LoopSelector& loop_selector, const uint first_false_path_loop_node_index) const {
    const NodeInOriginalLoopBody node_in_true_path_loop_body(first_false_path_loop_node_index, _old_new);
    const NodeInClonedLoopBody node_in_false_path_loop_body(first_false_path_loop_node_index);
    CloneUnswitchedLoopPredicatesVisitor
    clone_unswitched_loop_predicates_visitor(_loop_head, old_to_new(_loop_head)->as_Loop(), node_in_true_path_loop_body,
                                             node_in_false_path_loop_body, _phase);
-    Node* source_loop_entry = unswitched_loop_selector.selector()->in(0);
+    Node* source_loop_entry = loop_selector.selector()->in(0);
    PredicateIterator predicate_iterator(source_loop_entry);
    predicate_iterator.for_each(clone_unswitched_loop_predicates_visitor);
  }

 #ifdef ASSERT
-  void verify_unswitched_loop_versions(LoopNode* true_path_loop_head,
-                                       const UnswitchedLoopSelector& unswitched_loop_selector) const {
-    verify_unswitched_loop_version(true_path_loop_head, unswitched_loop_selector.true_path_loop_proj());
-    verify_unswitched_loop_version(old_to_new(true_path_loop_head)->as_Loop(),
-                                   unswitched_loop_selector.false_path_loop_proj());
+  void verify_loop_versions(LoopNode* true_path_loop_head,
+                            const LoopSelector& loop_selector) const {
+    verify_loop_version(true_path_loop_head,
+                        loop_selector.true_path_loop_proj());
+    verify_loop_version(old_to_new(true_path_loop_head)->as_Loop(),
+                        loop_selector.false_path_loop_proj());
  }

-  static void verify_unswitched_loop_version(LoopNode* loop_head, IfProjNode* loop_selector_if_proj) {
+  static void verify_loop_version(LoopNode* loop_head, IfProjNode* loop_selector_if_proj) {
    Node* entry = loop_head->skip_strip_mined()->in(LoopNode::EntryControl);
    const Predicates predicates(entry);
    // When skipping all predicates, we should end up at 'loop_selector_if_proj'.
@ -302,15 +366,15 @@ class OriginalLoop : public StackObj {
  // If node. Keep the true-path-path in the true-path-loop and the false-path-path in the false-path-loop by setting
  // the bool input accordingly. The unswitch candidate If nodes are folded in the next IGVN round.
  void remove_unswitch_candidate_from_loops(const UnswitchedLoopSelector& unswitched_loop_selector) {
-    IfNode* unswitching_candidate = unswitched_loop_selector.unswitch_candidate();
-    _phase->igvn().rehash_node_delayed(unswitching_candidate);
-    _phase->dominated_by(unswitched_loop_selector.true_path_loop_proj(), unswitching_candidate);
+    const LoopSelector& loop_selector = unswitched_loop_selector.loop_selector();;
+    IfNode* unswitch_candidate        = unswitched_loop_selector.unswitch_candidate();
+    _phase->igvn().rehash_node_delayed(unswitch_candidate);
+    _phase->dominated_by(loop_selector.true_path_loop_proj(), unswitch_candidate);

-    IfNode* unswitching_candidate_clone = _old_new[unswitching_candidate->_idx]->as_If();
-    _phase->igvn().rehash_node_delayed(unswitching_candidate_clone);
-    _phase->dominated_by(unswitched_loop_selector.false_path_loop_proj(), unswitching_candidate_clone);
+    IfNode* unswitch_candidate_clone = _old_new[unswitch_candidate->_idx]->as_If();
+    _phase->igvn().rehash_node_delayed(unswitch_candidate_clone);
+    _phase->dominated_by(loop_selector.false_path_loop_proj(), unswitch_candidate_clone);
  }
-
 };

 // See comments below file header for more information about Loop Unswitching.
@ -343,6 +407,172 @@ void PhaseIdealLoop::do_unswitching(IdealLoopTree* loop, Node_List& old_new) {
  C->set_major_progress();
 }

+void PhaseIdealLoop::do_multiversioning(IdealLoopTree* lpt, Node_List& old_new) {
+#ifndef PRODUCT
+  if (TraceLoopOpts || TraceLoopMultiversioning) {
+    tty->print("Multiversion ");
+    lpt->dump_head();
+  }
+#endif
+  assert(LoopMultiversioning, "LoopMultiversioning must be enabled");
+
+  CountedLoopNode* original_head = lpt->_head->as_CountedLoop();
+  C->print_method(PHASE_BEFORE_LOOP_MULTIVERSIONING, 4, original_head);
+
+  Node* one = _igvn.intcon(1);
+  set_ctrl(one, C->root());
+  Node* opaque = new OpaqueMultiversioningNode(C, one);
+  set_ctrl(opaque, C->root());
+  _igvn.register_new_node_with_optimizer(opaque);
+  _igvn.set_type(opaque, TypeInt::BOOL);
+
+  const LoopSelector loop_selector(lpt, opaque, PROB_LIKELY_MAG(3), COUNT_UNKNOWN);
+  OriginalLoop original_loop(lpt, old_new);
+  original_loop.multiversion(loop_selector);
+
+  add_unswitched_loop_version_bodies_to_igvn(lpt, old_new);
+
+  CountedLoopNode* new_head = old_new[original_head->_idx]->as_CountedLoop();
+  original_head->set_multiversion_fast_loop();
+  new_head->set_multiversion_delayed_slow_loop();
+
+  NOT_PRODUCT(trace_loop_multiversioning_result(loop_selector, original_head, new_head);)
+  C->print_method(PHASE_AFTER_LOOP_MULTIVERSIONING, 4, new_head);
+  C->set_major_progress();
+}
+
+// Create a new if in the multiversioning pattern, adding an additional condition for the
+// multiversioning fast-loop.
+//
+// Before:
+//                       entry  opaque
+//                         |      |
+//                      multiversion_if
+//                         |      |
+//        +----------------+      +---------------+
+//        |                                       |
+//   multiversion_fast_proj          multiversion_slow_proj
+//                                                |
+//                                                +--------+
+//                                                         |
+//                                                      slow_path
+//
+//
+// After:
+//                     entry  opaque <-- to be replaced by caller
+//                         |  |
+//                        new_if
+//                         |  |
+//                         |  +-----------------------------+
+//                         |                                |
+//                 new_if_true  opaque                new_if_false
+//                         |      |                         |
+//                      multiversion_if                     |
+//                         |      |                         |
+//        +----------------+      +---------------+         |
+//        |                                       |         |
+//   multiversion_fast_proj      new_multiversion_slow_proj |
+//                                                |         |
+//                                                +------+  |
+//                                                       |  |
+//                                                      region
+//                                                         |
+//                                                      slow_path
+//
+IfTrueNode* PhaseIdealLoop::create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj) {
+  // Give all nodes in the old sub-graph a name.
+  IfNode* multiversion_if = multiversioning_fast_proj->in(0)->as_If();
+  Node* entry = multiversion_if->in(0);
+  OpaqueMultiversioningNode* opaque = multiversion_if->in(1)->as_OpaqueMultiversioning();
+  IfFalseNode* multiversion_slow_proj = multiversion_if->proj_out(0)->as_IfFalse();
+  Node* slow_path = multiversion_slow_proj->unique_ctrl_out();
+
+  // The slow_loop may still be delayed, and waiting for runtime-checks to be added to the
+  // multiversion_if. Now that we have at least one condition for the multiversioning,
+  // we should resume optimizations for the slow loop.
+  opaque->notify_slow_loop_that_it_can_resume_optimizations();
+
+  // Create new_if with its projections.
+  IfNode* new_if = IfNode::make_with_same_profile(multiversion_if, entry, opaque);
+  IdealLoopTree* lp = get_loop(entry);
+  register_control(new_if, lp, entry);
+
+  IfTrueNode*  new_if_true  = new IfTrueNode(new_if);
+  IfFalseNode* new_if_false = new IfFalseNode(new_if);
+  register_control(new_if_true,  lp, new_if);
+  register_control(new_if_false, lp, new_if);
+
+  // Hook new_if_true into multiversion_if.
+  _igvn.replace_input_of(multiversion_if, 0, new_if_true);
+
+  // Clone multiversion_slow_path - this allows us to easily carry the dependencies to
+  // the new region below.
+  IfFalseNode* new_multiversion_slow_proj = multiversion_slow_proj->clone()->as_IfFalse();
+  register_control(new_multiversion_slow_proj, lp, multiversion_if);
+
+  // Create new Region.
+  RegionNode* region = new RegionNode(1);
+  region->add_req(new_multiversion_slow_proj);
+  region->add_req(new_if_false);
+  register_control(region, lp, new_multiversion_slow_proj);
+
+  // Hook region into slow_path, in stead of the multiversion_slow_proj.
+  // This also moves all other dependencies of the multiversion_slow_proj to the region.
+  _igvn.replace_node(multiversion_slow_proj, region);
+
+  return new_if_true;
+}
+
+OpaqueMultiversioningNode* find_multiversion_opaque_from_multiversion_if_false(Node* maybe_multiversion_if_false) {
+  IfFalseNode* multiversion_if_false = maybe_multiversion_if_false->isa_IfFalse();
+  if (multiversion_if_false == nullptr) { return nullptr; }
+  IfNode* multiversion_if = multiversion_if_false->in(0)->isa_If();
+  if (multiversion_if == nullptr) { return nullptr; }
+  return multiversion_if->in(1)->isa_OpaqueMultiversioning();
+}
+
+bool PhaseIdealLoop::try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt) {
+  CountedLoopNode* cl = lpt->_head->as_CountedLoop();
+  assert(cl->is_multiversion_delayed_slow_loop(), "must currently be delayed");
+
+  // Find multiversion_if.
+  Node* entry = cl->skip_strip_mined()->in(LoopNode::EntryControl);
+  const Predicates predicates(entry);
+
+  Node* slow_path = predicates.entry();
+
+  // Find opaque.
+  OpaqueMultiversioningNode* opaque = nullptr;
+  if (slow_path->is_Region()) {
+    for (uint i = 1; i < slow_path->req(); i++) {
+      Node* n = slow_path->in(i);
+      opaque = find_multiversion_opaque_from_multiversion_if_false(n);
+      if (opaque != nullptr) { break; }
+    }
+  } else {
+    opaque = find_multiversion_opaque_from_multiversion_if_false(slow_path);
+  }
+  assert(opaque != nullptr, "must have found multiversion opaque node");
+  if (opaque == nullptr) { return false; }
+
+  // We may still be delayed, if there were not yet any runtime-checks added
+  // for the multiversioning. We may never add any, and then this loop would
+  // fold away. So we wait until some runtime-checks are added, then we know
+  // that this loop will be reachable and it is worth optimizing further.
+  if (opaque->is_delayed_slow_loop()) { return false; }
+
+  // Clear away the "delayed" status, i.e. resume optimizations.
+  cl->set_no_multiversion();
+  cl->set_multiversion_slow_loop();
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("Resume Optimizations ");
+    lpt->dump_head();
+  }
+#endif
+  return true;
+}
+
 bool PhaseIdealLoop::has_control_dependencies_from_predicates(LoopNode* head) {
  Node* entry = head->skip_strip_mined()->in(LoopNode::EntryControl);
  const Predicates predicates(entry);
@ -377,7 +607,7 @@ void PhaseIdealLoop::trace_loop_unswitching_result(const UnswitchedLoopSelector&
                                                   const LoopNode* original_head, const LoopNode* new_head) {
  if (TraceLoopUnswitching) {
    IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate();
-    IfNode* loop_selector = unswitched_loop_selector.selector();
+    IfNode* loop_selector = unswitched_loop_selector.loop_selector().selector();
    tty->print_cr("Loop Unswitching:");
    tty->print_cr("- Unswitch-Candidate-If: %d %s", unswitch_candidate->_idx, unswitch_candidate->Name());
    tty->print_cr("- Loop-Selector-If: %d %s", loop_selector->_idx, loop_selector->Name());
@ -385,22 +615,33 @@ void PhaseIdealLoop::trace_loop_unswitching_result(const UnswitchedLoopSelector&
    tty->print_cr("- False-Path-Loop (=Clone): %d %s", new_head->_idx, new_head->Name());
  }
 }
+
+void PhaseIdealLoop::trace_loop_multiversioning_result(const LoopSelector& loop_selector,
+                                                       const LoopNode* original_head, const LoopNode* new_head) {
+  if (TraceLoopMultiversioning) {
+    IfNode* selector_if = loop_selector.selector();
+    tty->print_cr("Loop Multiversioning:");
+    tty->print_cr("- Loop-Selector-If: %d %s", selector_if->_idx, selector_if->Name());
+    tty->print_cr("- True-Path-Loop (=Orig / Fast): %d %s", original_head->_idx, original_head->Name());
+    tty->print_cr("- False-Path-Loop (=Clone / Slow): %d %s", new_head->_idx, new_head->Name());
+  }
+}
 #endif

 // When unswitching a counted loop, we need to convert it back to a normal loop since it's not a proper pre, main or,
-// post loop anymore after loop unswitching.
+// post loop anymore after loop unswitching. We also lose the multiversion structure, with access to the multiversion_if.
 void PhaseIdealLoop::revert_to_normal_loop(const LoopNode* loop_head) {
  CountedLoopNode* cl = loop_head->isa_CountedLoop();
-  if (cl != nullptr && !cl->is_normal_loop()) {
-    cl->set_normal_loop();
-  }
+  if (cl == nullptr) { return; }
+  if (!cl->is_normal_loop()) { cl->set_normal_loop(); }
+  if (cl->is_multiversion()) { cl->set_no_multiversion(); }
 }

 // Hoist invariant CheckCastPPNodes out of each unswitched loop version to the appropriate loop selector If projection.
 void PhaseIdealLoop::hoist_invariant_check_casts(const IdealLoopTree* loop, const Node_List& old_new,
                                                 const UnswitchedLoopSelector& unswitched_loop_selector) {
  IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate();
-  IfNode* loop_selector = unswitched_loop_selector.selector();
+  IfNode* loop_selector = unswitched_loop_selector.loop_selector().selector();
  ResourceMark rm;
  GrowableArray<CheckCastPPNode*> loop_invariant_check_casts;
  for (DUIterator_Fast imax, i = unswitch_candidate->fast_outs(imax); i < imax; i++) {
--- a/src/hotspot/share/opto/loopnode.cpp
+++ b/src/hotspot/share/opto/loopnode.cpp
@ -1090,6 +1090,14 @@ bool PhaseIdealLoop::create_loop_nest(IdealLoopTree* loop, Node_List &old_new) {
    if (UseProfiledLoopPredicate) {
      add_parse_predicate(Deoptimization::Reason_profile_predicate, inner_head, outer_ilt, cloned_sfpt);
    }
+
+    // We only want to use the auto-vectorization check as a trap once per bci. And
+    // PhaseIdealLoop::add_parse_predicate only checks trap limits per method, so
+    // we do a custom check here.
+    if (!C->too_many_traps(cloned_sfpt->jvms()->method(), cloned_sfpt->jvms()->bci(), Deoptimization::Reason_auto_vectorization_check)) {
+      add_parse_predicate(Deoptimization::Reason_auto_vectorization_check, inner_head, outer_ilt, cloned_sfpt);
+    }
+
    add_parse_predicate(Deoptimization::Reason_loop_limit_check, inner_head, outer_ilt, cloned_sfpt);
  }

@ -2511,6 +2519,9 @@ void CountedLoopNode::dump_spec(outputStream *st) const {
  if (is_main_loop()) st->print("main of N%d", _idx);
  if (is_post_loop()) st->print("post of N%d", _main_idx);
  if (is_strip_mined()) st->print(" strip mined");
+  if (is_multiversion_fast_loop())         { st->print(" multiversion_fast"); }
+  if (is_multiversion_slow_loop())         { st->print(" multiversion_slow"); }
+  if (is_multiversion_delayed_slow_loop()) { st->print(" multiversion_delayed_slow"); }
 }
 #endif

@ -4303,6 +4314,9 @@ void IdealLoopTree::dump_head() {
    if (cl->is_post_loop()) tty->print(" post");
    if (cl->is_vectorized_loop()) tty->print(" vector");
    if (range_checks_present()) tty->print(" rc ");
+    if (cl->is_multiversion_fast_loop())         { tty->print(" multiversion_fast"); }
+    if (cl->is_multiversion_slow_loop())         { tty->print(" multiversion_slow"); }
+    if (cl->is_multiversion_delayed_slow_loop()) { tty->print(" multiversion_delayed_slow"); }
  }
  if (_has_call) tty->print(" has_call");
  if (_has_sfpt) tty->print(" has_sfpt");
@ -4948,18 +4962,6 @@ void PhaseIdealLoop::build_and_optimize() {
    C->set_major_progress();
  }

-  // Keep loop predicates and perform optimizations with them
-  // until no more loop optimizations could be done.
-  // After that switch predicates off and do more loop optimizations.
-  if (!C->major_progress() && (C->parse_predicate_count() > 0)) {
-    C->mark_parse_predicate_nodes_useless(_igvn);
-    assert(C->parse_predicate_count() == 0, "should be zero now");
-     if (TraceLoopOpts) {
-       tty->print_cr("PredicatesOff");
-     }
-     C->set_major_progress();
-  }
-
  // Auto-vectorize main-loop
  if (C->do_superword() && C->has_loops() && !C->major_progress()) {
    Compile::TracePhase tp(_t_autoVectorize);
@ -4992,6 +4994,18 @@ void PhaseIdealLoop::build_and_optimize() {
      }
    }
  }
+
+  // Keep loop predicates and perform optimizations with them
+  // until no more loop optimizations could be done.
+  // After that switch predicates off and do more loop optimizations.
+  if (!C->major_progress() && (C->parse_predicate_count() > 0)) {
+    C->mark_parse_predicate_nodes_useless(_igvn);
+    assert(C->parse_predicate_count() == 0, "should be zero now");
+    if (TraceLoopOpts) {
+      tty->print_cr("PredicatesOff");
+    }
+    C->set_major_progress();
+  }
 }

 #ifndef PRODUCT
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@ -43,6 +43,7 @@ class OuterStripMinedLoopEndNode;
 class PredicateBlock;
 class PathFrequency;
 class PhaseIdealLoop;
+class LoopSelector;
 class UnswitchedLoopSelector;
 class VectorSet;
 class VSharedData;
@ -79,7 +80,12 @@ protected:
         SubwordLoop           = 1<<13,
         ProfileTripFailed     = 1<<14,
         LoopNestInnerLoop     = 1<<15,
-         LoopNestLongOuterLoop = 1<<16 };
+         LoopNestLongOuterLoop = 1<<16,
+         MultiversionFastLoop         = 1<<17,
+         MultiversionSlowLoop         = 2<<17,
+         MultiversionDelayedSlowLoop  = 3<<17,
+         MultiversionFlagsMask        = 3<<17,
+       };
  char _unswitch_count;
  enum { _unswitch_max=3 };

@ -315,6 +321,32 @@ public:
  void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; }
  int  slp_max_unroll() const                { return _slp_maximum_unroll_factor; }

+  // Multiversioning allows us to duplicate a CountedLoop, and have two versions, and the multiversion_if
+  // decides which one is taken:
+  // (1) fast_loop: We enter this loop by default, by default the multiversion_if has its condition set to
+  //                "true", guarded by a OpaqueMultiversioning. If we want to make a speculative assumption
+  //                for an optimization, we can add the runtime-check to the multiversion_if, and if the
+  //                assumption fails we take the slow_loop instead, where we do not make the same speculative
+  //                assumption.
+  //                We call it the "fast_loop" because it has more optimizations, enabled by the speculative
+  //                runtime-checks at the multiversion_if, and we expect the fast_loop to execute faster.
+  // (2) slow_loop: By default, it is not taken, until a runtime-check is added to the multiversion_if while
+  //                optimizing the fast_looop. If such a runtime-check is never added, then after loop-opts
+  //                the multiversion_if constant folds to true, and the slow_loop is folded away. To save
+  //                compile time, we delay the optimization of the slow_loop until a runtime-check is added
+  //                to the multiversion_if, at which point we resume optimizations for the slow_loop.
+  //                We call it the "slow_loop" because it has fewer optimizations, since this is the fall-back
+  //                loop where we do not make any of the speculative assumptions we make for the fast_loop.
+  //                Hence, we expect the slow_loop to execute slower.
+  bool is_multiversion()                   const { return (_loop_flags & MultiversionFlagsMask) != Normal; }
+  bool is_multiversion_fast_loop()         const { return (_loop_flags & MultiversionFlagsMask) == MultiversionFastLoop; }
+  bool is_multiversion_slow_loop()         const { return (_loop_flags & MultiversionFlagsMask) == MultiversionSlowLoop; }
+  bool is_multiversion_delayed_slow_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionDelayedSlowLoop; }
+  void set_multiversion_fast_loop()         { assert(!is_multiversion(), ""); _loop_flags |= MultiversionFastLoop; }
+  void set_multiversion_slow_loop()         { assert(!is_multiversion(), ""); _loop_flags |= MultiversionSlowLoop; }
+  void set_multiversion_delayed_slow_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionDelayedSlowLoop; }
+  void set_no_multiversion()                { assert( is_multiversion(), ""); _loop_flags &= ~MultiversionFlagsMask; }
+
  virtual LoopNode* skip_strip_mined(int expect_skeleton = 1);
  OuterStripMinedLoopNode* outer_loop() const;
  virtual IfTrueNode* outer_loop_tail() const;
@ -1457,6 +1489,8 @@ public:
  static void trace_loop_unswitching_impossible(const LoopNode* original_head);
  static void trace_loop_unswitching_result(const UnswitchedLoopSelector& unswitched_loop_selector,
                                            const LoopNode* original_head, const LoopNode* new_head);
+  static void trace_loop_multiversioning_result(const LoopSelector& loop_selector,
+                                                const LoopNode* original_head, const LoopNode* new_head);
 #endif

 public:
@ -1483,6 +1517,11 @@ public:
  };
  AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);

+  void maybe_multiversion_for_auto_vectorization_runtime_checks(IdealLoopTree* lpt, Node_List& old_new);
+  void do_multiversioning(IdealLoopTree* lpt, Node_List& old_new);
+  IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj);
+  bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt);
+
  // Move an unordered Reduction out of loop if possible
  void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);

--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@ -4482,6 +4482,66 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
  return AutoVectorizeStatus::Success;
 }

+// Just before insert_pre_post_loops, we can multi-version the loop:
+//
+//              multiversion_if
+//               |       |
+//         fast_loop   slow_loop
+//
+// In the fast_loop we can make speculative assumptions, and put the
+// conditions into the multiversion_if. If the conditions hold at runtime,
+// we enter the fast_loop, if the conditions fail, we take the slow_loop
+// instead which does not make any of the speculative assumptions.
+//
+// Note: we only multiversion the loop if the loop does not have any
+//       auto vectorization check Predicate. If we have that predicate,
+//       then we can simply add the speculative assumption checks to
+//       that Predicate. This means we do not need to duplicate the
+//       loop - we have a smaller graph and save compile time. Should
+//       the conditions ever fail, then we deopt / trap at the Predicate
+//       and recompile without that Predicate. At that point we will
+//       multiversion the loop, so that we can still have speculative
+//       runtime checks.
+//
+// We perform the multiversioning when the loop is still in its single
+// iteration form, even before we insert pre and post loops. This makes
+// the cloning much simpler. However, this means that both the fast
+// and the slow loop have to be optimized independently (adding pre
+// and post loops, unrolling the main loop, auto-vectorize etc.). And
+// we may end up not needing any speculative assumptions in the fast_loop
+// and then rejecting the slow_loop by constant folding the multiversion_if.
+//
+// Therefore, we "delay" the optimization of the slow_loop until we add
+// at least one speculative assumption for the fast_loop. If we never
+// add such a speculative runtime check, the OpaqueMultiversioningNode
+// of the multiversion_if constant folds to true after loop opts, and the
+// multiversion_if folds away the "delayed" slow_loop. If we add any
+// speculative assumption, then we notify the OpaqueMultiversioningNode
+// with "notify_slow_loop_that_it_can_resume_optimizations".
+//
+// Note: new runtime checks can be added to the multiversion_if with
+//       PhaseIdealLoop::create_new_if_for_multiversion
+void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(IdealLoopTree* lpt, Node_List& old_new) {
+  CountedLoopNode* cl = lpt->_head->as_CountedLoop();
+  LoopNode* outer_loop = cl->skip_strip_mined();
+  Node* entry = outer_loop->in(LoopNode::EntryControl);
+
+  // Check we have multiversioning enabled, and are not already multiversioned.
+  if (!LoopMultiversioning || cl->is_multiversion()) { return; }
+
+  // Check that we do not have a parse-predicate where we can add the runtime checks
+  // during auto-vectorization.
+  const Predicates predicates(entry);
+  const PredicateBlock* predicate_block = predicates.auto_vectorization_check_block();
+  if (predicate_block->has_parse_predicate()) { return; }
+
+  // Check node budget.
+  uint estimate = lpt->est_loop_clone_sz(2);
+  if (!may_require_nodes(estimate)) { return; }
+
+  do_multiversioning(lpt, old_new);
+}
+
 // Returns true if the Reduction node is unordered.
 static bool is_unordered_reduction(Node* n) {
  return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
--- a/src/hotspot/share/opto/mempointer.hpp
+++ b/src/hotspot/share/opto/mempointer.hpp
@ -229,9 +229,12 @@
 //     Even if we could know that there is some base address to which we add index offsets, we cannot know
 //     if this reference address points to the beginning of a native memory allocation or into the middle,
 //     or outside it. We also have no guarantee for alignment with such a base address.
+//
 //     Still: we would like to find such a base if possible, and if two pointers are similar (i.e. have the
 //     same summands), we would like to find the same base. Further, it is reasonable to speculatively
-//     assume that such base addresses are aligned (TODO: need to add this speculative check in JDK-8323582).
+//     assume that such base addresses are aligned. We performs such a speculative alignment runtime check
+//     in VTransform::add_speculative_alignment_check.
+//
 //     A base pointer must have scale = 1, and be accepted byMemPointer::is_native_memory_base_candidate.
 //     It can thus be one of these:
 //      (1) CastX2P
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@ -139,6 +139,7 @@ class NeverBranchNode;
 class Opaque1Node;
 class OpaqueLoopInitNode;
 class OpaqueLoopStrideNode;
+class OpaqueMultiversioningNode;
 class OpaqueNotNullNode;
 class OpaqueInitializedAssertionPredicateNode;
 class OpaqueTemplateAssertionPredicateNode;
@ -800,6 +801,7 @@ public:
    DEFINE_CLASS_ID(Opaque1,  Node, 16)
      DEFINE_CLASS_ID(OpaqueLoopInit, Opaque1, 0)
      DEFINE_CLASS_ID(OpaqueLoopStride, Opaque1, 1)
+      DEFINE_CLASS_ID(OpaqueMultiversioning, Opaque1, 2)
    DEFINE_CLASS_ID(OpaqueNotNull,  Node, 17)
    DEFINE_CLASS_ID(OpaqueInitializedAssertionPredicate,  Node, 18)
    DEFINE_CLASS_ID(OpaqueTemplateAssertionPredicate,  Node, 19)
@ -982,6 +984,7 @@ public:
  DEFINE_CLASS_QUERY(OpaqueTemplateAssertionPredicate)
  DEFINE_CLASS_QUERY(OpaqueLoopInit)
  DEFINE_CLASS_QUERY(OpaqueLoopStride)
+  DEFINE_CLASS_QUERY(OpaqueMultiversioning)
  DEFINE_CLASS_QUERY(OuterStripMinedLoop)
  DEFINE_CLASS_QUERY(OuterStripMinedLoopEnd)
  DEFINE_CLASS_QUERY(Parm)
--- a/src/hotspot/share/opto/opaquenode.hpp
+++ b/src/hotspot/share/opto/opaquenode.hpp
@ -91,6 +91,29 @@ public:
  IfNode* if_node() const;
 };

+// This node is used to mark the auto vectorization Predicate.
+// At first, the multiversion_if has its condition set to "true" and we always
+// take the fast_loop. Since we do not know if the slow_loop is ever going to
+// be used, we delay optimizations for it. Once the fast_loop decides to use
+// speculative runtime-checks and adds them to the multiversion_if, the slow_loop
+// can now resume optimizations, as it is reachable at runtime.
+// See PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks
+class OpaqueMultiversioningNode : public Opaque1Node {
+private:
+  bool _is_delayed_slow_loop;
+
+public:
+  OpaqueMultiversioningNode(Compile* C, Node* n) :
+      Opaque1Node(C, n), _is_delayed_slow_loop(true)
+  {
+    init_class_id(Class_OpaqueMultiversioning);
+  }
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return TypeInt::BOOL; }
+  bool is_delayed_slow_loop() const { return _is_delayed_slow_loop; }
+  void notify_slow_loop_that_it_can_resume_optimizations() { _is_delayed_slow_loop = false; }
+};
+
 // This node is used in the context of intrinsics. We sometimes implicitly know that an object is non-null even though
 // the compiler cannot prove it. We therefore add a corresponding cast to propagate this implicit knowledge. However,
 // this cast could become top during optimizations (input to cast becomes null) and the data path is folded. To ensure
--- a/src/hotspot/share/opto/phasetype.hpp
+++ b/src/hotspot/share/opto/phasetype.hpp
@ -64,14 +64,17 @@
  flags(AFTER_LOOP_PEELING,             "After Loop Peeling") \
  flags(BEFORE_LOOP_UNSWITCHING,        "Before Loop Unswitching") \
  flags(AFTER_LOOP_UNSWITCHING,         "After Loop Unswitching") \
+  flags(BEFORE_LOOP_MULTIVERSIONING,    "Before Loop Multiversioning") \
+  flags(AFTER_LOOP_MULTIVERSIONING,     "After Loop Multiversioning") \
  flags(BEFORE_RANGE_CHECK_ELIMINATION, "Before Range Check Elimination") \
  flags(AFTER_RANGE_CHECK_ELIMINATION,  "After Range Check Elimination") \
  flags(BEFORE_PRE_MAIN_POST,           "Before Pre/Main/Post Loops") \
  flags(AFTER_PRE_MAIN_POST,            "After Pre/Main/Post Loops") \
-  flags(AUTO_VECTORIZATION1_BEFORE_APPLY,       "AutoVectorization 1, Before Apply") \
-  flags(AUTO_VECTORIZATION2_AFTER_REORDER,      "AutoVectorization 2, After Apply Memop Reordering") \
-  flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \
-  flags(AUTO_VECTORIZATION4_AFTER_APPLY,        "AutoVectorization 4, After Apply") \
+  flags(AUTO_VECTORIZATION1_BEFORE_APPLY,                     "AutoVectorization 1, Before Apply") \
+  flags(AUTO_VECTORIZATION2_AFTER_REORDER,                    "AutoVectorization 2, After Apply Memop Reordering") \
+  flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT,               "AutoVectorization 3, After Adjusting Pre-Loop Limit") \
+  flags(AUTO_VECTORIZATION4_AFTER_SPECULATIVE_RUNTIME_CHECKS, "AutoVectorization 4, After Adding Speculative Runtime Checks") \
+  flags(AUTO_VECTORIZATION5_AFTER_APPLY,                      "AutoVectorization 5, After Apply") \
  flags(BEFORE_CLOOPS,                  "Before CountedLoop") \
  flags(AFTER_CLOOPS,                   "After CountedLoop") \
  flags(PHASEIDEAL_BEFORE_EA,           "PhaseIdealLoop before EA") \
--- a/src/hotspot/share/opto/predicates.cpp
+++ b/src/hotspot/share/opto/predicates.cpp
@ -120,6 +120,7 @@ bool RuntimePredicate::has_valid_uncommon_trap(const Node* success_proj) {
  assert(RegularPredicate::may_be_predicate_if(success_proj), "must have been checked before");
  const Deoptimization::DeoptReason deopt_reason = uncommon_trap_reason(success_proj->as_IfProj());
  return (deopt_reason == Deoptimization::Reason_loop_limit_check ||
+          deopt_reason == Deoptimization::Reason_auto_vectorization_check ||
          deopt_reason == Deoptimization::Reason_predicate ||
          deopt_reason == Deoptimization::Reason_profile_predicate);
 }
@ -893,6 +894,8 @@ void Predicates::dump() const {
    tty->print_cr("%d %s:", loop_head->_idx, loop_head->Name());
    tty->print_cr("- Loop Limit Check Predicate Block:");
    _loop_limit_check_predicate_block.dump("  ");
+    tty->print_cr("- Auto Vectorization Check Block:");
+    _auto_vectorization_check_block.dump("  ");
    tty->print_cr("- Profiled Loop Predicate Block:");
    _profiled_loop_predicate_block.dump("  ");
    tty->print_cr("- Loop Predicate Block:");
--- a/src/hotspot/share/opto/predicates.hpp
+++ b/src/hotspot/share/opto/predicates.hpp
@ -734,6 +734,8 @@ class PredicateIterator : public StackObj {
    Node* current = _start_node;
    PredicateBlockIterator loop_limit_check_predicate_iterator(current, Deoptimization::Reason_loop_limit_check);
    current = loop_limit_check_predicate_iterator.for_each(predicate_visitor);
+    PredicateBlockIterator auto_vectorization_check_iterator(current, Deoptimization::Reason_auto_vectorization_check);
+    current = auto_vectorization_check_iterator.for_each(predicate_visitor);
    if (UseLoopPredicate) {
      if (UseProfiledLoopPredicate) {
        PredicateBlockIterator profiled_loop_predicate_iterator(current, Deoptimization::Reason_profile_predicate);
@ -906,6 +908,7 @@ class PredicateBlock : public StackObj {
 class Predicates : public StackObj {
  Node* const _tail;
  const PredicateBlock _loop_limit_check_predicate_block;
+  const PredicateBlock _auto_vectorization_check_block;
  const PredicateBlock _profiled_loop_predicate_block;
  const PredicateBlock _loop_predicate_block;
  Node* const _entry;
@ -914,7 +917,9 @@ class Predicates : public StackObj {
  explicit Predicates(Node* loop_entry)
      : _tail(loop_entry),
        _loop_limit_check_predicate_block(loop_entry, Deoptimization::Reason_loop_limit_check),
-        _profiled_loop_predicate_block(_loop_limit_check_predicate_block.entry(),
+        _auto_vectorization_check_block(_loop_limit_check_predicate_block.entry(),
+                                        Deoptimization::Reason_auto_vectorization_check),
+        _profiled_loop_predicate_block(_auto_vectorization_check_block.entry(),
                                       Deoptimization::Reason_profile_predicate),
        _loop_predicate_block(_profiled_loop_predicate_block.entry(),
                              Deoptimization::Reason_predicate),
@ -935,6 +940,10 @@ class Predicates : public StackObj {
    return &_profiled_loop_predicate_block;
  }

+  const PredicateBlock* auto_vectorization_check_block() const {
+    return &_auto_vectorization_check_block;
+  }
+
  const PredicateBlock* loop_limit_check_predicate_block() const {
    return &_loop_limit_check_predicate_block;
  }
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -1484,7 +1484,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac
                         pack->size(),
                         pre_end->init_trip(),
                         pre_end->stride_con(),
-                         iv_stride()
+                         iv_stride(),
+                         _vloop.are_speculative_checks_possible()
                         DEBUG_ONLY(COMMA is_trace_align_vector()));
  return solver.solve();
 }
@ -1896,6 +1897,7 @@ bool SuperWord::schedule_and_apply() const {
  VTransformTrace trace(_vloop.vtrace(),
                        is_trace_superword_rejections(),
                        is_trace_align_vector(),
+                        _vloop.is_trace_speculative_runtime_checks(),
                        is_trace_superword_info());
 #endif
  VTransform vtransform(_vloop_analyzer,
@ -1938,8 +1940,11 @@ void VTransform::apply() {
  adjust_pre_loop_limit_to_align_main_loop_vectors();
  C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl());

+  apply_speculative_runtime_checks();
+  C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_SPECULATIVE_RUNTIME_CHECKS, 4, cl());
+
  apply_vectorization();
-  C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl());
+  C->print_method(PHASE_AUTO_VECTORIZATION5_AFTER_APPLY, 4, cl());
 }

 // We prepare the memory graph for the replacement of scalar memops with vector memops.
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@ -29,25 +29,26 @@
 #include "utilities/stringUtils.hpp"

 #define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \
-  flags(POINTER_PARSING,      "Trace VPointer/MemPointer parsing") \
-  flags(POINTER_ALIASING,     "Trace VPointer/MemPointer aliasing") \
-  flags(POINTER_ADJACENCY,    "Trace VPointer/MemPointer adjacency") \
-  flags(POINTER_OVERLAP,      "Trace VPointer/MemPointer overlap") \
-  flags(PRECONDITIONS,        "Trace VLoop::check_preconditions") \
-  flags(LOOP_ANALYZER,        "Trace VLoopAnalyzer::setup_submodules") \
-  flags(MEMORY_SLICES,        "Trace VLoopMemorySlices") \
-  flags(BODY,                 "Trace VLoopBody") \
-  flags(TYPES,                "Trace VLoopTypes") \
-  flags(POINTERS,             "Trace VLoopPointers") \
-  flags(DEPENDENCY_GRAPH,     "Trace VLoopDependencyGraph") \
-  flags(SW_ADJACENT_MEMOPS,   "Trace SuperWord::find_adjacent_memop_pairs") \
-  flags(SW_REJECTIONS,        "Trace SuperWord rejections (non vectorizations)") \
-  flags(SW_PACKSET,           "Trace SuperWord packset at different stages") \
-  flags(SW_INFO,              "Trace SuperWord info (equivalent to TraceSuperWord)") \
-  flags(SW_VERBOSE,           "Trace SuperWord verbose (all SW tags enabled)") \
-  flags(ALIGN_VECTOR,         "Trace AlignVector") \
-  flags(VTRANSFORM,           "Trace VTransform Graph") \
-  flags(ALL,                  "Trace everything (very verbose)")
+  flags(POINTER_PARSING,            "Trace VPointer/MemPointer parsing") \
+  flags(POINTER_ALIASING,           "Trace VPointer/MemPointer aliasing") \
+  flags(POINTER_ADJACENCY,          "Trace VPointer/MemPointer adjacency") \
+  flags(POINTER_OVERLAP,            "Trace VPointer/MemPointer overlap") \
+  flags(PRECONDITIONS,              "Trace VLoop::check_preconditions") \
+  flags(LOOP_ANALYZER,              "Trace VLoopAnalyzer::setup_submodules") \
+  flags(MEMORY_SLICES,              "Trace VLoopMemorySlices") \
+  flags(BODY,                       "Trace VLoopBody") \
+  flags(TYPES,                      "Trace VLoopTypes") \
+  flags(POINTERS,                   "Trace VLoopPointers") \
+  flags(DEPENDENCY_GRAPH,           "Trace VLoopDependencyGraph") \
+  flags(SW_ADJACENT_MEMOPS,         "Trace SuperWord::find_adjacent_memop_pairs") \
+  flags(SW_REJECTIONS,              "Trace SuperWord rejections (non vectorizations)") \
+  flags(SW_PACKSET,                 "Trace SuperWord packset at different stages") \
+  flags(SW_INFO,                    "Trace SuperWord info (equivalent to TraceSuperWord)") \
+  flags(SW_VERBOSE,                 "Trace SuperWord verbose (all SW tags enabled)") \
+  flags(ALIGN_VECTOR,               "Trace AlignVector") \
+  flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
+  flags(VTRANSFORM,                 "Trace VTransform Graph") \
+  flags(ALL,                        "Trace everything (very verbose)")

 #define table_entry(name, description) name,
 enum TraceAutoVectorizationTag {
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@ -93,9 +93,9 @@ VStatus VLoop::check_preconditions_helper() {
    return VStatus::make_failure(VLoop::FAILURE_BACKEDGE);
  }

-  // To align vector memory accesses in the main-loop, we will have to adjust
-  // the pre-loop limit.
  if (_cl->is_main_loop()) {
+    // To align vector memory accesses in the main-loop, we will have to adjust
+    // the pre-loop limit.
    CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
    if (pre_end == nullptr) {
      return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
@ -105,6 +105,41 @@ VStatus VLoop::check_preconditions_helper() {
      return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
    }
    _pre_loop_end = pre_end;
+
+    // See if we find the infrastructure for speculative runtime-checks.
+    //  (1) Auto Vectorization Parse Predicate
+    Node* pre_ctrl = pre_loop_head()->in(LoopNode::EntryControl);
+    const Predicates predicates(pre_ctrl);
+    const PredicateBlock* predicate_block = predicates.auto_vectorization_check_block();
+    if (predicate_block->has_parse_predicate()) {
+      _auto_vectorization_parse_predicate_proj = predicate_block->parse_predicate_success_proj();
+    }
+
+    //  (2) Multiversioning fast-loop projection
+    IfTrueNode* before_predicates = predicates.entry()->isa_IfTrue();
+    if (before_predicates != nullptr &&
+        before_predicates->in(0)->is_If() &&
+        before_predicates->in(0)->in(1)->is_OpaqueMultiversioning()) {
+      _multiversioning_fast_proj = before_predicates;
+    }
+#ifndef PRODUCT
+    if (is_trace_preconditions() || is_trace_speculative_runtime_checks()) {
+      tty->print_cr(" Infrastructure for speculative runtime-checks:");
+      if (_auto_vectorization_parse_predicate_proj != nullptr) {
+        tty->print_cr("  auto_vectorization_parse_predicate_proj: speculate and trap");
+        _auto_vectorization_parse_predicate_proj->dump_bfs(5,0,"");
+      } else if (_multiversioning_fast_proj != nullptr) {
+        tty->print_cr("  multiversioning_fast_proj: speculate and multiversion");
+        _multiversioning_fast_proj->dump_bfs(5,0,"");
+      } else {
+        tty->print_cr("  Not found.");
+      }
+    }
+#endif
+    assert(_auto_vectorization_parse_predicate_proj == nullptr ||
+           _multiversioning_fast_proj == nullptr, "we should only have at most one of these");
+    assert(_cl->is_multiversion_fast_loop() == (_multiversioning_fast_proj != nullptr),
+           "must find the multiversion selector IFF loop is a multiversion fast loop");
  }

  return VStatus::make_success();
@ -472,15 +507,28 @@ AlignmentSolution* AlignmentSolver::solve() const {
  //        + con                   + con                                     + C_const                   (sum of constant terms)
  //
  // We describe the 6 terms:
-  //   1) The "base" of the address is the address of a Java object (e.g. array),
-  //      and as such ObjectAlignmentInBytes (a power of 2) aligned. We have
-  //      defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is also
+  //   1) The "base" of the address:
+  //        - For heap objects, this is the base of the object, and as such
+  //          ObjectAlignmentInBytes (a power of 2) aligned.
+  //        - For off-heap / native memory, the "base" has no alignment
+  //          gurantees. To ensure alignment we can do either of these:
+  //          - Add a runtime check to verify ObjectAlignmentInBytes alignment,
+  //            i.e. we can speculatively compile with an alignment assumption.
+  //            If we pass the check, we can go into the loop with the alignment
+  //            assumption, if we fail we have to trap/deopt or take the other
+  //            loop version without alignment assumptions.
+  //          - If runtime checks are not possible, then we return an empty
+  //            solution, i.e. we do not vectorize the corresponding pack.
+  //
+  //      Let us assume we have an object "base", or passed the alignment
+  //      runtime check for native "bases", hence we know:
+  //
+  //        base % ObjectAlignmentInBytes = 0
+  //
+  //      We defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is
  //      a power of 2. And hence we know that "base" is thus also aw-aligned:
  //
-  //        base % ObjectAlignmentInBytes = 0     ==>    base % aw = 0
-  //
-  //      TODO: Note: we have been assuming that this also holds for native memory base
-  //                  addresses. This is incorrect, see JDK-8323582.
+  //        base % ObjectAlignmentInBytes = 0     ==>    base % aw = 0              (BASE_ALIGNED)
  //
  //   2) The "C_const" term is the sum of all constant terms. This is "con",
  //      plus "iv_scale * init" if it is constant.
@ -505,6 +553,13 @@ AlignmentSolution* AlignmentSolver::solve() const {
  //   6) The "C_main * main_iter" term represents how much the iv is increased
  //      during "main_iter" main-loop iterations.

+  // For native memory, we must add a runtime-check that "base % ObjectAlignmentInBytes = ",
+  // to ensure (BASE_ALIGNED). If we cannot add this runtime-check, we have no guarantee on
+  // its alignment.
+  if (!_vpointer.mem_pointer().base().is_object() && !_are_speculative_checks_possible) {
+    return new EmptyAlignmentSolution("Cannot add speculative check for native memory alignment.");
+  }
+
  // Attribute init (i.e. _init_node) either to C_const or to C_init term.
  const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0;
  const int C_const =      _vpointer.con() + C_const_init * iv_scale();
@ -521,8 +576,7 @@ AlignmentSolution* AlignmentSolver::solve() const {
  // We must find a pre_iter, such that adr is aw aligned: adr % aw = 0. Note, that we are defining the
  // modulo operator "%" such that the remainder is always positive, see AlignmentSolution::mod(i, q).
  //
-  // TODO: Note: the following assumption is incorrect for native memory bases, see JDK-8323582.
-  // Since "base % aw = 0", we only need to ensure alignment of the other 5 terms:
+  // Since "base % aw = 0" (BASE_ALIGNED), we only need to ensure alignment of the other 5 terms:
  //
  //   (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter + C_main * main_iter) % aw = 0      (1)
  //
@ -878,8 +932,7 @@ AlignmentSolution* AlignmentSolver::solve() const {
  //         + iv_scale * pre_stride * pre_iter
  //         + iv_scale * main_stride * main_iter)) % aw =
  //
-  //   -> base aligned: base % aw = 0
-  //        TODO: Note: this assumption is incorrect for native memory bases, see JDK-8323582.
+  //   -> apply (BASE_ALIGNED): base % aw = 0
  //   -> main-loop iterations aligned (2): C_main % aw = (iv_scale * main_stride) % aw = 0
  //   (con + invar + iv_scale * init + iv_scale * pre_stride * pre_iter) % aw =
  //
@ -958,7 +1011,7 @@ void AlignmentSolver::trace_start_solve() const {
                  _pre_stride, _main_stride);
    // adr = base + con + invar + iv_scale * iv
    tty->print("  adr = base[%d]", base().object_or_native()->_idx);
-    tty->print(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con());
+    tty->print_cr(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con());
  }
 }

--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -85,6 +85,14 @@ private:
  PhiNode* _iv;
  CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only

+  // We can add speculative runtime-checks if we have one of these:
+  //  - Auto Vectorization Parse Predicate:
+  //      pass all checks or trap -> recompile without this predicate.
+  //  - Multiversioning fast-loop projection:
+  //      pass all checks or go to slow-path-loop, where we have no speculative assumptions.
+  ParsePredicateSuccessProj* _auto_vectorization_parse_predicate_proj;
+  IfTrueNode* _multiversioning_fast_proj;
+
  NOT_PRODUCT(VTrace _vtrace;)
  NOT_PRODUCT(TraceMemPointer _mptrace;)

@ -104,7 +112,9 @@ public:
    _cl        (nullptr),
    _cl_exit   (nullptr),
    _iv        (nullptr),
-    _pre_loop_end (nullptr)
+    _pre_loop_end (nullptr),
+    _auto_vectorization_parse_predicate_proj(nullptr),
+    _multiversioning_fast_proj(nullptr)
 #ifndef PRODUCT
    COMMA
    _mptrace(TraceMemPointer(
@ -138,6 +148,19 @@ public:
    return head;
  };

+  ParsePredicateSuccessProj* auto_vectorization_parse_predicate_proj() const {
+    return _auto_vectorization_parse_predicate_proj;
+  }
+
+  IfTrueNode* multiversioning_fast_proj() const {
+    return _multiversioning_fast_proj;
+  }
+
+  bool are_speculative_checks_possible() const {
+    return _auto_vectorization_parse_predicate_proj != nullptr ||
+           _multiversioning_fast_proj != nullptr;
+  }
+
  // Estimate maximum size for data structures, to avoid repeated reallocation
  int estimated_body_length() const { return lpt()->_body.size(); };
  int estimated_node_count()  const { return (int)(1.10 * phase()->C->unique()); };
@ -176,6 +199,10 @@ public:
  bool is_trace_vpointers() const {
    return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
  }
+
+  bool is_trace_speculative_runtime_checks() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
+  }
 #endif

  // Is the node in the basic block of the loop?
@ -1296,6 +1323,14 @@ private:
  const int      _pre_stride;     // address increment per pre-loop iteration
  const int      _main_stride;    // address increment per main-loop iteration

+  // For native bases, we have no alignment guarantee. This means we cannot in
+  // general guarantee alignment statically. But we can check alignment with a
+  // speculative runtime check, see VTransform::apply_speculative_runtime_checks.
+  // For this, we need find the Predicate for auto vectorization checks, or else
+  // we need to find the multiversion_if. If we cannot find either, then we
+  // cannot make any speculative runtime checks.
+  const bool     _are_speculative_checks_possible;
+
  DEBUG_ONLY( const bool _is_trace; );

  static const MemNode* mem_ref_not_null(const MemNode* mem_ref) {
@ -1309,7 +1344,8 @@ public:
                  const uint vector_length,
                  const Node* init_node,
                  const int pre_stride,
-                  const int main_stride
+                  const int main_stride,
+                  const bool are_speculative_checks_possible
                  DEBUG_ONLY( COMMA const bool is_trace)
                  ) :
      _vpointer(          vpointer),
@ -1318,7 +1354,8 @@ public:
      _aw(                MIN2(_vector_width, ObjectAlignmentInBytes)),
      _init_node(         init_node),
      _pre_stride(        pre_stride),
-      _main_stride(       main_stride)
+      _main_stride(       main_stride),
+      _are_speculative_checks_possible(are_speculative_checks_possible)
      DEBUG_ONLY( COMMA _is_trace(is_trace) )
  {
    assert(_mem_ref != nullptr &&
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@ -23,6 +23,7 @@

 #include "opto/vtransform.hpp"
 #include "opto/vectornode.hpp"
+#include "opto/castnode.hpp"
 #include "opto/convertnode.hpp"

 void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
@ -143,6 +144,94 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const {
 }
 #endif

+void VTransform::apply_speculative_runtime_checks() {
+  if (VLoop::vectors_should_be_aligned()) {
+#ifdef ASSERT
+    if (_trace._align_vector || _trace._speculative_runtime_checks) {
+      tty->print_cr("\nVTransform::apply_speculative_runtime_checks: native memory alignment");
+    }
+#endif
+
+    const GrowableArray<VTransformNode*>& vtnodes = _graph.vtnodes();
+    for (int i = 0; i < vtnodes.length(); i++) {
+      VTransformVectorNode* vtn = vtnodes.at(i)->isa_Vector();
+      if (vtn == nullptr) { continue; }
+      MemNode* p0 = vtn->nodes().at(0)->isa_Mem();
+      if (p0 == nullptr) { continue; }
+      const VPointer& vp = vpointer(p0);
+      if (vp.mem_pointer().base().is_object()) { continue; }
+      assert(vp.mem_pointer().base().is_native(), "VPointer base must be object or native");
+
+      // We have a native memory reference. Build a runtime check for it.
+      // See: AlignmentSolver::solve
+      // In a future RFE we may be able to speculate on invar alignment as
+      // well, and allow vectorization of more cases.
+      add_speculative_alignment_check(vp.mem_pointer().base().native(), ObjectAlignmentInBytes);
+    }
+  }
+}
+
+#define TRACE_SPECULATIVE_ALIGNMENT_CHECK(node) {                     \
+  DEBUG_ONLY(                                                         \
+    if (_trace._align_vector || _trace._speculative_runtime_checks) { \
+      tty->print("  " #node ": ");                                    \
+      node->dump();                                                   \
+    }                                                                 \
+  )                                                                   \
+}                                                                     \
+
+// Check: (node % alignment) == 0.
+void VTransform::add_speculative_alignment_check(Node* node, juint alignment) {
+  TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
+  Node* ctrl = phase()->get_ctrl(node);
+
+  // Cast adr/long -> int
+  if (node->bottom_type()->basic_type() == T_ADDRESS) {
+    // adr -> int/long
+    node = new CastP2XNode(nullptr, node);
+    phase()->register_new_node(node, ctrl);
+    TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
+  }
+  if (node->bottom_type()->basic_type() == T_LONG) {
+    // long -> int
+    node  = new ConvL2INode(node);
+    phase()->register_new_node(node, ctrl);
+    TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
+  }
+
+  Node* mask_alignment = igvn().intcon(alignment-1);
+  Node* base_alignment = new AndINode(node, mask_alignment);
+  phase()->register_new_node(base_alignment, ctrl);
+  TRACE_SPECULATIVE_ALIGNMENT_CHECK(mask_alignment);
+  TRACE_SPECULATIVE_ALIGNMENT_CHECK(base_alignment);
+
+  Node* zero = igvn().intcon(0);
+  Node* cmp_alignment = CmpNode::make(base_alignment, zero, T_INT, false);
+  BoolNode* bol_alignment = new BoolNode(cmp_alignment, BoolTest::eq);
+  phase()->register_new_node(cmp_alignment, ctrl);
+  phase()->register_new_node(bol_alignment, ctrl);
+  TRACE_SPECULATIVE_ALIGNMENT_CHECK(cmp_alignment);
+  TRACE_SPECULATIVE_ALIGNMENT_CHECK(bol_alignment);
+
+  add_speculative_check(bol_alignment);
+}
+
+void VTransform::add_speculative_check(BoolNode* bol) {
+  assert(_vloop.are_speculative_checks_possible(), "otherwise we cannot make speculative assumptions");
+  ParsePredicateSuccessProj* parse_predicate_proj = _vloop.auto_vectorization_parse_predicate_proj();
+  IfTrueNode* new_check_proj = nullptr;
+  if (parse_predicate_proj != nullptr) {
+    new_check_proj = phase()->create_new_if_for_predicate(parse_predicate_proj, nullptr,
+                                                          Deoptimization::Reason_auto_vectorization_check,
+                                                          Op_If);
+  } else {
+    new_check_proj = phase()->create_new_if_for_multiversion(_vloop.multiversioning_fast_proj());
+  }
+  Node* iff_speculate = new_check_proj->in(0);
+  igvn().replace_input_of(iff_speculate, 1, bol);
+  TRACE_SPECULATIVE_ALIGNMENT_CHECK(iff_speculate);
+}
+
 // Helper-class for VTransformGraph::has_store_to_load_forwarding_failure.
 // It wraps a VPointer. The VPointer has an iv_offset applied, which
 // simulates a virtual unrolling. They represent the memory region:
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@ -109,16 +109,19 @@ public:
  const bool _verbose;
  const bool _rejections;
  const bool _align_vector;
+  const bool _speculative_runtime_checks;
  const bool _info;

  VTransformTrace(const VTrace& vtrace,
                  const bool is_trace_rejections,
                  const bool is_trace_align_vector,
+                  const bool is_trace_speculative_runtime_checks,
                  const bool is_trace_info) :
-    _verbose     (vtrace.is_trace(TraceAutoVectorizationTag::ALL)),
-    _rejections  (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections),
-    _align_vector(_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector),
-    _info        (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {}
+    _verbose                   (vtrace.is_trace(TraceAutoVectorizationTag::ALL)),
+    _rejections                (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections),
+    _align_vector              (_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector),
+    _speculative_runtime_checks(_verbose | is_trace_vtransform(vtrace) | is_trace_speculative_runtime_checks),
+    _info                      (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {}

  static bool is_trace_vtransform(const VTrace& vtrace) {
    return vtrace.is_trace(TraceAutoVectorizationTag::VTRANSFORM);
@ -245,6 +248,10 @@ private:
  void determine_mem_ref_and_aw_for_main_loop_alignment();
  void adjust_pre_loop_limit_to_align_main_loop_vectors();

+  void apply_speculative_runtime_checks();
+  void add_speculative_alignment_check(Node* node, juint alignment);
+  void add_speculative_check(BoolNode* bol);
+
  void apply_vectorization() const;
 };

--- a/src/hotspot/share/runtime/deoptimization.cpp
+++ b/src/hotspot/share/runtime/deoptimization.cpp
@ -2717,6 +2717,7 @@ const char* Deoptimization::_trap_reason_name[] = {
  "intrinsic" JVMCI_ONLY("_or_type_checked_inlining"),
  "bimorphic" JVMCI_ONLY("_or_optimized_type_check"),
  "profile_predicate",
+  "auto_vectorization_check",
  "unloaded",
  "uninitialized",
  "initialized",
--- a/src/hotspot/share/runtime/deoptimization.hpp
+++ b/src/hotspot/share/runtime/deoptimization.hpp
@ -98,6 +98,7 @@ class Deoptimization : AllStatic {
 #endif

    Reason_profile_predicate,     // compiler generated predicate moved from frequent branch in a loop failed
+    Reason_auto_vectorization_check, // compiler generated (speculative) auto vectorization checks failed

    // recorded per method
    Reason_unloaded,              // unloaded class or constant pool entry
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@ -2269,6 +2269,7 @@
  declare_constant(Deoptimization::Reason_age)                            \
  declare_constant(Deoptimization::Reason_predicate)                      \
  declare_constant(Deoptimization::Reason_loop_limit_check)               \
+  declare_constant(Deoptimization::Reason_auto_vectorization_check)       \
  declare_constant(Deoptimization::Reason_speculate_class_check)          \
  declare_constant(Deoptimization::Reason_speculate_null_check)           \
  declare_constant(Deoptimization::Reason_speculate_null_assert)          \
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegmentUnalignedAddress.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegmentUnalignedAddress.java
@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.loopopts.superword;
+
+import compiler.lib.ir_framework.*;
+import compiler.lib.verify.*;
+import jdk.test.lib.Utils;
+import java.nio.ByteBuffer;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Random;
+import java.lang.foreign.*;
+
+/*
+ * @test id=byte-buffer-direct
+ * @bug 8323582
+ * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect
+ */
+
+/*
+ * @test id=byte-buffer-direct-AlignVector
+ * @bug 8323582
+ * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect AlignVector
+ */
+
+/*
+ * @test id=byte-buffer-direct-VerifyAlignVector
+ * @bug 8323582
+ * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect VerifyAlignVector
+ */
+
+/*
+ * @test id=native
+ * @bug 8323582
+ * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native
+ */
+
+/*
+ * @test id=native-AlignVector
+ * @bug 8323582
+ * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native AlignVector
+ */
+
+/*
+ * @test id=native-VerifyAlignVector
+ * @bug 8323582
+ * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native VerifyAlignVector
+ */
+
+public class TestMemorySegmentUnalignedAddress {
+    public static void main(String[] args) {
+        TestFramework framework = new TestFramework(TestMemorySegmentUnalignedAddressImpl.class);
+        framework.addFlags("-DmemorySegmentProviderNameForTestVM=" + args[0]);
+        if (args.length > 1) {
+            switch (args[1]) {
+                case "AlignVector" ->       { framework.addFlags("-XX:+AlignVector"); }
+                case "VerifyAlignVector" -> { framework.addFlags("-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
+                default ->                  { throw new RuntimeException("unexpected: " + args[1]); }
+            }
+        }
+        framework.setDefaultWarmup(100);
+        framework.start();
+    }
+}
+
+class TestMemorySegmentUnalignedAddressImpl {
+    static final int SIZE = 10_000;
+    static final int BACKING_SIZE = 10_000 + 1;
+    static final Random RANDOM = Utils.getRandomInstance();
+
+    interface TestFunction {
+        Object run(int i);
+    }
+
+    interface MemorySegmentProvider {
+        MemorySegment newMemorySegment();
+    }
+
+    static MemorySegmentProvider provider;
+
+    static {
+        String providerName = System.getProperty("memorySegmentProviderNameForTestVM");
+        provider = switch (providerName) {
+            case "ByteBufferDirect" -> TestMemorySegmentUnalignedAddressImpl::newMemorySegmentOfByteBufferDirect;
+            case "Native"           -> TestMemorySegmentUnalignedAddressImpl::newMemorySegmentOfNative;
+            default -> throw new RuntimeException("Test argument not recognized: " + providerName);
+        };
+    }
+
+    // List of tests
+    Map<String, TestFunction> tests = new HashMap<>();
+
+    // List of gold, the results from the first run before compilation
+    Map<String, Object> golds = new HashMap<>();
+
+    public TestMemorySegmentUnalignedAddressImpl () {
+        // Generate two MemorySegments as inputs
+        MemorySegment a = sliceAligned(newMemorySegment());
+        MemorySegment b = sliceAligned(newMemorySegment());
+        fillRandom(a);
+        fillRandom(b);
+
+        // Add all tests to list
+        tests.put("testAlwaysAligned", (int i) -> {
+            MemorySegment ms = newMemorySegment();
+            MemorySegment slice = sliceAligned(ms);
+            copy(a, slice);
+            return testAlwaysAligned(slice);
+        });
+        tests.put("testAlwaysUnaligned", (int i) -> {
+            MemorySegment ms = newMemorySegment();
+            MemorySegment slice = sliceUnaligned(ms);
+            copy(a, slice);
+            return testAlwaysUnaligned(slice);
+        });
+        tests.put("testMixedAlignedAndUnaligned", (int i) -> {
+            MemorySegment ms = newMemorySegment();
+            MemorySegment slice = (i % 2 == 0) ? sliceUnaligned(ms) : sliceAligned(ms);
+            copy(a, slice);
+            return testMixedAlignedAndUnaligned(slice);
+        });
+
+        // Compute gold value for all test methods before compilation
+        for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
+            String name = entry.getKey();
+            TestFunction test = entry.getValue();
+            Object gold = test.run(0);
+            golds.put(name, gold);
+        }
+    }
+
+    MemorySegment sliceAligned(MemorySegment src) {
+        return src.asSlice(0, SIZE);
+    }
+
+    MemorySegment sliceUnaligned(MemorySegment src) {
+        return src.asSlice(1, SIZE);
+    }
+
+    MemorySegment newMemorySegment() {
+        return provider.newMemorySegment();
+    }
+
+    static void copy(MemorySegment src, MemorySegment dst) {
+        MemorySegment.copy(src, 0, dst, 0, src.byteSize());
+    }
+
+    static MemorySegment newMemorySegmentOfByteBufferDirect() {
+        return MemorySegment.ofBuffer(ByteBuffer.allocateDirect(BACKING_SIZE));
+    }
+
+    static MemorySegment newMemorySegmentOfNative() {
+        // Auto arena: GC decides when there is no reference to the MemorySegment,
+        // and then it deallocates the backing memory.
+        return Arena.ofAuto().allocate(BACKING_SIZE, 1);
+    }
+
+    static void fillRandom(MemorySegment data) {
+        for (int i = 0; i < (int)data.byteSize(); i++) {
+            data.set(ValueLayout.JAVA_BYTE, i, (byte)RANDOM.nextInt());
+        }
+    }
+
+    static void verify(String name, Object gold, Object result) {
+        try {
+            Verify.checkEQ(gold, result);
+        } catch (VerifyException e) {
+            throw new RuntimeException("Verify: wrong result in " + name, e);
+        }
+    }
+
+    static int runInvocationCounter = 0;
+
+    @Run(test = {"testAlwaysAligned",
+                 "testAlwaysUnaligned",
+                 "testMixedAlignedAndUnaligned"})
+    void runTests() {
+        runInvocationCounter++;
+        for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
+            String name = entry.getKey();
+            TestFunction test = entry.getValue();
+            // Recall gold value from before compilation
+            Object gold = golds.get(name);
+            // Compute new result
+            Object result = test.run(runInvocationCounter);
+            // Compare gold and new result
+            verify(name, gold, result);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
+                  IRNode.ADD_VI,        "> 0",
+                  IRNode.STORE_VECTOR,  "> 0",
+                  "multiversion",       "= 0"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    // We never fail the alignment check in the auto vectorization Predicate,
+    // hence we never even create the multiversioned loops.
+    static Object testAlwaysAligned(MemorySegment ms) {
+        for (long i = 0; i < ms.byteSize(); i += 4) {
+            int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
+            ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
+        }
+        return new Object[]{ ms };
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
+                  IRNode.ADD_VI,        "> 0",
+                  IRNode.STORE_VECTOR,  "> 0",
+                  "multiversion_fast",  "= 4",  // pre, main, drain, post
+                  "multiversion_slow",  "= 2"}, // main, post
+        applyIf = {"AlignVector", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    // We add alignment checks to the auto vectorization Predicate. It fails
+    // at runtime, deopts, and recompiles with multiversioning.
+    @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
+                  IRNode.ADD_VI,        "> 0",
+                  IRNode.STORE_VECTOR,  "> 0",
+                  "multiversion_fast",  "= 0",
+                  "multiversion_slow",  "= 0"},
+        applyIf = {"AlignVector", "false"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    // We never add any conditions to the auto vectorization Predicate, so
+    // we also never deopt and never end up multiversioning.
+    static Object testAlwaysUnaligned(MemorySegment ms) {
+        for (long i = 0; i < ms.byteSize(); i += 4) {
+            int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
+            ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
+        }
+        return new Object[]{ ms };
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
+                  IRNode.ADD_VI,        "> 0",
+                  IRNode.STORE_VECTOR,  "> 0",
+                  "multiversion_fast",  "= 4",  // pre, main, drain, post
+                  "multiversion_slow",  "= 2"}, // main, post
+        applyIf = {"AlignVector", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    // We add alignment checks to the auto vectorization Predicate. It fails
+    // at runtime, deopts, and recompiles with multiversioning.
+    @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
+                  IRNode.ADD_VI,        "> 0",
+                  IRNode.STORE_VECTOR,  "> 0",
+                  "multiversion_fast",  "= 0",
+                  "multiversion_slow",  "= 0"},
+        applyIf = {"AlignVector", "false"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        phase = CompilePhase.PRINT_IDEAL)
+    // We never add any conditions to the auto vectorization Predicate, so
+    // we also never deopt and never end up multiversioning.
+    static Object testMixedAlignedAndUnaligned(MemorySegment ms) {
+        for (long i = 0; i < ms.byteSize(); i += 4) {
+            int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
+            ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
+        }
+        return new Object[]{ ms };
+    }
+}