From 885338b5f38ed05d8b91efc0178b371f2f89310e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 27 Feb 2025 06:58:43 +0000 Subject: [PATCH] 8323582: C2 SuperWord AlignVector: misaligned vector memory access with unaligned native memory Reviewed-by: roland, kvn --- src/hotspot/share/jvmci/vmStructs_jvmci.cpp | 1 + src/hotspot/share/opto/c2_globals.hpp | 6 + src/hotspot/share/opto/cfgnode.hpp | 2 +- src/hotspot/share/opto/classes.hpp | 1 + src/hotspot/share/opto/graphKit.cpp | 1 + src/hotspot/share/opto/ifnode.cpp | 6 +- src/hotspot/share/opto/loopTransform.cpp | 33 +- src/hotspot/share/opto/loopUnswitch.cpp | 369 +++++++++++++++--- src/hotspot/share/opto/loopnode.cpp | 38 +- src/hotspot/share/opto/loopnode.hpp | 41 +- src/hotspot/share/opto/loopopts.cpp | 60 +++ src/hotspot/share/opto/mempointer.hpp | 5 +- src/hotspot/share/opto/node.hpp | 3 + src/hotspot/share/opto/opaquenode.hpp | 23 ++ src/hotspot/share/opto/phasetype.hpp | 11 +- src/hotspot/share/opto/predicates.cpp | 3 + src/hotspot/share/opto/predicates.hpp | 11 +- src/hotspot/share/opto/superword.cpp | 9 +- .../share/opto/traceAutoVectorizationTag.hpp | 39 +- src/hotspot/share/opto/vectorization.cpp | 81 +++- src/hotspot/share/opto/vectorization.hpp | 43 +- src/hotspot/share/opto/vtransform.cpp | 89 +++++ src/hotspot/share/opto/vtransform.hpp | 15 +- src/hotspot/share/runtime/deoptimization.cpp | 1 + src/hotspot/share/runtime/deoptimization.hpp | 1 + src/hotspot/share/runtime/vmStructs.cpp | 1 + .../TestMemorySegmentUnalignedAddress.java | 303 ++++++++++++++ 27 files changed, 1067 insertions(+), 129 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegmentUnalignedAddress.java diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp index 8c964b56931..93c3449d0ff 100644 --- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp +++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp @@ -708,6 +708,7 @@ declare_constant(Deoptimization::Reason_constraint) \ declare_constant(Deoptimization::Reason_div0_check) \ declare_constant(Deoptimization::Reason_loop_limit_check) \ + declare_constant(Deoptimization::Reason_auto_vectorization_check) \ declare_constant(Deoptimization::Reason_type_checked_inlining) \ declare_constant(Deoptimization::Reason_optimized_type_check) \ declare_constant(Deoptimization::Reason_aliasing) \ diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp index caa0474c475..05b34bade43 100644 --- a/src/hotspot/share/opto/c2_globals.hpp +++ b/src/hotspot/share/opto/c2_globals.hpp @@ -346,6 +346,12 @@ develop(bool, TraceLoopUnswitching, false, \ "Trace loop unswitching") \ \ + product(bool, LoopMultiversioning, true, DIAGNOSTIC, \ + "Enable loop multiversioning (for speculative compilation)") \ + \ + develop(bool, TraceLoopMultiversioning, false, \ + "Trace loop multiversioning") \ + \ product(bool, AllowVectorizeOnDemand, true, \ "Globally suppress vectorization set in VectorizeMethod") \ \ diff --git a/src/hotspot/share/opto/cfgnode.hpp b/src/hotspot/share/opto/cfgnode.hpp index 899e3c9bb85..c2cc0161df1 100644 --- a/src/hotspot/share/opto/cfgnode.hpp +++ b/src/hotspot/share/opto/cfgnode.hpp @@ -428,7 +428,7 @@ public: IfNode(Node* control, Node* bol, float p, float fcnt); IfNode(Node* control, Node* bol, float p, float fcnt, AssertionPredicateType assertion_predicate_type); - static IfNode* make_with_same_profile(IfNode* if_node_profile, Node* ctrl, BoolNode* bol); + static IfNode* make_with_same_profile(IfNode* if_node_profile, Node* ctrl, Node* bol); virtual int Opcode() const; virtual bool pinned() const { return true; } diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index 918d8156b5f..41b621dfce9 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -277,6 +277,7 @@ macro(OnSpinWait) macro(Opaque1) macro(OpaqueLoopInit) macro(OpaqueLoopStride) +macro(OpaqueMultiversioning) macro(OpaqueZeroTripGuard) macro(OpaqueNotNull) macro(OpaqueInitializedAssertionPredicate) diff --git a/src/hotspot/share/opto/graphKit.cpp b/src/hotspot/share/opto/graphKit.cpp index 2af70960f54..bb225eeabf5 100644 --- a/src/hotspot/share/opto/graphKit.cpp +++ b/src/hotspot/share/opto/graphKit.cpp @@ -4086,6 +4086,7 @@ void GraphKit::add_parse_predicates(int nargs) { if (UseProfiledLoopPredicate) { add_parse_predicate(Deoptimization::Reason_profile_predicate, nargs); } + add_parse_predicate(Deoptimization::Reason_auto_vectorization_check, nargs); // Loop Limit Check Predicate should be near the loop. add_parse_predicate(Deoptimization::Reason_loop_limit_check, nargs); } diff --git a/src/hotspot/share/opto/ifnode.cpp b/src/hotspot/share/opto/ifnode.cpp index 56602135560..5da8993306f 100644 --- a/src/hotspot/share/opto/ifnode.cpp +++ b/src/hotspot/share/opto/ifnode.cpp @@ -469,7 +469,7 @@ static Node* split_if(IfNode *iff, PhaseIterGVN *igvn) { return new ConINode(TypeInt::ZERO); } -IfNode* IfNode::make_with_same_profile(IfNode* if_node_profile, Node* ctrl, BoolNode* bol) { +IfNode* IfNode::make_with_same_profile(IfNode* if_node_profile, Node* ctrl, Node* bol) { // Assert here that we only try to create a clone from an If node with the same profiling if that actually makes sense. // Some If node subtypes should not be cloned in this way. In theory, we should not clone BaseCountedLoopEndNodes. // But they can end up being used as normal If nodes when peeling a loop - they serve as zero-trip guard. @@ -2177,6 +2177,7 @@ ParsePredicateNode::ParsePredicateNode(Node* control, Deoptimization::DeoptReaso switch (deopt_reason) { case Deoptimization::Reason_predicate: case Deoptimization::Reason_profile_predicate: + case Deoptimization::Reason_auto_vectorization_check: case Deoptimization::Reason_loop_limit_check: break; default: @@ -2214,6 +2215,9 @@ void ParsePredicateNode::dump_spec(outputStream* st) const { case Deoptimization::DeoptReason::Reason_profile_predicate: st->print("Profiled Loop "); break; + case Deoptimization::DeoptReason::Reason_auto_vectorization_check: + st->print("Auto_Vectorization_Check "); + break; case Deoptimization::DeoptReason::Reason_loop_limit_check: st->print("Loop Limit Check "); break; diff --git a/src/hotspot/share/opto/loopTransform.cpp b/src/hotspot/share/opto/loopTransform.cpp index 03a7bf50e70..436d8758df3 100644 --- a/src/hotspot/share/opto/loopTransform.cpp +++ b/src/hotspot/share/opto/loopTransform.cpp @@ -745,6 +745,11 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) { cl->set_trip_count(cl->trip_count() - 1); if (cl->is_main_loop()) { cl->set_normal_loop(); + if (cl->is_multiversion()) { + // Peeling also destroys the connection of the main loop + // to the multiversion_if. + cl->set_no_multiversion(); + } #ifndef PRODUCT if (PrintOpto && VerifyLoopOptimizations) { tty->print("Peeling a 'main' loop; resetting to 'normal' "); @@ -1174,8 +1179,9 @@ bool IdealLoopTree::policy_range_check(PhaseIdealLoop* phase, bool provisional, if (!bol->is_Bool()) { assert(bol->is_OpaqueNotNull() || bol->is_OpaqueTemplateAssertionPredicate() || - bol->is_OpaqueInitializedAssertionPredicate(), - "Opaque node of a non-null-check or an Assertion Predicate"); + bol->is_OpaqueInitializedAssertionPredicate() || + bol->is_OpaqueMultiversioning(), + "Opaque node of a non-null-check or an Assertion Predicate or Multiversioning"); continue; } if (bol->as_Bool()->_test._test == BoolTest::ne) { @@ -3354,6 +3360,23 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n // Do nothing special to pre- and post- loops if (cl->is_pre_loop() || cl->is_post_loop()) return true; + // With multiversioning, we create a fast_loop and a slow_loop, and a multiversion_if that + // decides which loop is taken at runtime. At first, the multiversion_if always takes the + // fast_loop, and we only optimize the fast_loop. Since we are not sure if we will ever use + // the slow_loop, we delay optimizations for it, so we do not waste compile time and code + // size. If we never change the condition of the multiversion_if, the slow_loop is eventually + // folded away after loop-opts. While optimizing the fast_loop, we may want to perform some + // speculative optimization, for which we need a runtime-check. We add this runtime-check + // condition to the multiversion_if. Now, it becomes possible to execute the slow_loop at + // runtime, and we resume optimizations for slow_loop ("un-delay" it). + // TLDR: If the slow_loop is still in "delay" mode, check if the multiversion_if was changed + // and we should now resume optimizations for it. + if (cl->is_multiversion_delayed_slow_loop() && + !phase->try_resume_optimizations_for_delayed_slow_loop(this)) { + // We are still delayed, so wait with further loop-opts. + return true; + } + // Compute loop trip count from profile data compute_profile_trip_cnt(phase); @@ -3413,6 +3436,12 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n if (!phase->may_require_nodes(estimate)) { return false; } + + // We are going to add pre-loop and post-loop. + // But should we also multi-version for auto-vectorization speculative + // checks, i.e. fast and slow-paths? + phase->maybe_multiversion_for_auto_vectorization_runtime_checks(this, old_new); + phase->insert_pre_post_loops(this, old_new, peel_only); } // Adjust the pre- and main-loop limits to let the pre and post loops run diff --git a/src/hotspot/share/opto/loopUnswitch.cpp b/src/hotspot/share/opto/loopUnswitch.cpp index 05e24e2271e..051aa59ca71 100644 --- a/src/hotspot/share/opto/loopUnswitch.cpp +++ b/src/hotspot/share/opto/loopUnswitch.cpp @@ -32,6 +32,23 @@ #include "opto/predicates.hpp" #include "opto/rootnode.hpp" +// Multiversioning: +// A loop is cloned, and a selector If decides which loop is taken at run-time: the true-path-loop (original) or the +// false-path-loop (cloned). +// +// Use-cases: +// - Speculative compilation: +// The selector If checks some assumptions which allow stronger optimization in the true-path-loop. If the assumptions +// do not hold, we can still execute in the false-path-loop, although with fewer optimizations. +// See: PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks +// PhaseIdealLoop::create_new_if_for_multiversion +// +// - Unswitching: +// The selector If has the same (loop invariant) condition as some unswitching candidate If inside the loop. This +// allows us to constant-fold the unswitching candidate If to true in the true-path-loop and to false in the +// false-path-loop, thus eliminating the unswitching candidate If from the loop. +// +// // Loop Unswitching is a loop optimization to move an invariant, non-loop-exiting test in the loop body before the loop. // Such a test is either always true or always false in all loop iterations and could therefore only be executed once. // To achieve that, we duplicate the loop and change the original and cloned loop as follows: @@ -145,14 +162,16 @@ IfNode* PhaseIdealLoop::find_unswitch_candidate(const IdealLoopTree* loop) const return unswitch_candidate; } -// This class creates an If node (i.e. loop selector) that selects if the true-path-loop or the false-path-loop should be -// executed at runtime. This is done by finding an invariant and non-loop-exiting unswitch candidate If node (guaranteed -// to exist at this point) to perform Loop Unswitching on. -class UnswitchedLoopSelector : public StackObj { +// LoopSelector is used for loop multiversioning and unswitching. This class creates an If node (i.e. loop selector) +// that selects if the true-path-loop or the false-path-loop should be executed at runtime. +class LoopSelector : public StackObj { + // Cached fields for construction. PhaseIdealLoop* const _phase; IdealLoopTree* const _outer_loop; Node* const _original_loop_entry; - IfNode* const _unswitch_candidate; + const uint _dom_depth; // of original_loop_entry + + // Constructed selector if with its projections. IfNode* const _selector; IfTrueNode* const _true_path_loop_proj; IfFalseNode* const _false_path_loop_proj; @@ -160,52 +179,59 @@ class UnswitchedLoopSelector : public StackObj { enum PathToLoop { TRUE_PATH, FALSE_PATH }; public: - UnswitchedLoopSelector(IdealLoopTree* loop) + // For multiversioning: create a new selector (multiversion_if) from a bol condition. + LoopSelector(IdealLoopTree* loop, Node* bol, float prob, float fcnt) : _phase(loop->_phase), _outer_loop(loop->skip_strip_mined()->_parent), _original_loop_entry(loop->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl)), - _unswitch_candidate(find_unswitch_candidate(loop)), - _selector(create_selector_if()), + _dom_depth(_phase->dom_depth(_original_loop_entry)), + _selector(create_multiversioning_if(bol, prob, fcnt)), // multiversioning _true_path_loop_proj(create_proj_to_loop(TRUE_PATH)->as_IfTrue()), _false_path_loop_proj(create_proj_to_loop(FALSE_PATH)->as_IfFalse()) { } - NONCOPYABLE(UnswitchedLoopSelector); - private: - IfNode* find_unswitch_candidate(IdealLoopTree* loop) { - IfNode* unswitch_candidate = _phase->find_unswitch_candidate(loop); - assert(unswitch_candidate != nullptr, "guaranteed to exist by policy_unswitching"); - assert(_phase->is_member(loop, unswitch_candidate), "must be inside original loop"); - return unswitch_candidate; + // For unswitching: create an unswitching if before the loop, from a pre-existing + // unswitching_candidate inside the loop. + LoopSelector(IdealLoopTree* loop, IfNode* unswitch_candidate) + : _phase(loop->_phase), + _outer_loop(loop->skip_strip_mined()->_parent), + _original_loop_entry(loop->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl)), + _dom_depth(_phase->dom_depth(_original_loop_entry)), + _selector(create_unswitching_if(unswitch_candidate)), // unswitching + _true_path_loop_proj(create_proj_to_loop(TRUE_PATH)->as_IfTrue()), + _false_path_loop_proj(create_proj_to_loop(FALSE_PATH)->as_IfFalse()) { } + NONCOPYABLE(LoopSelector); - IfNode* create_selector_if() const { - const uint dom_depth = _phase->dom_depth(_original_loop_entry); + IfNode* create_multiversioning_if(Node* bol, float prob, float fcnt) { _phase->igvn().rehash_node_delayed(_original_loop_entry); - BoolNode* unswitch_candidate_bool = _unswitch_candidate->in(1)->as_Bool(); - IfNode* selector_if = IfNode::make_with_same_profile(_unswitch_candidate, _original_loop_entry, - unswitch_candidate_bool); - _phase->register_node(selector_if, _outer_loop, _original_loop_entry, dom_depth); + IfNode* selector_if = new IfNode(_original_loop_entry, bol, prob, fcnt); + _phase->register_node(selector_if, _outer_loop, _original_loop_entry, _dom_depth); return selector_if; } + IfNode* create_unswitching_if(IfNode* unswitch_candidate) { + _phase->igvn().rehash_node_delayed(_original_loop_entry); + BoolNode* unswitch_candidate_bool = unswitch_candidate->in(1)->as_Bool(); + IfNode* selector_if = IfNode::make_with_same_profile(unswitch_candidate, _original_loop_entry, + unswitch_candidate_bool); + _phase->register_node(selector_if, _outer_loop, _original_loop_entry, _dom_depth); + return selector_if; + } + + private: IfProjNode* create_proj_to_loop(const PathToLoop path_to_loop) { - const uint dom_depth = _phase->dom_depth(_original_loop_entry); IfProjNode* proj_to_loop; if (path_to_loop == TRUE_PATH) { proj_to_loop = new IfTrueNode(_selector); } else { proj_to_loop = new IfFalseNode(_selector); } - _phase->register_node(proj_to_loop, _outer_loop, _selector, dom_depth); + _phase->register_node(proj_to_loop, _outer_loop, _selector, _dom_depth); return proj_to_loop; } public: - IfNode* unswitch_candidate() const { - return _unswitch_candidate; - } - IfNode* selector() const { return _selector; } @@ -219,6 +245,37 @@ class UnswitchedLoopSelector : public StackObj { } }; +// This class creates an If node (i.e. loop selector) that selects if the true-path-loop or the false-path-loop should be +// executed at runtime. This is done by finding an invariant and non-loop-exiting unswitch candidate If node (guaranteed +// to exist at this point) to perform Loop Unswitching on. +class UnswitchedLoopSelector : public StackObj { + IfNode* const _unswitch_candidate; + const LoopSelector _loop_selector; + + public: + UnswitchedLoopSelector(IdealLoopTree* loop) + : _unswitch_candidate(find_unswitch_candidate(loop)), + _loop_selector(loop, _unswitch_candidate) {} + NONCOPYABLE(UnswitchedLoopSelector); + + private: + static IfNode* find_unswitch_candidate(IdealLoopTree* loop) { + IfNode* unswitch_candidate = loop->_phase->find_unswitch_candidate(loop); + assert(unswitch_candidate != nullptr, "guaranteed to exist by policy_unswitching"); + assert(loop->_phase->is_member(loop, unswitch_candidate), "must be inside original loop"); + return unswitch_candidate; + } + + public: + IfNode* unswitch_candidate() const { + return _unswitch_candidate; + } + + const LoopSelector& loop_selector() const { + return _loop_selector; + } +}; + // Class to unswitch the original loop and create Predicates at the new unswitched loop versions. The newly cloned loop // becomes the false-path-loop while original loop becomes the true-path-loop. class OriginalLoop : public StackObj { @@ -238,55 +295,62 @@ class OriginalLoop : public StackObj { // Unswitch the original loop on the invariant loop selector by creating a true-path-loop and a false-path-loop. // Remove the unswitch candidate If from both unswitched loop versions which are now covered by the loop selector If. void unswitch(const UnswitchedLoopSelector& unswitched_loop_selector) { - const uint first_false_path_loop_node_index = _phase->C->unique(); - clone_loop(unswitched_loop_selector); - - move_parse_and_template_assertion_predicates_to_unswitched_loops(unswitched_loop_selector, - first_false_path_loop_node_index); - DEBUG_ONLY(verify_unswitched_loop_versions(_loop->_head->as_Loop(), unswitched_loop_selector);) - - _phase->recompute_dom_depth(); + multiversion(unswitched_loop_selector.loop_selector()); remove_unswitch_candidate_from_loops(unswitched_loop_selector); } - private: - void clone_loop(const UnswitchedLoopSelector& unswitched_loop_selector) { - _phase->clone_loop(_loop, _old_new, _phase->dom_depth(_loop_head), - PhaseIdealLoop::CloneIncludesStripMined, unswitched_loop_selector.selector()); - fix_loop_entries(unswitched_loop_selector); + // Multiversion the original loop. The loop selector if selects between the original loop (true-path-loop), and + // a copy of it (false-path-loop). + void multiversion(const LoopSelector& loop_selector) { + const uint first_false_path_loop_node_index = _phase->C->unique(); + clone_loop(loop_selector); + + move_parse_and_template_assertion_predicates_to_unswitched_loops(loop_selector, + first_false_path_loop_node_index); + DEBUG_ONLY(verify_loop_versions(_loop->_head->as_Loop(), loop_selector);) + + _phase->recompute_dom_depth(); } - void fix_loop_entries(const UnswitchedLoopSelector& unswitched_loop_selector) { - _phase->replace_loop_entry(_loop_head, unswitched_loop_selector.true_path_loop_proj()); + private: + void clone_loop(const LoopSelector& loop_selector) { + _phase->clone_loop(_loop, _old_new, _phase->dom_depth(_loop_head), + PhaseIdealLoop::CloneIncludesStripMined, loop_selector.selector()); + fix_loop_entries(loop_selector); + } + + void fix_loop_entries(const LoopSelector& loop_selector) { + _phase->replace_loop_entry(_loop_head, loop_selector.true_path_loop_proj()); LoopNode* false_path_loop_strip_mined_head = old_to_new(_loop_head)->as_Loop(); _phase->replace_loop_entry(false_path_loop_strip_mined_head, - unswitched_loop_selector.false_path_loop_proj()); + loop_selector.false_path_loop_proj()); } // Moves the Parse And Template Assertion Predicates to the true and false path loop. They are inserted between the // loop heads and the loop selector If projections. The old Parse and Template Assertion Predicates before // the unswitched loop selector are killed. void move_parse_and_template_assertion_predicates_to_unswitched_loops( - const UnswitchedLoopSelector& unswitched_loop_selector, const uint first_false_path_loop_node_index) const { + const LoopSelector& loop_selector, const uint first_false_path_loop_node_index) const { const NodeInOriginalLoopBody node_in_true_path_loop_body(first_false_path_loop_node_index, _old_new); const NodeInClonedLoopBody node_in_false_path_loop_body(first_false_path_loop_node_index); CloneUnswitchedLoopPredicatesVisitor clone_unswitched_loop_predicates_visitor(_loop_head, old_to_new(_loop_head)->as_Loop(), node_in_true_path_loop_body, node_in_false_path_loop_body, _phase); - Node* source_loop_entry = unswitched_loop_selector.selector()->in(0); + Node* source_loop_entry = loop_selector.selector()->in(0); PredicateIterator predicate_iterator(source_loop_entry); predicate_iterator.for_each(clone_unswitched_loop_predicates_visitor); } #ifdef ASSERT - void verify_unswitched_loop_versions(LoopNode* true_path_loop_head, - const UnswitchedLoopSelector& unswitched_loop_selector) const { - verify_unswitched_loop_version(true_path_loop_head, unswitched_loop_selector.true_path_loop_proj()); - verify_unswitched_loop_version(old_to_new(true_path_loop_head)->as_Loop(), - unswitched_loop_selector.false_path_loop_proj()); + void verify_loop_versions(LoopNode* true_path_loop_head, + const LoopSelector& loop_selector) const { + verify_loop_version(true_path_loop_head, + loop_selector.true_path_loop_proj()); + verify_loop_version(old_to_new(true_path_loop_head)->as_Loop(), + loop_selector.false_path_loop_proj()); } - static void verify_unswitched_loop_version(LoopNode* loop_head, IfProjNode* loop_selector_if_proj) { + static void verify_loop_version(LoopNode* loop_head, IfProjNode* loop_selector_if_proj) { Node* entry = loop_head->skip_strip_mined()->in(LoopNode::EntryControl); const Predicates predicates(entry); // When skipping all predicates, we should end up at 'loop_selector_if_proj'. @@ -302,15 +366,15 @@ class OriginalLoop : public StackObj { // If node. Keep the true-path-path in the true-path-loop and the false-path-path in the false-path-loop by setting // the bool input accordingly. The unswitch candidate If nodes are folded in the next IGVN round. void remove_unswitch_candidate_from_loops(const UnswitchedLoopSelector& unswitched_loop_selector) { - IfNode* unswitching_candidate = unswitched_loop_selector.unswitch_candidate(); - _phase->igvn().rehash_node_delayed(unswitching_candidate); - _phase->dominated_by(unswitched_loop_selector.true_path_loop_proj(), unswitching_candidate); + const LoopSelector& loop_selector = unswitched_loop_selector.loop_selector();; + IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate(); + _phase->igvn().rehash_node_delayed(unswitch_candidate); + _phase->dominated_by(loop_selector.true_path_loop_proj(), unswitch_candidate); - IfNode* unswitching_candidate_clone = _old_new[unswitching_candidate->_idx]->as_If(); - _phase->igvn().rehash_node_delayed(unswitching_candidate_clone); - _phase->dominated_by(unswitched_loop_selector.false_path_loop_proj(), unswitching_candidate_clone); + IfNode* unswitch_candidate_clone = _old_new[unswitch_candidate->_idx]->as_If(); + _phase->igvn().rehash_node_delayed(unswitch_candidate_clone); + _phase->dominated_by(loop_selector.false_path_loop_proj(), unswitch_candidate_clone); } - }; // See comments below file header for more information about Loop Unswitching. @@ -343,6 +407,172 @@ void PhaseIdealLoop::do_unswitching(IdealLoopTree* loop, Node_List& old_new) { C->set_major_progress(); } +void PhaseIdealLoop::do_multiversioning(IdealLoopTree* lpt, Node_List& old_new) { +#ifndef PRODUCT + if (TraceLoopOpts || TraceLoopMultiversioning) { + tty->print("Multiversion "); + lpt->dump_head(); + } +#endif + assert(LoopMultiversioning, "LoopMultiversioning must be enabled"); + + CountedLoopNode* original_head = lpt->_head->as_CountedLoop(); + C->print_method(PHASE_BEFORE_LOOP_MULTIVERSIONING, 4, original_head); + + Node* one = _igvn.intcon(1); + set_ctrl(one, C->root()); + Node* opaque = new OpaqueMultiversioningNode(C, one); + set_ctrl(opaque, C->root()); + _igvn.register_new_node_with_optimizer(opaque); + _igvn.set_type(opaque, TypeInt::BOOL); + + const LoopSelector loop_selector(lpt, opaque, PROB_LIKELY_MAG(3), COUNT_UNKNOWN); + OriginalLoop original_loop(lpt, old_new); + original_loop.multiversion(loop_selector); + + add_unswitched_loop_version_bodies_to_igvn(lpt, old_new); + + CountedLoopNode* new_head = old_new[original_head->_idx]->as_CountedLoop(); + original_head->set_multiversion_fast_loop(); + new_head->set_multiversion_delayed_slow_loop(); + + NOT_PRODUCT(trace_loop_multiversioning_result(loop_selector, original_head, new_head);) + C->print_method(PHASE_AFTER_LOOP_MULTIVERSIONING, 4, new_head); + C->set_major_progress(); +} + +// Create a new if in the multiversioning pattern, adding an additional condition for the +// multiversioning fast-loop. +// +// Before: +// entry opaque +// | | +// multiversion_if +// | | +// +----------------+ +---------------+ +// | | +// multiversion_fast_proj multiversion_slow_proj +// | +// +--------+ +// | +// slow_path +// +// +// After: +// entry opaque <-- to be replaced by caller +// | | +// new_if +// | | +// | +-----------------------------+ +// | | +// new_if_true opaque new_if_false +// | | | +// multiversion_if | +// | | | +// +----------------+ +---------------+ | +// | | | +// multiversion_fast_proj new_multiversion_slow_proj | +// | | +// +------+ | +// | | +// region +// | +// slow_path +// +IfTrueNode* PhaseIdealLoop::create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj) { + // Give all nodes in the old sub-graph a name. + IfNode* multiversion_if = multiversioning_fast_proj->in(0)->as_If(); + Node* entry = multiversion_if->in(0); + OpaqueMultiversioningNode* opaque = multiversion_if->in(1)->as_OpaqueMultiversioning(); + IfFalseNode* multiversion_slow_proj = multiversion_if->proj_out(0)->as_IfFalse(); + Node* slow_path = multiversion_slow_proj->unique_ctrl_out(); + + // The slow_loop may still be delayed, and waiting for runtime-checks to be added to the + // multiversion_if. Now that we have at least one condition for the multiversioning, + // we should resume optimizations for the slow loop. + opaque->notify_slow_loop_that_it_can_resume_optimizations(); + + // Create new_if with its projections. + IfNode* new_if = IfNode::make_with_same_profile(multiversion_if, entry, opaque); + IdealLoopTree* lp = get_loop(entry); + register_control(new_if, lp, entry); + + IfTrueNode* new_if_true = new IfTrueNode(new_if); + IfFalseNode* new_if_false = new IfFalseNode(new_if); + register_control(new_if_true, lp, new_if); + register_control(new_if_false, lp, new_if); + + // Hook new_if_true into multiversion_if. + _igvn.replace_input_of(multiversion_if, 0, new_if_true); + + // Clone multiversion_slow_path - this allows us to easily carry the dependencies to + // the new region below. + IfFalseNode* new_multiversion_slow_proj = multiversion_slow_proj->clone()->as_IfFalse(); + register_control(new_multiversion_slow_proj, lp, multiversion_if); + + // Create new Region. + RegionNode* region = new RegionNode(1); + region->add_req(new_multiversion_slow_proj); + region->add_req(new_if_false); + register_control(region, lp, new_multiversion_slow_proj); + + // Hook region into slow_path, in stead of the multiversion_slow_proj. + // This also moves all other dependencies of the multiversion_slow_proj to the region. + _igvn.replace_node(multiversion_slow_proj, region); + + return new_if_true; +} + +OpaqueMultiversioningNode* find_multiversion_opaque_from_multiversion_if_false(Node* maybe_multiversion_if_false) { + IfFalseNode* multiversion_if_false = maybe_multiversion_if_false->isa_IfFalse(); + if (multiversion_if_false == nullptr) { return nullptr; } + IfNode* multiversion_if = multiversion_if_false->in(0)->isa_If(); + if (multiversion_if == nullptr) { return nullptr; } + return multiversion_if->in(1)->isa_OpaqueMultiversioning(); +} + +bool PhaseIdealLoop::try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt) { + CountedLoopNode* cl = lpt->_head->as_CountedLoop(); + assert(cl->is_multiversion_delayed_slow_loop(), "must currently be delayed"); + + // Find multiversion_if. + Node* entry = cl->skip_strip_mined()->in(LoopNode::EntryControl); + const Predicates predicates(entry); + + Node* slow_path = predicates.entry(); + + // Find opaque. + OpaqueMultiversioningNode* opaque = nullptr; + if (slow_path->is_Region()) { + for (uint i = 1; i < slow_path->req(); i++) { + Node* n = slow_path->in(i); + opaque = find_multiversion_opaque_from_multiversion_if_false(n); + if (opaque != nullptr) { break; } + } + } else { + opaque = find_multiversion_opaque_from_multiversion_if_false(slow_path); + } + assert(opaque != nullptr, "must have found multiversion opaque node"); + if (opaque == nullptr) { return false; } + + // We may still be delayed, if there were not yet any runtime-checks added + // for the multiversioning. We may never add any, and then this loop would + // fold away. So we wait until some runtime-checks are added, then we know + // that this loop will be reachable and it is worth optimizing further. + if (opaque->is_delayed_slow_loop()) { return false; } + + // Clear away the "delayed" status, i.e. resume optimizations. + cl->set_no_multiversion(); + cl->set_multiversion_slow_loop(); +#ifndef PRODUCT + if (TraceLoopOpts) { + tty->print("Resume Optimizations "); + lpt->dump_head(); + } +#endif + return true; +} + bool PhaseIdealLoop::has_control_dependencies_from_predicates(LoopNode* head) { Node* entry = head->skip_strip_mined()->in(LoopNode::EntryControl); const Predicates predicates(entry); @@ -377,7 +607,7 @@ void PhaseIdealLoop::trace_loop_unswitching_result(const UnswitchedLoopSelector& const LoopNode* original_head, const LoopNode* new_head) { if (TraceLoopUnswitching) { IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate(); - IfNode* loop_selector = unswitched_loop_selector.selector(); + IfNode* loop_selector = unswitched_loop_selector.loop_selector().selector(); tty->print_cr("Loop Unswitching:"); tty->print_cr("- Unswitch-Candidate-If: %d %s", unswitch_candidate->_idx, unswitch_candidate->Name()); tty->print_cr("- Loop-Selector-If: %d %s", loop_selector->_idx, loop_selector->Name()); @@ -385,22 +615,33 @@ void PhaseIdealLoop::trace_loop_unswitching_result(const UnswitchedLoopSelector& tty->print_cr("- False-Path-Loop (=Clone): %d %s", new_head->_idx, new_head->Name()); } } + +void PhaseIdealLoop::trace_loop_multiversioning_result(const LoopSelector& loop_selector, + const LoopNode* original_head, const LoopNode* new_head) { + if (TraceLoopMultiversioning) { + IfNode* selector_if = loop_selector.selector(); + tty->print_cr("Loop Multiversioning:"); + tty->print_cr("- Loop-Selector-If: %d %s", selector_if->_idx, selector_if->Name()); + tty->print_cr("- True-Path-Loop (=Orig / Fast): %d %s", original_head->_idx, original_head->Name()); + tty->print_cr("- False-Path-Loop (=Clone / Slow): %d %s", new_head->_idx, new_head->Name()); + } +} #endif // When unswitching a counted loop, we need to convert it back to a normal loop since it's not a proper pre, main or, -// post loop anymore after loop unswitching. +// post loop anymore after loop unswitching. We also lose the multiversion structure, with access to the multiversion_if. void PhaseIdealLoop::revert_to_normal_loop(const LoopNode* loop_head) { CountedLoopNode* cl = loop_head->isa_CountedLoop(); - if (cl != nullptr && !cl->is_normal_loop()) { - cl->set_normal_loop(); - } + if (cl == nullptr) { return; } + if (!cl->is_normal_loop()) { cl->set_normal_loop(); } + if (cl->is_multiversion()) { cl->set_no_multiversion(); } } // Hoist invariant CheckCastPPNodes out of each unswitched loop version to the appropriate loop selector If projection. void PhaseIdealLoop::hoist_invariant_check_casts(const IdealLoopTree* loop, const Node_List& old_new, const UnswitchedLoopSelector& unswitched_loop_selector) { IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate(); - IfNode* loop_selector = unswitched_loop_selector.selector(); + IfNode* loop_selector = unswitched_loop_selector.loop_selector().selector(); ResourceMark rm; GrowableArray loop_invariant_check_casts; for (DUIterator_Fast imax, i = unswitch_candidate->fast_outs(imax); i < imax; i++) { diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index ed2db13421f..a58fa44f9d6 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -1090,6 +1090,14 @@ bool PhaseIdealLoop::create_loop_nest(IdealLoopTree* loop, Node_List &old_new) { if (UseProfiledLoopPredicate) { add_parse_predicate(Deoptimization::Reason_profile_predicate, inner_head, outer_ilt, cloned_sfpt); } + + // We only want to use the auto-vectorization check as a trap once per bci. And + // PhaseIdealLoop::add_parse_predicate only checks trap limits per method, so + // we do a custom check here. + if (!C->too_many_traps(cloned_sfpt->jvms()->method(), cloned_sfpt->jvms()->bci(), Deoptimization::Reason_auto_vectorization_check)) { + add_parse_predicate(Deoptimization::Reason_auto_vectorization_check, inner_head, outer_ilt, cloned_sfpt); + } + add_parse_predicate(Deoptimization::Reason_loop_limit_check, inner_head, outer_ilt, cloned_sfpt); } @@ -2511,6 +2519,9 @@ void CountedLoopNode::dump_spec(outputStream *st) const { if (is_main_loop()) st->print("main of N%d", _idx); if (is_post_loop()) st->print("post of N%d", _main_idx); if (is_strip_mined()) st->print(" strip mined"); + if (is_multiversion_fast_loop()) { st->print(" multiversion_fast"); } + if (is_multiversion_slow_loop()) { st->print(" multiversion_slow"); } + if (is_multiversion_delayed_slow_loop()) { st->print(" multiversion_delayed_slow"); } } #endif @@ -4303,6 +4314,9 @@ void IdealLoopTree::dump_head() { if (cl->is_post_loop()) tty->print(" post"); if (cl->is_vectorized_loop()) tty->print(" vector"); if (range_checks_present()) tty->print(" rc "); + if (cl->is_multiversion_fast_loop()) { tty->print(" multiversion_fast"); } + if (cl->is_multiversion_slow_loop()) { tty->print(" multiversion_slow"); } + if (cl->is_multiversion_delayed_slow_loop()) { tty->print(" multiversion_delayed_slow"); } } if (_has_call) tty->print(" has_call"); if (_has_sfpt) tty->print(" has_sfpt"); @@ -4948,18 +4962,6 @@ void PhaseIdealLoop::build_and_optimize() { C->set_major_progress(); } - // Keep loop predicates and perform optimizations with them - // until no more loop optimizations could be done. - // After that switch predicates off and do more loop optimizations. - if (!C->major_progress() && (C->parse_predicate_count() > 0)) { - C->mark_parse_predicate_nodes_useless(_igvn); - assert(C->parse_predicate_count() == 0, "should be zero now"); - if (TraceLoopOpts) { - tty->print_cr("PredicatesOff"); - } - C->set_major_progress(); - } - // Auto-vectorize main-loop if (C->do_superword() && C->has_loops() && !C->major_progress()) { Compile::TracePhase tp(_t_autoVectorize); @@ -4992,6 +4994,18 @@ void PhaseIdealLoop::build_and_optimize() { } } } + + // Keep loop predicates and perform optimizations with them + // until no more loop optimizations could be done. + // After that switch predicates off and do more loop optimizations. + if (!C->major_progress() && (C->parse_predicate_count() > 0)) { + C->mark_parse_predicate_nodes_useless(_igvn); + assert(C->parse_predicate_count() == 0, "should be zero now"); + if (TraceLoopOpts) { + tty->print_cr("PredicatesOff"); + } + C->set_major_progress(); + } } #ifndef PRODUCT diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index 4e5a60ee3cd..a9c5d697c6b 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -43,6 +43,7 @@ class OuterStripMinedLoopEndNode; class PredicateBlock; class PathFrequency; class PhaseIdealLoop; +class LoopSelector; class UnswitchedLoopSelector; class VectorSet; class VSharedData; @@ -79,7 +80,12 @@ protected: SubwordLoop = 1<<13, ProfileTripFailed = 1<<14, LoopNestInnerLoop = 1<<15, - LoopNestLongOuterLoop = 1<<16 }; + LoopNestLongOuterLoop = 1<<16, + MultiversionFastLoop = 1<<17, + MultiversionSlowLoop = 2<<17, + MultiversionDelayedSlowLoop = 3<<17, + MultiversionFlagsMask = 3<<17, + }; char _unswitch_count; enum { _unswitch_max=3 }; @@ -315,6 +321,32 @@ public: void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; } int slp_max_unroll() const { return _slp_maximum_unroll_factor; } + // Multiversioning allows us to duplicate a CountedLoop, and have two versions, and the multiversion_if + // decides which one is taken: + // (1) fast_loop: We enter this loop by default, by default the multiversion_if has its condition set to + // "true", guarded by a OpaqueMultiversioning. If we want to make a speculative assumption + // for an optimization, we can add the runtime-check to the multiversion_if, and if the + // assumption fails we take the slow_loop instead, where we do not make the same speculative + // assumption. + // We call it the "fast_loop" because it has more optimizations, enabled by the speculative + // runtime-checks at the multiversion_if, and we expect the fast_loop to execute faster. + // (2) slow_loop: By default, it is not taken, until a runtime-check is added to the multiversion_if while + // optimizing the fast_looop. If such a runtime-check is never added, then after loop-opts + // the multiversion_if constant folds to true, and the slow_loop is folded away. To save + // compile time, we delay the optimization of the slow_loop until a runtime-check is added + // to the multiversion_if, at which point we resume optimizations for the slow_loop. + // We call it the "slow_loop" because it has fewer optimizations, since this is the fall-back + // loop where we do not make any of the speculative assumptions we make for the fast_loop. + // Hence, we expect the slow_loop to execute slower. + bool is_multiversion() const { return (_loop_flags & MultiversionFlagsMask) != Normal; } + bool is_multiversion_fast_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionFastLoop; } + bool is_multiversion_slow_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionSlowLoop; } + bool is_multiversion_delayed_slow_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionDelayedSlowLoop; } + void set_multiversion_fast_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionFastLoop; } + void set_multiversion_slow_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionSlowLoop; } + void set_multiversion_delayed_slow_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionDelayedSlowLoop; } + void set_no_multiversion() { assert( is_multiversion(), ""); _loop_flags &= ~MultiversionFlagsMask; } + virtual LoopNode* skip_strip_mined(int expect_skeleton = 1); OuterStripMinedLoopNode* outer_loop() const; virtual IfTrueNode* outer_loop_tail() const; @@ -1457,6 +1489,8 @@ public: static void trace_loop_unswitching_impossible(const LoopNode* original_head); static void trace_loop_unswitching_result(const UnswitchedLoopSelector& unswitched_loop_selector, const LoopNode* original_head, const LoopNode* new_head); + static void trace_loop_multiversioning_result(const LoopSelector& loop_selector, + const LoopNode* original_head, const LoopNode* new_head); #endif public: @@ -1483,6 +1517,11 @@ public: }; AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared); + void maybe_multiversion_for_auto_vectorization_runtime_checks(IdealLoopTree* lpt, Node_List& old_new); + void do_multiversioning(IdealLoopTree* lpt, Node_List& old_new); + IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj); + bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt); + // Move an unordered Reduction out of loop if possible void move_unordered_reduction_out_of_loop(IdealLoopTree* loop); diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 8afce3e86ae..2d564c3c8cf 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4482,6 +4482,66 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) { return AutoVectorizeStatus::Success; } +// Just before insert_pre_post_loops, we can multi-version the loop: +// +// multiversion_if +// | | +// fast_loop slow_loop +// +// In the fast_loop we can make speculative assumptions, and put the +// conditions into the multiversion_if. If the conditions hold at runtime, +// we enter the fast_loop, if the conditions fail, we take the slow_loop +// instead which does not make any of the speculative assumptions. +// +// Note: we only multiversion the loop if the loop does not have any +// auto vectorization check Predicate. If we have that predicate, +// then we can simply add the speculative assumption checks to +// that Predicate. This means we do not need to duplicate the +// loop - we have a smaller graph and save compile time. Should +// the conditions ever fail, then we deopt / trap at the Predicate +// and recompile without that Predicate. At that point we will +// multiversion the loop, so that we can still have speculative +// runtime checks. +// +// We perform the multiversioning when the loop is still in its single +// iteration form, even before we insert pre and post loops. This makes +// the cloning much simpler. However, this means that both the fast +// and the slow loop have to be optimized independently (adding pre +// and post loops, unrolling the main loop, auto-vectorize etc.). And +// we may end up not needing any speculative assumptions in the fast_loop +// and then rejecting the slow_loop by constant folding the multiversion_if. +// +// Therefore, we "delay" the optimization of the slow_loop until we add +// at least one speculative assumption for the fast_loop. If we never +// add such a speculative runtime check, the OpaqueMultiversioningNode +// of the multiversion_if constant folds to true after loop opts, and the +// multiversion_if folds away the "delayed" slow_loop. If we add any +// speculative assumption, then we notify the OpaqueMultiversioningNode +// with "notify_slow_loop_that_it_can_resume_optimizations". +// +// Note: new runtime checks can be added to the multiversion_if with +// PhaseIdealLoop::create_new_if_for_multiversion +void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(IdealLoopTree* lpt, Node_List& old_new) { + CountedLoopNode* cl = lpt->_head->as_CountedLoop(); + LoopNode* outer_loop = cl->skip_strip_mined(); + Node* entry = outer_loop->in(LoopNode::EntryControl); + + // Check we have multiversioning enabled, and are not already multiversioned. + if (!LoopMultiversioning || cl->is_multiversion()) { return; } + + // Check that we do not have a parse-predicate where we can add the runtime checks + // during auto-vectorization. + const Predicates predicates(entry); + const PredicateBlock* predicate_block = predicates.auto_vectorization_check_block(); + if (predicate_block->has_parse_predicate()) { return; } + + // Check node budget. + uint estimate = lpt->est_loop_clone_sz(2); + if (!may_require_nodes(estimate)) { return; } + + do_multiversioning(lpt, old_new); +} + // Returns true if the Reduction node is unordered. static bool is_unordered_reduction(Node* n) { return n->is_Reduction() && !n->as_Reduction()->requires_strict_order(); diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index f1d29f2453f..90216a5e2d3 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -229,9 +229,12 @@ // Even if we could know that there is some base address to which we add index offsets, we cannot know // if this reference address points to the beginning of a native memory allocation or into the middle, // or outside it. We also have no guarantee for alignment with such a base address. +// // Still: we would like to find such a base if possible, and if two pointers are similar (i.e. have the // same summands), we would like to find the same base. Further, it is reasonable to speculatively -// assume that such base addresses are aligned (TODO: need to add this speculative check in JDK-8323582). +// assume that such base addresses are aligned. We performs such a speculative alignment runtime check +// in VTransform::add_speculative_alignment_check. +// // A base pointer must have scale = 1, and be accepted byMemPointer::is_native_memory_base_candidate. // It can thus be one of these: // (1) CastX2P diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index 2e52c12e4e8..941b816d8e0 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -139,6 +139,7 @@ class NeverBranchNode; class Opaque1Node; class OpaqueLoopInitNode; class OpaqueLoopStrideNode; +class OpaqueMultiversioningNode; class OpaqueNotNullNode; class OpaqueInitializedAssertionPredicateNode; class OpaqueTemplateAssertionPredicateNode; @@ -800,6 +801,7 @@ public: DEFINE_CLASS_ID(Opaque1, Node, 16) DEFINE_CLASS_ID(OpaqueLoopInit, Opaque1, 0) DEFINE_CLASS_ID(OpaqueLoopStride, Opaque1, 1) + DEFINE_CLASS_ID(OpaqueMultiversioning, Opaque1, 2) DEFINE_CLASS_ID(OpaqueNotNull, Node, 17) DEFINE_CLASS_ID(OpaqueInitializedAssertionPredicate, Node, 18) DEFINE_CLASS_ID(OpaqueTemplateAssertionPredicate, Node, 19) @@ -982,6 +984,7 @@ public: DEFINE_CLASS_QUERY(OpaqueTemplateAssertionPredicate) DEFINE_CLASS_QUERY(OpaqueLoopInit) DEFINE_CLASS_QUERY(OpaqueLoopStride) + DEFINE_CLASS_QUERY(OpaqueMultiversioning) DEFINE_CLASS_QUERY(OuterStripMinedLoop) DEFINE_CLASS_QUERY(OuterStripMinedLoopEnd) DEFINE_CLASS_QUERY(Parm) diff --git a/src/hotspot/share/opto/opaquenode.hpp b/src/hotspot/share/opto/opaquenode.hpp index c1686d846f9..0e8a53efd34 100644 --- a/src/hotspot/share/opto/opaquenode.hpp +++ b/src/hotspot/share/opto/opaquenode.hpp @@ -91,6 +91,29 @@ public: IfNode* if_node() const; }; +// This node is used to mark the auto vectorization Predicate. +// At first, the multiversion_if has its condition set to "true" and we always +// take the fast_loop. Since we do not know if the slow_loop is ever going to +// be used, we delay optimizations for it. Once the fast_loop decides to use +// speculative runtime-checks and adds them to the multiversion_if, the slow_loop +// can now resume optimizations, as it is reachable at runtime. +// See PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks +class OpaqueMultiversioningNode : public Opaque1Node { +private: + bool _is_delayed_slow_loop; + +public: + OpaqueMultiversioningNode(Compile* C, Node* n) : + Opaque1Node(C, n), _is_delayed_slow_loop(true) + { + init_class_id(Class_OpaqueMultiversioning); + } + virtual int Opcode() const; + virtual const Type* bottom_type() const { return TypeInt::BOOL; } + bool is_delayed_slow_loop() const { return _is_delayed_slow_loop; } + void notify_slow_loop_that_it_can_resume_optimizations() { _is_delayed_slow_loop = false; } +}; + // This node is used in the context of intrinsics. We sometimes implicitly know that an object is non-null even though // the compiler cannot prove it. We therefore add a corresponding cast to propagate this implicit knowledge. However, // this cast could become top during optimizations (input to cast becomes null) and the data path is folded. To ensure diff --git a/src/hotspot/share/opto/phasetype.hpp b/src/hotspot/share/opto/phasetype.hpp index dcdf3aa3f86..8015255c03b 100644 --- a/src/hotspot/share/opto/phasetype.hpp +++ b/src/hotspot/share/opto/phasetype.hpp @@ -64,14 +64,17 @@ flags(AFTER_LOOP_PEELING, "After Loop Peeling") \ flags(BEFORE_LOOP_UNSWITCHING, "Before Loop Unswitching") \ flags(AFTER_LOOP_UNSWITCHING, "After Loop Unswitching") \ + flags(BEFORE_LOOP_MULTIVERSIONING, "Before Loop Multiversioning") \ + flags(AFTER_LOOP_MULTIVERSIONING, "After Loop Multiversioning") \ flags(BEFORE_RANGE_CHECK_ELIMINATION, "Before Range Check Elimination") \ flags(AFTER_RANGE_CHECK_ELIMINATION, "After Range Check Elimination") \ flags(BEFORE_PRE_MAIN_POST, "Before Pre/Main/Post Loops") \ flags(AFTER_PRE_MAIN_POST, "After Pre/Main/Post Loops") \ - flags(AUTO_VECTORIZATION1_BEFORE_APPLY, "AutoVectorization 1, Before Apply") \ - flags(AUTO_VECTORIZATION2_AFTER_REORDER, "AutoVectorization 2, After Apply Memop Reordering") \ - flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \ - flags(AUTO_VECTORIZATION4_AFTER_APPLY, "AutoVectorization 4, After Apply") \ + flags(AUTO_VECTORIZATION1_BEFORE_APPLY, "AutoVectorization 1, Before Apply") \ + flags(AUTO_VECTORIZATION2_AFTER_REORDER, "AutoVectorization 2, After Apply Memop Reordering") \ + flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \ + flags(AUTO_VECTORIZATION4_AFTER_SPECULATIVE_RUNTIME_CHECKS, "AutoVectorization 4, After Adding Speculative Runtime Checks") \ + flags(AUTO_VECTORIZATION5_AFTER_APPLY, "AutoVectorization 5, After Apply") \ flags(BEFORE_CLOOPS, "Before CountedLoop") \ flags(AFTER_CLOOPS, "After CountedLoop") \ flags(PHASEIDEAL_BEFORE_EA, "PhaseIdealLoop before EA") \ diff --git a/src/hotspot/share/opto/predicates.cpp b/src/hotspot/share/opto/predicates.cpp index 6badaa65487..be4e0067c10 100644 --- a/src/hotspot/share/opto/predicates.cpp +++ b/src/hotspot/share/opto/predicates.cpp @@ -120,6 +120,7 @@ bool RuntimePredicate::has_valid_uncommon_trap(const Node* success_proj) { assert(RegularPredicate::may_be_predicate_if(success_proj), "must have been checked before"); const Deoptimization::DeoptReason deopt_reason = uncommon_trap_reason(success_proj->as_IfProj()); return (deopt_reason == Deoptimization::Reason_loop_limit_check || + deopt_reason == Deoptimization::Reason_auto_vectorization_check || deopt_reason == Deoptimization::Reason_predicate || deopt_reason == Deoptimization::Reason_profile_predicate); } @@ -893,6 +894,8 @@ void Predicates::dump() const { tty->print_cr("%d %s:", loop_head->_idx, loop_head->Name()); tty->print_cr("- Loop Limit Check Predicate Block:"); _loop_limit_check_predicate_block.dump(" "); + tty->print_cr("- Auto Vectorization Check Block:"); + _auto_vectorization_check_block.dump(" "); tty->print_cr("- Profiled Loop Predicate Block:"); _profiled_loop_predicate_block.dump(" "); tty->print_cr("- Loop Predicate Block:"); diff --git a/src/hotspot/share/opto/predicates.hpp b/src/hotspot/share/opto/predicates.hpp index bfc1b23115e..ceb8fe7e317 100644 --- a/src/hotspot/share/opto/predicates.hpp +++ b/src/hotspot/share/opto/predicates.hpp @@ -734,6 +734,8 @@ class PredicateIterator : public StackObj { Node* current = _start_node; PredicateBlockIterator loop_limit_check_predicate_iterator(current, Deoptimization::Reason_loop_limit_check); current = loop_limit_check_predicate_iterator.for_each(predicate_visitor); + PredicateBlockIterator auto_vectorization_check_iterator(current, Deoptimization::Reason_auto_vectorization_check); + current = auto_vectorization_check_iterator.for_each(predicate_visitor); if (UseLoopPredicate) { if (UseProfiledLoopPredicate) { PredicateBlockIterator profiled_loop_predicate_iterator(current, Deoptimization::Reason_profile_predicate); @@ -906,6 +908,7 @@ class PredicateBlock : public StackObj { class Predicates : public StackObj { Node* const _tail; const PredicateBlock _loop_limit_check_predicate_block; + const PredicateBlock _auto_vectorization_check_block; const PredicateBlock _profiled_loop_predicate_block; const PredicateBlock _loop_predicate_block; Node* const _entry; @@ -914,7 +917,9 @@ class Predicates : public StackObj { explicit Predicates(Node* loop_entry) : _tail(loop_entry), _loop_limit_check_predicate_block(loop_entry, Deoptimization::Reason_loop_limit_check), - _profiled_loop_predicate_block(_loop_limit_check_predicate_block.entry(), + _auto_vectorization_check_block(_loop_limit_check_predicate_block.entry(), + Deoptimization::Reason_auto_vectorization_check), + _profiled_loop_predicate_block(_auto_vectorization_check_block.entry(), Deoptimization::Reason_profile_predicate), _loop_predicate_block(_profiled_loop_predicate_block.entry(), Deoptimization::Reason_predicate), @@ -935,6 +940,10 @@ class Predicates : public StackObj { return &_profiled_loop_predicate_block; } + const PredicateBlock* auto_vectorization_check_block() const { + return &_auto_vectorization_check_block; + } + const PredicateBlock* loop_limit_check_predicate_block() const { return &_loop_limit_check_predicate_block; } diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 31aa1507d23..697e4d842d5 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1484,7 +1484,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac pack->size(), pre_end->init_trip(), pre_end->stride_con(), - iv_stride() + iv_stride(), + _vloop.are_speculative_checks_possible() DEBUG_ONLY(COMMA is_trace_align_vector())); return solver.solve(); } @@ -1896,6 +1897,7 @@ bool SuperWord::schedule_and_apply() const { VTransformTrace trace(_vloop.vtrace(), is_trace_superword_rejections(), is_trace_align_vector(), + _vloop.is_trace_speculative_runtime_checks(), is_trace_superword_info()); #endif VTransform vtransform(_vloop_analyzer, @@ -1938,8 +1940,11 @@ void VTransform::apply() { adjust_pre_loop_limit_to_align_main_loop_vectors(); C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl()); + apply_speculative_runtime_checks(); + C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_SPECULATIVE_RUNTIME_CHECKS, 4, cl()); + apply_vectorization(); - C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl()); + C->print_method(PHASE_AUTO_VECTORIZATION5_AFTER_APPLY, 4, cl()); } // We prepare the memory graph for the replacement of scalar memops with vector memops. diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 0c08777c90c..0e14964263d 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -29,25 +29,26 @@ #include "utilities/stringUtils.hpp" #define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \ - flags(POINTER_PARSING, "Trace VPointer/MemPointer parsing") \ - flags(POINTER_ALIASING, "Trace VPointer/MemPointer aliasing") \ - flags(POINTER_ADJACENCY, "Trace VPointer/MemPointer adjacency") \ - flags(POINTER_OVERLAP, "Trace VPointer/MemPointer overlap") \ - flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \ - flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ - flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ - flags(BODY, "Trace VLoopBody") \ - flags(TYPES, "Trace VLoopTypes") \ - flags(POINTERS, "Trace VLoopPointers") \ - flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \ - flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \ - flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \ - flags(SW_PACKSET, "Trace SuperWord packset at different stages") \ - flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \ - flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \ - flags(ALIGN_VECTOR, "Trace AlignVector") \ - flags(VTRANSFORM, "Trace VTransform Graph") \ - flags(ALL, "Trace everything (very verbose)") + flags(POINTER_PARSING, "Trace VPointer/MemPointer parsing") \ + flags(POINTER_ALIASING, "Trace VPointer/MemPointer aliasing") \ + flags(POINTER_ADJACENCY, "Trace VPointer/MemPointer adjacency") \ + flags(POINTER_OVERLAP, "Trace VPointer/MemPointer overlap") \ + flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \ + flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ + flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ + flags(BODY, "Trace VLoopBody") \ + flags(TYPES, "Trace VLoopTypes") \ + flags(POINTERS, "Trace VLoopPointers") \ + flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \ + flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \ + flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \ + flags(SW_PACKSET, "Trace SuperWord packset at different stages") \ + flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \ + flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \ + flags(ALIGN_VECTOR, "Trace AlignVector") \ + flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \ + flags(VTRANSFORM, "Trace VTransform Graph") \ + flags(ALL, "Trace everything (very verbose)") #define table_entry(name, description) name, enum TraceAutoVectorizationTag { diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index ffc2314a59b..e607a1065dd 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -93,9 +93,9 @@ VStatus VLoop::check_preconditions_helper() { return VStatus::make_failure(VLoop::FAILURE_BACKEDGE); } - // To align vector memory accesses in the main-loop, we will have to adjust - // the pre-loop limit. if (_cl->is_main_loop()) { + // To align vector memory accesses in the main-loop, we will have to adjust + // the pre-loop limit. CountedLoopEndNode* pre_end = _cl->find_pre_loop_end(); if (pre_end == nullptr) { return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT); @@ -105,6 +105,41 @@ VStatus VLoop::check_preconditions_helper() { return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT); } _pre_loop_end = pre_end; + + // See if we find the infrastructure for speculative runtime-checks. + // (1) Auto Vectorization Parse Predicate + Node* pre_ctrl = pre_loop_head()->in(LoopNode::EntryControl); + const Predicates predicates(pre_ctrl); + const PredicateBlock* predicate_block = predicates.auto_vectorization_check_block(); + if (predicate_block->has_parse_predicate()) { + _auto_vectorization_parse_predicate_proj = predicate_block->parse_predicate_success_proj(); + } + + // (2) Multiversioning fast-loop projection + IfTrueNode* before_predicates = predicates.entry()->isa_IfTrue(); + if (before_predicates != nullptr && + before_predicates->in(0)->is_If() && + before_predicates->in(0)->in(1)->is_OpaqueMultiversioning()) { + _multiversioning_fast_proj = before_predicates; + } +#ifndef PRODUCT + if (is_trace_preconditions() || is_trace_speculative_runtime_checks()) { + tty->print_cr(" Infrastructure for speculative runtime-checks:"); + if (_auto_vectorization_parse_predicate_proj != nullptr) { + tty->print_cr(" auto_vectorization_parse_predicate_proj: speculate and trap"); + _auto_vectorization_parse_predicate_proj->dump_bfs(5,0,""); + } else if (_multiversioning_fast_proj != nullptr) { + tty->print_cr(" multiversioning_fast_proj: speculate and multiversion"); + _multiversioning_fast_proj->dump_bfs(5,0,""); + } else { + tty->print_cr(" Not found."); + } + } +#endif + assert(_auto_vectorization_parse_predicate_proj == nullptr || + _multiversioning_fast_proj == nullptr, "we should only have at most one of these"); + assert(_cl->is_multiversion_fast_loop() == (_multiversioning_fast_proj != nullptr), + "must find the multiversion selector IFF loop is a multiversion fast loop"); } return VStatus::make_success(); @@ -472,15 +507,28 @@ AlignmentSolution* AlignmentSolver::solve() const { // + con + con + C_const (sum of constant terms) // // We describe the 6 terms: - // 1) The "base" of the address is the address of a Java object (e.g. array), - // and as such ObjectAlignmentInBytes (a power of 2) aligned. We have - // defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is also + // 1) The "base" of the address: + // - For heap objects, this is the base of the object, and as such + // ObjectAlignmentInBytes (a power of 2) aligned. + // - For off-heap / native memory, the "base" has no alignment + // gurantees. To ensure alignment we can do either of these: + // - Add a runtime check to verify ObjectAlignmentInBytes alignment, + // i.e. we can speculatively compile with an alignment assumption. + // If we pass the check, we can go into the loop with the alignment + // assumption, if we fail we have to trap/deopt or take the other + // loop version without alignment assumptions. + // - If runtime checks are not possible, then we return an empty + // solution, i.e. we do not vectorize the corresponding pack. + // + // Let us assume we have an object "base", or passed the alignment + // runtime check for native "bases", hence we know: + // + // base % ObjectAlignmentInBytes = 0 + // + // We defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is // a power of 2. And hence we know that "base" is thus also aw-aligned: // - // base % ObjectAlignmentInBytes = 0 ==> base % aw = 0 - // - // TODO: Note: we have been assuming that this also holds for native memory base - // addresses. This is incorrect, see JDK-8323582. + // base % ObjectAlignmentInBytes = 0 ==> base % aw = 0 (BASE_ALIGNED) // // 2) The "C_const" term is the sum of all constant terms. This is "con", // plus "iv_scale * init" if it is constant. @@ -505,6 +553,13 @@ AlignmentSolution* AlignmentSolver::solve() const { // 6) The "C_main * main_iter" term represents how much the iv is increased // during "main_iter" main-loop iterations. + // For native memory, we must add a runtime-check that "base % ObjectAlignmentInBytes = ", + // to ensure (BASE_ALIGNED). If we cannot add this runtime-check, we have no guarantee on + // its alignment. + if (!_vpointer.mem_pointer().base().is_object() && !_are_speculative_checks_possible) { + return new EmptyAlignmentSolution("Cannot add speculative check for native memory alignment."); + } + // Attribute init (i.e. _init_node) either to C_const or to C_init term. const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0; const int C_const = _vpointer.con() + C_const_init * iv_scale(); @@ -521,8 +576,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // We must find a pre_iter, such that adr is aw aligned: adr % aw = 0. Note, that we are defining the // modulo operator "%" such that the remainder is always positive, see AlignmentSolution::mod(i, q). // - // TODO: Note: the following assumption is incorrect for native memory bases, see JDK-8323582. - // Since "base % aw = 0", we only need to ensure alignment of the other 5 terms: + // Since "base % aw = 0" (BASE_ALIGNED), we only need to ensure alignment of the other 5 terms: // // (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter + C_main * main_iter) % aw = 0 (1) // @@ -878,8 +932,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // + iv_scale * pre_stride * pre_iter // + iv_scale * main_stride * main_iter)) % aw = // - // -> base aligned: base % aw = 0 - // TODO: Note: this assumption is incorrect for native memory bases, see JDK-8323582. + // -> apply (BASE_ALIGNED): base % aw = 0 // -> main-loop iterations aligned (2): C_main % aw = (iv_scale * main_stride) % aw = 0 // (con + invar + iv_scale * init + iv_scale * pre_stride * pre_iter) % aw = // @@ -958,7 +1011,7 @@ void AlignmentSolver::trace_start_solve() const { _pre_stride, _main_stride); // adr = base + con + invar + iv_scale * iv tty->print(" adr = base[%d]", base().object_or_native()->_idx); - tty->print(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con()); + tty->print_cr(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con()); } } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index cb1e8c45856..620eac25c9b 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -85,6 +85,14 @@ private: PhiNode* _iv; CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only + // We can add speculative runtime-checks if we have one of these: + // - Auto Vectorization Parse Predicate: + // pass all checks or trap -> recompile without this predicate. + // - Multiversioning fast-loop projection: + // pass all checks or go to slow-path-loop, where we have no speculative assumptions. + ParsePredicateSuccessProj* _auto_vectorization_parse_predicate_proj; + IfTrueNode* _multiversioning_fast_proj; + NOT_PRODUCT(VTrace _vtrace;) NOT_PRODUCT(TraceMemPointer _mptrace;) @@ -104,7 +112,9 @@ public: _cl (nullptr), _cl_exit (nullptr), _iv (nullptr), - _pre_loop_end (nullptr) + _pre_loop_end (nullptr), + _auto_vectorization_parse_predicate_proj(nullptr), + _multiversioning_fast_proj(nullptr) #ifndef PRODUCT COMMA _mptrace(TraceMemPointer( @@ -138,6 +148,19 @@ public: return head; }; + ParsePredicateSuccessProj* auto_vectorization_parse_predicate_proj() const { + return _auto_vectorization_parse_predicate_proj; + } + + IfTrueNode* multiversioning_fast_proj() const { + return _multiversioning_fast_proj; + } + + bool are_speculative_checks_possible() const { + return _auto_vectorization_parse_predicate_proj != nullptr || + _multiversioning_fast_proj != nullptr; + } + // Estimate maximum size for data structures, to avoid repeated reallocation int estimated_body_length() const { return lpt()->_body.size(); }; int estimated_node_count() const { return (int)(1.10 * phase()->C->unique()); }; @@ -176,6 +199,10 @@ public: bool is_trace_vpointers() const { return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS); } + + bool is_trace_speculative_runtime_checks() const { + return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS); + } #endif // Is the node in the basic block of the loop? @@ -1296,6 +1323,14 @@ private: const int _pre_stride; // address increment per pre-loop iteration const int _main_stride; // address increment per main-loop iteration + // For native bases, we have no alignment guarantee. This means we cannot in + // general guarantee alignment statically. But we can check alignment with a + // speculative runtime check, see VTransform::apply_speculative_runtime_checks. + // For this, we need find the Predicate for auto vectorization checks, or else + // we need to find the multiversion_if. If we cannot find either, then we + // cannot make any speculative runtime checks. + const bool _are_speculative_checks_possible; + DEBUG_ONLY( const bool _is_trace; ); static const MemNode* mem_ref_not_null(const MemNode* mem_ref) { @@ -1309,7 +1344,8 @@ public: const uint vector_length, const Node* init_node, const int pre_stride, - const int main_stride + const int main_stride, + const bool are_speculative_checks_possible DEBUG_ONLY( COMMA const bool is_trace) ) : _vpointer( vpointer), @@ -1318,7 +1354,8 @@ public: _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), _init_node( init_node), _pre_stride( pre_stride), - _main_stride( main_stride) + _main_stride( main_stride), + _are_speculative_checks_possible(are_speculative_checks_possible) DEBUG_ONLY( COMMA _is_trace(is_trace) ) { assert(_mem_ref != nullptr && diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 4730f3ac134..18b5c09acb8 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -23,6 +23,7 @@ #include "opto/vtransform.hpp" #include "opto/vectornode.hpp" +#include "opto/castnode.hpp" #include "opto/convertnode.hpp" void VTransformGraph::add_vtnode(VTransformNode* vtnode) { @@ -143,6 +144,94 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { } #endif +void VTransform::apply_speculative_runtime_checks() { + if (VLoop::vectors_should_be_aligned()) { +#ifdef ASSERT + if (_trace._align_vector || _trace._speculative_runtime_checks) { + tty->print_cr("\nVTransform::apply_speculative_runtime_checks: native memory alignment"); + } +#endif + + const GrowableArray& vtnodes = _graph.vtnodes(); + for (int i = 0; i < vtnodes.length(); i++) { + VTransformVectorNode* vtn = vtnodes.at(i)->isa_Vector(); + if (vtn == nullptr) { continue; } + MemNode* p0 = vtn->nodes().at(0)->isa_Mem(); + if (p0 == nullptr) { continue; } + const VPointer& vp = vpointer(p0); + if (vp.mem_pointer().base().is_object()) { continue; } + assert(vp.mem_pointer().base().is_native(), "VPointer base must be object or native"); + + // We have a native memory reference. Build a runtime check for it. + // See: AlignmentSolver::solve + // In a future RFE we may be able to speculate on invar alignment as + // well, and allow vectorization of more cases. + add_speculative_alignment_check(vp.mem_pointer().base().native(), ObjectAlignmentInBytes); + } + } +} + +#define TRACE_SPECULATIVE_ALIGNMENT_CHECK(node) { \ + DEBUG_ONLY( \ + if (_trace._align_vector || _trace._speculative_runtime_checks) { \ + tty->print(" " #node ": "); \ + node->dump(); \ + } \ + ) \ +} \ + +// Check: (node % alignment) == 0. +void VTransform::add_speculative_alignment_check(Node* node, juint alignment) { + TRACE_SPECULATIVE_ALIGNMENT_CHECK(node); + Node* ctrl = phase()->get_ctrl(node); + + // Cast adr/long -> int + if (node->bottom_type()->basic_type() == T_ADDRESS) { + // adr -> int/long + node = new CastP2XNode(nullptr, node); + phase()->register_new_node(node, ctrl); + TRACE_SPECULATIVE_ALIGNMENT_CHECK(node); + } + if (node->bottom_type()->basic_type() == T_LONG) { + // long -> int + node = new ConvL2INode(node); + phase()->register_new_node(node, ctrl); + TRACE_SPECULATIVE_ALIGNMENT_CHECK(node); + } + + Node* mask_alignment = igvn().intcon(alignment-1); + Node* base_alignment = new AndINode(node, mask_alignment); + phase()->register_new_node(base_alignment, ctrl); + TRACE_SPECULATIVE_ALIGNMENT_CHECK(mask_alignment); + TRACE_SPECULATIVE_ALIGNMENT_CHECK(base_alignment); + + Node* zero = igvn().intcon(0); + Node* cmp_alignment = CmpNode::make(base_alignment, zero, T_INT, false); + BoolNode* bol_alignment = new BoolNode(cmp_alignment, BoolTest::eq); + phase()->register_new_node(cmp_alignment, ctrl); + phase()->register_new_node(bol_alignment, ctrl); + TRACE_SPECULATIVE_ALIGNMENT_CHECK(cmp_alignment); + TRACE_SPECULATIVE_ALIGNMENT_CHECK(bol_alignment); + + add_speculative_check(bol_alignment); +} + +void VTransform::add_speculative_check(BoolNode* bol) { + assert(_vloop.are_speculative_checks_possible(), "otherwise we cannot make speculative assumptions"); + ParsePredicateSuccessProj* parse_predicate_proj = _vloop.auto_vectorization_parse_predicate_proj(); + IfTrueNode* new_check_proj = nullptr; + if (parse_predicate_proj != nullptr) { + new_check_proj = phase()->create_new_if_for_predicate(parse_predicate_proj, nullptr, + Deoptimization::Reason_auto_vectorization_check, + Op_If); + } else { + new_check_proj = phase()->create_new_if_for_multiversion(_vloop.multiversioning_fast_proj()); + } + Node* iff_speculate = new_check_proj->in(0); + igvn().replace_input_of(iff_speculate, 1, bol); + TRACE_SPECULATIVE_ALIGNMENT_CHECK(iff_speculate); +} + // Helper-class for VTransformGraph::has_store_to_load_forwarding_failure. // It wraps a VPointer. The VPointer has an iv_offset applied, which // simulates a virtual unrolling. They represent the memory region: diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 4fc68c7b4df..308578ecb57 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -109,16 +109,19 @@ public: const bool _verbose; const bool _rejections; const bool _align_vector; + const bool _speculative_runtime_checks; const bool _info; VTransformTrace(const VTrace& vtrace, const bool is_trace_rejections, const bool is_trace_align_vector, + const bool is_trace_speculative_runtime_checks, const bool is_trace_info) : - _verbose (vtrace.is_trace(TraceAutoVectorizationTag::ALL)), - _rejections (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections), - _align_vector(_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector), - _info (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {} + _verbose (vtrace.is_trace(TraceAutoVectorizationTag::ALL)), + _rejections (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections), + _align_vector (_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector), + _speculative_runtime_checks(_verbose | is_trace_vtransform(vtrace) | is_trace_speculative_runtime_checks), + _info (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {} static bool is_trace_vtransform(const VTrace& vtrace) { return vtrace.is_trace(TraceAutoVectorizationTag::VTRANSFORM); @@ -245,6 +248,10 @@ private: void determine_mem_ref_and_aw_for_main_loop_alignment(); void adjust_pre_loop_limit_to_align_main_loop_vectors(); + void apply_speculative_runtime_checks(); + void add_speculative_alignment_check(Node* node, juint alignment); + void add_speculative_check(BoolNode* bol); + void apply_vectorization() const; }; diff --git a/src/hotspot/share/runtime/deoptimization.cpp b/src/hotspot/share/runtime/deoptimization.cpp index 2945e98bd2b..4a51e2cbd92 100644 --- a/src/hotspot/share/runtime/deoptimization.cpp +++ b/src/hotspot/share/runtime/deoptimization.cpp @@ -2717,6 +2717,7 @@ const char* Deoptimization::_trap_reason_name[] = { "intrinsic" JVMCI_ONLY("_or_type_checked_inlining"), "bimorphic" JVMCI_ONLY("_or_optimized_type_check"), "profile_predicate", + "auto_vectorization_check", "unloaded", "uninitialized", "initialized", diff --git a/src/hotspot/share/runtime/deoptimization.hpp b/src/hotspot/share/runtime/deoptimization.hpp index 0a2bafb3830..42cf25e5162 100644 --- a/src/hotspot/share/runtime/deoptimization.hpp +++ b/src/hotspot/share/runtime/deoptimization.hpp @@ -98,6 +98,7 @@ class Deoptimization : AllStatic { #endif Reason_profile_predicate, // compiler generated predicate moved from frequent branch in a loop failed + Reason_auto_vectorization_check, // compiler generated (speculative) auto vectorization checks failed // recorded per method Reason_unloaded, // unloaded class or constant pool entry diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp index 1c776f7b8e2..c9d93930d63 100644 --- a/src/hotspot/share/runtime/vmStructs.cpp +++ b/src/hotspot/share/runtime/vmStructs.cpp @@ -2269,6 +2269,7 @@ declare_constant(Deoptimization::Reason_age) \ declare_constant(Deoptimization::Reason_predicate) \ declare_constant(Deoptimization::Reason_loop_limit_check) \ + declare_constant(Deoptimization::Reason_auto_vectorization_check) \ declare_constant(Deoptimization::Reason_speculate_class_check) \ declare_constant(Deoptimization::Reason_speculate_null_check) \ declare_constant(Deoptimization::Reason_speculate_null_assert) \ diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegmentUnalignedAddress.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegmentUnalignedAddress.java new file mode 100644 index 00000000000..13fd9ff9b17 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegmentUnalignedAddress.java @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.loopopts.superword; + +import compiler.lib.ir_framework.*; +import compiler.lib.verify.*; +import jdk.test.lib.Utils; +import java.nio.ByteBuffer; +import java.util.Map; +import java.util.HashMap; +import java.util.Random; +import java.lang.foreign.*; + +/* + * @test id=byte-buffer-direct + * @bug 8323582 + * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect + */ + +/* + * @test id=byte-buffer-direct-AlignVector + * @bug 8323582 + * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect AlignVector + */ + +/* + * @test id=byte-buffer-direct-VerifyAlignVector + * @bug 8323582 + * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect VerifyAlignVector + */ + +/* + * @test id=native + * @bug 8323582 + * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native + */ + +/* + * @test id=native-AlignVector + * @bug 8323582 + * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native AlignVector + */ + +/* + * @test id=native-VerifyAlignVector + * @bug 8323582 + * @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native VerifyAlignVector + */ + +public class TestMemorySegmentUnalignedAddress { + public static void main(String[] args) { + TestFramework framework = new TestFramework(TestMemorySegmentUnalignedAddressImpl.class); + framework.addFlags("-DmemorySegmentProviderNameForTestVM=" + args[0]); + if (args.length > 1) { + switch (args[1]) { + case "AlignVector" -> { framework.addFlags("-XX:+AlignVector"); } + case "VerifyAlignVector" -> { framework.addFlags("-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); } + default -> { throw new RuntimeException("unexpected: " + args[1]); } + } + } + framework.setDefaultWarmup(100); + framework.start(); + } +} + +class TestMemorySegmentUnalignedAddressImpl { + static final int SIZE = 10_000; + static final int BACKING_SIZE = 10_000 + 1; + static final Random RANDOM = Utils.getRandomInstance(); + + interface TestFunction { + Object run(int i); + } + + interface MemorySegmentProvider { + MemorySegment newMemorySegment(); + } + + static MemorySegmentProvider provider; + + static { + String providerName = System.getProperty("memorySegmentProviderNameForTestVM"); + provider = switch (providerName) { + case "ByteBufferDirect" -> TestMemorySegmentUnalignedAddressImpl::newMemorySegmentOfByteBufferDirect; + case "Native" -> TestMemorySegmentUnalignedAddressImpl::newMemorySegmentOfNative; + default -> throw new RuntimeException("Test argument not recognized: " + providerName); + }; + } + + // List of tests + Map tests = new HashMap<>(); + + // List of gold, the results from the first run before compilation + Map golds = new HashMap<>(); + + public TestMemorySegmentUnalignedAddressImpl () { + // Generate two MemorySegments as inputs + MemorySegment a = sliceAligned(newMemorySegment()); + MemorySegment b = sliceAligned(newMemorySegment()); + fillRandom(a); + fillRandom(b); + + // Add all tests to list + tests.put("testAlwaysAligned", (int i) -> { + MemorySegment ms = newMemorySegment(); + MemorySegment slice = sliceAligned(ms); + copy(a, slice); + return testAlwaysAligned(slice); + }); + tests.put("testAlwaysUnaligned", (int i) -> { + MemorySegment ms = newMemorySegment(); + MemorySegment slice = sliceUnaligned(ms); + copy(a, slice); + return testAlwaysUnaligned(slice); + }); + tests.put("testMixedAlignedAndUnaligned", (int i) -> { + MemorySegment ms = newMemorySegment(); + MemorySegment slice = (i % 2 == 0) ? sliceUnaligned(ms) : sliceAligned(ms); + copy(a, slice); + return testMixedAlignedAndUnaligned(slice); + }); + + // Compute gold value for all test methods before compilation + for (Map.Entry entry : tests.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + Object gold = test.run(0); + golds.put(name, gold); + } + } + + MemorySegment sliceAligned(MemorySegment src) { + return src.asSlice(0, SIZE); + } + + MemorySegment sliceUnaligned(MemorySegment src) { + return src.asSlice(1, SIZE); + } + + MemorySegment newMemorySegment() { + return provider.newMemorySegment(); + } + + static void copy(MemorySegment src, MemorySegment dst) { + MemorySegment.copy(src, 0, dst, 0, src.byteSize()); + } + + static MemorySegment newMemorySegmentOfByteBufferDirect() { + return MemorySegment.ofBuffer(ByteBuffer.allocateDirect(BACKING_SIZE)); + } + + static MemorySegment newMemorySegmentOfNative() { + // Auto arena: GC decides when there is no reference to the MemorySegment, + // and then it deallocates the backing memory. + return Arena.ofAuto().allocate(BACKING_SIZE, 1); + } + + static void fillRandom(MemorySegment data) { + for (int i = 0; i < (int)data.byteSize(); i++) { + data.set(ValueLayout.JAVA_BYTE, i, (byte)RANDOM.nextInt()); + } + } + + static void verify(String name, Object gold, Object result) { + try { + Verify.checkEQ(gold, result); + } catch (VerifyException e) { + throw new RuntimeException("Verify: wrong result in " + name, e); + } + } + + static int runInvocationCounter = 0; + + @Run(test = {"testAlwaysAligned", + "testAlwaysUnaligned", + "testMixedAlignedAndUnaligned"}) + void runTests() { + runInvocationCounter++; + for (Map.Entry entry : tests.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + // Recall gold value from before compilation + Object gold = golds.get(name); + // Compute new result + Object result = test.run(runInvocationCounter); + // Compare gold and new result + verify(name, gold, result); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0", + "multiversion", "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + phase = CompilePhase.PRINT_IDEAL) + // We never fail the alignment check in the auto vectorization Predicate, + // hence we never even create the multiversioned loops. + static Object testAlwaysAligned(MemorySegment ms) { + for (long i = 0; i < ms.byteSize(); i += 4) { + int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i); + ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1)); + } + return new Object[]{ ms }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0", + "multiversion_fast", "= 4", // pre, main, drain, post + "multiversion_slow", "= 2"}, // main, post + applyIf = {"AlignVector", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + phase = CompilePhase.PRINT_IDEAL) + // We add alignment checks to the auto vectorization Predicate. It fails + // at runtime, deopts, and recompiles with multiversioning. + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0", + "multiversion_fast", "= 0", + "multiversion_slow", "= 0"}, + applyIf = {"AlignVector", "false"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + phase = CompilePhase.PRINT_IDEAL) + // We never add any conditions to the auto vectorization Predicate, so + // we also never deopt and never end up multiversioning. + static Object testAlwaysUnaligned(MemorySegment ms) { + for (long i = 0; i < ms.byteSize(); i += 4) { + int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i); + ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1)); + } + return new Object[]{ ms }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0", + "multiversion_fast", "= 4", // pre, main, drain, post + "multiversion_slow", "= 2"}, // main, post + applyIf = {"AlignVector", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + phase = CompilePhase.PRINT_IDEAL) + // We add alignment checks to the auto vectorization Predicate. It fails + // at runtime, deopts, and recompiles with multiversioning. + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0", + "multiversion_fast", "= 0", + "multiversion_slow", "= 0"}, + applyIf = {"AlignVector", "false"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + phase = CompilePhase.PRINT_IDEAL) + // We never add any conditions to the auto vectorization Predicate, so + // we also never deopt and never end up multiversioning. + static Object testMixedAlignedAndUnaligned(MemorySegment ms) { + for (long i = 0; i < ms.byteSize(); i += 4) { + int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i); + ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1)); + } + return new Object[]{ ms }; + } +}