8323582: C2 SuperWord AlignVector: misaligned vector memory access with unaligned native memory

Reviewed-by: roland, kvn
This commit is contained in:
Emanuel Peter 2025-02-27 06:58:43 +00:00
parent bb48b7319c
commit 885338b5f3
27 changed files with 1067 additions and 129 deletions

View File

@ -708,6 +708,7 @@
declare_constant(Deoptimization::Reason_constraint) \
declare_constant(Deoptimization::Reason_div0_check) \
declare_constant(Deoptimization::Reason_loop_limit_check) \
declare_constant(Deoptimization::Reason_auto_vectorization_check) \
declare_constant(Deoptimization::Reason_type_checked_inlining) \
declare_constant(Deoptimization::Reason_optimized_type_check) \
declare_constant(Deoptimization::Reason_aliasing) \

View File

@ -346,6 +346,12 @@
develop(bool, TraceLoopUnswitching, false, \
"Trace loop unswitching") \
\
product(bool, LoopMultiversioning, true, DIAGNOSTIC, \
"Enable loop multiversioning (for speculative compilation)") \
\
develop(bool, TraceLoopMultiversioning, false, \
"Trace loop multiversioning") \
\
product(bool, AllowVectorizeOnDemand, true, \
"Globally suppress vectorization set in VectorizeMethod") \
\

View File

@ -428,7 +428,7 @@ public:
IfNode(Node* control, Node* bol, float p, float fcnt);
IfNode(Node* control, Node* bol, float p, float fcnt, AssertionPredicateType assertion_predicate_type);
static IfNode* make_with_same_profile(IfNode* if_node_profile, Node* ctrl, BoolNode* bol);
static IfNode* make_with_same_profile(IfNode* if_node_profile, Node* ctrl, Node* bol);
virtual int Opcode() const;
virtual bool pinned() const { return true; }

View File

@ -277,6 +277,7 @@ macro(OnSpinWait)
macro(Opaque1)
macro(OpaqueLoopInit)
macro(OpaqueLoopStride)
macro(OpaqueMultiversioning)
macro(OpaqueZeroTripGuard)
macro(OpaqueNotNull)
macro(OpaqueInitializedAssertionPredicate)

View File

@ -4086,6 +4086,7 @@ void GraphKit::add_parse_predicates(int nargs) {
if (UseProfiledLoopPredicate) {
add_parse_predicate(Deoptimization::Reason_profile_predicate, nargs);
}
add_parse_predicate(Deoptimization::Reason_auto_vectorization_check, nargs);
// Loop Limit Check Predicate should be near the loop.
add_parse_predicate(Deoptimization::Reason_loop_limit_check, nargs);
}

View File

@ -469,7 +469,7 @@ static Node* split_if(IfNode *iff, PhaseIterGVN *igvn) {
return new ConINode(TypeInt::ZERO);
}
IfNode* IfNode::make_with_same_profile(IfNode* if_node_profile, Node* ctrl, BoolNode* bol) {
IfNode* IfNode::make_with_same_profile(IfNode* if_node_profile, Node* ctrl, Node* bol) {
// Assert here that we only try to create a clone from an If node with the same profiling if that actually makes sense.
// Some If node subtypes should not be cloned in this way. In theory, we should not clone BaseCountedLoopEndNodes.
// But they can end up being used as normal If nodes when peeling a loop - they serve as zero-trip guard.
@ -2177,6 +2177,7 @@ ParsePredicateNode::ParsePredicateNode(Node* control, Deoptimization::DeoptReaso
switch (deopt_reason) {
case Deoptimization::Reason_predicate:
case Deoptimization::Reason_profile_predicate:
case Deoptimization::Reason_auto_vectorization_check:
case Deoptimization::Reason_loop_limit_check:
break;
default:
@ -2214,6 +2215,9 @@ void ParsePredicateNode::dump_spec(outputStream* st) const {
case Deoptimization::DeoptReason::Reason_profile_predicate:
st->print("Profiled Loop ");
break;
case Deoptimization::DeoptReason::Reason_auto_vectorization_check:
st->print("Auto_Vectorization_Check ");
break;
case Deoptimization::DeoptReason::Reason_loop_limit_check:
st->print("Loop Limit Check ");
break;

View File

@ -745,6 +745,11 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) {
cl->set_trip_count(cl->trip_count() - 1);
if (cl->is_main_loop()) {
cl->set_normal_loop();
if (cl->is_multiversion()) {
// Peeling also destroys the connection of the main loop
// to the multiversion_if.
cl->set_no_multiversion();
}
#ifndef PRODUCT
if (PrintOpto && VerifyLoopOptimizations) {
tty->print("Peeling a 'main' loop; resetting to 'normal' ");
@ -1174,8 +1179,9 @@ bool IdealLoopTree::policy_range_check(PhaseIdealLoop* phase, bool provisional,
if (!bol->is_Bool()) {
assert(bol->is_OpaqueNotNull() ||
bol->is_OpaqueTemplateAssertionPredicate() ||
bol->is_OpaqueInitializedAssertionPredicate(),
"Opaque node of a non-null-check or an Assertion Predicate");
bol->is_OpaqueInitializedAssertionPredicate() ||
bol->is_OpaqueMultiversioning(),
"Opaque node of a non-null-check or an Assertion Predicate or Multiversioning");
continue;
}
if (bol->as_Bool()->_test._test == BoolTest::ne) {
@ -3354,6 +3360,23 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
// Do nothing special to pre- and post- loops
if (cl->is_pre_loop() || cl->is_post_loop()) return true;
// With multiversioning, we create a fast_loop and a slow_loop, and a multiversion_if that
// decides which loop is taken at runtime. At first, the multiversion_if always takes the
// fast_loop, and we only optimize the fast_loop. Since we are not sure if we will ever use
// the slow_loop, we delay optimizations for it, so we do not waste compile time and code
// size. If we never change the condition of the multiversion_if, the slow_loop is eventually
// folded away after loop-opts. While optimizing the fast_loop, we may want to perform some
// speculative optimization, for which we need a runtime-check. We add this runtime-check
// condition to the multiversion_if. Now, it becomes possible to execute the slow_loop at
// runtime, and we resume optimizations for slow_loop ("un-delay" it).
// TLDR: If the slow_loop is still in "delay" mode, check if the multiversion_if was changed
// and we should now resume optimizations for it.
if (cl->is_multiversion_delayed_slow_loop() &&
!phase->try_resume_optimizations_for_delayed_slow_loop(this)) {
// We are still delayed, so wait with further loop-opts.
return true;
}
// Compute loop trip count from profile data
compute_profile_trip_cnt(phase);
@ -3413,6 +3436,12 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
if (!phase->may_require_nodes(estimate)) {
return false;
}
// We are going to add pre-loop and post-loop.
// But should we also multi-version for auto-vectorization speculative
// checks, i.e. fast and slow-paths?
phase->maybe_multiversion_for_auto_vectorization_runtime_checks(this, old_new);
phase->insert_pre_post_loops(this, old_new, peel_only);
}
// Adjust the pre- and main-loop limits to let the pre and post loops run

View File

@ -32,6 +32,23 @@
#include "opto/predicates.hpp"
#include "opto/rootnode.hpp"
// Multiversioning:
// A loop is cloned, and a selector If decides which loop is taken at run-time: the true-path-loop (original) or the
// false-path-loop (cloned).
//
// Use-cases:
// - Speculative compilation:
// The selector If checks some assumptions which allow stronger optimization in the true-path-loop. If the assumptions
// do not hold, we can still execute in the false-path-loop, although with fewer optimizations.
// See: PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks
// PhaseIdealLoop::create_new_if_for_multiversion
//
// - Unswitching:
// The selector If has the same (loop invariant) condition as some unswitching candidate If inside the loop. This
// allows us to constant-fold the unswitching candidate If to true in the true-path-loop and to false in the
// false-path-loop, thus eliminating the unswitching candidate If from the loop.
//
//
// Loop Unswitching is a loop optimization to move an invariant, non-loop-exiting test in the loop body before the loop.
// Such a test is either always true or always false in all loop iterations and could therefore only be executed once.
// To achieve that, we duplicate the loop and change the original and cloned loop as follows:
@ -145,14 +162,16 @@ IfNode* PhaseIdealLoop::find_unswitch_candidate(const IdealLoopTree* loop) const
return unswitch_candidate;
}
// This class creates an If node (i.e. loop selector) that selects if the true-path-loop or the false-path-loop should be
// executed at runtime. This is done by finding an invariant and non-loop-exiting unswitch candidate If node (guaranteed
// to exist at this point) to perform Loop Unswitching on.
class UnswitchedLoopSelector : public StackObj {
// LoopSelector is used for loop multiversioning and unswitching. This class creates an If node (i.e. loop selector)
// that selects if the true-path-loop or the false-path-loop should be executed at runtime.
class LoopSelector : public StackObj {
// Cached fields for construction.
PhaseIdealLoop* const _phase;
IdealLoopTree* const _outer_loop;
Node* const _original_loop_entry;
IfNode* const _unswitch_candidate;
const uint _dom_depth; // of original_loop_entry
// Constructed selector if with its projections.
IfNode* const _selector;
IfTrueNode* const _true_path_loop_proj;
IfFalseNode* const _false_path_loop_proj;
@ -160,52 +179,59 @@ class UnswitchedLoopSelector : public StackObj {
enum PathToLoop { TRUE_PATH, FALSE_PATH };
public:
UnswitchedLoopSelector(IdealLoopTree* loop)
// For multiversioning: create a new selector (multiversion_if) from a bol condition.
LoopSelector(IdealLoopTree* loop, Node* bol, float prob, float fcnt)
: _phase(loop->_phase),
_outer_loop(loop->skip_strip_mined()->_parent),
_original_loop_entry(loop->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl)),
_unswitch_candidate(find_unswitch_candidate(loop)),
_selector(create_selector_if()),
_dom_depth(_phase->dom_depth(_original_loop_entry)),
_selector(create_multiversioning_if(bol, prob, fcnt)), // multiversioning
_true_path_loop_proj(create_proj_to_loop(TRUE_PATH)->as_IfTrue()),
_false_path_loop_proj(create_proj_to_loop(FALSE_PATH)->as_IfFalse()) {
}
NONCOPYABLE(UnswitchedLoopSelector);
private:
IfNode* find_unswitch_candidate(IdealLoopTree* loop) {
IfNode* unswitch_candidate = _phase->find_unswitch_candidate(loop);
assert(unswitch_candidate != nullptr, "guaranteed to exist by policy_unswitching");
assert(_phase->is_member(loop, unswitch_candidate), "must be inside original loop");
return unswitch_candidate;
// For unswitching: create an unswitching if before the loop, from a pre-existing
// unswitching_candidate inside the loop.
LoopSelector(IdealLoopTree* loop, IfNode* unswitch_candidate)
: _phase(loop->_phase),
_outer_loop(loop->skip_strip_mined()->_parent),
_original_loop_entry(loop->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl)),
_dom_depth(_phase->dom_depth(_original_loop_entry)),
_selector(create_unswitching_if(unswitch_candidate)), // unswitching
_true_path_loop_proj(create_proj_to_loop(TRUE_PATH)->as_IfTrue()),
_false_path_loop_proj(create_proj_to_loop(FALSE_PATH)->as_IfFalse()) {
}
NONCOPYABLE(LoopSelector);
IfNode* create_selector_if() const {
const uint dom_depth = _phase->dom_depth(_original_loop_entry);
IfNode* create_multiversioning_if(Node* bol, float prob, float fcnt) {
_phase->igvn().rehash_node_delayed(_original_loop_entry);
BoolNode* unswitch_candidate_bool = _unswitch_candidate->in(1)->as_Bool();
IfNode* selector_if = IfNode::make_with_same_profile(_unswitch_candidate, _original_loop_entry,
unswitch_candidate_bool);
_phase->register_node(selector_if, _outer_loop, _original_loop_entry, dom_depth);
IfNode* selector_if = new IfNode(_original_loop_entry, bol, prob, fcnt);
_phase->register_node(selector_if, _outer_loop, _original_loop_entry, _dom_depth);
return selector_if;
}
IfNode* create_unswitching_if(IfNode* unswitch_candidate) {
_phase->igvn().rehash_node_delayed(_original_loop_entry);
BoolNode* unswitch_candidate_bool = unswitch_candidate->in(1)->as_Bool();
IfNode* selector_if = IfNode::make_with_same_profile(unswitch_candidate, _original_loop_entry,
unswitch_candidate_bool);
_phase->register_node(selector_if, _outer_loop, _original_loop_entry, _dom_depth);
return selector_if;
}
private:
IfProjNode* create_proj_to_loop(const PathToLoop path_to_loop) {
const uint dom_depth = _phase->dom_depth(_original_loop_entry);
IfProjNode* proj_to_loop;
if (path_to_loop == TRUE_PATH) {
proj_to_loop = new IfTrueNode(_selector);
} else {
proj_to_loop = new IfFalseNode(_selector);
}
_phase->register_node(proj_to_loop, _outer_loop, _selector, dom_depth);
_phase->register_node(proj_to_loop, _outer_loop, _selector, _dom_depth);
return proj_to_loop;
}
public:
IfNode* unswitch_candidate() const {
return _unswitch_candidate;
}
IfNode* selector() const {
return _selector;
}
@ -219,6 +245,37 @@ class UnswitchedLoopSelector : public StackObj {
}
};
// This class creates an If node (i.e. loop selector) that selects if the true-path-loop or the false-path-loop should be
// executed at runtime. This is done by finding an invariant and non-loop-exiting unswitch candidate If node (guaranteed
// to exist at this point) to perform Loop Unswitching on.
class UnswitchedLoopSelector : public StackObj {
IfNode* const _unswitch_candidate;
const LoopSelector _loop_selector;
public:
UnswitchedLoopSelector(IdealLoopTree* loop)
: _unswitch_candidate(find_unswitch_candidate(loop)),
_loop_selector(loop, _unswitch_candidate) {}
NONCOPYABLE(UnswitchedLoopSelector);
private:
static IfNode* find_unswitch_candidate(IdealLoopTree* loop) {
IfNode* unswitch_candidate = loop->_phase->find_unswitch_candidate(loop);
assert(unswitch_candidate != nullptr, "guaranteed to exist by policy_unswitching");
assert(loop->_phase->is_member(loop, unswitch_candidate), "must be inside original loop");
return unswitch_candidate;
}
public:
IfNode* unswitch_candidate() const {
return _unswitch_candidate;
}
const LoopSelector& loop_selector() const {
return _loop_selector;
}
};
// Class to unswitch the original loop and create Predicates at the new unswitched loop versions. The newly cloned loop
// becomes the false-path-loop while original loop becomes the true-path-loop.
class OriginalLoop : public StackObj {
@ -238,55 +295,62 @@ class OriginalLoop : public StackObj {
// Unswitch the original loop on the invariant loop selector by creating a true-path-loop and a false-path-loop.
// Remove the unswitch candidate If from both unswitched loop versions which are now covered by the loop selector If.
void unswitch(const UnswitchedLoopSelector& unswitched_loop_selector) {
const uint first_false_path_loop_node_index = _phase->C->unique();
clone_loop(unswitched_loop_selector);
move_parse_and_template_assertion_predicates_to_unswitched_loops(unswitched_loop_selector,
first_false_path_loop_node_index);
DEBUG_ONLY(verify_unswitched_loop_versions(_loop->_head->as_Loop(), unswitched_loop_selector);)
_phase->recompute_dom_depth();
multiversion(unswitched_loop_selector.loop_selector());
remove_unswitch_candidate_from_loops(unswitched_loop_selector);
}
private:
void clone_loop(const UnswitchedLoopSelector& unswitched_loop_selector) {
_phase->clone_loop(_loop, _old_new, _phase->dom_depth(_loop_head),
PhaseIdealLoop::CloneIncludesStripMined, unswitched_loop_selector.selector());
fix_loop_entries(unswitched_loop_selector);
// Multiversion the original loop. The loop selector if selects between the original loop (true-path-loop), and
// a copy of it (false-path-loop).
void multiversion(const LoopSelector& loop_selector) {
const uint first_false_path_loop_node_index = _phase->C->unique();
clone_loop(loop_selector);
move_parse_and_template_assertion_predicates_to_unswitched_loops(loop_selector,
first_false_path_loop_node_index);
DEBUG_ONLY(verify_loop_versions(_loop->_head->as_Loop(), loop_selector);)
_phase->recompute_dom_depth();
}
void fix_loop_entries(const UnswitchedLoopSelector& unswitched_loop_selector) {
_phase->replace_loop_entry(_loop_head, unswitched_loop_selector.true_path_loop_proj());
private:
void clone_loop(const LoopSelector& loop_selector) {
_phase->clone_loop(_loop, _old_new, _phase->dom_depth(_loop_head),
PhaseIdealLoop::CloneIncludesStripMined, loop_selector.selector());
fix_loop_entries(loop_selector);
}
void fix_loop_entries(const LoopSelector& loop_selector) {
_phase->replace_loop_entry(_loop_head, loop_selector.true_path_loop_proj());
LoopNode* false_path_loop_strip_mined_head = old_to_new(_loop_head)->as_Loop();
_phase->replace_loop_entry(false_path_loop_strip_mined_head,
unswitched_loop_selector.false_path_loop_proj());
loop_selector.false_path_loop_proj());
}
// Moves the Parse And Template Assertion Predicates to the true and false path loop. They are inserted between the
// loop heads and the loop selector If projections. The old Parse and Template Assertion Predicates before
// the unswitched loop selector are killed.
void move_parse_and_template_assertion_predicates_to_unswitched_loops(
const UnswitchedLoopSelector& unswitched_loop_selector, const uint first_false_path_loop_node_index) const {
const LoopSelector& loop_selector, const uint first_false_path_loop_node_index) const {
const NodeInOriginalLoopBody node_in_true_path_loop_body(first_false_path_loop_node_index, _old_new);
const NodeInClonedLoopBody node_in_false_path_loop_body(first_false_path_loop_node_index);
CloneUnswitchedLoopPredicatesVisitor
clone_unswitched_loop_predicates_visitor(_loop_head, old_to_new(_loop_head)->as_Loop(), node_in_true_path_loop_body,
node_in_false_path_loop_body, _phase);
Node* source_loop_entry = unswitched_loop_selector.selector()->in(0);
Node* source_loop_entry = loop_selector.selector()->in(0);
PredicateIterator predicate_iterator(source_loop_entry);
predicate_iterator.for_each(clone_unswitched_loop_predicates_visitor);
}
#ifdef ASSERT
void verify_unswitched_loop_versions(LoopNode* true_path_loop_head,
const UnswitchedLoopSelector& unswitched_loop_selector) const {
verify_unswitched_loop_version(true_path_loop_head, unswitched_loop_selector.true_path_loop_proj());
verify_unswitched_loop_version(old_to_new(true_path_loop_head)->as_Loop(),
unswitched_loop_selector.false_path_loop_proj());
void verify_loop_versions(LoopNode* true_path_loop_head,
const LoopSelector& loop_selector) const {
verify_loop_version(true_path_loop_head,
loop_selector.true_path_loop_proj());
verify_loop_version(old_to_new(true_path_loop_head)->as_Loop(),
loop_selector.false_path_loop_proj());
}
static void verify_unswitched_loop_version(LoopNode* loop_head, IfProjNode* loop_selector_if_proj) {
static void verify_loop_version(LoopNode* loop_head, IfProjNode* loop_selector_if_proj) {
Node* entry = loop_head->skip_strip_mined()->in(LoopNode::EntryControl);
const Predicates predicates(entry);
// When skipping all predicates, we should end up at 'loop_selector_if_proj'.
@ -302,15 +366,15 @@ class OriginalLoop : public StackObj {
// If node. Keep the true-path-path in the true-path-loop and the false-path-path in the false-path-loop by setting
// the bool input accordingly. The unswitch candidate If nodes are folded in the next IGVN round.
void remove_unswitch_candidate_from_loops(const UnswitchedLoopSelector& unswitched_loop_selector) {
IfNode* unswitching_candidate = unswitched_loop_selector.unswitch_candidate();
_phase->igvn().rehash_node_delayed(unswitching_candidate);
_phase->dominated_by(unswitched_loop_selector.true_path_loop_proj(), unswitching_candidate);
const LoopSelector& loop_selector = unswitched_loop_selector.loop_selector();;
IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate();
_phase->igvn().rehash_node_delayed(unswitch_candidate);
_phase->dominated_by(loop_selector.true_path_loop_proj(), unswitch_candidate);
IfNode* unswitching_candidate_clone = _old_new[unswitching_candidate->_idx]->as_If();
_phase->igvn().rehash_node_delayed(unswitching_candidate_clone);
_phase->dominated_by(unswitched_loop_selector.false_path_loop_proj(), unswitching_candidate_clone);
IfNode* unswitch_candidate_clone = _old_new[unswitch_candidate->_idx]->as_If();
_phase->igvn().rehash_node_delayed(unswitch_candidate_clone);
_phase->dominated_by(loop_selector.false_path_loop_proj(), unswitch_candidate_clone);
}
};
// See comments below file header for more information about Loop Unswitching.
@ -343,6 +407,172 @@ void PhaseIdealLoop::do_unswitching(IdealLoopTree* loop, Node_List& old_new) {
C->set_major_progress();
}
void PhaseIdealLoop::do_multiversioning(IdealLoopTree* lpt, Node_List& old_new) {
#ifndef PRODUCT
if (TraceLoopOpts || TraceLoopMultiversioning) {
tty->print("Multiversion ");
lpt->dump_head();
}
#endif
assert(LoopMultiversioning, "LoopMultiversioning must be enabled");
CountedLoopNode* original_head = lpt->_head->as_CountedLoop();
C->print_method(PHASE_BEFORE_LOOP_MULTIVERSIONING, 4, original_head);
Node* one = _igvn.intcon(1);
set_ctrl(one, C->root());
Node* opaque = new OpaqueMultiversioningNode(C, one);
set_ctrl(opaque, C->root());
_igvn.register_new_node_with_optimizer(opaque);
_igvn.set_type(opaque, TypeInt::BOOL);
const LoopSelector loop_selector(lpt, opaque, PROB_LIKELY_MAG(3), COUNT_UNKNOWN);
OriginalLoop original_loop(lpt, old_new);
original_loop.multiversion(loop_selector);
add_unswitched_loop_version_bodies_to_igvn(lpt, old_new);
CountedLoopNode* new_head = old_new[original_head->_idx]->as_CountedLoop();
original_head->set_multiversion_fast_loop();
new_head->set_multiversion_delayed_slow_loop();
NOT_PRODUCT(trace_loop_multiversioning_result(loop_selector, original_head, new_head);)
C->print_method(PHASE_AFTER_LOOP_MULTIVERSIONING, 4, new_head);
C->set_major_progress();
}
// Create a new if in the multiversioning pattern, adding an additional condition for the
// multiversioning fast-loop.
//
// Before:
// entry opaque
// | |
// multiversion_if
// | |
// +----------------+ +---------------+
// | |
// multiversion_fast_proj multiversion_slow_proj
// |
// +--------+
// |
// slow_path
//
//
// After:
// entry opaque <-- to be replaced by caller
// | |
// new_if
// | |
// | +-----------------------------+
// | |
// new_if_true opaque new_if_false
// | | |
// multiversion_if |
// | | |
// +----------------+ +---------------+ |
// | | |
// multiversion_fast_proj new_multiversion_slow_proj |
// | |
// +------+ |
// | |
// region
// |
// slow_path
//
IfTrueNode* PhaseIdealLoop::create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj) {
// Give all nodes in the old sub-graph a name.
IfNode* multiversion_if = multiversioning_fast_proj->in(0)->as_If();
Node* entry = multiversion_if->in(0);
OpaqueMultiversioningNode* opaque = multiversion_if->in(1)->as_OpaqueMultiversioning();
IfFalseNode* multiversion_slow_proj = multiversion_if->proj_out(0)->as_IfFalse();
Node* slow_path = multiversion_slow_proj->unique_ctrl_out();
// The slow_loop may still be delayed, and waiting for runtime-checks to be added to the
// multiversion_if. Now that we have at least one condition for the multiversioning,
// we should resume optimizations for the slow loop.
opaque->notify_slow_loop_that_it_can_resume_optimizations();
// Create new_if with its projections.
IfNode* new_if = IfNode::make_with_same_profile(multiversion_if, entry, opaque);
IdealLoopTree* lp = get_loop(entry);
register_control(new_if, lp, entry);
IfTrueNode* new_if_true = new IfTrueNode(new_if);
IfFalseNode* new_if_false = new IfFalseNode(new_if);
register_control(new_if_true, lp, new_if);
register_control(new_if_false, lp, new_if);
// Hook new_if_true into multiversion_if.
_igvn.replace_input_of(multiversion_if, 0, new_if_true);
// Clone multiversion_slow_path - this allows us to easily carry the dependencies to
// the new region below.
IfFalseNode* new_multiversion_slow_proj = multiversion_slow_proj->clone()->as_IfFalse();
register_control(new_multiversion_slow_proj, lp, multiversion_if);
// Create new Region.
RegionNode* region = new RegionNode(1);
region->add_req(new_multiversion_slow_proj);
region->add_req(new_if_false);
register_control(region, lp, new_multiversion_slow_proj);
// Hook region into slow_path, in stead of the multiversion_slow_proj.
// This also moves all other dependencies of the multiversion_slow_proj to the region.
_igvn.replace_node(multiversion_slow_proj, region);
return new_if_true;
}
OpaqueMultiversioningNode* find_multiversion_opaque_from_multiversion_if_false(Node* maybe_multiversion_if_false) {
IfFalseNode* multiversion_if_false = maybe_multiversion_if_false->isa_IfFalse();
if (multiversion_if_false == nullptr) { return nullptr; }
IfNode* multiversion_if = multiversion_if_false->in(0)->isa_If();
if (multiversion_if == nullptr) { return nullptr; }
return multiversion_if->in(1)->isa_OpaqueMultiversioning();
}
bool PhaseIdealLoop::try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt) {
CountedLoopNode* cl = lpt->_head->as_CountedLoop();
assert(cl->is_multiversion_delayed_slow_loop(), "must currently be delayed");
// Find multiversion_if.
Node* entry = cl->skip_strip_mined()->in(LoopNode::EntryControl);
const Predicates predicates(entry);
Node* slow_path = predicates.entry();
// Find opaque.
OpaqueMultiversioningNode* opaque = nullptr;
if (slow_path->is_Region()) {
for (uint i = 1; i < slow_path->req(); i++) {
Node* n = slow_path->in(i);
opaque = find_multiversion_opaque_from_multiversion_if_false(n);
if (opaque != nullptr) { break; }
}
} else {
opaque = find_multiversion_opaque_from_multiversion_if_false(slow_path);
}
assert(opaque != nullptr, "must have found multiversion opaque node");
if (opaque == nullptr) { return false; }
// We may still be delayed, if there were not yet any runtime-checks added
// for the multiversioning. We may never add any, and then this loop would
// fold away. So we wait until some runtime-checks are added, then we know
// that this loop will be reachable and it is worth optimizing further.
if (opaque->is_delayed_slow_loop()) { return false; }
// Clear away the "delayed" status, i.e. resume optimizations.
cl->set_no_multiversion();
cl->set_multiversion_slow_loop();
#ifndef PRODUCT
if (TraceLoopOpts) {
tty->print("Resume Optimizations ");
lpt->dump_head();
}
#endif
return true;
}
bool PhaseIdealLoop::has_control_dependencies_from_predicates(LoopNode* head) {
Node* entry = head->skip_strip_mined()->in(LoopNode::EntryControl);
const Predicates predicates(entry);
@ -377,7 +607,7 @@ void PhaseIdealLoop::trace_loop_unswitching_result(const UnswitchedLoopSelector&
const LoopNode* original_head, const LoopNode* new_head) {
if (TraceLoopUnswitching) {
IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate();
IfNode* loop_selector = unswitched_loop_selector.selector();
IfNode* loop_selector = unswitched_loop_selector.loop_selector().selector();
tty->print_cr("Loop Unswitching:");
tty->print_cr("- Unswitch-Candidate-If: %d %s", unswitch_candidate->_idx, unswitch_candidate->Name());
tty->print_cr("- Loop-Selector-If: %d %s", loop_selector->_idx, loop_selector->Name());
@ -385,22 +615,33 @@ void PhaseIdealLoop::trace_loop_unswitching_result(const UnswitchedLoopSelector&
tty->print_cr("- False-Path-Loop (=Clone): %d %s", new_head->_idx, new_head->Name());
}
}
void PhaseIdealLoop::trace_loop_multiversioning_result(const LoopSelector& loop_selector,
const LoopNode* original_head, const LoopNode* new_head) {
if (TraceLoopMultiversioning) {
IfNode* selector_if = loop_selector.selector();
tty->print_cr("Loop Multiversioning:");
tty->print_cr("- Loop-Selector-If: %d %s", selector_if->_idx, selector_if->Name());
tty->print_cr("- True-Path-Loop (=Orig / Fast): %d %s", original_head->_idx, original_head->Name());
tty->print_cr("- False-Path-Loop (=Clone / Slow): %d %s", new_head->_idx, new_head->Name());
}
}
#endif
// When unswitching a counted loop, we need to convert it back to a normal loop since it's not a proper pre, main or,
// post loop anymore after loop unswitching.
// post loop anymore after loop unswitching. We also lose the multiversion structure, with access to the multiversion_if.
void PhaseIdealLoop::revert_to_normal_loop(const LoopNode* loop_head) {
CountedLoopNode* cl = loop_head->isa_CountedLoop();
if (cl != nullptr && !cl->is_normal_loop()) {
cl->set_normal_loop();
}
if (cl == nullptr) { return; }
if (!cl->is_normal_loop()) { cl->set_normal_loop(); }
if (cl->is_multiversion()) { cl->set_no_multiversion(); }
}
// Hoist invariant CheckCastPPNodes out of each unswitched loop version to the appropriate loop selector If projection.
void PhaseIdealLoop::hoist_invariant_check_casts(const IdealLoopTree* loop, const Node_List& old_new,
const UnswitchedLoopSelector& unswitched_loop_selector) {
IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate();
IfNode* loop_selector = unswitched_loop_selector.selector();
IfNode* loop_selector = unswitched_loop_selector.loop_selector().selector();
ResourceMark rm;
GrowableArray<CheckCastPPNode*> loop_invariant_check_casts;
for (DUIterator_Fast imax, i = unswitch_candidate->fast_outs(imax); i < imax; i++) {

View File

@ -1090,6 +1090,14 @@ bool PhaseIdealLoop::create_loop_nest(IdealLoopTree* loop, Node_List &old_new) {
if (UseProfiledLoopPredicate) {
add_parse_predicate(Deoptimization::Reason_profile_predicate, inner_head, outer_ilt, cloned_sfpt);
}
// We only want to use the auto-vectorization check as a trap once per bci. And
// PhaseIdealLoop::add_parse_predicate only checks trap limits per method, so
// we do a custom check here.
if (!C->too_many_traps(cloned_sfpt->jvms()->method(), cloned_sfpt->jvms()->bci(), Deoptimization::Reason_auto_vectorization_check)) {
add_parse_predicate(Deoptimization::Reason_auto_vectorization_check, inner_head, outer_ilt, cloned_sfpt);
}
add_parse_predicate(Deoptimization::Reason_loop_limit_check, inner_head, outer_ilt, cloned_sfpt);
}
@ -2511,6 +2519,9 @@ void CountedLoopNode::dump_spec(outputStream *st) const {
if (is_main_loop()) st->print("main of N%d", _idx);
if (is_post_loop()) st->print("post of N%d", _main_idx);
if (is_strip_mined()) st->print(" strip mined");
if (is_multiversion_fast_loop()) { st->print(" multiversion_fast"); }
if (is_multiversion_slow_loop()) { st->print(" multiversion_slow"); }
if (is_multiversion_delayed_slow_loop()) { st->print(" multiversion_delayed_slow"); }
}
#endif
@ -4303,6 +4314,9 @@ void IdealLoopTree::dump_head() {
if (cl->is_post_loop()) tty->print(" post");
if (cl->is_vectorized_loop()) tty->print(" vector");
if (range_checks_present()) tty->print(" rc ");
if (cl->is_multiversion_fast_loop()) { tty->print(" multiversion_fast"); }
if (cl->is_multiversion_slow_loop()) { tty->print(" multiversion_slow"); }
if (cl->is_multiversion_delayed_slow_loop()) { tty->print(" multiversion_delayed_slow"); }
}
if (_has_call) tty->print(" has_call");
if (_has_sfpt) tty->print(" has_sfpt");
@ -4948,18 +4962,6 @@ void PhaseIdealLoop::build_and_optimize() {
C->set_major_progress();
}
// Keep loop predicates and perform optimizations with them
// until no more loop optimizations could be done.
// After that switch predicates off and do more loop optimizations.
if (!C->major_progress() && (C->parse_predicate_count() > 0)) {
C->mark_parse_predicate_nodes_useless(_igvn);
assert(C->parse_predicate_count() == 0, "should be zero now");
if (TraceLoopOpts) {
tty->print_cr("PredicatesOff");
}
C->set_major_progress();
}
// Auto-vectorize main-loop
if (C->do_superword() && C->has_loops() && !C->major_progress()) {
Compile::TracePhase tp(_t_autoVectorize);
@ -4992,6 +4994,18 @@ void PhaseIdealLoop::build_and_optimize() {
}
}
}
// Keep loop predicates and perform optimizations with them
// until no more loop optimizations could be done.
// After that switch predicates off and do more loop optimizations.
if (!C->major_progress() && (C->parse_predicate_count() > 0)) {
C->mark_parse_predicate_nodes_useless(_igvn);
assert(C->parse_predicate_count() == 0, "should be zero now");
if (TraceLoopOpts) {
tty->print_cr("PredicatesOff");
}
C->set_major_progress();
}
}
#ifndef PRODUCT

View File

@ -43,6 +43,7 @@ class OuterStripMinedLoopEndNode;
class PredicateBlock;
class PathFrequency;
class PhaseIdealLoop;
class LoopSelector;
class UnswitchedLoopSelector;
class VectorSet;
class VSharedData;
@ -79,7 +80,12 @@ protected:
SubwordLoop = 1<<13,
ProfileTripFailed = 1<<14,
LoopNestInnerLoop = 1<<15,
LoopNestLongOuterLoop = 1<<16 };
LoopNestLongOuterLoop = 1<<16,
MultiversionFastLoop = 1<<17,
MultiversionSlowLoop = 2<<17,
MultiversionDelayedSlowLoop = 3<<17,
MultiversionFlagsMask = 3<<17,
};
char _unswitch_count;
enum { _unswitch_max=3 };
@ -315,6 +321,32 @@ public:
void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; }
int slp_max_unroll() const { return _slp_maximum_unroll_factor; }
// Multiversioning allows us to duplicate a CountedLoop, and have two versions, and the multiversion_if
// decides which one is taken:
// (1) fast_loop: We enter this loop by default, by default the multiversion_if has its condition set to
// "true", guarded by a OpaqueMultiversioning. If we want to make a speculative assumption
// for an optimization, we can add the runtime-check to the multiversion_if, and if the
// assumption fails we take the slow_loop instead, where we do not make the same speculative
// assumption.
// We call it the "fast_loop" because it has more optimizations, enabled by the speculative
// runtime-checks at the multiversion_if, and we expect the fast_loop to execute faster.
// (2) slow_loop: By default, it is not taken, until a runtime-check is added to the multiversion_if while
// optimizing the fast_looop. If such a runtime-check is never added, then after loop-opts
// the multiversion_if constant folds to true, and the slow_loop is folded away. To save
// compile time, we delay the optimization of the slow_loop until a runtime-check is added
// to the multiversion_if, at which point we resume optimizations for the slow_loop.
// We call it the "slow_loop" because it has fewer optimizations, since this is the fall-back
// loop where we do not make any of the speculative assumptions we make for the fast_loop.
// Hence, we expect the slow_loop to execute slower.
bool is_multiversion() const { return (_loop_flags & MultiversionFlagsMask) != Normal; }
bool is_multiversion_fast_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionFastLoop; }
bool is_multiversion_slow_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionSlowLoop; }
bool is_multiversion_delayed_slow_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionDelayedSlowLoop; }
void set_multiversion_fast_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionFastLoop; }
void set_multiversion_slow_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionSlowLoop; }
void set_multiversion_delayed_slow_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionDelayedSlowLoop; }
void set_no_multiversion() { assert( is_multiversion(), ""); _loop_flags &= ~MultiversionFlagsMask; }
virtual LoopNode* skip_strip_mined(int expect_skeleton = 1);
OuterStripMinedLoopNode* outer_loop() const;
virtual IfTrueNode* outer_loop_tail() const;
@ -1457,6 +1489,8 @@ public:
static void trace_loop_unswitching_impossible(const LoopNode* original_head);
static void trace_loop_unswitching_result(const UnswitchedLoopSelector& unswitched_loop_selector,
const LoopNode* original_head, const LoopNode* new_head);
static void trace_loop_multiversioning_result(const LoopSelector& loop_selector,
const LoopNode* original_head, const LoopNode* new_head);
#endif
public:
@ -1483,6 +1517,11 @@ public:
};
AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
void maybe_multiversion_for_auto_vectorization_runtime_checks(IdealLoopTree* lpt, Node_List& old_new);
void do_multiversioning(IdealLoopTree* lpt, Node_List& old_new);
IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj);
bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt);
// Move an unordered Reduction out of loop if possible
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);

View File

@ -4482,6 +4482,66 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
return AutoVectorizeStatus::Success;
}
// Just before insert_pre_post_loops, we can multi-version the loop:
//
// multiversion_if
// | |
// fast_loop slow_loop
//
// In the fast_loop we can make speculative assumptions, and put the
// conditions into the multiversion_if. If the conditions hold at runtime,
// we enter the fast_loop, if the conditions fail, we take the slow_loop
// instead which does not make any of the speculative assumptions.
//
// Note: we only multiversion the loop if the loop does not have any
// auto vectorization check Predicate. If we have that predicate,
// then we can simply add the speculative assumption checks to
// that Predicate. This means we do not need to duplicate the
// loop - we have a smaller graph and save compile time. Should
// the conditions ever fail, then we deopt / trap at the Predicate
// and recompile without that Predicate. At that point we will
// multiversion the loop, so that we can still have speculative
// runtime checks.
//
// We perform the multiversioning when the loop is still in its single
// iteration form, even before we insert pre and post loops. This makes
// the cloning much simpler. However, this means that both the fast
// and the slow loop have to be optimized independently (adding pre
// and post loops, unrolling the main loop, auto-vectorize etc.). And
// we may end up not needing any speculative assumptions in the fast_loop
// and then rejecting the slow_loop by constant folding the multiversion_if.
//
// Therefore, we "delay" the optimization of the slow_loop until we add
// at least one speculative assumption for the fast_loop. If we never
// add such a speculative runtime check, the OpaqueMultiversioningNode
// of the multiversion_if constant folds to true after loop opts, and the
// multiversion_if folds away the "delayed" slow_loop. If we add any
// speculative assumption, then we notify the OpaqueMultiversioningNode
// with "notify_slow_loop_that_it_can_resume_optimizations".
//
// Note: new runtime checks can be added to the multiversion_if with
// PhaseIdealLoop::create_new_if_for_multiversion
void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(IdealLoopTree* lpt, Node_List& old_new) {
CountedLoopNode* cl = lpt->_head->as_CountedLoop();
LoopNode* outer_loop = cl->skip_strip_mined();
Node* entry = outer_loop->in(LoopNode::EntryControl);
// Check we have multiversioning enabled, and are not already multiversioned.
if (!LoopMultiversioning || cl->is_multiversion()) { return; }
// Check that we do not have a parse-predicate where we can add the runtime checks
// during auto-vectorization.
const Predicates predicates(entry);
const PredicateBlock* predicate_block = predicates.auto_vectorization_check_block();
if (predicate_block->has_parse_predicate()) { return; }
// Check node budget.
uint estimate = lpt->est_loop_clone_sz(2);
if (!may_require_nodes(estimate)) { return; }
do_multiversioning(lpt, old_new);
}
// Returns true if the Reduction node is unordered.
static bool is_unordered_reduction(Node* n) {
return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();

View File

@ -229,9 +229,12 @@
// Even if we could know that there is some base address to which we add index offsets, we cannot know
// if this reference address points to the beginning of a native memory allocation or into the middle,
// or outside it. We also have no guarantee for alignment with such a base address.
//
// Still: we would like to find such a base if possible, and if two pointers are similar (i.e. have the
// same summands), we would like to find the same base. Further, it is reasonable to speculatively
// assume that such base addresses are aligned (TODO: need to add this speculative check in JDK-8323582).
// assume that such base addresses are aligned. We performs such a speculative alignment runtime check
// in VTransform::add_speculative_alignment_check.
//
// A base pointer must have scale = 1, and be accepted byMemPointer::is_native_memory_base_candidate.
// It can thus be one of these:
// (1) CastX2P

View File

@ -139,6 +139,7 @@ class NeverBranchNode;
class Opaque1Node;
class OpaqueLoopInitNode;
class OpaqueLoopStrideNode;
class OpaqueMultiversioningNode;
class OpaqueNotNullNode;
class OpaqueInitializedAssertionPredicateNode;
class OpaqueTemplateAssertionPredicateNode;
@ -800,6 +801,7 @@ public:
DEFINE_CLASS_ID(Opaque1, Node, 16)
DEFINE_CLASS_ID(OpaqueLoopInit, Opaque1, 0)
DEFINE_CLASS_ID(OpaqueLoopStride, Opaque1, 1)
DEFINE_CLASS_ID(OpaqueMultiversioning, Opaque1, 2)
DEFINE_CLASS_ID(OpaqueNotNull, Node, 17)
DEFINE_CLASS_ID(OpaqueInitializedAssertionPredicate, Node, 18)
DEFINE_CLASS_ID(OpaqueTemplateAssertionPredicate, Node, 19)
@ -982,6 +984,7 @@ public:
DEFINE_CLASS_QUERY(OpaqueTemplateAssertionPredicate)
DEFINE_CLASS_QUERY(OpaqueLoopInit)
DEFINE_CLASS_QUERY(OpaqueLoopStride)
DEFINE_CLASS_QUERY(OpaqueMultiversioning)
DEFINE_CLASS_QUERY(OuterStripMinedLoop)
DEFINE_CLASS_QUERY(OuterStripMinedLoopEnd)
DEFINE_CLASS_QUERY(Parm)

View File

@ -91,6 +91,29 @@ public:
IfNode* if_node() const;
};
// This node is used to mark the auto vectorization Predicate.
// At first, the multiversion_if has its condition set to "true" and we always
// take the fast_loop. Since we do not know if the slow_loop is ever going to
// be used, we delay optimizations for it. Once the fast_loop decides to use
// speculative runtime-checks and adds them to the multiversion_if, the slow_loop
// can now resume optimizations, as it is reachable at runtime.
// See PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks
class OpaqueMultiversioningNode : public Opaque1Node {
private:
bool _is_delayed_slow_loop;
public:
OpaqueMultiversioningNode(Compile* C, Node* n) :
Opaque1Node(C, n), _is_delayed_slow_loop(true)
{
init_class_id(Class_OpaqueMultiversioning);
}
virtual int Opcode() const;
virtual const Type* bottom_type() const { return TypeInt::BOOL; }
bool is_delayed_slow_loop() const { return _is_delayed_slow_loop; }
void notify_slow_loop_that_it_can_resume_optimizations() { _is_delayed_slow_loop = false; }
};
// This node is used in the context of intrinsics. We sometimes implicitly know that an object is non-null even though
// the compiler cannot prove it. We therefore add a corresponding cast to propagate this implicit knowledge. However,
// this cast could become top during optimizations (input to cast becomes null) and the data path is folded. To ensure

View File

@ -64,14 +64,17 @@
flags(AFTER_LOOP_PEELING, "After Loop Peeling") \
flags(BEFORE_LOOP_UNSWITCHING, "Before Loop Unswitching") \
flags(AFTER_LOOP_UNSWITCHING, "After Loop Unswitching") \
flags(BEFORE_LOOP_MULTIVERSIONING, "Before Loop Multiversioning") \
flags(AFTER_LOOP_MULTIVERSIONING, "After Loop Multiversioning") \
flags(BEFORE_RANGE_CHECK_ELIMINATION, "Before Range Check Elimination") \
flags(AFTER_RANGE_CHECK_ELIMINATION, "After Range Check Elimination") \
flags(BEFORE_PRE_MAIN_POST, "Before Pre/Main/Post Loops") \
flags(AFTER_PRE_MAIN_POST, "After Pre/Main/Post Loops") \
flags(AUTO_VECTORIZATION1_BEFORE_APPLY, "AutoVectorization 1, Before Apply") \
flags(AUTO_VECTORIZATION2_AFTER_REORDER, "AutoVectorization 2, After Apply Memop Reordering") \
flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \
flags(AUTO_VECTORIZATION4_AFTER_APPLY, "AutoVectorization 4, After Apply") \
flags(AUTO_VECTORIZATION1_BEFORE_APPLY, "AutoVectorization 1, Before Apply") \
flags(AUTO_VECTORIZATION2_AFTER_REORDER, "AutoVectorization 2, After Apply Memop Reordering") \
flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \
flags(AUTO_VECTORIZATION4_AFTER_SPECULATIVE_RUNTIME_CHECKS, "AutoVectorization 4, After Adding Speculative Runtime Checks") \
flags(AUTO_VECTORIZATION5_AFTER_APPLY, "AutoVectorization 5, After Apply") \
flags(BEFORE_CLOOPS, "Before CountedLoop") \
flags(AFTER_CLOOPS, "After CountedLoop") \
flags(PHASEIDEAL_BEFORE_EA, "PhaseIdealLoop before EA") \

View File

@ -120,6 +120,7 @@ bool RuntimePredicate::has_valid_uncommon_trap(const Node* success_proj) {
assert(RegularPredicate::may_be_predicate_if(success_proj), "must have been checked before");
const Deoptimization::DeoptReason deopt_reason = uncommon_trap_reason(success_proj->as_IfProj());
return (deopt_reason == Deoptimization::Reason_loop_limit_check ||
deopt_reason == Deoptimization::Reason_auto_vectorization_check ||
deopt_reason == Deoptimization::Reason_predicate ||
deopt_reason == Deoptimization::Reason_profile_predicate);
}
@ -893,6 +894,8 @@ void Predicates::dump() const {
tty->print_cr("%d %s:", loop_head->_idx, loop_head->Name());
tty->print_cr("- Loop Limit Check Predicate Block:");
_loop_limit_check_predicate_block.dump(" ");
tty->print_cr("- Auto Vectorization Check Block:");
_auto_vectorization_check_block.dump(" ");
tty->print_cr("- Profiled Loop Predicate Block:");
_profiled_loop_predicate_block.dump(" ");
tty->print_cr("- Loop Predicate Block:");

View File

@ -734,6 +734,8 @@ class PredicateIterator : public StackObj {
Node* current = _start_node;
PredicateBlockIterator loop_limit_check_predicate_iterator(current, Deoptimization::Reason_loop_limit_check);
current = loop_limit_check_predicate_iterator.for_each(predicate_visitor);
PredicateBlockIterator auto_vectorization_check_iterator(current, Deoptimization::Reason_auto_vectorization_check);
current = auto_vectorization_check_iterator.for_each(predicate_visitor);
if (UseLoopPredicate) {
if (UseProfiledLoopPredicate) {
PredicateBlockIterator profiled_loop_predicate_iterator(current, Deoptimization::Reason_profile_predicate);
@ -906,6 +908,7 @@ class PredicateBlock : public StackObj {
class Predicates : public StackObj {
Node* const _tail;
const PredicateBlock _loop_limit_check_predicate_block;
const PredicateBlock _auto_vectorization_check_block;
const PredicateBlock _profiled_loop_predicate_block;
const PredicateBlock _loop_predicate_block;
Node* const _entry;
@ -914,7 +917,9 @@ class Predicates : public StackObj {
explicit Predicates(Node* loop_entry)
: _tail(loop_entry),
_loop_limit_check_predicate_block(loop_entry, Deoptimization::Reason_loop_limit_check),
_profiled_loop_predicate_block(_loop_limit_check_predicate_block.entry(),
_auto_vectorization_check_block(_loop_limit_check_predicate_block.entry(),
Deoptimization::Reason_auto_vectorization_check),
_profiled_loop_predicate_block(_auto_vectorization_check_block.entry(),
Deoptimization::Reason_profile_predicate),
_loop_predicate_block(_profiled_loop_predicate_block.entry(),
Deoptimization::Reason_predicate),
@ -935,6 +940,10 @@ class Predicates : public StackObj {
return &_profiled_loop_predicate_block;
}
const PredicateBlock* auto_vectorization_check_block() const {
return &_auto_vectorization_check_block;
}
const PredicateBlock* loop_limit_check_predicate_block() const {
return &_loop_limit_check_predicate_block;
}

View File

@ -1484,7 +1484,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac
pack->size(),
pre_end->init_trip(),
pre_end->stride_con(),
iv_stride()
iv_stride(),
_vloop.are_speculative_checks_possible()
DEBUG_ONLY(COMMA is_trace_align_vector()));
return solver.solve();
}
@ -1896,6 +1897,7 @@ bool SuperWord::schedule_and_apply() const {
VTransformTrace trace(_vloop.vtrace(),
is_trace_superword_rejections(),
is_trace_align_vector(),
_vloop.is_trace_speculative_runtime_checks(),
is_trace_superword_info());
#endif
VTransform vtransform(_vloop_analyzer,
@ -1938,8 +1940,11 @@ void VTransform::apply() {
adjust_pre_loop_limit_to_align_main_loop_vectors();
C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl());
apply_speculative_runtime_checks();
C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_SPECULATIVE_RUNTIME_CHECKS, 4, cl());
apply_vectorization();
C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl());
C->print_method(PHASE_AUTO_VECTORIZATION5_AFTER_APPLY, 4, cl());
}
// We prepare the memory graph for the replacement of scalar memops with vector memops.

View File

@ -29,25 +29,26 @@
#include "utilities/stringUtils.hpp"
#define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \
flags(POINTER_PARSING, "Trace VPointer/MemPointer parsing") \
flags(POINTER_ALIASING, "Trace VPointer/MemPointer aliasing") \
flags(POINTER_ADJACENCY, "Trace VPointer/MemPointer adjacency") \
flags(POINTER_OVERLAP, "Trace VPointer/MemPointer overlap") \
flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \
flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
flags(BODY, "Trace VLoopBody") \
flags(TYPES, "Trace VLoopTypes") \
flags(POINTERS, "Trace VLoopPointers") \
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
flags(ALIGN_VECTOR, "Trace AlignVector") \
flags(VTRANSFORM, "Trace VTransform Graph") \
flags(ALL, "Trace everything (very verbose)")
flags(POINTER_PARSING, "Trace VPointer/MemPointer parsing") \
flags(POINTER_ALIASING, "Trace VPointer/MemPointer aliasing") \
flags(POINTER_ADJACENCY, "Trace VPointer/MemPointer adjacency") \
flags(POINTER_OVERLAP, "Trace VPointer/MemPointer overlap") \
flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \
flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
flags(BODY, "Trace VLoopBody") \
flags(TYPES, "Trace VLoopTypes") \
flags(POINTERS, "Trace VLoopPointers") \
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
flags(ALIGN_VECTOR, "Trace AlignVector") \
flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
flags(VTRANSFORM, "Trace VTransform Graph") \
flags(ALL, "Trace everything (very verbose)")
#define table_entry(name, description) name,
enum TraceAutoVectorizationTag {

View File

@ -93,9 +93,9 @@ VStatus VLoop::check_preconditions_helper() {
return VStatus::make_failure(VLoop::FAILURE_BACKEDGE);
}
// To align vector memory accesses in the main-loop, we will have to adjust
// the pre-loop limit.
if (_cl->is_main_loop()) {
// To align vector memory accesses in the main-loop, we will have to adjust
// the pre-loop limit.
CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
if (pre_end == nullptr) {
return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
@ -105,6 +105,41 @@ VStatus VLoop::check_preconditions_helper() {
return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
}
_pre_loop_end = pre_end;
// See if we find the infrastructure for speculative runtime-checks.
// (1) Auto Vectorization Parse Predicate
Node* pre_ctrl = pre_loop_head()->in(LoopNode::EntryControl);
const Predicates predicates(pre_ctrl);
const PredicateBlock* predicate_block = predicates.auto_vectorization_check_block();
if (predicate_block->has_parse_predicate()) {
_auto_vectorization_parse_predicate_proj = predicate_block->parse_predicate_success_proj();
}
// (2) Multiversioning fast-loop projection
IfTrueNode* before_predicates = predicates.entry()->isa_IfTrue();
if (before_predicates != nullptr &&
before_predicates->in(0)->is_If() &&
before_predicates->in(0)->in(1)->is_OpaqueMultiversioning()) {
_multiversioning_fast_proj = before_predicates;
}
#ifndef PRODUCT
if (is_trace_preconditions() || is_trace_speculative_runtime_checks()) {
tty->print_cr(" Infrastructure for speculative runtime-checks:");
if (_auto_vectorization_parse_predicate_proj != nullptr) {
tty->print_cr(" auto_vectorization_parse_predicate_proj: speculate and trap");
_auto_vectorization_parse_predicate_proj->dump_bfs(5,0,"");
} else if (_multiversioning_fast_proj != nullptr) {
tty->print_cr(" multiversioning_fast_proj: speculate and multiversion");
_multiversioning_fast_proj->dump_bfs(5,0,"");
} else {
tty->print_cr(" Not found.");
}
}
#endif
assert(_auto_vectorization_parse_predicate_proj == nullptr ||
_multiversioning_fast_proj == nullptr, "we should only have at most one of these");
assert(_cl->is_multiversion_fast_loop() == (_multiversioning_fast_proj != nullptr),
"must find the multiversion selector IFF loop is a multiversion fast loop");
}
return VStatus::make_success();
@ -472,15 +507,28 @@ AlignmentSolution* AlignmentSolver::solve() const {
// + con + con + C_const (sum of constant terms)
//
// We describe the 6 terms:
// 1) The "base" of the address is the address of a Java object (e.g. array),
// and as such ObjectAlignmentInBytes (a power of 2) aligned. We have
// defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is also
// 1) The "base" of the address:
// - For heap objects, this is the base of the object, and as such
// ObjectAlignmentInBytes (a power of 2) aligned.
// - For off-heap / native memory, the "base" has no alignment
// gurantees. To ensure alignment we can do either of these:
// - Add a runtime check to verify ObjectAlignmentInBytes alignment,
// i.e. we can speculatively compile with an alignment assumption.
// If we pass the check, we can go into the loop with the alignment
// assumption, if we fail we have to trap/deopt or take the other
// loop version without alignment assumptions.
// - If runtime checks are not possible, then we return an empty
// solution, i.e. we do not vectorize the corresponding pack.
//
// Let us assume we have an object "base", or passed the alignment
// runtime check for native "bases", hence we know:
//
// base % ObjectAlignmentInBytes = 0
//
// We defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is
// a power of 2. And hence we know that "base" is thus also aw-aligned:
//
// base % ObjectAlignmentInBytes = 0 ==> base % aw = 0
//
// TODO: Note: we have been assuming that this also holds for native memory base
// addresses. This is incorrect, see JDK-8323582.
// base % ObjectAlignmentInBytes = 0 ==> base % aw = 0 (BASE_ALIGNED)
//
// 2) The "C_const" term is the sum of all constant terms. This is "con",
// plus "iv_scale * init" if it is constant.
@ -505,6 +553,13 @@ AlignmentSolution* AlignmentSolver::solve() const {
// 6) The "C_main * main_iter" term represents how much the iv is increased
// during "main_iter" main-loop iterations.
// For native memory, we must add a runtime-check that "base % ObjectAlignmentInBytes = ",
// to ensure (BASE_ALIGNED). If we cannot add this runtime-check, we have no guarantee on
// its alignment.
if (!_vpointer.mem_pointer().base().is_object() && !_are_speculative_checks_possible) {
return new EmptyAlignmentSolution("Cannot add speculative check for native memory alignment.");
}
// Attribute init (i.e. _init_node) either to C_const or to C_init term.
const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0;
const int C_const = _vpointer.con() + C_const_init * iv_scale();
@ -521,8 +576,7 @@ AlignmentSolution* AlignmentSolver::solve() const {
// We must find a pre_iter, such that adr is aw aligned: adr % aw = 0. Note, that we are defining the
// modulo operator "%" such that the remainder is always positive, see AlignmentSolution::mod(i, q).
//
// TODO: Note: the following assumption is incorrect for native memory bases, see JDK-8323582.
// Since "base % aw = 0", we only need to ensure alignment of the other 5 terms:
// Since "base % aw = 0" (BASE_ALIGNED), we only need to ensure alignment of the other 5 terms:
//
// (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter + C_main * main_iter) % aw = 0 (1)
//
@ -878,8 +932,7 @@ AlignmentSolution* AlignmentSolver::solve() const {
// + iv_scale * pre_stride * pre_iter
// + iv_scale * main_stride * main_iter)) % aw =
//
// -> base aligned: base % aw = 0
// TODO: Note: this assumption is incorrect for native memory bases, see JDK-8323582.
// -> apply (BASE_ALIGNED): base % aw = 0
// -> main-loop iterations aligned (2): C_main % aw = (iv_scale * main_stride) % aw = 0
// (con + invar + iv_scale * init + iv_scale * pre_stride * pre_iter) % aw =
//
@ -958,7 +1011,7 @@ void AlignmentSolver::trace_start_solve() const {
_pre_stride, _main_stride);
// adr = base + con + invar + iv_scale * iv
tty->print(" adr = base[%d]", base().object_or_native()->_idx);
tty->print(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con());
tty->print_cr(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con());
}
}

View File

@ -85,6 +85,14 @@ private:
PhiNode* _iv;
CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only
// We can add speculative runtime-checks if we have one of these:
// - Auto Vectorization Parse Predicate:
// pass all checks or trap -> recompile without this predicate.
// - Multiversioning fast-loop projection:
// pass all checks or go to slow-path-loop, where we have no speculative assumptions.
ParsePredicateSuccessProj* _auto_vectorization_parse_predicate_proj;
IfTrueNode* _multiversioning_fast_proj;
NOT_PRODUCT(VTrace _vtrace;)
NOT_PRODUCT(TraceMemPointer _mptrace;)
@ -104,7 +112,9 @@ public:
_cl (nullptr),
_cl_exit (nullptr),
_iv (nullptr),
_pre_loop_end (nullptr)
_pre_loop_end (nullptr),
_auto_vectorization_parse_predicate_proj(nullptr),
_multiversioning_fast_proj(nullptr)
#ifndef PRODUCT
COMMA
_mptrace(TraceMemPointer(
@ -138,6 +148,19 @@ public:
return head;
};
ParsePredicateSuccessProj* auto_vectorization_parse_predicate_proj() const {
return _auto_vectorization_parse_predicate_proj;
}
IfTrueNode* multiversioning_fast_proj() const {
return _multiversioning_fast_proj;
}
bool are_speculative_checks_possible() const {
return _auto_vectorization_parse_predicate_proj != nullptr ||
_multiversioning_fast_proj != nullptr;
}
// Estimate maximum size for data structures, to avoid repeated reallocation
int estimated_body_length() const { return lpt()->_body.size(); };
int estimated_node_count() const { return (int)(1.10 * phase()->C->unique()); };
@ -176,6 +199,10 @@ public:
bool is_trace_vpointers() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
}
bool is_trace_speculative_runtime_checks() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
}
#endif
// Is the node in the basic block of the loop?
@ -1296,6 +1323,14 @@ private:
const int _pre_stride; // address increment per pre-loop iteration
const int _main_stride; // address increment per main-loop iteration
// For native bases, we have no alignment guarantee. This means we cannot in
// general guarantee alignment statically. But we can check alignment with a
// speculative runtime check, see VTransform::apply_speculative_runtime_checks.
// For this, we need find the Predicate for auto vectorization checks, or else
// we need to find the multiversion_if. If we cannot find either, then we
// cannot make any speculative runtime checks.
const bool _are_speculative_checks_possible;
DEBUG_ONLY( const bool _is_trace; );
static const MemNode* mem_ref_not_null(const MemNode* mem_ref) {
@ -1309,7 +1344,8 @@ public:
const uint vector_length,
const Node* init_node,
const int pre_stride,
const int main_stride
const int main_stride,
const bool are_speculative_checks_possible
DEBUG_ONLY( COMMA const bool is_trace)
) :
_vpointer( vpointer),
@ -1318,7 +1354,8 @@ public:
_aw( MIN2(_vector_width, ObjectAlignmentInBytes)),
_init_node( init_node),
_pre_stride( pre_stride),
_main_stride( main_stride)
_main_stride( main_stride),
_are_speculative_checks_possible(are_speculative_checks_possible)
DEBUG_ONLY( COMMA _is_trace(is_trace) )
{
assert(_mem_ref != nullptr &&

View File

@ -23,6 +23,7 @@
#include "opto/vtransform.hpp"
#include "opto/vectornode.hpp"
#include "opto/castnode.hpp"
#include "opto/convertnode.hpp"
void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
@ -143,6 +144,94 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const {
}
#endif
void VTransform::apply_speculative_runtime_checks() {
if (VLoop::vectors_should_be_aligned()) {
#ifdef ASSERT
if (_trace._align_vector || _trace._speculative_runtime_checks) {
tty->print_cr("\nVTransform::apply_speculative_runtime_checks: native memory alignment");
}
#endif
const GrowableArray<VTransformNode*>& vtnodes = _graph.vtnodes();
for (int i = 0; i < vtnodes.length(); i++) {
VTransformVectorNode* vtn = vtnodes.at(i)->isa_Vector();
if (vtn == nullptr) { continue; }
MemNode* p0 = vtn->nodes().at(0)->isa_Mem();
if (p0 == nullptr) { continue; }
const VPointer& vp = vpointer(p0);
if (vp.mem_pointer().base().is_object()) { continue; }
assert(vp.mem_pointer().base().is_native(), "VPointer base must be object or native");
// We have a native memory reference. Build a runtime check for it.
// See: AlignmentSolver::solve
// In a future RFE we may be able to speculate on invar alignment as
// well, and allow vectorization of more cases.
add_speculative_alignment_check(vp.mem_pointer().base().native(), ObjectAlignmentInBytes);
}
}
}
#define TRACE_SPECULATIVE_ALIGNMENT_CHECK(node) { \
DEBUG_ONLY( \
if (_trace._align_vector || _trace._speculative_runtime_checks) { \
tty->print(" " #node ": "); \
node->dump(); \
} \
) \
} \
// Check: (node % alignment) == 0.
void VTransform::add_speculative_alignment_check(Node* node, juint alignment) {
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
Node* ctrl = phase()->get_ctrl(node);
// Cast adr/long -> int
if (node->bottom_type()->basic_type() == T_ADDRESS) {
// adr -> int/long
node = new CastP2XNode(nullptr, node);
phase()->register_new_node(node, ctrl);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
}
if (node->bottom_type()->basic_type() == T_LONG) {
// long -> int
node = new ConvL2INode(node);
phase()->register_new_node(node, ctrl);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
}
Node* mask_alignment = igvn().intcon(alignment-1);
Node* base_alignment = new AndINode(node, mask_alignment);
phase()->register_new_node(base_alignment, ctrl);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(mask_alignment);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(base_alignment);
Node* zero = igvn().intcon(0);
Node* cmp_alignment = CmpNode::make(base_alignment, zero, T_INT, false);
BoolNode* bol_alignment = new BoolNode(cmp_alignment, BoolTest::eq);
phase()->register_new_node(cmp_alignment, ctrl);
phase()->register_new_node(bol_alignment, ctrl);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(cmp_alignment);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(bol_alignment);
add_speculative_check(bol_alignment);
}
void VTransform::add_speculative_check(BoolNode* bol) {
assert(_vloop.are_speculative_checks_possible(), "otherwise we cannot make speculative assumptions");
ParsePredicateSuccessProj* parse_predicate_proj = _vloop.auto_vectorization_parse_predicate_proj();
IfTrueNode* new_check_proj = nullptr;
if (parse_predicate_proj != nullptr) {
new_check_proj = phase()->create_new_if_for_predicate(parse_predicate_proj, nullptr,
Deoptimization::Reason_auto_vectorization_check,
Op_If);
} else {
new_check_proj = phase()->create_new_if_for_multiversion(_vloop.multiversioning_fast_proj());
}
Node* iff_speculate = new_check_proj->in(0);
igvn().replace_input_of(iff_speculate, 1, bol);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(iff_speculate);
}
// Helper-class for VTransformGraph::has_store_to_load_forwarding_failure.
// It wraps a VPointer. The VPointer has an iv_offset applied, which
// simulates a virtual unrolling. They represent the memory region:

View File

@ -109,16 +109,19 @@ public:
const bool _verbose;
const bool _rejections;
const bool _align_vector;
const bool _speculative_runtime_checks;
const bool _info;
VTransformTrace(const VTrace& vtrace,
const bool is_trace_rejections,
const bool is_trace_align_vector,
const bool is_trace_speculative_runtime_checks,
const bool is_trace_info) :
_verbose (vtrace.is_trace(TraceAutoVectorizationTag::ALL)),
_rejections (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections),
_align_vector(_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector),
_info (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {}
_verbose (vtrace.is_trace(TraceAutoVectorizationTag::ALL)),
_rejections (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections),
_align_vector (_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector),
_speculative_runtime_checks(_verbose | is_trace_vtransform(vtrace) | is_trace_speculative_runtime_checks),
_info (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {}
static bool is_trace_vtransform(const VTrace& vtrace) {
return vtrace.is_trace(TraceAutoVectorizationTag::VTRANSFORM);
@ -245,6 +248,10 @@ private:
void determine_mem_ref_and_aw_for_main_loop_alignment();
void adjust_pre_loop_limit_to_align_main_loop_vectors();
void apply_speculative_runtime_checks();
void add_speculative_alignment_check(Node* node, juint alignment);
void add_speculative_check(BoolNode* bol);
void apply_vectorization() const;
};

View File

@ -2717,6 +2717,7 @@ const char* Deoptimization::_trap_reason_name[] = {
"intrinsic" JVMCI_ONLY("_or_type_checked_inlining"),
"bimorphic" JVMCI_ONLY("_or_optimized_type_check"),
"profile_predicate",
"auto_vectorization_check",
"unloaded",
"uninitialized",
"initialized",

View File

@ -98,6 +98,7 @@ class Deoptimization : AllStatic {
#endif
Reason_profile_predicate, // compiler generated predicate moved from frequent branch in a loop failed
Reason_auto_vectorization_check, // compiler generated (speculative) auto vectorization checks failed
// recorded per method
Reason_unloaded, // unloaded class or constant pool entry

View File

@ -2269,6 +2269,7 @@
declare_constant(Deoptimization::Reason_age) \
declare_constant(Deoptimization::Reason_predicate) \
declare_constant(Deoptimization::Reason_loop_limit_check) \
declare_constant(Deoptimization::Reason_auto_vectorization_check) \
declare_constant(Deoptimization::Reason_speculate_class_check) \
declare_constant(Deoptimization::Reason_speculate_null_check) \
declare_constant(Deoptimization::Reason_speculate_null_assert) \

View File

@ -0,0 +1,303 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.loopopts.superword;
import compiler.lib.ir_framework.*;
import compiler.lib.verify.*;
import jdk.test.lib.Utils;
import java.nio.ByteBuffer;
import java.util.Map;
import java.util.HashMap;
import java.util.Random;
import java.lang.foreign.*;
/*
* @test id=byte-buffer-direct
* @bug 8323582
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect
*/
/*
* @test id=byte-buffer-direct-AlignVector
* @bug 8323582
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect AlignVector
*/
/*
* @test id=byte-buffer-direct-VerifyAlignVector
* @bug 8323582
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect VerifyAlignVector
*/
/*
* @test id=native
* @bug 8323582
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native
*/
/*
* @test id=native-AlignVector
* @bug 8323582
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native AlignVector
*/
/*
* @test id=native-VerifyAlignVector
* @bug 8323582
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native VerifyAlignVector
*/
public class TestMemorySegmentUnalignedAddress {
public static void main(String[] args) {
TestFramework framework = new TestFramework(TestMemorySegmentUnalignedAddressImpl.class);
framework.addFlags("-DmemorySegmentProviderNameForTestVM=" + args[0]);
if (args.length > 1) {
switch (args[1]) {
case "AlignVector" -> { framework.addFlags("-XX:+AlignVector"); }
case "VerifyAlignVector" -> { framework.addFlags("-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
default -> { throw new RuntimeException("unexpected: " + args[1]); }
}
}
framework.setDefaultWarmup(100);
framework.start();
}
}
class TestMemorySegmentUnalignedAddressImpl {
static final int SIZE = 10_000;
static final int BACKING_SIZE = 10_000 + 1;
static final Random RANDOM = Utils.getRandomInstance();
interface TestFunction {
Object run(int i);
}
interface MemorySegmentProvider {
MemorySegment newMemorySegment();
}
static MemorySegmentProvider provider;
static {
String providerName = System.getProperty("memorySegmentProviderNameForTestVM");
provider = switch (providerName) {
case "ByteBufferDirect" -> TestMemorySegmentUnalignedAddressImpl::newMemorySegmentOfByteBufferDirect;
case "Native" -> TestMemorySegmentUnalignedAddressImpl::newMemorySegmentOfNative;
default -> throw new RuntimeException("Test argument not recognized: " + providerName);
};
}
// List of tests
Map<String, TestFunction> tests = new HashMap<>();
// List of gold, the results from the first run before compilation
Map<String, Object> golds = new HashMap<>();
public TestMemorySegmentUnalignedAddressImpl () {
// Generate two MemorySegments as inputs
MemorySegment a = sliceAligned(newMemorySegment());
MemorySegment b = sliceAligned(newMemorySegment());
fillRandom(a);
fillRandom(b);
// Add all tests to list
tests.put("testAlwaysAligned", (int i) -> {
MemorySegment ms = newMemorySegment();
MemorySegment slice = sliceAligned(ms);
copy(a, slice);
return testAlwaysAligned(slice);
});
tests.put("testAlwaysUnaligned", (int i) -> {
MemorySegment ms = newMemorySegment();
MemorySegment slice = sliceUnaligned(ms);
copy(a, slice);
return testAlwaysUnaligned(slice);
});
tests.put("testMixedAlignedAndUnaligned", (int i) -> {
MemorySegment ms = newMemorySegment();
MemorySegment slice = (i % 2 == 0) ? sliceUnaligned(ms) : sliceAligned(ms);
copy(a, slice);
return testMixedAlignedAndUnaligned(slice);
});
// Compute gold value for all test methods before compilation
for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
String name = entry.getKey();
TestFunction test = entry.getValue();
Object gold = test.run(0);
golds.put(name, gold);
}
}
MemorySegment sliceAligned(MemorySegment src) {
return src.asSlice(0, SIZE);
}
MemorySegment sliceUnaligned(MemorySegment src) {
return src.asSlice(1, SIZE);
}
MemorySegment newMemorySegment() {
return provider.newMemorySegment();
}
static void copy(MemorySegment src, MemorySegment dst) {
MemorySegment.copy(src, 0, dst, 0, src.byteSize());
}
static MemorySegment newMemorySegmentOfByteBufferDirect() {
return MemorySegment.ofBuffer(ByteBuffer.allocateDirect(BACKING_SIZE));
}
static MemorySegment newMemorySegmentOfNative() {
// Auto arena: GC decides when there is no reference to the MemorySegment,
// and then it deallocates the backing memory.
return Arena.ofAuto().allocate(BACKING_SIZE, 1);
}
static void fillRandom(MemorySegment data) {
for (int i = 0; i < (int)data.byteSize(); i++) {
data.set(ValueLayout.JAVA_BYTE, i, (byte)RANDOM.nextInt());
}
}
static void verify(String name, Object gold, Object result) {
try {
Verify.checkEQ(gold, result);
} catch (VerifyException e) {
throw new RuntimeException("Verify: wrong result in " + name, e);
}
}
static int runInvocationCounter = 0;
@Run(test = {"testAlwaysAligned",
"testAlwaysUnaligned",
"testMixedAlignedAndUnaligned"})
void runTests() {
runInvocationCounter++;
for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
String name = entry.getKey();
TestFunction test = entry.getValue();
// Recall gold value from before compilation
Object gold = golds.get(name);
// Compute new result
Object result = test.run(runInvocationCounter);
// Compare gold and new result
verify(name, gold, result);
}
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
IRNode.ADD_VI, "> 0",
IRNode.STORE_VECTOR, "> 0",
"multiversion", "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
phase = CompilePhase.PRINT_IDEAL)
// We never fail the alignment check in the auto vectorization Predicate,
// hence we never even create the multiversioned loops.
static Object testAlwaysAligned(MemorySegment ms) {
for (long i = 0; i < ms.byteSize(); i += 4) {
int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
}
return new Object[]{ ms };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
IRNode.ADD_VI, "> 0",
IRNode.STORE_VECTOR, "> 0",
"multiversion_fast", "= 4", // pre, main, drain, post
"multiversion_slow", "= 2"}, // main, post
applyIf = {"AlignVector", "true"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
phase = CompilePhase.PRINT_IDEAL)
// We add alignment checks to the auto vectorization Predicate. It fails
// at runtime, deopts, and recompiles with multiversioning.
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
IRNode.ADD_VI, "> 0",
IRNode.STORE_VECTOR, "> 0",
"multiversion_fast", "= 0",
"multiversion_slow", "= 0"},
applyIf = {"AlignVector", "false"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
phase = CompilePhase.PRINT_IDEAL)
// We never add any conditions to the auto vectorization Predicate, so
// we also never deopt and never end up multiversioning.
static Object testAlwaysUnaligned(MemorySegment ms) {
for (long i = 0; i < ms.byteSize(); i += 4) {
int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
}
return new Object[]{ ms };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
IRNode.ADD_VI, "> 0",
IRNode.STORE_VECTOR, "> 0",
"multiversion_fast", "= 4", // pre, main, drain, post
"multiversion_slow", "= 2"}, // main, post
applyIf = {"AlignVector", "true"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
phase = CompilePhase.PRINT_IDEAL)
// We add alignment checks to the auto vectorization Predicate. It fails
// at runtime, deopts, and recompiles with multiversioning.
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
IRNode.ADD_VI, "> 0",
IRNode.STORE_VECTOR, "> 0",
"multiversion_fast", "= 0",
"multiversion_slow", "= 0"},
applyIf = {"AlignVector", "false"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
phase = CompilePhase.PRINT_IDEAL)
// We never add any conditions to the auto vectorization Predicate, so
// we also never deopt and never end up multiversioning.
static Object testMixedAlignedAndUnaligned(MemorySegment ms) {
for (long i = 0; i < ms.byteSize(); i += 4) {
int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
}
return new Object[]{ ms };
}
}