mirror of
https://github.com/openjdk/jdk.git
synced 2026-01-28 12:09:14 +00:00
8323582: C2 SuperWord AlignVector: misaligned vector memory access with unaligned native memory
Reviewed-by: roland, kvn
This commit is contained in:
parent
bb48b7319c
commit
885338b5f3
@ -708,6 +708,7 @@
|
||||
declare_constant(Deoptimization::Reason_constraint) \
|
||||
declare_constant(Deoptimization::Reason_div0_check) \
|
||||
declare_constant(Deoptimization::Reason_loop_limit_check) \
|
||||
declare_constant(Deoptimization::Reason_auto_vectorization_check) \
|
||||
declare_constant(Deoptimization::Reason_type_checked_inlining) \
|
||||
declare_constant(Deoptimization::Reason_optimized_type_check) \
|
||||
declare_constant(Deoptimization::Reason_aliasing) \
|
||||
|
||||
@ -346,6 +346,12 @@
|
||||
develop(bool, TraceLoopUnswitching, false, \
|
||||
"Trace loop unswitching") \
|
||||
\
|
||||
product(bool, LoopMultiversioning, true, DIAGNOSTIC, \
|
||||
"Enable loop multiversioning (for speculative compilation)") \
|
||||
\
|
||||
develop(bool, TraceLoopMultiversioning, false, \
|
||||
"Trace loop multiversioning") \
|
||||
\
|
||||
product(bool, AllowVectorizeOnDemand, true, \
|
||||
"Globally suppress vectorization set in VectorizeMethod") \
|
||||
\
|
||||
|
||||
@ -428,7 +428,7 @@ public:
|
||||
IfNode(Node* control, Node* bol, float p, float fcnt);
|
||||
IfNode(Node* control, Node* bol, float p, float fcnt, AssertionPredicateType assertion_predicate_type);
|
||||
|
||||
static IfNode* make_with_same_profile(IfNode* if_node_profile, Node* ctrl, BoolNode* bol);
|
||||
static IfNode* make_with_same_profile(IfNode* if_node_profile, Node* ctrl, Node* bol);
|
||||
|
||||
virtual int Opcode() const;
|
||||
virtual bool pinned() const { return true; }
|
||||
|
||||
@ -277,6 +277,7 @@ macro(OnSpinWait)
|
||||
macro(Opaque1)
|
||||
macro(OpaqueLoopInit)
|
||||
macro(OpaqueLoopStride)
|
||||
macro(OpaqueMultiversioning)
|
||||
macro(OpaqueZeroTripGuard)
|
||||
macro(OpaqueNotNull)
|
||||
macro(OpaqueInitializedAssertionPredicate)
|
||||
|
||||
@ -4086,6 +4086,7 @@ void GraphKit::add_parse_predicates(int nargs) {
|
||||
if (UseProfiledLoopPredicate) {
|
||||
add_parse_predicate(Deoptimization::Reason_profile_predicate, nargs);
|
||||
}
|
||||
add_parse_predicate(Deoptimization::Reason_auto_vectorization_check, nargs);
|
||||
// Loop Limit Check Predicate should be near the loop.
|
||||
add_parse_predicate(Deoptimization::Reason_loop_limit_check, nargs);
|
||||
}
|
||||
|
||||
@ -469,7 +469,7 @@ static Node* split_if(IfNode *iff, PhaseIterGVN *igvn) {
|
||||
return new ConINode(TypeInt::ZERO);
|
||||
}
|
||||
|
||||
IfNode* IfNode::make_with_same_profile(IfNode* if_node_profile, Node* ctrl, BoolNode* bol) {
|
||||
IfNode* IfNode::make_with_same_profile(IfNode* if_node_profile, Node* ctrl, Node* bol) {
|
||||
// Assert here that we only try to create a clone from an If node with the same profiling if that actually makes sense.
|
||||
// Some If node subtypes should not be cloned in this way. In theory, we should not clone BaseCountedLoopEndNodes.
|
||||
// But they can end up being used as normal If nodes when peeling a loop - they serve as zero-trip guard.
|
||||
@ -2177,6 +2177,7 @@ ParsePredicateNode::ParsePredicateNode(Node* control, Deoptimization::DeoptReaso
|
||||
switch (deopt_reason) {
|
||||
case Deoptimization::Reason_predicate:
|
||||
case Deoptimization::Reason_profile_predicate:
|
||||
case Deoptimization::Reason_auto_vectorization_check:
|
||||
case Deoptimization::Reason_loop_limit_check:
|
||||
break;
|
||||
default:
|
||||
@ -2214,6 +2215,9 @@ void ParsePredicateNode::dump_spec(outputStream* st) const {
|
||||
case Deoptimization::DeoptReason::Reason_profile_predicate:
|
||||
st->print("Profiled Loop ");
|
||||
break;
|
||||
case Deoptimization::DeoptReason::Reason_auto_vectorization_check:
|
||||
st->print("Auto_Vectorization_Check ");
|
||||
break;
|
||||
case Deoptimization::DeoptReason::Reason_loop_limit_check:
|
||||
st->print("Loop Limit Check ");
|
||||
break;
|
||||
|
||||
@ -745,6 +745,11 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) {
|
||||
cl->set_trip_count(cl->trip_count() - 1);
|
||||
if (cl->is_main_loop()) {
|
||||
cl->set_normal_loop();
|
||||
if (cl->is_multiversion()) {
|
||||
// Peeling also destroys the connection of the main loop
|
||||
// to the multiversion_if.
|
||||
cl->set_no_multiversion();
|
||||
}
|
||||
#ifndef PRODUCT
|
||||
if (PrintOpto && VerifyLoopOptimizations) {
|
||||
tty->print("Peeling a 'main' loop; resetting to 'normal' ");
|
||||
@ -1174,8 +1179,9 @@ bool IdealLoopTree::policy_range_check(PhaseIdealLoop* phase, bool provisional,
|
||||
if (!bol->is_Bool()) {
|
||||
assert(bol->is_OpaqueNotNull() ||
|
||||
bol->is_OpaqueTemplateAssertionPredicate() ||
|
||||
bol->is_OpaqueInitializedAssertionPredicate(),
|
||||
"Opaque node of a non-null-check or an Assertion Predicate");
|
||||
bol->is_OpaqueInitializedAssertionPredicate() ||
|
||||
bol->is_OpaqueMultiversioning(),
|
||||
"Opaque node of a non-null-check or an Assertion Predicate or Multiversioning");
|
||||
continue;
|
||||
}
|
||||
if (bol->as_Bool()->_test._test == BoolTest::ne) {
|
||||
@ -3354,6 +3360,23 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
|
||||
// Do nothing special to pre- and post- loops
|
||||
if (cl->is_pre_loop() || cl->is_post_loop()) return true;
|
||||
|
||||
// With multiversioning, we create a fast_loop and a slow_loop, and a multiversion_if that
|
||||
// decides which loop is taken at runtime. At first, the multiversion_if always takes the
|
||||
// fast_loop, and we only optimize the fast_loop. Since we are not sure if we will ever use
|
||||
// the slow_loop, we delay optimizations for it, so we do not waste compile time and code
|
||||
// size. If we never change the condition of the multiversion_if, the slow_loop is eventually
|
||||
// folded away after loop-opts. While optimizing the fast_loop, we may want to perform some
|
||||
// speculative optimization, for which we need a runtime-check. We add this runtime-check
|
||||
// condition to the multiversion_if. Now, it becomes possible to execute the slow_loop at
|
||||
// runtime, and we resume optimizations for slow_loop ("un-delay" it).
|
||||
// TLDR: If the slow_loop is still in "delay" mode, check if the multiversion_if was changed
|
||||
// and we should now resume optimizations for it.
|
||||
if (cl->is_multiversion_delayed_slow_loop() &&
|
||||
!phase->try_resume_optimizations_for_delayed_slow_loop(this)) {
|
||||
// We are still delayed, so wait with further loop-opts.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Compute loop trip count from profile data
|
||||
compute_profile_trip_cnt(phase);
|
||||
|
||||
@ -3413,6 +3436,12 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
|
||||
if (!phase->may_require_nodes(estimate)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We are going to add pre-loop and post-loop.
|
||||
// But should we also multi-version for auto-vectorization speculative
|
||||
// checks, i.e. fast and slow-paths?
|
||||
phase->maybe_multiversion_for_auto_vectorization_runtime_checks(this, old_new);
|
||||
|
||||
phase->insert_pre_post_loops(this, old_new, peel_only);
|
||||
}
|
||||
// Adjust the pre- and main-loop limits to let the pre and post loops run
|
||||
|
||||
@ -32,6 +32,23 @@
|
||||
#include "opto/predicates.hpp"
|
||||
#include "opto/rootnode.hpp"
|
||||
|
||||
// Multiversioning:
|
||||
// A loop is cloned, and a selector If decides which loop is taken at run-time: the true-path-loop (original) or the
|
||||
// false-path-loop (cloned).
|
||||
//
|
||||
// Use-cases:
|
||||
// - Speculative compilation:
|
||||
// The selector If checks some assumptions which allow stronger optimization in the true-path-loop. If the assumptions
|
||||
// do not hold, we can still execute in the false-path-loop, although with fewer optimizations.
|
||||
// See: PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks
|
||||
// PhaseIdealLoop::create_new_if_for_multiversion
|
||||
//
|
||||
// - Unswitching:
|
||||
// The selector If has the same (loop invariant) condition as some unswitching candidate If inside the loop. This
|
||||
// allows us to constant-fold the unswitching candidate If to true in the true-path-loop and to false in the
|
||||
// false-path-loop, thus eliminating the unswitching candidate If from the loop.
|
||||
//
|
||||
//
|
||||
// Loop Unswitching is a loop optimization to move an invariant, non-loop-exiting test in the loop body before the loop.
|
||||
// Such a test is either always true or always false in all loop iterations and could therefore only be executed once.
|
||||
// To achieve that, we duplicate the loop and change the original and cloned loop as follows:
|
||||
@ -145,14 +162,16 @@ IfNode* PhaseIdealLoop::find_unswitch_candidate(const IdealLoopTree* loop) const
|
||||
return unswitch_candidate;
|
||||
}
|
||||
|
||||
// This class creates an If node (i.e. loop selector) that selects if the true-path-loop or the false-path-loop should be
|
||||
// executed at runtime. This is done by finding an invariant and non-loop-exiting unswitch candidate If node (guaranteed
|
||||
// to exist at this point) to perform Loop Unswitching on.
|
||||
class UnswitchedLoopSelector : public StackObj {
|
||||
// LoopSelector is used for loop multiversioning and unswitching. This class creates an If node (i.e. loop selector)
|
||||
// that selects if the true-path-loop or the false-path-loop should be executed at runtime.
|
||||
class LoopSelector : public StackObj {
|
||||
// Cached fields for construction.
|
||||
PhaseIdealLoop* const _phase;
|
||||
IdealLoopTree* const _outer_loop;
|
||||
Node* const _original_loop_entry;
|
||||
IfNode* const _unswitch_candidate;
|
||||
const uint _dom_depth; // of original_loop_entry
|
||||
|
||||
// Constructed selector if with its projections.
|
||||
IfNode* const _selector;
|
||||
IfTrueNode* const _true_path_loop_proj;
|
||||
IfFalseNode* const _false_path_loop_proj;
|
||||
@ -160,52 +179,59 @@ class UnswitchedLoopSelector : public StackObj {
|
||||
enum PathToLoop { TRUE_PATH, FALSE_PATH };
|
||||
|
||||
public:
|
||||
UnswitchedLoopSelector(IdealLoopTree* loop)
|
||||
// For multiversioning: create a new selector (multiversion_if) from a bol condition.
|
||||
LoopSelector(IdealLoopTree* loop, Node* bol, float prob, float fcnt)
|
||||
: _phase(loop->_phase),
|
||||
_outer_loop(loop->skip_strip_mined()->_parent),
|
||||
_original_loop_entry(loop->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl)),
|
||||
_unswitch_candidate(find_unswitch_candidate(loop)),
|
||||
_selector(create_selector_if()),
|
||||
_dom_depth(_phase->dom_depth(_original_loop_entry)),
|
||||
_selector(create_multiversioning_if(bol, prob, fcnt)), // multiversioning
|
||||
_true_path_loop_proj(create_proj_to_loop(TRUE_PATH)->as_IfTrue()),
|
||||
_false_path_loop_proj(create_proj_to_loop(FALSE_PATH)->as_IfFalse()) {
|
||||
}
|
||||
NONCOPYABLE(UnswitchedLoopSelector);
|
||||
|
||||
private:
|
||||
IfNode* find_unswitch_candidate(IdealLoopTree* loop) {
|
||||
IfNode* unswitch_candidate = _phase->find_unswitch_candidate(loop);
|
||||
assert(unswitch_candidate != nullptr, "guaranteed to exist by policy_unswitching");
|
||||
assert(_phase->is_member(loop, unswitch_candidate), "must be inside original loop");
|
||||
return unswitch_candidate;
|
||||
// For unswitching: create an unswitching if before the loop, from a pre-existing
|
||||
// unswitching_candidate inside the loop.
|
||||
LoopSelector(IdealLoopTree* loop, IfNode* unswitch_candidate)
|
||||
: _phase(loop->_phase),
|
||||
_outer_loop(loop->skip_strip_mined()->_parent),
|
||||
_original_loop_entry(loop->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl)),
|
||||
_dom_depth(_phase->dom_depth(_original_loop_entry)),
|
||||
_selector(create_unswitching_if(unswitch_candidate)), // unswitching
|
||||
_true_path_loop_proj(create_proj_to_loop(TRUE_PATH)->as_IfTrue()),
|
||||
_false_path_loop_proj(create_proj_to_loop(FALSE_PATH)->as_IfFalse()) {
|
||||
}
|
||||
NONCOPYABLE(LoopSelector);
|
||||
|
||||
IfNode* create_selector_if() const {
|
||||
const uint dom_depth = _phase->dom_depth(_original_loop_entry);
|
||||
IfNode* create_multiversioning_if(Node* bol, float prob, float fcnt) {
|
||||
_phase->igvn().rehash_node_delayed(_original_loop_entry);
|
||||
BoolNode* unswitch_candidate_bool = _unswitch_candidate->in(1)->as_Bool();
|
||||
IfNode* selector_if = IfNode::make_with_same_profile(_unswitch_candidate, _original_loop_entry,
|
||||
unswitch_candidate_bool);
|
||||
_phase->register_node(selector_if, _outer_loop, _original_loop_entry, dom_depth);
|
||||
IfNode* selector_if = new IfNode(_original_loop_entry, bol, prob, fcnt);
|
||||
_phase->register_node(selector_if, _outer_loop, _original_loop_entry, _dom_depth);
|
||||
return selector_if;
|
||||
}
|
||||
|
||||
IfNode* create_unswitching_if(IfNode* unswitch_candidate) {
|
||||
_phase->igvn().rehash_node_delayed(_original_loop_entry);
|
||||
BoolNode* unswitch_candidate_bool = unswitch_candidate->in(1)->as_Bool();
|
||||
IfNode* selector_if = IfNode::make_with_same_profile(unswitch_candidate, _original_loop_entry,
|
||||
unswitch_candidate_bool);
|
||||
_phase->register_node(selector_if, _outer_loop, _original_loop_entry, _dom_depth);
|
||||
return selector_if;
|
||||
}
|
||||
|
||||
private:
|
||||
IfProjNode* create_proj_to_loop(const PathToLoop path_to_loop) {
|
||||
const uint dom_depth = _phase->dom_depth(_original_loop_entry);
|
||||
IfProjNode* proj_to_loop;
|
||||
if (path_to_loop == TRUE_PATH) {
|
||||
proj_to_loop = new IfTrueNode(_selector);
|
||||
} else {
|
||||
proj_to_loop = new IfFalseNode(_selector);
|
||||
}
|
||||
_phase->register_node(proj_to_loop, _outer_loop, _selector, dom_depth);
|
||||
_phase->register_node(proj_to_loop, _outer_loop, _selector, _dom_depth);
|
||||
return proj_to_loop;
|
||||
}
|
||||
|
||||
public:
|
||||
IfNode* unswitch_candidate() const {
|
||||
return _unswitch_candidate;
|
||||
}
|
||||
|
||||
IfNode* selector() const {
|
||||
return _selector;
|
||||
}
|
||||
@ -219,6 +245,37 @@ class UnswitchedLoopSelector : public StackObj {
|
||||
}
|
||||
};
|
||||
|
||||
// This class creates an If node (i.e. loop selector) that selects if the true-path-loop or the false-path-loop should be
|
||||
// executed at runtime. This is done by finding an invariant and non-loop-exiting unswitch candidate If node (guaranteed
|
||||
// to exist at this point) to perform Loop Unswitching on.
|
||||
class UnswitchedLoopSelector : public StackObj {
|
||||
IfNode* const _unswitch_candidate;
|
||||
const LoopSelector _loop_selector;
|
||||
|
||||
public:
|
||||
UnswitchedLoopSelector(IdealLoopTree* loop)
|
||||
: _unswitch_candidate(find_unswitch_candidate(loop)),
|
||||
_loop_selector(loop, _unswitch_candidate) {}
|
||||
NONCOPYABLE(UnswitchedLoopSelector);
|
||||
|
||||
private:
|
||||
static IfNode* find_unswitch_candidate(IdealLoopTree* loop) {
|
||||
IfNode* unswitch_candidate = loop->_phase->find_unswitch_candidate(loop);
|
||||
assert(unswitch_candidate != nullptr, "guaranteed to exist by policy_unswitching");
|
||||
assert(loop->_phase->is_member(loop, unswitch_candidate), "must be inside original loop");
|
||||
return unswitch_candidate;
|
||||
}
|
||||
|
||||
public:
|
||||
IfNode* unswitch_candidate() const {
|
||||
return _unswitch_candidate;
|
||||
}
|
||||
|
||||
const LoopSelector& loop_selector() const {
|
||||
return _loop_selector;
|
||||
}
|
||||
};
|
||||
|
||||
// Class to unswitch the original loop and create Predicates at the new unswitched loop versions. The newly cloned loop
|
||||
// becomes the false-path-loop while original loop becomes the true-path-loop.
|
||||
class OriginalLoop : public StackObj {
|
||||
@ -238,55 +295,62 @@ class OriginalLoop : public StackObj {
|
||||
// Unswitch the original loop on the invariant loop selector by creating a true-path-loop and a false-path-loop.
|
||||
// Remove the unswitch candidate If from both unswitched loop versions which are now covered by the loop selector If.
|
||||
void unswitch(const UnswitchedLoopSelector& unswitched_loop_selector) {
|
||||
const uint first_false_path_loop_node_index = _phase->C->unique();
|
||||
clone_loop(unswitched_loop_selector);
|
||||
|
||||
move_parse_and_template_assertion_predicates_to_unswitched_loops(unswitched_loop_selector,
|
||||
first_false_path_loop_node_index);
|
||||
DEBUG_ONLY(verify_unswitched_loop_versions(_loop->_head->as_Loop(), unswitched_loop_selector);)
|
||||
|
||||
_phase->recompute_dom_depth();
|
||||
multiversion(unswitched_loop_selector.loop_selector());
|
||||
remove_unswitch_candidate_from_loops(unswitched_loop_selector);
|
||||
}
|
||||
|
||||
private:
|
||||
void clone_loop(const UnswitchedLoopSelector& unswitched_loop_selector) {
|
||||
_phase->clone_loop(_loop, _old_new, _phase->dom_depth(_loop_head),
|
||||
PhaseIdealLoop::CloneIncludesStripMined, unswitched_loop_selector.selector());
|
||||
fix_loop_entries(unswitched_loop_selector);
|
||||
// Multiversion the original loop. The loop selector if selects between the original loop (true-path-loop), and
|
||||
// a copy of it (false-path-loop).
|
||||
void multiversion(const LoopSelector& loop_selector) {
|
||||
const uint first_false_path_loop_node_index = _phase->C->unique();
|
||||
clone_loop(loop_selector);
|
||||
|
||||
move_parse_and_template_assertion_predicates_to_unswitched_loops(loop_selector,
|
||||
first_false_path_loop_node_index);
|
||||
DEBUG_ONLY(verify_loop_versions(_loop->_head->as_Loop(), loop_selector);)
|
||||
|
||||
_phase->recompute_dom_depth();
|
||||
}
|
||||
|
||||
void fix_loop_entries(const UnswitchedLoopSelector& unswitched_loop_selector) {
|
||||
_phase->replace_loop_entry(_loop_head, unswitched_loop_selector.true_path_loop_proj());
|
||||
private:
|
||||
void clone_loop(const LoopSelector& loop_selector) {
|
||||
_phase->clone_loop(_loop, _old_new, _phase->dom_depth(_loop_head),
|
||||
PhaseIdealLoop::CloneIncludesStripMined, loop_selector.selector());
|
||||
fix_loop_entries(loop_selector);
|
||||
}
|
||||
|
||||
void fix_loop_entries(const LoopSelector& loop_selector) {
|
||||
_phase->replace_loop_entry(_loop_head, loop_selector.true_path_loop_proj());
|
||||
LoopNode* false_path_loop_strip_mined_head = old_to_new(_loop_head)->as_Loop();
|
||||
_phase->replace_loop_entry(false_path_loop_strip_mined_head,
|
||||
unswitched_loop_selector.false_path_loop_proj());
|
||||
loop_selector.false_path_loop_proj());
|
||||
}
|
||||
|
||||
// Moves the Parse And Template Assertion Predicates to the true and false path loop. They are inserted between the
|
||||
// loop heads and the loop selector If projections. The old Parse and Template Assertion Predicates before
|
||||
// the unswitched loop selector are killed.
|
||||
void move_parse_and_template_assertion_predicates_to_unswitched_loops(
|
||||
const UnswitchedLoopSelector& unswitched_loop_selector, const uint first_false_path_loop_node_index) const {
|
||||
const LoopSelector& loop_selector, const uint first_false_path_loop_node_index) const {
|
||||
const NodeInOriginalLoopBody node_in_true_path_loop_body(first_false_path_loop_node_index, _old_new);
|
||||
const NodeInClonedLoopBody node_in_false_path_loop_body(first_false_path_loop_node_index);
|
||||
CloneUnswitchedLoopPredicatesVisitor
|
||||
clone_unswitched_loop_predicates_visitor(_loop_head, old_to_new(_loop_head)->as_Loop(), node_in_true_path_loop_body,
|
||||
node_in_false_path_loop_body, _phase);
|
||||
Node* source_loop_entry = unswitched_loop_selector.selector()->in(0);
|
||||
Node* source_loop_entry = loop_selector.selector()->in(0);
|
||||
PredicateIterator predicate_iterator(source_loop_entry);
|
||||
predicate_iterator.for_each(clone_unswitched_loop_predicates_visitor);
|
||||
}
|
||||
|
||||
#ifdef ASSERT
|
||||
void verify_unswitched_loop_versions(LoopNode* true_path_loop_head,
|
||||
const UnswitchedLoopSelector& unswitched_loop_selector) const {
|
||||
verify_unswitched_loop_version(true_path_loop_head, unswitched_loop_selector.true_path_loop_proj());
|
||||
verify_unswitched_loop_version(old_to_new(true_path_loop_head)->as_Loop(),
|
||||
unswitched_loop_selector.false_path_loop_proj());
|
||||
void verify_loop_versions(LoopNode* true_path_loop_head,
|
||||
const LoopSelector& loop_selector) const {
|
||||
verify_loop_version(true_path_loop_head,
|
||||
loop_selector.true_path_loop_proj());
|
||||
verify_loop_version(old_to_new(true_path_loop_head)->as_Loop(),
|
||||
loop_selector.false_path_loop_proj());
|
||||
}
|
||||
|
||||
static void verify_unswitched_loop_version(LoopNode* loop_head, IfProjNode* loop_selector_if_proj) {
|
||||
static void verify_loop_version(LoopNode* loop_head, IfProjNode* loop_selector_if_proj) {
|
||||
Node* entry = loop_head->skip_strip_mined()->in(LoopNode::EntryControl);
|
||||
const Predicates predicates(entry);
|
||||
// When skipping all predicates, we should end up at 'loop_selector_if_proj'.
|
||||
@ -302,15 +366,15 @@ class OriginalLoop : public StackObj {
|
||||
// If node. Keep the true-path-path in the true-path-loop and the false-path-path in the false-path-loop by setting
|
||||
// the bool input accordingly. The unswitch candidate If nodes are folded in the next IGVN round.
|
||||
void remove_unswitch_candidate_from_loops(const UnswitchedLoopSelector& unswitched_loop_selector) {
|
||||
IfNode* unswitching_candidate = unswitched_loop_selector.unswitch_candidate();
|
||||
_phase->igvn().rehash_node_delayed(unswitching_candidate);
|
||||
_phase->dominated_by(unswitched_loop_selector.true_path_loop_proj(), unswitching_candidate);
|
||||
const LoopSelector& loop_selector = unswitched_loop_selector.loop_selector();;
|
||||
IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate();
|
||||
_phase->igvn().rehash_node_delayed(unswitch_candidate);
|
||||
_phase->dominated_by(loop_selector.true_path_loop_proj(), unswitch_candidate);
|
||||
|
||||
IfNode* unswitching_candidate_clone = _old_new[unswitching_candidate->_idx]->as_If();
|
||||
_phase->igvn().rehash_node_delayed(unswitching_candidate_clone);
|
||||
_phase->dominated_by(unswitched_loop_selector.false_path_loop_proj(), unswitching_candidate_clone);
|
||||
IfNode* unswitch_candidate_clone = _old_new[unswitch_candidate->_idx]->as_If();
|
||||
_phase->igvn().rehash_node_delayed(unswitch_candidate_clone);
|
||||
_phase->dominated_by(loop_selector.false_path_loop_proj(), unswitch_candidate_clone);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// See comments below file header for more information about Loop Unswitching.
|
||||
@ -343,6 +407,172 @@ void PhaseIdealLoop::do_unswitching(IdealLoopTree* loop, Node_List& old_new) {
|
||||
C->set_major_progress();
|
||||
}
|
||||
|
||||
void PhaseIdealLoop::do_multiversioning(IdealLoopTree* lpt, Node_List& old_new) {
|
||||
#ifndef PRODUCT
|
||||
if (TraceLoopOpts || TraceLoopMultiversioning) {
|
||||
tty->print("Multiversion ");
|
||||
lpt->dump_head();
|
||||
}
|
||||
#endif
|
||||
assert(LoopMultiversioning, "LoopMultiversioning must be enabled");
|
||||
|
||||
CountedLoopNode* original_head = lpt->_head->as_CountedLoop();
|
||||
C->print_method(PHASE_BEFORE_LOOP_MULTIVERSIONING, 4, original_head);
|
||||
|
||||
Node* one = _igvn.intcon(1);
|
||||
set_ctrl(one, C->root());
|
||||
Node* opaque = new OpaqueMultiversioningNode(C, one);
|
||||
set_ctrl(opaque, C->root());
|
||||
_igvn.register_new_node_with_optimizer(opaque);
|
||||
_igvn.set_type(opaque, TypeInt::BOOL);
|
||||
|
||||
const LoopSelector loop_selector(lpt, opaque, PROB_LIKELY_MAG(3), COUNT_UNKNOWN);
|
||||
OriginalLoop original_loop(lpt, old_new);
|
||||
original_loop.multiversion(loop_selector);
|
||||
|
||||
add_unswitched_loop_version_bodies_to_igvn(lpt, old_new);
|
||||
|
||||
CountedLoopNode* new_head = old_new[original_head->_idx]->as_CountedLoop();
|
||||
original_head->set_multiversion_fast_loop();
|
||||
new_head->set_multiversion_delayed_slow_loop();
|
||||
|
||||
NOT_PRODUCT(trace_loop_multiversioning_result(loop_selector, original_head, new_head);)
|
||||
C->print_method(PHASE_AFTER_LOOP_MULTIVERSIONING, 4, new_head);
|
||||
C->set_major_progress();
|
||||
}
|
||||
|
||||
// Create a new if in the multiversioning pattern, adding an additional condition for the
|
||||
// multiversioning fast-loop.
|
||||
//
|
||||
// Before:
|
||||
// entry opaque
|
||||
// | |
|
||||
// multiversion_if
|
||||
// | |
|
||||
// +----------------+ +---------------+
|
||||
// | |
|
||||
// multiversion_fast_proj multiversion_slow_proj
|
||||
// |
|
||||
// +--------+
|
||||
// |
|
||||
// slow_path
|
||||
//
|
||||
//
|
||||
// After:
|
||||
// entry opaque <-- to be replaced by caller
|
||||
// | |
|
||||
// new_if
|
||||
// | |
|
||||
// | +-----------------------------+
|
||||
// | |
|
||||
// new_if_true opaque new_if_false
|
||||
// | | |
|
||||
// multiversion_if |
|
||||
// | | |
|
||||
// +----------------+ +---------------+ |
|
||||
// | | |
|
||||
// multiversion_fast_proj new_multiversion_slow_proj |
|
||||
// | |
|
||||
// +------+ |
|
||||
// | |
|
||||
// region
|
||||
// |
|
||||
// slow_path
|
||||
//
|
||||
IfTrueNode* PhaseIdealLoop::create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj) {
|
||||
// Give all nodes in the old sub-graph a name.
|
||||
IfNode* multiversion_if = multiversioning_fast_proj->in(0)->as_If();
|
||||
Node* entry = multiversion_if->in(0);
|
||||
OpaqueMultiversioningNode* opaque = multiversion_if->in(1)->as_OpaqueMultiversioning();
|
||||
IfFalseNode* multiversion_slow_proj = multiversion_if->proj_out(0)->as_IfFalse();
|
||||
Node* slow_path = multiversion_slow_proj->unique_ctrl_out();
|
||||
|
||||
// The slow_loop may still be delayed, and waiting for runtime-checks to be added to the
|
||||
// multiversion_if. Now that we have at least one condition for the multiversioning,
|
||||
// we should resume optimizations for the slow loop.
|
||||
opaque->notify_slow_loop_that_it_can_resume_optimizations();
|
||||
|
||||
// Create new_if with its projections.
|
||||
IfNode* new_if = IfNode::make_with_same_profile(multiversion_if, entry, opaque);
|
||||
IdealLoopTree* lp = get_loop(entry);
|
||||
register_control(new_if, lp, entry);
|
||||
|
||||
IfTrueNode* new_if_true = new IfTrueNode(new_if);
|
||||
IfFalseNode* new_if_false = new IfFalseNode(new_if);
|
||||
register_control(new_if_true, lp, new_if);
|
||||
register_control(new_if_false, lp, new_if);
|
||||
|
||||
// Hook new_if_true into multiversion_if.
|
||||
_igvn.replace_input_of(multiversion_if, 0, new_if_true);
|
||||
|
||||
// Clone multiversion_slow_path - this allows us to easily carry the dependencies to
|
||||
// the new region below.
|
||||
IfFalseNode* new_multiversion_slow_proj = multiversion_slow_proj->clone()->as_IfFalse();
|
||||
register_control(new_multiversion_slow_proj, lp, multiversion_if);
|
||||
|
||||
// Create new Region.
|
||||
RegionNode* region = new RegionNode(1);
|
||||
region->add_req(new_multiversion_slow_proj);
|
||||
region->add_req(new_if_false);
|
||||
register_control(region, lp, new_multiversion_slow_proj);
|
||||
|
||||
// Hook region into slow_path, in stead of the multiversion_slow_proj.
|
||||
// This also moves all other dependencies of the multiversion_slow_proj to the region.
|
||||
_igvn.replace_node(multiversion_slow_proj, region);
|
||||
|
||||
return new_if_true;
|
||||
}
|
||||
|
||||
OpaqueMultiversioningNode* find_multiversion_opaque_from_multiversion_if_false(Node* maybe_multiversion_if_false) {
|
||||
IfFalseNode* multiversion_if_false = maybe_multiversion_if_false->isa_IfFalse();
|
||||
if (multiversion_if_false == nullptr) { return nullptr; }
|
||||
IfNode* multiversion_if = multiversion_if_false->in(0)->isa_If();
|
||||
if (multiversion_if == nullptr) { return nullptr; }
|
||||
return multiversion_if->in(1)->isa_OpaqueMultiversioning();
|
||||
}
|
||||
|
||||
bool PhaseIdealLoop::try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt) {
|
||||
CountedLoopNode* cl = lpt->_head->as_CountedLoop();
|
||||
assert(cl->is_multiversion_delayed_slow_loop(), "must currently be delayed");
|
||||
|
||||
// Find multiversion_if.
|
||||
Node* entry = cl->skip_strip_mined()->in(LoopNode::EntryControl);
|
||||
const Predicates predicates(entry);
|
||||
|
||||
Node* slow_path = predicates.entry();
|
||||
|
||||
// Find opaque.
|
||||
OpaqueMultiversioningNode* opaque = nullptr;
|
||||
if (slow_path->is_Region()) {
|
||||
for (uint i = 1; i < slow_path->req(); i++) {
|
||||
Node* n = slow_path->in(i);
|
||||
opaque = find_multiversion_opaque_from_multiversion_if_false(n);
|
||||
if (opaque != nullptr) { break; }
|
||||
}
|
||||
} else {
|
||||
opaque = find_multiversion_opaque_from_multiversion_if_false(slow_path);
|
||||
}
|
||||
assert(opaque != nullptr, "must have found multiversion opaque node");
|
||||
if (opaque == nullptr) { return false; }
|
||||
|
||||
// We may still be delayed, if there were not yet any runtime-checks added
|
||||
// for the multiversioning. We may never add any, and then this loop would
|
||||
// fold away. So we wait until some runtime-checks are added, then we know
|
||||
// that this loop will be reachable and it is worth optimizing further.
|
||||
if (opaque->is_delayed_slow_loop()) { return false; }
|
||||
|
||||
// Clear away the "delayed" status, i.e. resume optimizations.
|
||||
cl->set_no_multiversion();
|
||||
cl->set_multiversion_slow_loop();
|
||||
#ifndef PRODUCT
|
||||
if (TraceLoopOpts) {
|
||||
tty->print("Resume Optimizations ");
|
||||
lpt->dump_head();
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PhaseIdealLoop::has_control_dependencies_from_predicates(LoopNode* head) {
|
||||
Node* entry = head->skip_strip_mined()->in(LoopNode::EntryControl);
|
||||
const Predicates predicates(entry);
|
||||
@ -377,7 +607,7 @@ void PhaseIdealLoop::trace_loop_unswitching_result(const UnswitchedLoopSelector&
|
||||
const LoopNode* original_head, const LoopNode* new_head) {
|
||||
if (TraceLoopUnswitching) {
|
||||
IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate();
|
||||
IfNode* loop_selector = unswitched_loop_selector.selector();
|
||||
IfNode* loop_selector = unswitched_loop_selector.loop_selector().selector();
|
||||
tty->print_cr("Loop Unswitching:");
|
||||
tty->print_cr("- Unswitch-Candidate-If: %d %s", unswitch_candidate->_idx, unswitch_candidate->Name());
|
||||
tty->print_cr("- Loop-Selector-If: %d %s", loop_selector->_idx, loop_selector->Name());
|
||||
@ -385,22 +615,33 @@ void PhaseIdealLoop::trace_loop_unswitching_result(const UnswitchedLoopSelector&
|
||||
tty->print_cr("- False-Path-Loop (=Clone): %d %s", new_head->_idx, new_head->Name());
|
||||
}
|
||||
}
|
||||
|
||||
void PhaseIdealLoop::trace_loop_multiversioning_result(const LoopSelector& loop_selector,
|
||||
const LoopNode* original_head, const LoopNode* new_head) {
|
||||
if (TraceLoopMultiversioning) {
|
||||
IfNode* selector_if = loop_selector.selector();
|
||||
tty->print_cr("Loop Multiversioning:");
|
||||
tty->print_cr("- Loop-Selector-If: %d %s", selector_if->_idx, selector_if->Name());
|
||||
tty->print_cr("- True-Path-Loop (=Orig / Fast): %d %s", original_head->_idx, original_head->Name());
|
||||
tty->print_cr("- False-Path-Loop (=Clone / Slow): %d %s", new_head->_idx, new_head->Name());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// When unswitching a counted loop, we need to convert it back to a normal loop since it's not a proper pre, main or,
|
||||
// post loop anymore after loop unswitching.
|
||||
// post loop anymore after loop unswitching. We also lose the multiversion structure, with access to the multiversion_if.
|
||||
void PhaseIdealLoop::revert_to_normal_loop(const LoopNode* loop_head) {
|
||||
CountedLoopNode* cl = loop_head->isa_CountedLoop();
|
||||
if (cl != nullptr && !cl->is_normal_loop()) {
|
||||
cl->set_normal_loop();
|
||||
}
|
||||
if (cl == nullptr) { return; }
|
||||
if (!cl->is_normal_loop()) { cl->set_normal_loop(); }
|
||||
if (cl->is_multiversion()) { cl->set_no_multiversion(); }
|
||||
}
|
||||
|
||||
// Hoist invariant CheckCastPPNodes out of each unswitched loop version to the appropriate loop selector If projection.
|
||||
void PhaseIdealLoop::hoist_invariant_check_casts(const IdealLoopTree* loop, const Node_List& old_new,
|
||||
const UnswitchedLoopSelector& unswitched_loop_selector) {
|
||||
IfNode* unswitch_candidate = unswitched_loop_selector.unswitch_candidate();
|
||||
IfNode* loop_selector = unswitched_loop_selector.selector();
|
||||
IfNode* loop_selector = unswitched_loop_selector.loop_selector().selector();
|
||||
ResourceMark rm;
|
||||
GrowableArray<CheckCastPPNode*> loop_invariant_check_casts;
|
||||
for (DUIterator_Fast imax, i = unswitch_candidate->fast_outs(imax); i < imax; i++) {
|
||||
|
||||
@ -1090,6 +1090,14 @@ bool PhaseIdealLoop::create_loop_nest(IdealLoopTree* loop, Node_List &old_new) {
|
||||
if (UseProfiledLoopPredicate) {
|
||||
add_parse_predicate(Deoptimization::Reason_profile_predicate, inner_head, outer_ilt, cloned_sfpt);
|
||||
}
|
||||
|
||||
// We only want to use the auto-vectorization check as a trap once per bci. And
|
||||
// PhaseIdealLoop::add_parse_predicate only checks trap limits per method, so
|
||||
// we do a custom check here.
|
||||
if (!C->too_many_traps(cloned_sfpt->jvms()->method(), cloned_sfpt->jvms()->bci(), Deoptimization::Reason_auto_vectorization_check)) {
|
||||
add_parse_predicate(Deoptimization::Reason_auto_vectorization_check, inner_head, outer_ilt, cloned_sfpt);
|
||||
}
|
||||
|
||||
add_parse_predicate(Deoptimization::Reason_loop_limit_check, inner_head, outer_ilt, cloned_sfpt);
|
||||
}
|
||||
|
||||
@ -2511,6 +2519,9 @@ void CountedLoopNode::dump_spec(outputStream *st) const {
|
||||
if (is_main_loop()) st->print("main of N%d", _idx);
|
||||
if (is_post_loop()) st->print("post of N%d", _main_idx);
|
||||
if (is_strip_mined()) st->print(" strip mined");
|
||||
if (is_multiversion_fast_loop()) { st->print(" multiversion_fast"); }
|
||||
if (is_multiversion_slow_loop()) { st->print(" multiversion_slow"); }
|
||||
if (is_multiversion_delayed_slow_loop()) { st->print(" multiversion_delayed_slow"); }
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -4303,6 +4314,9 @@ void IdealLoopTree::dump_head() {
|
||||
if (cl->is_post_loop()) tty->print(" post");
|
||||
if (cl->is_vectorized_loop()) tty->print(" vector");
|
||||
if (range_checks_present()) tty->print(" rc ");
|
||||
if (cl->is_multiversion_fast_loop()) { tty->print(" multiversion_fast"); }
|
||||
if (cl->is_multiversion_slow_loop()) { tty->print(" multiversion_slow"); }
|
||||
if (cl->is_multiversion_delayed_slow_loop()) { tty->print(" multiversion_delayed_slow"); }
|
||||
}
|
||||
if (_has_call) tty->print(" has_call");
|
||||
if (_has_sfpt) tty->print(" has_sfpt");
|
||||
@ -4948,18 +4962,6 @@ void PhaseIdealLoop::build_and_optimize() {
|
||||
C->set_major_progress();
|
||||
}
|
||||
|
||||
// Keep loop predicates and perform optimizations with them
|
||||
// until no more loop optimizations could be done.
|
||||
// After that switch predicates off and do more loop optimizations.
|
||||
if (!C->major_progress() && (C->parse_predicate_count() > 0)) {
|
||||
C->mark_parse_predicate_nodes_useless(_igvn);
|
||||
assert(C->parse_predicate_count() == 0, "should be zero now");
|
||||
if (TraceLoopOpts) {
|
||||
tty->print_cr("PredicatesOff");
|
||||
}
|
||||
C->set_major_progress();
|
||||
}
|
||||
|
||||
// Auto-vectorize main-loop
|
||||
if (C->do_superword() && C->has_loops() && !C->major_progress()) {
|
||||
Compile::TracePhase tp(_t_autoVectorize);
|
||||
@ -4992,6 +4994,18 @@ void PhaseIdealLoop::build_and_optimize() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Keep loop predicates and perform optimizations with them
|
||||
// until no more loop optimizations could be done.
|
||||
// After that switch predicates off and do more loop optimizations.
|
||||
if (!C->major_progress() && (C->parse_predicate_count() > 0)) {
|
||||
C->mark_parse_predicate_nodes_useless(_igvn);
|
||||
assert(C->parse_predicate_count() == 0, "should be zero now");
|
||||
if (TraceLoopOpts) {
|
||||
tty->print_cr("PredicatesOff");
|
||||
}
|
||||
C->set_major_progress();
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
|
||||
@ -43,6 +43,7 @@ class OuterStripMinedLoopEndNode;
|
||||
class PredicateBlock;
|
||||
class PathFrequency;
|
||||
class PhaseIdealLoop;
|
||||
class LoopSelector;
|
||||
class UnswitchedLoopSelector;
|
||||
class VectorSet;
|
||||
class VSharedData;
|
||||
@ -79,7 +80,12 @@ protected:
|
||||
SubwordLoop = 1<<13,
|
||||
ProfileTripFailed = 1<<14,
|
||||
LoopNestInnerLoop = 1<<15,
|
||||
LoopNestLongOuterLoop = 1<<16 };
|
||||
LoopNestLongOuterLoop = 1<<16,
|
||||
MultiversionFastLoop = 1<<17,
|
||||
MultiversionSlowLoop = 2<<17,
|
||||
MultiversionDelayedSlowLoop = 3<<17,
|
||||
MultiversionFlagsMask = 3<<17,
|
||||
};
|
||||
char _unswitch_count;
|
||||
enum { _unswitch_max=3 };
|
||||
|
||||
@ -315,6 +321,32 @@ public:
|
||||
void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; }
|
||||
int slp_max_unroll() const { return _slp_maximum_unroll_factor; }
|
||||
|
||||
// Multiversioning allows us to duplicate a CountedLoop, and have two versions, and the multiversion_if
|
||||
// decides which one is taken:
|
||||
// (1) fast_loop: We enter this loop by default, by default the multiversion_if has its condition set to
|
||||
// "true", guarded by a OpaqueMultiversioning. If we want to make a speculative assumption
|
||||
// for an optimization, we can add the runtime-check to the multiversion_if, and if the
|
||||
// assumption fails we take the slow_loop instead, where we do not make the same speculative
|
||||
// assumption.
|
||||
// We call it the "fast_loop" because it has more optimizations, enabled by the speculative
|
||||
// runtime-checks at the multiversion_if, and we expect the fast_loop to execute faster.
|
||||
// (2) slow_loop: By default, it is not taken, until a runtime-check is added to the multiversion_if while
|
||||
// optimizing the fast_looop. If such a runtime-check is never added, then after loop-opts
|
||||
// the multiversion_if constant folds to true, and the slow_loop is folded away. To save
|
||||
// compile time, we delay the optimization of the slow_loop until a runtime-check is added
|
||||
// to the multiversion_if, at which point we resume optimizations for the slow_loop.
|
||||
// We call it the "slow_loop" because it has fewer optimizations, since this is the fall-back
|
||||
// loop where we do not make any of the speculative assumptions we make for the fast_loop.
|
||||
// Hence, we expect the slow_loop to execute slower.
|
||||
bool is_multiversion() const { return (_loop_flags & MultiversionFlagsMask) != Normal; }
|
||||
bool is_multiversion_fast_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionFastLoop; }
|
||||
bool is_multiversion_slow_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionSlowLoop; }
|
||||
bool is_multiversion_delayed_slow_loop() const { return (_loop_flags & MultiversionFlagsMask) == MultiversionDelayedSlowLoop; }
|
||||
void set_multiversion_fast_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionFastLoop; }
|
||||
void set_multiversion_slow_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionSlowLoop; }
|
||||
void set_multiversion_delayed_slow_loop() { assert(!is_multiversion(), ""); _loop_flags |= MultiversionDelayedSlowLoop; }
|
||||
void set_no_multiversion() { assert( is_multiversion(), ""); _loop_flags &= ~MultiversionFlagsMask; }
|
||||
|
||||
virtual LoopNode* skip_strip_mined(int expect_skeleton = 1);
|
||||
OuterStripMinedLoopNode* outer_loop() const;
|
||||
virtual IfTrueNode* outer_loop_tail() const;
|
||||
@ -1457,6 +1489,8 @@ public:
|
||||
static void trace_loop_unswitching_impossible(const LoopNode* original_head);
|
||||
static void trace_loop_unswitching_result(const UnswitchedLoopSelector& unswitched_loop_selector,
|
||||
const LoopNode* original_head, const LoopNode* new_head);
|
||||
static void trace_loop_multiversioning_result(const LoopSelector& loop_selector,
|
||||
const LoopNode* original_head, const LoopNode* new_head);
|
||||
#endif
|
||||
|
||||
public:
|
||||
@ -1483,6 +1517,11 @@ public:
|
||||
};
|
||||
AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
|
||||
|
||||
void maybe_multiversion_for_auto_vectorization_runtime_checks(IdealLoopTree* lpt, Node_List& old_new);
|
||||
void do_multiversioning(IdealLoopTree* lpt, Node_List& old_new);
|
||||
IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj);
|
||||
bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt);
|
||||
|
||||
// Move an unordered Reduction out of loop if possible
|
||||
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
|
||||
|
||||
|
||||
@ -4482,6 +4482,66 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
|
||||
return AutoVectorizeStatus::Success;
|
||||
}
|
||||
|
||||
// Just before insert_pre_post_loops, we can multi-version the loop:
|
||||
//
|
||||
// multiversion_if
|
||||
// | |
|
||||
// fast_loop slow_loop
|
||||
//
|
||||
// In the fast_loop we can make speculative assumptions, and put the
|
||||
// conditions into the multiversion_if. If the conditions hold at runtime,
|
||||
// we enter the fast_loop, if the conditions fail, we take the slow_loop
|
||||
// instead which does not make any of the speculative assumptions.
|
||||
//
|
||||
// Note: we only multiversion the loop if the loop does not have any
|
||||
// auto vectorization check Predicate. If we have that predicate,
|
||||
// then we can simply add the speculative assumption checks to
|
||||
// that Predicate. This means we do not need to duplicate the
|
||||
// loop - we have a smaller graph and save compile time. Should
|
||||
// the conditions ever fail, then we deopt / trap at the Predicate
|
||||
// and recompile without that Predicate. At that point we will
|
||||
// multiversion the loop, so that we can still have speculative
|
||||
// runtime checks.
|
||||
//
|
||||
// We perform the multiversioning when the loop is still in its single
|
||||
// iteration form, even before we insert pre and post loops. This makes
|
||||
// the cloning much simpler. However, this means that both the fast
|
||||
// and the slow loop have to be optimized independently (adding pre
|
||||
// and post loops, unrolling the main loop, auto-vectorize etc.). And
|
||||
// we may end up not needing any speculative assumptions in the fast_loop
|
||||
// and then rejecting the slow_loop by constant folding the multiversion_if.
|
||||
//
|
||||
// Therefore, we "delay" the optimization of the slow_loop until we add
|
||||
// at least one speculative assumption for the fast_loop. If we never
|
||||
// add such a speculative runtime check, the OpaqueMultiversioningNode
|
||||
// of the multiversion_if constant folds to true after loop opts, and the
|
||||
// multiversion_if folds away the "delayed" slow_loop. If we add any
|
||||
// speculative assumption, then we notify the OpaqueMultiversioningNode
|
||||
// with "notify_slow_loop_that_it_can_resume_optimizations".
|
||||
//
|
||||
// Note: new runtime checks can be added to the multiversion_if with
|
||||
// PhaseIdealLoop::create_new_if_for_multiversion
|
||||
void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(IdealLoopTree* lpt, Node_List& old_new) {
|
||||
CountedLoopNode* cl = lpt->_head->as_CountedLoop();
|
||||
LoopNode* outer_loop = cl->skip_strip_mined();
|
||||
Node* entry = outer_loop->in(LoopNode::EntryControl);
|
||||
|
||||
// Check we have multiversioning enabled, and are not already multiversioned.
|
||||
if (!LoopMultiversioning || cl->is_multiversion()) { return; }
|
||||
|
||||
// Check that we do not have a parse-predicate where we can add the runtime checks
|
||||
// during auto-vectorization.
|
||||
const Predicates predicates(entry);
|
||||
const PredicateBlock* predicate_block = predicates.auto_vectorization_check_block();
|
||||
if (predicate_block->has_parse_predicate()) { return; }
|
||||
|
||||
// Check node budget.
|
||||
uint estimate = lpt->est_loop_clone_sz(2);
|
||||
if (!may_require_nodes(estimate)) { return; }
|
||||
|
||||
do_multiversioning(lpt, old_new);
|
||||
}
|
||||
|
||||
// Returns true if the Reduction node is unordered.
|
||||
static bool is_unordered_reduction(Node* n) {
|
||||
return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
|
||||
|
||||
@ -229,9 +229,12 @@
|
||||
// Even if we could know that there is some base address to which we add index offsets, we cannot know
|
||||
// if this reference address points to the beginning of a native memory allocation or into the middle,
|
||||
// or outside it. We also have no guarantee for alignment with such a base address.
|
||||
//
|
||||
// Still: we would like to find such a base if possible, and if two pointers are similar (i.e. have the
|
||||
// same summands), we would like to find the same base. Further, it is reasonable to speculatively
|
||||
// assume that such base addresses are aligned (TODO: need to add this speculative check in JDK-8323582).
|
||||
// assume that such base addresses are aligned. We performs such a speculative alignment runtime check
|
||||
// in VTransform::add_speculative_alignment_check.
|
||||
//
|
||||
// A base pointer must have scale = 1, and be accepted byMemPointer::is_native_memory_base_candidate.
|
||||
// It can thus be one of these:
|
||||
// (1) CastX2P
|
||||
|
||||
@ -139,6 +139,7 @@ class NeverBranchNode;
|
||||
class Opaque1Node;
|
||||
class OpaqueLoopInitNode;
|
||||
class OpaqueLoopStrideNode;
|
||||
class OpaqueMultiversioningNode;
|
||||
class OpaqueNotNullNode;
|
||||
class OpaqueInitializedAssertionPredicateNode;
|
||||
class OpaqueTemplateAssertionPredicateNode;
|
||||
@ -800,6 +801,7 @@ public:
|
||||
DEFINE_CLASS_ID(Opaque1, Node, 16)
|
||||
DEFINE_CLASS_ID(OpaqueLoopInit, Opaque1, 0)
|
||||
DEFINE_CLASS_ID(OpaqueLoopStride, Opaque1, 1)
|
||||
DEFINE_CLASS_ID(OpaqueMultiversioning, Opaque1, 2)
|
||||
DEFINE_CLASS_ID(OpaqueNotNull, Node, 17)
|
||||
DEFINE_CLASS_ID(OpaqueInitializedAssertionPredicate, Node, 18)
|
||||
DEFINE_CLASS_ID(OpaqueTemplateAssertionPredicate, Node, 19)
|
||||
@ -982,6 +984,7 @@ public:
|
||||
DEFINE_CLASS_QUERY(OpaqueTemplateAssertionPredicate)
|
||||
DEFINE_CLASS_QUERY(OpaqueLoopInit)
|
||||
DEFINE_CLASS_QUERY(OpaqueLoopStride)
|
||||
DEFINE_CLASS_QUERY(OpaqueMultiversioning)
|
||||
DEFINE_CLASS_QUERY(OuterStripMinedLoop)
|
||||
DEFINE_CLASS_QUERY(OuterStripMinedLoopEnd)
|
||||
DEFINE_CLASS_QUERY(Parm)
|
||||
|
||||
@ -91,6 +91,29 @@ public:
|
||||
IfNode* if_node() const;
|
||||
};
|
||||
|
||||
// This node is used to mark the auto vectorization Predicate.
|
||||
// At first, the multiversion_if has its condition set to "true" and we always
|
||||
// take the fast_loop. Since we do not know if the slow_loop is ever going to
|
||||
// be used, we delay optimizations for it. Once the fast_loop decides to use
|
||||
// speculative runtime-checks and adds them to the multiversion_if, the slow_loop
|
||||
// can now resume optimizations, as it is reachable at runtime.
|
||||
// See PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks
|
||||
class OpaqueMultiversioningNode : public Opaque1Node {
|
||||
private:
|
||||
bool _is_delayed_slow_loop;
|
||||
|
||||
public:
|
||||
OpaqueMultiversioningNode(Compile* C, Node* n) :
|
||||
Opaque1Node(C, n), _is_delayed_slow_loop(true)
|
||||
{
|
||||
init_class_id(Class_OpaqueMultiversioning);
|
||||
}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return TypeInt::BOOL; }
|
||||
bool is_delayed_slow_loop() const { return _is_delayed_slow_loop; }
|
||||
void notify_slow_loop_that_it_can_resume_optimizations() { _is_delayed_slow_loop = false; }
|
||||
};
|
||||
|
||||
// This node is used in the context of intrinsics. We sometimes implicitly know that an object is non-null even though
|
||||
// the compiler cannot prove it. We therefore add a corresponding cast to propagate this implicit knowledge. However,
|
||||
// this cast could become top during optimizations (input to cast becomes null) and the data path is folded. To ensure
|
||||
|
||||
@ -64,14 +64,17 @@
|
||||
flags(AFTER_LOOP_PEELING, "After Loop Peeling") \
|
||||
flags(BEFORE_LOOP_UNSWITCHING, "Before Loop Unswitching") \
|
||||
flags(AFTER_LOOP_UNSWITCHING, "After Loop Unswitching") \
|
||||
flags(BEFORE_LOOP_MULTIVERSIONING, "Before Loop Multiversioning") \
|
||||
flags(AFTER_LOOP_MULTIVERSIONING, "After Loop Multiversioning") \
|
||||
flags(BEFORE_RANGE_CHECK_ELIMINATION, "Before Range Check Elimination") \
|
||||
flags(AFTER_RANGE_CHECK_ELIMINATION, "After Range Check Elimination") \
|
||||
flags(BEFORE_PRE_MAIN_POST, "Before Pre/Main/Post Loops") \
|
||||
flags(AFTER_PRE_MAIN_POST, "After Pre/Main/Post Loops") \
|
||||
flags(AUTO_VECTORIZATION1_BEFORE_APPLY, "AutoVectorization 1, Before Apply") \
|
||||
flags(AUTO_VECTORIZATION2_AFTER_REORDER, "AutoVectorization 2, After Apply Memop Reordering") \
|
||||
flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \
|
||||
flags(AUTO_VECTORIZATION4_AFTER_APPLY, "AutoVectorization 4, After Apply") \
|
||||
flags(AUTO_VECTORIZATION1_BEFORE_APPLY, "AutoVectorization 1, Before Apply") \
|
||||
flags(AUTO_VECTORIZATION2_AFTER_REORDER, "AutoVectorization 2, After Apply Memop Reordering") \
|
||||
flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \
|
||||
flags(AUTO_VECTORIZATION4_AFTER_SPECULATIVE_RUNTIME_CHECKS, "AutoVectorization 4, After Adding Speculative Runtime Checks") \
|
||||
flags(AUTO_VECTORIZATION5_AFTER_APPLY, "AutoVectorization 5, After Apply") \
|
||||
flags(BEFORE_CLOOPS, "Before CountedLoop") \
|
||||
flags(AFTER_CLOOPS, "After CountedLoop") \
|
||||
flags(PHASEIDEAL_BEFORE_EA, "PhaseIdealLoop before EA") \
|
||||
|
||||
@ -120,6 +120,7 @@ bool RuntimePredicate::has_valid_uncommon_trap(const Node* success_proj) {
|
||||
assert(RegularPredicate::may_be_predicate_if(success_proj), "must have been checked before");
|
||||
const Deoptimization::DeoptReason deopt_reason = uncommon_trap_reason(success_proj->as_IfProj());
|
||||
return (deopt_reason == Deoptimization::Reason_loop_limit_check ||
|
||||
deopt_reason == Deoptimization::Reason_auto_vectorization_check ||
|
||||
deopt_reason == Deoptimization::Reason_predicate ||
|
||||
deopt_reason == Deoptimization::Reason_profile_predicate);
|
||||
}
|
||||
@ -893,6 +894,8 @@ void Predicates::dump() const {
|
||||
tty->print_cr("%d %s:", loop_head->_idx, loop_head->Name());
|
||||
tty->print_cr("- Loop Limit Check Predicate Block:");
|
||||
_loop_limit_check_predicate_block.dump(" ");
|
||||
tty->print_cr("- Auto Vectorization Check Block:");
|
||||
_auto_vectorization_check_block.dump(" ");
|
||||
tty->print_cr("- Profiled Loop Predicate Block:");
|
||||
_profiled_loop_predicate_block.dump(" ");
|
||||
tty->print_cr("- Loop Predicate Block:");
|
||||
|
||||
@ -734,6 +734,8 @@ class PredicateIterator : public StackObj {
|
||||
Node* current = _start_node;
|
||||
PredicateBlockIterator loop_limit_check_predicate_iterator(current, Deoptimization::Reason_loop_limit_check);
|
||||
current = loop_limit_check_predicate_iterator.for_each(predicate_visitor);
|
||||
PredicateBlockIterator auto_vectorization_check_iterator(current, Deoptimization::Reason_auto_vectorization_check);
|
||||
current = auto_vectorization_check_iterator.for_each(predicate_visitor);
|
||||
if (UseLoopPredicate) {
|
||||
if (UseProfiledLoopPredicate) {
|
||||
PredicateBlockIterator profiled_loop_predicate_iterator(current, Deoptimization::Reason_profile_predicate);
|
||||
@ -906,6 +908,7 @@ class PredicateBlock : public StackObj {
|
||||
class Predicates : public StackObj {
|
||||
Node* const _tail;
|
||||
const PredicateBlock _loop_limit_check_predicate_block;
|
||||
const PredicateBlock _auto_vectorization_check_block;
|
||||
const PredicateBlock _profiled_loop_predicate_block;
|
||||
const PredicateBlock _loop_predicate_block;
|
||||
Node* const _entry;
|
||||
@ -914,7 +917,9 @@ class Predicates : public StackObj {
|
||||
explicit Predicates(Node* loop_entry)
|
||||
: _tail(loop_entry),
|
||||
_loop_limit_check_predicate_block(loop_entry, Deoptimization::Reason_loop_limit_check),
|
||||
_profiled_loop_predicate_block(_loop_limit_check_predicate_block.entry(),
|
||||
_auto_vectorization_check_block(_loop_limit_check_predicate_block.entry(),
|
||||
Deoptimization::Reason_auto_vectorization_check),
|
||||
_profiled_loop_predicate_block(_auto_vectorization_check_block.entry(),
|
||||
Deoptimization::Reason_profile_predicate),
|
||||
_loop_predicate_block(_profiled_loop_predicate_block.entry(),
|
||||
Deoptimization::Reason_predicate),
|
||||
@ -935,6 +940,10 @@ class Predicates : public StackObj {
|
||||
return &_profiled_loop_predicate_block;
|
||||
}
|
||||
|
||||
const PredicateBlock* auto_vectorization_check_block() const {
|
||||
return &_auto_vectorization_check_block;
|
||||
}
|
||||
|
||||
const PredicateBlock* loop_limit_check_predicate_block() const {
|
||||
return &_loop_limit_check_predicate_block;
|
||||
}
|
||||
|
||||
@ -1484,7 +1484,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac
|
||||
pack->size(),
|
||||
pre_end->init_trip(),
|
||||
pre_end->stride_con(),
|
||||
iv_stride()
|
||||
iv_stride(),
|
||||
_vloop.are_speculative_checks_possible()
|
||||
DEBUG_ONLY(COMMA is_trace_align_vector()));
|
||||
return solver.solve();
|
||||
}
|
||||
@ -1896,6 +1897,7 @@ bool SuperWord::schedule_and_apply() const {
|
||||
VTransformTrace trace(_vloop.vtrace(),
|
||||
is_trace_superword_rejections(),
|
||||
is_trace_align_vector(),
|
||||
_vloop.is_trace_speculative_runtime_checks(),
|
||||
is_trace_superword_info());
|
||||
#endif
|
||||
VTransform vtransform(_vloop_analyzer,
|
||||
@ -1938,8 +1940,11 @@ void VTransform::apply() {
|
||||
adjust_pre_loop_limit_to_align_main_loop_vectors();
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl());
|
||||
|
||||
apply_speculative_runtime_checks();
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_SPECULATIVE_RUNTIME_CHECKS, 4, cl());
|
||||
|
||||
apply_vectorization();
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl());
|
||||
C->print_method(PHASE_AUTO_VECTORIZATION5_AFTER_APPLY, 4, cl());
|
||||
}
|
||||
|
||||
// We prepare the memory graph for the replacement of scalar memops with vector memops.
|
||||
|
||||
@ -29,25 +29,26 @@
|
||||
#include "utilities/stringUtils.hpp"
|
||||
|
||||
#define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \
|
||||
flags(POINTER_PARSING, "Trace VPointer/MemPointer parsing") \
|
||||
flags(POINTER_ALIASING, "Trace VPointer/MemPointer aliasing") \
|
||||
flags(POINTER_ADJACENCY, "Trace VPointer/MemPointer adjacency") \
|
||||
flags(POINTER_OVERLAP, "Trace VPointer/MemPointer overlap") \
|
||||
flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \
|
||||
flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \
|
||||
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
|
||||
flags(BODY, "Trace VLoopBody") \
|
||||
flags(TYPES, "Trace VLoopTypes") \
|
||||
flags(POINTERS, "Trace VLoopPointers") \
|
||||
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
|
||||
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \
|
||||
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
|
||||
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
|
||||
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
|
||||
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
|
||||
flags(ALIGN_VECTOR, "Trace AlignVector") \
|
||||
flags(VTRANSFORM, "Trace VTransform Graph") \
|
||||
flags(ALL, "Trace everything (very verbose)")
|
||||
flags(POINTER_PARSING, "Trace VPointer/MemPointer parsing") \
|
||||
flags(POINTER_ALIASING, "Trace VPointer/MemPointer aliasing") \
|
||||
flags(POINTER_ADJACENCY, "Trace VPointer/MemPointer adjacency") \
|
||||
flags(POINTER_OVERLAP, "Trace VPointer/MemPointer overlap") \
|
||||
flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \
|
||||
flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \
|
||||
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
|
||||
flags(BODY, "Trace VLoopBody") \
|
||||
flags(TYPES, "Trace VLoopTypes") \
|
||||
flags(POINTERS, "Trace VLoopPointers") \
|
||||
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
|
||||
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \
|
||||
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
|
||||
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
|
||||
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
|
||||
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
|
||||
flags(ALIGN_VECTOR, "Trace AlignVector") \
|
||||
flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
|
||||
flags(VTRANSFORM, "Trace VTransform Graph") \
|
||||
flags(ALL, "Trace everything (very verbose)")
|
||||
|
||||
#define table_entry(name, description) name,
|
||||
enum TraceAutoVectorizationTag {
|
||||
|
||||
@ -93,9 +93,9 @@ VStatus VLoop::check_preconditions_helper() {
|
||||
return VStatus::make_failure(VLoop::FAILURE_BACKEDGE);
|
||||
}
|
||||
|
||||
// To align vector memory accesses in the main-loop, we will have to adjust
|
||||
// the pre-loop limit.
|
||||
if (_cl->is_main_loop()) {
|
||||
// To align vector memory accesses in the main-loop, we will have to adjust
|
||||
// the pre-loop limit.
|
||||
CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
|
||||
if (pre_end == nullptr) {
|
||||
return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
|
||||
@ -105,6 +105,41 @@ VStatus VLoop::check_preconditions_helper() {
|
||||
return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
|
||||
}
|
||||
_pre_loop_end = pre_end;
|
||||
|
||||
// See if we find the infrastructure for speculative runtime-checks.
|
||||
// (1) Auto Vectorization Parse Predicate
|
||||
Node* pre_ctrl = pre_loop_head()->in(LoopNode::EntryControl);
|
||||
const Predicates predicates(pre_ctrl);
|
||||
const PredicateBlock* predicate_block = predicates.auto_vectorization_check_block();
|
||||
if (predicate_block->has_parse_predicate()) {
|
||||
_auto_vectorization_parse_predicate_proj = predicate_block->parse_predicate_success_proj();
|
||||
}
|
||||
|
||||
// (2) Multiversioning fast-loop projection
|
||||
IfTrueNode* before_predicates = predicates.entry()->isa_IfTrue();
|
||||
if (before_predicates != nullptr &&
|
||||
before_predicates->in(0)->is_If() &&
|
||||
before_predicates->in(0)->in(1)->is_OpaqueMultiversioning()) {
|
||||
_multiversioning_fast_proj = before_predicates;
|
||||
}
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_preconditions() || is_trace_speculative_runtime_checks()) {
|
||||
tty->print_cr(" Infrastructure for speculative runtime-checks:");
|
||||
if (_auto_vectorization_parse_predicate_proj != nullptr) {
|
||||
tty->print_cr(" auto_vectorization_parse_predicate_proj: speculate and trap");
|
||||
_auto_vectorization_parse_predicate_proj->dump_bfs(5,0,"");
|
||||
} else if (_multiversioning_fast_proj != nullptr) {
|
||||
tty->print_cr(" multiversioning_fast_proj: speculate and multiversion");
|
||||
_multiversioning_fast_proj->dump_bfs(5,0,"");
|
||||
} else {
|
||||
tty->print_cr(" Not found.");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
assert(_auto_vectorization_parse_predicate_proj == nullptr ||
|
||||
_multiversioning_fast_proj == nullptr, "we should only have at most one of these");
|
||||
assert(_cl->is_multiversion_fast_loop() == (_multiversioning_fast_proj != nullptr),
|
||||
"must find the multiversion selector IFF loop is a multiversion fast loop");
|
||||
}
|
||||
|
||||
return VStatus::make_success();
|
||||
@ -472,15 +507,28 @@ AlignmentSolution* AlignmentSolver::solve() const {
|
||||
// + con + con + C_const (sum of constant terms)
|
||||
//
|
||||
// We describe the 6 terms:
|
||||
// 1) The "base" of the address is the address of a Java object (e.g. array),
|
||||
// and as such ObjectAlignmentInBytes (a power of 2) aligned. We have
|
||||
// defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is also
|
||||
// 1) The "base" of the address:
|
||||
// - For heap objects, this is the base of the object, and as such
|
||||
// ObjectAlignmentInBytes (a power of 2) aligned.
|
||||
// - For off-heap / native memory, the "base" has no alignment
|
||||
// gurantees. To ensure alignment we can do either of these:
|
||||
// - Add a runtime check to verify ObjectAlignmentInBytes alignment,
|
||||
// i.e. we can speculatively compile with an alignment assumption.
|
||||
// If we pass the check, we can go into the loop with the alignment
|
||||
// assumption, if we fail we have to trap/deopt or take the other
|
||||
// loop version without alignment assumptions.
|
||||
// - If runtime checks are not possible, then we return an empty
|
||||
// solution, i.e. we do not vectorize the corresponding pack.
|
||||
//
|
||||
// Let us assume we have an object "base", or passed the alignment
|
||||
// runtime check for native "bases", hence we know:
|
||||
//
|
||||
// base % ObjectAlignmentInBytes = 0
|
||||
//
|
||||
// We defined aw = MIN(vector_width, ObjectAlignmentInBytes), which is
|
||||
// a power of 2. And hence we know that "base" is thus also aw-aligned:
|
||||
//
|
||||
// base % ObjectAlignmentInBytes = 0 ==> base % aw = 0
|
||||
//
|
||||
// TODO: Note: we have been assuming that this also holds for native memory base
|
||||
// addresses. This is incorrect, see JDK-8323582.
|
||||
// base % ObjectAlignmentInBytes = 0 ==> base % aw = 0 (BASE_ALIGNED)
|
||||
//
|
||||
// 2) The "C_const" term is the sum of all constant terms. This is "con",
|
||||
// plus "iv_scale * init" if it is constant.
|
||||
@ -505,6 +553,13 @@ AlignmentSolution* AlignmentSolver::solve() const {
|
||||
// 6) The "C_main * main_iter" term represents how much the iv is increased
|
||||
// during "main_iter" main-loop iterations.
|
||||
|
||||
// For native memory, we must add a runtime-check that "base % ObjectAlignmentInBytes = ",
|
||||
// to ensure (BASE_ALIGNED). If we cannot add this runtime-check, we have no guarantee on
|
||||
// its alignment.
|
||||
if (!_vpointer.mem_pointer().base().is_object() && !_are_speculative_checks_possible) {
|
||||
return new EmptyAlignmentSolution("Cannot add speculative check for native memory alignment.");
|
||||
}
|
||||
|
||||
// Attribute init (i.e. _init_node) either to C_const or to C_init term.
|
||||
const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0;
|
||||
const int C_const = _vpointer.con() + C_const_init * iv_scale();
|
||||
@ -521,8 +576,7 @@ AlignmentSolution* AlignmentSolver::solve() const {
|
||||
// We must find a pre_iter, such that adr is aw aligned: adr % aw = 0. Note, that we are defining the
|
||||
// modulo operator "%" such that the remainder is always positive, see AlignmentSolution::mod(i, q).
|
||||
//
|
||||
// TODO: Note: the following assumption is incorrect for native memory bases, see JDK-8323582.
|
||||
// Since "base % aw = 0", we only need to ensure alignment of the other 5 terms:
|
||||
// Since "base % aw = 0" (BASE_ALIGNED), we only need to ensure alignment of the other 5 terms:
|
||||
//
|
||||
// (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter + C_main * main_iter) % aw = 0 (1)
|
||||
//
|
||||
@ -878,8 +932,7 @@ AlignmentSolution* AlignmentSolver::solve() const {
|
||||
// + iv_scale * pre_stride * pre_iter
|
||||
// + iv_scale * main_stride * main_iter)) % aw =
|
||||
//
|
||||
// -> base aligned: base % aw = 0
|
||||
// TODO: Note: this assumption is incorrect for native memory bases, see JDK-8323582.
|
||||
// -> apply (BASE_ALIGNED): base % aw = 0
|
||||
// -> main-loop iterations aligned (2): C_main % aw = (iv_scale * main_stride) % aw = 0
|
||||
// (con + invar + iv_scale * init + iv_scale * pre_stride * pre_iter) % aw =
|
||||
//
|
||||
@ -958,7 +1011,7 @@ void AlignmentSolver::trace_start_solve() const {
|
||||
_pre_stride, _main_stride);
|
||||
// adr = base + con + invar + iv_scale * iv
|
||||
tty->print(" adr = base[%d]", base().object_or_native()->_idx);
|
||||
tty->print(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con());
|
||||
tty->print_cr(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -85,6 +85,14 @@ private:
|
||||
PhiNode* _iv;
|
||||
CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only
|
||||
|
||||
// We can add speculative runtime-checks if we have one of these:
|
||||
// - Auto Vectorization Parse Predicate:
|
||||
// pass all checks or trap -> recompile without this predicate.
|
||||
// - Multiversioning fast-loop projection:
|
||||
// pass all checks or go to slow-path-loop, where we have no speculative assumptions.
|
||||
ParsePredicateSuccessProj* _auto_vectorization_parse_predicate_proj;
|
||||
IfTrueNode* _multiversioning_fast_proj;
|
||||
|
||||
NOT_PRODUCT(VTrace _vtrace;)
|
||||
NOT_PRODUCT(TraceMemPointer _mptrace;)
|
||||
|
||||
@ -104,7 +112,9 @@ public:
|
||||
_cl (nullptr),
|
||||
_cl_exit (nullptr),
|
||||
_iv (nullptr),
|
||||
_pre_loop_end (nullptr)
|
||||
_pre_loop_end (nullptr),
|
||||
_auto_vectorization_parse_predicate_proj(nullptr),
|
||||
_multiversioning_fast_proj(nullptr)
|
||||
#ifndef PRODUCT
|
||||
COMMA
|
||||
_mptrace(TraceMemPointer(
|
||||
@ -138,6 +148,19 @@ public:
|
||||
return head;
|
||||
};
|
||||
|
||||
ParsePredicateSuccessProj* auto_vectorization_parse_predicate_proj() const {
|
||||
return _auto_vectorization_parse_predicate_proj;
|
||||
}
|
||||
|
||||
IfTrueNode* multiversioning_fast_proj() const {
|
||||
return _multiversioning_fast_proj;
|
||||
}
|
||||
|
||||
bool are_speculative_checks_possible() const {
|
||||
return _auto_vectorization_parse_predicate_proj != nullptr ||
|
||||
_multiversioning_fast_proj != nullptr;
|
||||
}
|
||||
|
||||
// Estimate maximum size for data structures, to avoid repeated reallocation
|
||||
int estimated_body_length() const { return lpt()->_body.size(); };
|
||||
int estimated_node_count() const { return (int)(1.10 * phase()->C->unique()); };
|
||||
@ -176,6 +199,10 @@ public:
|
||||
bool is_trace_vpointers() const {
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
|
||||
}
|
||||
|
||||
bool is_trace_speculative_runtime_checks() const {
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Is the node in the basic block of the loop?
|
||||
@ -1296,6 +1323,14 @@ private:
|
||||
const int _pre_stride; // address increment per pre-loop iteration
|
||||
const int _main_stride; // address increment per main-loop iteration
|
||||
|
||||
// For native bases, we have no alignment guarantee. This means we cannot in
|
||||
// general guarantee alignment statically. But we can check alignment with a
|
||||
// speculative runtime check, see VTransform::apply_speculative_runtime_checks.
|
||||
// For this, we need find the Predicate for auto vectorization checks, or else
|
||||
// we need to find the multiversion_if. If we cannot find either, then we
|
||||
// cannot make any speculative runtime checks.
|
||||
const bool _are_speculative_checks_possible;
|
||||
|
||||
DEBUG_ONLY( const bool _is_trace; );
|
||||
|
||||
static const MemNode* mem_ref_not_null(const MemNode* mem_ref) {
|
||||
@ -1309,7 +1344,8 @@ public:
|
||||
const uint vector_length,
|
||||
const Node* init_node,
|
||||
const int pre_stride,
|
||||
const int main_stride
|
||||
const int main_stride,
|
||||
const bool are_speculative_checks_possible
|
||||
DEBUG_ONLY( COMMA const bool is_trace)
|
||||
) :
|
||||
_vpointer( vpointer),
|
||||
@ -1318,7 +1354,8 @@ public:
|
||||
_aw( MIN2(_vector_width, ObjectAlignmentInBytes)),
|
||||
_init_node( init_node),
|
||||
_pre_stride( pre_stride),
|
||||
_main_stride( main_stride)
|
||||
_main_stride( main_stride),
|
||||
_are_speculative_checks_possible(are_speculative_checks_possible)
|
||||
DEBUG_ONLY( COMMA _is_trace(is_trace) )
|
||||
{
|
||||
assert(_mem_ref != nullptr &&
|
||||
|
||||
@ -23,6 +23,7 @@
|
||||
|
||||
#include "opto/vtransform.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
#include "opto/castnode.hpp"
|
||||
#include "opto/convertnode.hpp"
|
||||
|
||||
void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
|
||||
@ -143,6 +144,94 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const {
|
||||
}
|
||||
#endif
|
||||
|
||||
void VTransform::apply_speculative_runtime_checks() {
|
||||
if (VLoop::vectors_should_be_aligned()) {
|
||||
#ifdef ASSERT
|
||||
if (_trace._align_vector || _trace._speculative_runtime_checks) {
|
||||
tty->print_cr("\nVTransform::apply_speculative_runtime_checks: native memory alignment");
|
||||
}
|
||||
#endif
|
||||
|
||||
const GrowableArray<VTransformNode*>& vtnodes = _graph.vtnodes();
|
||||
for (int i = 0; i < vtnodes.length(); i++) {
|
||||
VTransformVectorNode* vtn = vtnodes.at(i)->isa_Vector();
|
||||
if (vtn == nullptr) { continue; }
|
||||
MemNode* p0 = vtn->nodes().at(0)->isa_Mem();
|
||||
if (p0 == nullptr) { continue; }
|
||||
const VPointer& vp = vpointer(p0);
|
||||
if (vp.mem_pointer().base().is_object()) { continue; }
|
||||
assert(vp.mem_pointer().base().is_native(), "VPointer base must be object or native");
|
||||
|
||||
// We have a native memory reference. Build a runtime check for it.
|
||||
// See: AlignmentSolver::solve
|
||||
// In a future RFE we may be able to speculate on invar alignment as
|
||||
// well, and allow vectorization of more cases.
|
||||
add_speculative_alignment_check(vp.mem_pointer().base().native(), ObjectAlignmentInBytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define TRACE_SPECULATIVE_ALIGNMENT_CHECK(node) { \
|
||||
DEBUG_ONLY( \
|
||||
if (_trace._align_vector || _trace._speculative_runtime_checks) { \
|
||||
tty->print(" " #node ": "); \
|
||||
node->dump(); \
|
||||
} \
|
||||
) \
|
||||
} \
|
||||
|
||||
// Check: (node % alignment) == 0.
|
||||
void VTransform::add_speculative_alignment_check(Node* node, juint alignment) {
|
||||
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
|
||||
Node* ctrl = phase()->get_ctrl(node);
|
||||
|
||||
// Cast adr/long -> int
|
||||
if (node->bottom_type()->basic_type() == T_ADDRESS) {
|
||||
// adr -> int/long
|
||||
node = new CastP2XNode(nullptr, node);
|
||||
phase()->register_new_node(node, ctrl);
|
||||
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
|
||||
}
|
||||
if (node->bottom_type()->basic_type() == T_LONG) {
|
||||
// long -> int
|
||||
node = new ConvL2INode(node);
|
||||
phase()->register_new_node(node, ctrl);
|
||||
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
|
||||
}
|
||||
|
||||
Node* mask_alignment = igvn().intcon(alignment-1);
|
||||
Node* base_alignment = new AndINode(node, mask_alignment);
|
||||
phase()->register_new_node(base_alignment, ctrl);
|
||||
TRACE_SPECULATIVE_ALIGNMENT_CHECK(mask_alignment);
|
||||
TRACE_SPECULATIVE_ALIGNMENT_CHECK(base_alignment);
|
||||
|
||||
Node* zero = igvn().intcon(0);
|
||||
Node* cmp_alignment = CmpNode::make(base_alignment, zero, T_INT, false);
|
||||
BoolNode* bol_alignment = new BoolNode(cmp_alignment, BoolTest::eq);
|
||||
phase()->register_new_node(cmp_alignment, ctrl);
|
||||
phase()->register_new_node(bol_alignment, ctrl);
|
||||
TRACE_SPECULATIVE_ALIGNMENT_CHECK(cmp_alignment);
|
||||
TRACE_SPECULATIVE_ALIGNMENT_CHECK(bol_alignment);
|
||||
|
||||
add_speculative_check(bol_alignment);
|
||||
}
|
||||
|
||||
void VTransform::add_speculative_check(BoolNode* bol) {
|
||||
assert(_vloop.are_speculative_checks_possible(), "otherwise we cannot make speculative assumptions");
|
||||
ParsePredicateSuccessProj* parse_predicate_proj = _vloop.auto_vectorization_parse_predicate_proj();
|
||||
IfTrueNode* new_check_proj = nullptr;
|
||||
if (parse_predicate_proj != nullptr) {
|
||||
new_check_proj = phase()->create_new_if_for_predicate(parse_predicate_proj, nullptr,
|
||||
Deoptimization::Reason_auto_vectorization_check,
|
||||
Op_If);
|
||||
} else {
|
||||
new_check_proj = phase()->create_new_if_for_multiversion(_vloop.multiversioning_fast_proj());
|
||||
}
|
||||
Node* iff_speculate = new_check_proj->in(0);
|
||||
igvn().replace_input_of(iff_speculate, 1, bol);
|
||||
TRACE_SPECULATIVE_ALIGNMENT_CHECK(iff_speculate);
|
||||
}
|
||||
|
||||
// Helper-class for VTransformGraph::has_store_to_load_forwarding_failure.
|
||||
// It wraps a VPointer. The VPointer has an iv_offset applied, which
|
||||
// simulates a virtual unrolling. They represent the memory region:
|
||||
|
||||
@ -109,16 +109,19 @@ public:
|
||||
const bool _verbose;
|
||||
const bool _rejections;
|
||||
const bool _align_vector;
|
||||
const bool _speculative_runtime_checks;
|
||||
const bool _info;
|
||||
|
||||
VTransformTrace(const VTrace& vtrace,
|
||||
const bool is_trace_rejections,
|
||||
const bool is_trace_align_vector,
|
||||
const bool is_trace_speculative_runtime_checks,
|
||||
const bool is_trace_info) :
|
||||
_verbose (vtrace.is_trace(TraceAutoVectorizationTag::ALL)),
|
||||
_rejections (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections),
|
||||
_align_vector(_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector),
|
||||
_info (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {}
|
||||
_verbose (vtrace.is_trace(TraceAutoVectorizationTag::ALL)),
|
||||
_rejections (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections),
|
||||
_align_vector (_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector),
|
||||
_speculative_runtime_checks(_verbose | is_trace_vtransform(vtrace) | is_trace_speculative_runtime_checks),
|
||||
_info (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {}
|
||||
|
||||
static bool is_trace_vtransform(const VTrace& vtrace) {
|
||||
return vtrace.is_trace(TraceAutoVectorizationTag::VTRANSFORM);
|
||||
@ -245,6 +248,10 @@ private:
|
||||
void determine_mem_ref_and_aw_for_main_loop_alignment();
|
||||
void adjust_pre_loop_limit_to_align_main_loop_vectors();
|
||||
|
||||
void apply_speculative_runtime_checks();
|
||||
void add_speculative_alignment_check(Node* node, juint alignment);
|
||||
void add_speculative_check(BoolNode* bol);
|
||||
|
||||
void apply_vectorization() const;
|
||||
};
|
||||
|
||||
|
||||
@ -2717,6 +2717,7 @@ const char* Deoptimization::_trap_reason_name[] = {
|
||||
"intrinsic" JVMCI_ONLY("_or_type_checked_inlining"),
|
||||
"bimorphic" JVMCI_ONLY("_or_optimized_type_check"),
|
||||
"profile_predicate",
|
||||
"auto_vectorization_check",
|
||||
"unloaded",
|
||||
"uninitialized",
|
||||
"initialized",
|
||||
|
||||
@ -98,6 +98,7 @@ class Deoptimization : AllStatic {
|
||||
#endif
|
||||
|
||||
Reason_profile_predicate, // compiler generated predicate moved from frequent branch in a loop failed
|
||||
Reason_auto_vectorization_check, // compiler generated (speculative) auto vectorization checks failed
|
||||
|
||||
// recorded per method
|
||||
Reason_unloaded, // unloaded class or constant pool entry
|
||||
|
||||
@ -2269,6 +2269,7 @@
|
||||
declare_constant(Deoptimization::Reason_age) \
|
||||
declare_constant(Deoptimization::Reason_predicate) \
|
||||
declare_constant(Deoptimization::Reason_loop_limit_check) \
|
||||
declare_constant(Deoptimization::Reason_auto_vectorization_check) \
|
||||
declare_constant(Deoptimization::Reason_speculate_class_check) \
|
||||
declare_constant(Deoptimization::Reason_speculate_null_check) \
|
||||
declare_constant(Deoptimization::Reason_speculate_null_assert) \
|
||||
|
||||
@ -0,0 +1,303 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package compiler.loopopts.superword;
|
||||
|
||||
import compiler.lib.ir_framework.*;
|
||||
import compiler.lib.verify.*;
|
||||
import jdk.test.lib.Utils;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.Random;
|
||||
import java.lang.foreign.*;
|
||||
|
||||
/*
|
||||
* @test id=byte-buffer-direct
|
||||
* @bug 8323582
|
||||
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test id=byte-buffer-direct-AlignVector
|
||||
* @bug 8323582
|
||||
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect AlignVector
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test id=byte-buffer-direct-VerifyAlignVector
|
||||
* @bug 8323582
|
||||
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress ByteBufferDirect VerifyAlignVector
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test id=native
|
||||
* @bug 8323582
|
||||
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test id=native-AlignVector
|
||||
* @bug 8323582
|
||||
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native AlignVector
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test id=native-VerifyAlignVector
|
||||
* @bug 8323582
|
||||
* @summary Test vectorization of loops over MemorySegment, with native memory where the address is not always aligned.
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestMemorySegmentUnalignedAddress Native VerifyAlignVector
|
||||
*/
|
||||
|
||||
public class TestMemorySegmentUnalignedAddress {
|
||||
public static void main(String[] args) {
|
||||
TestFramework framework = new TestFramework(TestMemorySegmentUnalignedAddressImpl.class);
|
||||
framework.addFlags("-DmemorySegmentProviderNameForTestVM=" + args[0]);
|
||||
if (args.length > 1) {
|
||||
switch (args[1]) {
|
||||
case "AlignVector" -> { framework.addFlags("-XX:+AlignVector"); }
|
||||
case "VerifyAlignVector" -> { framework.addFlags("-XX:+AlignVector", "-XX:+IgnoreUnrecognizedVMOptions", "-XX:+VerifyAlignVector"); }
|
||||
default -> { throw new RuntimeException("unexpected: " + args[1]); }
|
||||
}
|
||||
}
|
||||
framework.setDefaultWarmup(100);
|
||||
framework.start();
|
||||
}
|
||||
}
|
||||
|
||||
class TestMemorySegmentUnalignedAddressImpl {
|
||||
static final int SIZE = 10_000;
|
||||
static final int BACKING_SIZE = 10_000 + 1;
|
||||
static final Random RANDOM = Utils.getRandomInstance();
|
||||
|
||||
interface TestFunction {
|
||||
Object run(int i);
|
||||
}
|
||||
|
||||
interface MemorySegmentProvider {
|
||||
MemorySegment newMemorySegment();
|
||||
}
|
||||
|
||||
static MemorySegmentProvider provider;
|
||||
|
||||
static {
|
||||
String providerName = System.getProperty("memorySegmentProviderNameForTestVM");
|
||||
provider = switch (providerName) {
|
||||
case "ByteBufferDirect" -> TestMemorySegmentUnalignedAddressImpl::newMemorySegmentOfByteBufferDirect;
|
||||
case "Native" -> TestMemorySegmentUnalignedAddressImpl::newMemorySegmentOfNative;
|
||||
default -> throw new RuntimeException("Test argument not recognized: " + providerName);
|
||||
};
|
||||
}
|
||||
|
||||
// List of tests
|
||||
Map<String, TestFunction> tests = new HashMap<>();
|
||||
|
||||
// List of gold, the results from the first run before compilation
|
||||
Map<String, Object> golds = new HashMap<>();
|
||||
|
||||
public TestMemorySegmentUnalignedAddressImpl () {
|
||||
// Generate two MemorySegments as inputs
|
||||
MemorySegment a = sliceAligned(newMemorySegment());
|
||||
MemorySegment b = sliceAligned(newMemorySegment());
|
||||
fillRandom(a);
|
||||
fillRandom(b);
|
||||
|
||||
// Add all tests to list
|
||||
tests.put("testAlwaysAligned", (int i) -> {
|
||||
MemorySegment ms = newMemorySegment();
|
||||
MemorySegment slice = sliceAligned(ms);
|
||||
copy(a, slice);
|
||||
return testAlwaysAligned(slice);
|
||||
});
|
||||
tests.put("testAlwaysUnaligned", (int i) -> {
|
||||
MemorySegment ms = newMemorySegment();
|
||||
MemorySegment slice = sliceUnaligned(ms);
|
||||
copy(a, slice);
|
||||
return testAlwaysUnaligned(slice);
|
||||
});
|
||||
tests.put("testMixedAlignedAndUnaligned", (int i) -> {
|
||||
MemorySegment ms = newMemorySegment();
|
||||
MemorySegment slice = (i % 2 == 0) ? sliceUnaligned(ms) : sliceAligned(ms);
|
||||
copy(a, slice);
|
||||
return testMixedAlignedAndUnaligned(slice);
|
||||
});
|
||||
|
||||
// Compute gold value for all test methods before compilation
|
||||
for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
|
||||
String name = entry.getKey();
|
||||
TestFunction test = entry.getValue();
|
||||
Object gold = test.run(0);
|
||||
golds.put(name, gold);
|
||||
}
|
||||
}
|
||||
|
||||
MemorySegment sliceAligned(MemorySegment src) {
|
||||
return src.asSlice(0, SIZE);
|
||||
}
|
||||
|
||||
MemorySegment sliceUnaligned(MemorySegment src) {
|
||||
return src.asSlice(1, SIZE);
|
||||
}
|
||||
|
||||
MemorySegment newMemorySegment() {
|
||||
return provider.newMemorySegment();
|
||||
}
|
||||
|
||||
static void copy(MemorySegment src, MemorySegment dst) {
|
||||
MemorySegment.copy(src, 0, dst, 0, src.byteSize());
|
||||
}
|
||||
|
||||
static MemorySegment newMemorySegmentOfByteBufferDirect() {
|
||||
return MemorySegment.ofBuffer(ByteBuffer.allocateDirect(BACKING_SIZE));
|
||||
}
|
||||
|
||||
static MemorySegment newMemorySegmentOfNative() {
|
||||
// Auto arena: GC decides when there is no reference to the MemorySegment,
|
||||
// and then it deallocates the backing memory.
|
||||
return Arena.ofAuto().allocate(BACKING_SIZE, 1);
|
||||
}
|
||||
|
||||
static void fillRandom(MemorySegment data) {
|
||||
for (int i = 0; i < (int)data.byteSize(); i++) {
|
||||
data.set(ValueLayout.JAVA_BYTE, i, (byte)RANDOM.nextInt());
|
||||
}
|
||||
}
|
||||
|
||||
static void verify(String name, Object gold, Object result) {
|
||||
try {
|
||||
Verify.checkEQ(gold, result);
|
||||
} catch (VerifyException e) {
|
||||
throw new RuntimeException("Verify: wrong result in " + name, e);
|
||||
}
|
||||
}
|
||||
|
||||
static int runInvocationCounter = 0;
|
||||
|
||||
@Run(test = {"testAlwaysAligned",
|
||||
"testAlwaysUnaligned",
|
||||
"testMixedAlignedAndUnaligned"})
|
||||
void runTests() {
|
||||
runInvocationCounter++;
|
||||
for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
|
||||
String name = entry.getKey();
|
||||
TestFunction test = entry.getValue();
|
||||
// Recall gold value from before compilation
|
||||
Object gold = golds.get(name);
|
||||
// Compute new result
|
||||
Object result = test.run(runInvocationCounter);
|
||||
// Compare gold and new result
|
||||
verify(name, gold, result);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
|
||||
IRNode.ADD_VI, "> 0",
|
||||
IRNode.STORE_VECTOR, "> 0",
|
||||
"multiversion", "= 0"},
|
||||
applyIfPlatform = {"64-bit", "true"},
|
||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
// We never fail the alignment check in the auto vectorization Predicate,
|
||||
// hence we never even create the multiversioned loops.
|
||||
static Object testAlwaysAligned(MemorySegment ms) {
|
||||
for (long i = 0; i < ms.byteSize(); i += 4) {
|
||||
int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
|
||||
ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
|
||||
}
|
||||
return new Object[]{ ms };
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
|
||||
IRNode.ADD_VI, "> 0",
|
||||
IRNode.STORE_VECTOR, "> 0",
|
||||
"multiversion_fast", "= 4", // pre, main, drain, post
|
||||
"multiversion_slow", "= 2"}, // main, post
|
||||
applyIf = {"AlignVector", "true"},
|
||||
applyIfPlatform = {"64-bit", "true"},
|
||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
// We add alignment checks to the auto vectorization Predicate. It fails
|
||||
// at runtime, deopts, and recompiles with multiversioning.
|
||||
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
|
||||
IRNode.ADD_VI, "> 0",
|
||||
IRNode.STORE_VECTOR, "> 0",
|
||||
"multiversion_fast", "= 0",
|
||||
"multiversion_slow", "= 0"},
|
||||
applyIf = {"AlignVector", "false"},
|
||||
applyIfPlatform = {"64-bit", "true"},
|
||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
// We never add any conditions to the auto vectorization Predicate, so
|
||||
// we also never deopt and never end up multiversioning.
|
||||
static Object testAlwaysUnaligned(MemorySegment ms) {
|
||||
for (long i = 0; i < ms.byteSize(); i += 4) {
|
||||
int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
|
||||
ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
|
||||
}
|
||||
return new Object[]{ ms };
|
||||
}
|
||||
|
||||
@Test
|
||||
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
|
||||
IRNode.ADD_VI, "> 0",
|
||||
IRNode.STORE_VECTOR, "> 0",
|
||||
"multiversion_fast", "= 4", // pre, main, drain, post
|
||||
"multiversion_slow", "= 2"}, // main, post
|
||||
applyIf = {"AlignVector", "true"},
|
||||
applyIfPlatform = {"64-bit", "true"},
|
||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
// We add alignment checks to the auto vectorization Predicate. It fails
|
||||
// at runtime, deopts, and recompiles with multiversioning.
|
||||
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
|
||||
IRNode.ADD_VI, "> 0",
|
||||
IRNode.STORE_VECTOR, "> 0",
|
||||
"multiversion_fast", "= 0",
|
||||
"multiversion_slow", "= 0"},
|
||||
applyIf = {"AlignVector", "false"},
|
||||
applyIfPlatform = {"64-bit", "true"},
|
||||
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
|
||||
phase = CompilePhase.PRINT_IDEAL)
|
||||
// We never add any conditions to the auto vectorization Predicate, so
|
||||
// we also never deopt and never end up multiversioning.
|
||||
static Object testMixedAlignedAndUnaligned(MemorySegment ms) {
|
||||
for (long i = 0; i < ms.byteSize(); i += 4) {
|
||||
int v = ms.get(ValueLayout.JAVA_INT_UNALIGNED, i);
|
||||
ms.set(ValueLayout.JAVA_INT_UNALIGNED, i, (int)(v + 1));
|
||||
}
|
||||
return new Object[]{ ms };
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user