8300256: C2: vectorization is sometimes skipped on loops where it would succeed

Reviewed-by: kvn, thartmann
This commit is contained in:
Roland Westrelin 2023-02-01 09:48:54 +00:00
parent ef0d0a7092
commit 2a8ae2ff1c
5 changed files with 115 additions and 72 deletions

View File

@ -1042,7 +1042,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
}
// Only attempt slp analysis when user controls do not prohibit it
if (!cl->range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
if (!range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
// Once policy_slp_analysis succeeds, mark the loop with the
// maximal unroll factor so that we minimize analysis passes
if (future_unroll_cnt >= _local_loop_unroll_factor) {
@ -1916,7 +1916,7 @@ void PhaseIdealLoop::insert_scalar_rced_post_loop(IdealLoopTree *loop, Node_List
CountedLoopNode *cl = loop->_head->as_CountedLoop();
// only process RCE'd main loops
if (!cl->is_main_loop() || cl->range_checks_present()) return;
if (!cl->is_main_loop() || loop->range_checks_present()) return;
#ifndef PRODUCT
if (TraceLoopOpts) {
@ -3003,7 +3003,7 @@ Node* PhaseIdealLoop::add_range_check_predicate(IdealLoopTree* loop, CountedLoop
//------------------------------do_range_check---------------------------------
// Eliminate range-checks and other trip-counter vs loop-invariant tests.
int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
void PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
#ifndef PRODUCT
if (PrintOpto && VerifyLoopOptimizations) {
tty->print("Range Check Elimination ");
@ -3016,12 +3016,10 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
assert(RangeCheckElimination, "");
CountedLoopNode *cl = loop->_head->as_CountedLoop();
// If we fail before trying to eliminate range checks, set multiversion state
int closed_range_checks = 1;
// protect against stride not being a constant
if (!cl->stride_is_con()) {
return closed_range_checks;
return;
}
// Find the trip counter; we are iteration splitting based on it
Node *trip_counter = cl->phi();
@ -3033,7 +3031,7 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
// Opaque1 node is optimized away and then another round
// of loop opts attempted.
if (cl->is_canonical_loop_entry() == NULL) {
return closed_range_checks;
return;
}
// Need to find the main-loop zero-trip guard
@ -3047,7 +3045,7 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
Node *p_f = iffm->in(0);
// pre loop may have been optimized out
if (p_f->Opcode() != Op_IfFalse) {
return closed_range_checks;
return;
}
CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd();
assert(pre_end->loopnode()->is_pre_loop(), "");
@ -3056,7 +3054,7 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
// optimized away and then another round of loop opts attempted.
// We can not optimize this particular loop in that case.
if (pre_opaq1->Opcode() != Op_Opaque1) {
return closed_range_checks;
return;
}
Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
Node *pre_limit = pre_opaq->in(1);
@ -3068,7 +3066,7 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
// pre-loop Opaque1 node.
Node *orig_limit = pre_opaq->original_loop_limit();
if (orig_limit == NULL || _igvn.type(orig_limit) == Type::TOP) {
return closed_range_checks;
return;
}
// Must know if its a count-up or count-down loop
@ -3081,10 +3079,6 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
set_ctrl(one, C->root());
set_ctrl(mini, C->root());
// Count number of range checks and reduce by load range limits, if zero,
// the loop is in canonical form to multiversion.
closed_range_checks = 0;
Node* predicate_proj = cl->skip_strip_mined()->in(LoopNode::EntryControl);
assert(predicate_proj->is_Proj() && predicate_proj->in(0)->is_If(), "if projection only");
@ -3095,7 +3089,6 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
iff->Opcode() == Op_RangeCheck) { // Test?
// Test is an IfNode, has 2 projections. If BOTH are in the loop
// we need loop unswitching instead of iteration splitting.
closed_range_checks++;
Node *exit = loop->is_loop_exit(iff);
if (!exit) continue;
int flip = (exit->Opcode() == Op_IfTrue) ? 1 : 0;
@ -3264,9 +3257,6 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
--imax;
}
}
if (int_limit->Opcode() == Op_LoadRange) {
closed_range_checks--;
}
} // End of is IF
}
if (predicate_proj != cl->skip_strip_mined()->in(LoopNode::EntryControl)) {
@ -3316,32 +3306,19 @@ int PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &old_new) {
assert(opqzm->outcnt() == 1, "cannot hack shared node");
_igvn.replace_input_of(opqzm, 1, main_limit);
return closed_range_checks;
return;
}
//------------------------------has_range_checks-------------------------------
// Check to see if RCE cleaned the current loop of range-checks.
void PhaseIdealLoop::has_range_checks(IdealLoopTree *loop) {
assert(RangeCheckElimination, "");
// skip if not a counted loop
if (!loop->is_counted()) return;
CountedLoopNode *cl = loop->_head->as_CountedLoop();
// skip this loop if it is already checked
if (cl->has_been_range_checked()) return;
// Now check for existence of range checks
for (uint i = 0; i < loop->_body.size(); i++) {
Node *iff = loop->_body[i];
bool IdealLoopTree::compute_has_range_checks() const {
assert(_head->is_CountedLoop(), "");
for (uint i = 0; i < _body.size(); i++) {
Node *iff = _body[i];
int iff_opc = iff->Opcode();
if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
cl->mark_has_range_checks();
break;
return true;
}
}
cl->set_has_been_range_checked();
return false;
}
//-------------------------multi_version_post_loops----------------------------
@ -4007,13 +3984,7 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n
// with full checks, but the main-loop with no checks. Remove said checks
// from the main body.
if (should_rce) {
if (phase->do_range_check(this, old_new) != 0) {
cl->mark_has_range_checks();
} else {
cl->clear_has_range_checks();
}
} else if (PostLoopMultiversioning) {
phase->has_range_checks(this);
phase->do_range_check(this, old_new);
}
if (should_unroll && !should_peel && PostLoopMultiversioning &&

View File

@ -3942,7 +3942,7 @@ uint IdealLoopTree::est_loop_flow_merge_sz() const {
#ifndef PRODUCT
//------------------------------dump_head--------------------------------------
// Dump 1 liner for loop header info
void IdealLoopTree::dump_head() const {
void IdealLoopTree::dump_head() {
tty->sp(2 * _nest);
tty->print("Loop: N%d/N%d ", _head->_idx, _tail->_idx);
if (_irreducible) tty->print(" IRREDUCIBLE");
@ -3990,7 +3990,7 @@ void IdealLoopTree::dump_head() const {
if (cl->is_post_loop()) tty->print(" post");
if (cl->is_reduction_loop()) tty->print(" reduction");
if (cl->is_vectorized_loop()) tty->print(" vector");
if (cl->range_checks_present()) tty->print(" rc ");
if (range_checks_present()) tty->print(" rc ");
if (cl->is_multiversioned()) tty->print(" multi ");
}
if (_has_call) tty->print(" has_call");
@ -4013,7 +4013,7 @@ void IdealLoopTree::dump_head() const {
//------------------------------dump-------------------------------------------
// Dump loops by loop tree
void IdealLoopTree::dump() const {
void IdealLoopTree::dump() {
dump_head();
if (_child) _child->dump();
if (_next) _next ->dump();
@ -4600,8 +4600,7 @@ void PhaseIdealLoop::build_and_optimize() {
IdealLoopTree *lpt_next = lpt->_next;
if (lpt_next && lpt_next->is_counted()) {
CountedLoopNode *cl = lpt_next->_head->as_CountedLoop();
has_range_checks(lpt_next);
if (cl->is_post_loop() && cl->range_checks_present()) {
if (cl->is_post_loop() && lpt_next->range_checks_present()) {
if (!cl->is_multiversioned()) {
if (multi_version_post_loops(lpt, lpt_next) == false) {
// Cause the rce loop to be optimized away if we fail

View File

@ -72,17 +72,16 @@ protected:
DoUnrollOnly = 1<<10,
VectorizedLoop = 1<<11,
HasAtomicPostLoop = 1<<12,
HasRangeChecks = 1<<13,
IsMultiversioned = 1<<14,
StripMined = 1<<15,
SubwordLoop = 1<<16,
ProfileTripFailed = 1<<17,
LoopNestInnerLoop = 1 << 18,
LoopNestLongOuterLoop = 1 << 19};
IsMultiversioned = 1<<13,
StripMined = 1<<14,
SubwordLoop = 1<<15,
ProfileTripFailed = 1<<16,
LoopNestInnerLoop = 1 << 17,
LoopNestLongOuterLoop = 1 << 18};
char _unswitch_count;
enum { _unswitch_max=3 };
char _postloop_flags;
enum { LoopNotRCEChecked = 0, LoopRCEChecked = 1, RCEPostLoop = 2 };
enum { RCEPostLoop = 1 };
// Expected trip count from profile data
float _profile_trip_cnt;
@ -94,7 +93,6 @@ public:
bool is_inner_loop() const { return _loop_flags & InnerLoop; }
void set_inner_loop() { _loop_flags |= InnerLoop; }
bool range_checks_present() const { return _loop_flags & HasRangeChecks; }
bool is_multiversioned() const { return _loop_flags & IsMultiversioned; }
bool is_vectorized_loop() const { return _loop_flags & VectorizedLoop; }
bool is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; }
@ -113,8 +111,6 @@ public:
void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
void mark_has_range_checks() { _loop_flags |= HasRangeChecks; }
void clear_has_range_checks() { _loop_flags &= ~HasRangeChecks; }
void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
void mark_strip_mined() { _loop_flags |= StripMined; }
void clear_strip_mined() { _loop_flags &= ~StripMined; }
@ -126,8 +122,6 @@ public:
int unswitch_max() { return _unswitch_max; }
int unswitch_count() { return _unswitch_count; }
int has_been_range_checked() const { return _postloop_flags & LoopRCEChecked; }
void set_has_been_range_checked() { _postloop_flags |= LoopRCEChecked; }
int is_rce_post_loop() const { return _postloop_flags & RCEPostLoop; }
void set_is_rce_post_loop() { _postloop_flags |= RCEPostLoop; }
@ -621,7 +615,9 @@ public:
uint8_t _irreducible:1, // True if irreducible
_has_call:1, // True if has call safepoint
_has_sfpt:1, // True if has non-call safepoint
_rce_candidate:1; // True if candidate for range check elimination
_rce_candidate:1, // True if candidate for range check elimination
_has_range_checks:1,
_has_range_checks_computed:1;
Node_List* _safepts; // List of safepoints in this loop
Node_List* _required_safept; // A inner loop cannot delete these safepts;
@ -633,6 +629,7 @@ public:
_phase(phase),
_local_loop_unroll_limit(0), _local_loop_unroll_factor(0),
_nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0),
_has_range_checks(0), _has_range_checks_computed(0),
_safepts(NULL),
_required_safept(NULL),
_allow_optimizations(true)
@ -780,9 +777,20 @@ public:
void remove_main_post_loops(CountedLoopNode *cl, PhaseIdealLoop *phase);
bool compute_has_range_checks() const;
bool range_checks_present() {
if (!_has_range_checks_computed) {
if (compute_has_range_checks()) {
_has_range_checks = 1;
}
_has_range_checks_computed = 1;
}
return _has_range_checks;
}
#ifndef PRODUCT
void dump_head() const; // Dump loop head only
void dump() const; // Dump this loop recursively
void dump_head(); // Dump loop head only
void dump(); // Dump this loop recursively
void verify_tree(IdealLoopTree *loop, const IdealLoopTree *parent) const;
#endif
@ -1424,10 +1432,7 @@ public:
}
// Eliminate range-checks and other trip-counter vs loop-invariant tests.
int do_range_check( IdealLoopTree *loop, Node_List &old_new );
// Check to see if do_range_check(...) cleaned the main loop of range-checks
void has_range_checks(IdealLoopTree *loop);
void do_range_check(IdealLoopTree *loop, Node_List &old_new);
// Process post loops which have range checks and try to build a multi-version
// guard to safely determine if we can execute the post loop which was RCE'd.

View File

@ -180,11 +180,10 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next;
CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
_phase->has_range_checks(lpt_next);
// Main loop SLP works well for manually unrolled loops. But post loop
// vectorization doesn't work for these. To bail out the optimization
// earlier, we have range check and loop stride conditions below.
if (cl_next->is_post_loop() && !cl_next->range_checks_present() &&
if (cl_next->is_post_loop() && !lpt_next->range_checks_present() &&
cl_next->stride_is_con() && abs(cl_next->stride_con()) == 1) {
if (!cl_next->is_vectorized_loop()) {
// Propagate some main loop attributes to its corresponding scalar

View File

@ -0,0 +1,69 @@
/*
* Copyright (c) 2023, Red Hat, Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.c2.irTests;
import compiler.lib.ir_framework.*;
import jdk.test.lib.Utils;
import jdk.internal.misc.Unsafe;
import java.util.Objects;
import java.util.Random;
/*
* @test
* @bug 8300256
* @requires (os.simpleArch == "x64") | (os.simpleArch == "aarch64")
* @modules java.base/jdk.internal.misc
* @library /test/lib /
* @run driver compiler.c2.irTests.TestVectorizationNotRun
*/
public class TestVectorizationNotRun {
private static final Unsafe UNSAFE = Unsafe.getUnsafe();
public static void main(String[] args) {
TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED");
}
static int size = 1024;
static int sizeBytes = 8 * size;
static byte[] byteArray = new byte[sizeBytes];
static long[] longArray = new long[size];
@Test
@IR(counts = { IRNode.LOAD_VECTOR, ">=1", IRNode.STORE_VECTOR, ">=1" })
public static void test(byte[] dest, long[] src) {
for (int i = 0; i < src.length; i++) {
if ((i < 0) || (8 > sizeBytes - i)) {
throw new IndexOutOfBoundsException();
}
UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + i * 8, src[i]);
}
}
@Run(test = "test")
public static void test_runner() {
test(byteArray, longArray);
}
}