8350756: C2 SuperWord Multiversioning: remove useless slow loop when the fast loop disappears

Reviewed-by: kvn, chagedorn
This commit is contained in:
Emanuel Peter 2025-03-06 06:48:42 +00:00
parent 3626ac35b3
commit e82031ec1a
7 changed files with 234 additions and 3 deletions

View File

@ -41,6 +41,7 @@
#include "opto/movenode.hpp"
#include "opto/mulnode.hpp"
#include "opto/opaquenode.hpp"
#include "opto/opcodes.hpp"
#include "opto/predicates.hpp"
#include "opto/rootnode.hpp"
#include "opto/runtime.hpp"
@ -2731,6 +2732,20 @@ Node* CountedLoopNode::match_incr_with_optional_truncation(Node* expr, Node** tr
return nullptr;
}
IfNode* CountedLoopNode::find_multiversion_if_from_multiversion_fast_main_loop() {
assert(is_main_loop() && is_multiversion_fast_loop(), "must be multiversion fast main loop");
CountedLoopEndNode* pre_end = find_pre_loop_end();
if (pre_end == nullptr) { return nullptr; }
Node* pre_entry = pre_end->loopnode()->in(LoopNode::EntryControl);
const Predicates predicates(pre_entry);
IfTrueNode* before_predicates = predicates.entry()->isa_IfTrue();
if (before_predicates != nullptr &&
before_predicates->in(0)->in(1)->is_OpaqueMultiversioning()) {
return before_predicates->in(0)->as_If();
}
return nullptr;
}
LoopNode* CountedLoopNode::skip_strip_mined(int expect_skeleton) {
if (is_strip_mined() && in(EntryControl) != nullptr && in(EntryControl)->is_OuterStripMinedLoop()) {
verify_strip_mined(expect_skeleton);
@ -4536,6 +4551,49 @@ void PhaseIdealLoop::eliminate_useless_zero_trip_guard() {
}
}
void PhaseIdealLoop::eliminate_useless_multiversion_if() {
if (_multiversion_opaque_nodes.size() == 0) {
return;
}
ResourceMark rm;
Unique_Node_List useful_multiversioning_opaque_nodes;
// The OpaqueMultiversioning is only used from the fast main loop in AutoVectorization, to add
// speculative runtime-checks to the multiversion_if. Thus, a OpaqueMultiversioning is only
// useful if it can be found from a fast main loop. If it can not be found from a fast main loop,
// then we cannot ever use that multiversion_if to add more speculative runtime-checks, and hence
// it is useless. If it is still in delayed mode, i.e. has not yet had any runtime-checks added,
// then we can let it constant fold towards the fast loop.
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
if (lpt->_child == nullptr && lpt->is_counted()) {
CountedLoopNode* head = lpt->_head->as_CountedLoop();
if (head->is_main_loop() && head->is_multiversion_fast_loop()) {
// There are fast_loop pre/main/post loops, but the finding traversal starts at the main
// loop, and traverses via the fast pre loop to the multiversion_if.
IfNode* multiversion_if = head->find_multiversion_if_from_multiversion_fast_main_loop();
if (multiversion_if != nullptr) {
useful_multiversioning_opaque_nodes.push(multiversion_if->in(1)->as_OpaqueMultiversioning());
}
}
}
}
for (uint i = 0; i < _multiversion_opaque_nodes.size(); i++) {
OpaqueMultiversioningNode* opaque = _multiversion_opaque_nodes.at(i)->as_OpaqueMultiversioning();
if (!useful_multiversioning_opaque_nodes.member(opaque)) {
if (opaque->is_delayed_slow_loop()) {
// We cannot hack the node directly, otherwise the slow_loop will complain that it cannot
// find the multiversioning opaque node. Instead, we mark the opaque node as useless, and
// it can be constant folded during IGVN.
opaque->mark_useless();
_igvn._worklist.push(opaque);
}
}
}
}
//------------------------process_expensive_nodes-----------------------------
// Expensive nodes have their control input set to prevent the GVN
// from commoning them and as a result forcing the resulting node to
@ -4805,6 +4863,7 @@ void PhaseIdealLoop::build_and_optimize() {
}
eliminate_useless_zero_trip_guard();
eliminate_useless_multiversion_if();
if (stop_early) {
assert(do_expensive_nodes, "why are we here?");
@ -6596,6 +6655,9 @@ void PhaseIdealLoop::build_loop_late_post_work(Node *n, bool pinned) {
_zero_trip_guard_opaque_nodes.push(n);
}
if (!_verify_only && n->Opcode() == Op_OpaqueMultiversioning) {
_multiversion_opaque_nodes.push(n);
}
}
#ifdef ASSERT

View File

@ -290,6 +290,8 @@ public:
bool has_atomic_post_loop () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; }
void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
IfNode* find_multiversion_if_from_multiversion_fast_main_loop();
int main_idx() const { return _main_idx; }
@ -932,6 +934,7 @@ private:
// clear out dead code after build_loop_late
Node_List _deadlist;
Node_List _zero_trip_guard_opaque_nodes;
Node_List _multiversion_opaque_nodes;
// Support for faster execution of get_late_ctrl()/dom_lca()
// when a node has many uses and dominator depth is deep.
@ -1453,6 +1456,7 @@ public:
void eliminate_useless_template_assertion_predicates(Unique_Node_List& useful_predicates);
void eliminate_useless_zero_trip_guard();
void eliminate_useless_multiversion_if();
public:
// Change the control input of expensive nodes to allow commoning by

View File

@ -791,7 +791,10 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) {
// Ignore Template Assertion Predicates with OpaqueTemplateAssertionPredicate nodes.
return nullptr;
}
assert(bol->Opcode() == Op_Bool, "Unexpected node");
if (!bol->is_Bool()) {
assert(false, "Expected Bool, but got %s", NodeClassNames[bol->Opcode()]);
return nullptr;
}
int cmp_op = bol->in(1)->Opcode();
if (cmp_op == Op_SubTypeCheck) { // SubTypeCheck expansion expects an IfNode
return nullptr;

View File

@ -82,6 +82,26 @@ IfNode* OpaqueZeroTripGuardNode::if_node() const {
return iff->as_If();
}
Node* OpaqueMultiversioningNode::Identity(PhaseGVN* phase) {
// Constant fold the multiversion_if. Since the slow_loop is still delayed,
// i.e. we have not yet added any possibly failing condition, we can just
// take the true branch in all cases.
if (_useless) {
assert(_is_delayed_slow_loop, "the slow_loop should still be delayed");
return in(1);
}
return Opaque1Node::Identity(phase);
}
#ifndef PRODUCT
void OpaqueMultiversioningNode::dump_spec(outputStream *st) const {
Opaque1Node::dump_spec(st);
if (_useless) {
st->print(" #useless");
}
}
#endif
const Type* OpaqueNotNullNode::Value(PhaseGVN* phase) const {
return phase->type(in(1));
}

View File

@ -101,17 +101,30 @@ public:
class OpaqueMultiversioningNode : public Opaque1Node {
private:
bool _is_delayed_slow_loop;
bool _useless;
public:
OpaqueMultiversioningNode(Compile* C, Node* n) :
Opaque1Node(C, n), _is_delayed_slow_loop(true)
Opaque1Node(C, n), _is_delayed_slow_loop(true), _useless(false)
{
init_class_id(Class_OpaqueMultiversioning);
}
virtual int Opcode() const;
virtual const Type* bottom_type() const { return TypeInt::BOOL; }
bool is_delayed_slow_loop() const { return _is_delayed_slow_loop; }
void notify_slow_loop_that_it_can_resume_optimizations() { _is_delayed_slow_loop = false; }
void notify_slow_loop_that_it_can_resume_optimizations() {
assert(!_useless, "must still be useful");
_is_delayed_slow_loop = false;
}
void mark_useless() {
assert(_is_delayed_slow_loop, "must still be delayed");
_useless = true;
}
virtual Node* Identity(PhaseGVN* phase);
NOT_PRODUCT(virtual void dump_spec(outputStream* st) const;)
};
// This node is used in the context of intrinsics. We sometimes implicitly know that an object is non-null even though

View File

@ -329,6 +329,11 @@ public class IRNode {
superWordNodes(ADD_REDUCTION_VL, "AddReductionVL");
}
public static final String OPAQUE_MULTIVERSIONING = PREFIX + "OPAQUE_MULTIVERSIONING" + POSTFIX;
static {
beforeMatchingNameRegex(OPAQUE_MULTIVERSIONING, "OpaqueMultiversioning");
}
public static final String ADD_P_OF = COMPOSITE_PREFIX + "ADD_P_OF" + POSTFIX;
static {
String regex = START + "addP_" + IS_REPLACED + MID + ".*" + END;

View File

@ -0,0 +1,124 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.loopopts.superword;
import compiler.lib.ir_framework.*;
/*
* @test
* @bug 8350756
* @summary Test case where the multiversion fast_loop disappears, and we should
* constant fold the multiversion_if, to remove the slow_loop.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestMultiversionRemoveUselessSlowLoop
*/
public class TestMultiversionRemoveUselessSlowLoop {
public static void main(String[] args) {
TestFramework framework = new TestFramework(TestMultiversionRemoveUselessSlowLoop.class);
// No traps means we cannot use the predicates version for SuperWord / AutoVectorization,
// and instead use multiversioning directly.
framework.addFlags("-XX:-TieredCompilation", "-XX:PerMethodTrapLimit=0");
framework.setDefaultWarmup(0); // simulates Xcomp
framework.start();
}
public static final int SIZE = 20;
public static final int[] a = new int[SIZE];
public static final int[] b = new int[SIZE];
public static final int SIZE2 = 10_000;
public static final int[] a2 = new int[SIZE2];
public static final int[] b2 = new int[SIZE2];
@Test
@IR(counts = {"pre .* multiversion_fast", "= 2", // regular pre-main-post for both loops
"main .* multiversion_fast", "= 2",
"post .* multiversion_fast", "= 2",
"multiversion_delayed_slow", "= 2", // both have the delayed slow_loop
"multiversion", "= 8", // nothing unexpected
IRNode.OPAQUE_MULTIVERSIONING, "= 2"}, // Both multiversion_if are still here
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
phase = CompilePhase.PHASEIDEALLOOP1)
@IR(counts = {"pre .* multiversion_fast", "= 2",
"main .* multiversion_fast", "= 1", // The first main loop is fully unrolled
"post .* multiversion_fast", "= 3", // the second loop is vectorized, and has a vectorized post loop
"multiversion_delayed_slow", "= 1", // As a consequence of the first main loop being removed, we constant fold the multiversion_if
"multiversion", "= 7", // nothing unexpected
IRNode.OPAQUE_MULTIVERSIONING, "= 1"}, // The multiversion_if of the first loop was constant folded, because the main loop disappeared.
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
phase = CompilePhase.PHASEIDEALLOOP_ITERATIONS)
@IR(counts = {"pre .* multiversion_fast.*", ">= 1", // In some cases, the pre loop of the first loop also disappears because it only has a single iteration
"pre .* multiversion_fast.*", "<= 2", // but not in other cases the pre loop of the first loop remains.
"main .* multiversion_fast", "= 1",
"post .* multiversion_fast", "= 3",
"multiversion_delayed_slow", "= 0", // The second loop's multiversion_if was also not used, so it is constant folded after loop opts.
"multiversion", ">= 5", // nothing unexpected
"multiversion", "<= 6", // nothing unexpected
IRNode.OPAQUE_MULTIVERSIONING, "= 0"}, // After loop-opts, we also constant fold the multiversion_if of the second loop, as it is unused.
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
phase = CompilePhase.PRINT_IDEAL)
public static void testIR() {
// This loop is short, and the multiversion_fast main loop eventuall is fully unrolled.
for (int i = 0; i < SIZE; i++) {
a[i] = b[i];
}
// We take this second loop with a larger limit so that loop opts keeps going once the loop
// above is fully optimized. It also gives us a reference where the main loop of the
// multiverion fast_loop does not disappear.
for (int i = 0; i < SIZE2; i++) {
a2[i] = b2[i];
}
}
static long instanceCount;
static int iFld;
static int iFld1;
// The inner loop is Multiversioned, then PreMainPost and Unroll.
// Eventually, both the fast and slow loops (pre main and post) disappear,
// and leave us with a simple if-diamond using the multiversion_if.
//
// Verification code in PhaseIdealLoop::conditional_move finds this diamond
// and expects a Bool but gets an OpaqueMultiversioning instead.
//
// If we let the multiversion_if constant fold soon after the main fast loop
// disappears, then this issue does not occur any more.
@Test
public static void testCrash() {
boolean b2 = true;
for (int i = 0; i < 1000; i++) {
for (int i21 = 82; i21 > 9; --i21) {
if (b2)
break;
iFld1 = iFld;
b2 = true;
}
instanceCount = iFld1;
}
}
}