diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index 8be412412ea..1ccc132fd38 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -41,6 +41,7 @@ #include "opto/movenode.hpp" #include "opto/mulnode.hpp" #include "opto/opaquenode.hpp" +#include "opto/opcodes.hpp" #include "opto/predicates.hpp" #include "opto/rootnode.hpp" #include "opto/runtime.hpp" @@ -2731,6 +2732,20 @@ Node* CountedLoopNode::match_incr_with_optional_truncation(Node* expr, Node** tr return nullptr; } +IfNode* CountedLoopNode::find_multiversion_if_from_multiversion_fast_main_loop() { + assert(is_main_loop() && is_multiversion_fast_loop(), "must be multiversion fast main loop"); + CountedLoopEndNode* pre_end = find_pre_loop_end(); + if (pre_end == nullptr) { return nullptr; } + Node* pre_entry = pre_end->loopnode()->in(LoopNode::EntryControl); + const Predicates predicates(pre_entry); + IfTrueNode* before_predicates = predicates.entry()->isa_IfTrue(); + if (before_predicates != nullptr && + before_predicates->in(0)->in(1)->is_OpaqueMultiversioning()) { + return before_predicates->in(0)->as_If(); + } + return nullptr; +} + LoopNode* CountedLoopNode::skip_strip_mined(int expect_skeleton) { if (is_strip_mined() && in(EntryControl) != nullptr && in(EntryControl)->is_OuterStripMinedLoop()) { verify_strip_mined(expect_skeleton); @@ -4536,6 +4551,49 @@ void PhaseIdealLoop::eliminate_useless_zero_trip_guard() { } } +void PhaseIdealLoop::eliminate_useless_multiversion_if() { + if (_multiversion_opaque_nodes.size() == 0) { + return; + } + + ResourceMark rm; + Unique_Node_List useful_multiversioning_opaque_nodes; + + // The OpaqueMultiversioning is only used from the fast main loop in AutoVectorization, to add + // speculative runtime-checks to the multiversion_if. Thus, a OpaqueMultiversioning is only + // useful if it can be found from a fast main loop. If it can not be found from a fast main loop, + // then we cannot ever use that multiversion_if to add more speculative runtime-checks, and hence + // it is useless. If it is still in delayed mode, i.e. has not yet had any runtime-checks added, + // then we can let it constant fold towards the fast loop. + for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) { + IdealLoopTree* lpt = iter.current(); + if (lpt->_child == nullptr && lpt->is_counted()) { + CountedLoopNode* head = lpt->_head->as_CountedLoop(); + if (head->is_main_loop() && head->is_multiversion_fast_loop()) { + // There are fast_loop pre/main/post loops, but the finding traversal starts at the main + // loop, and traverses via the fast pre loop to the multiversion_if. + IfNode* multiversion_if = head->find_multiversion_if_from_multiversion_fast_main_loop(); + if (multiversion_if != nullptr) { + useful_multiversioning_opaque_nodes.push(multiversion_if->in(1)->as_OpaqueMultiversioning()); + } + } + } + } + + for (uint i = 0; i < _multiversion_opaque_nodes.size(); i++) { + OpaqueMultiversioningNode* opaque = _multiversion_opaque_nodes.at(i)->as_OpaqueMultiversioning(); + if (!useful_multiversioning_opaque_nodes.member(opaque)) { + if (opaque->is_delayed_slow_loop()) { + // We cannot hack the node directly, otherwise the slow_loop will complain that it cannot + // find the multiversioning opaque node. Instead, we mark the opaque node as useless, and + // it can be constant folded during IGVN. + opaque->mark_useless(); + _igvn._worklist.push(opaque); + } + } + } +} + //------------------------process_expensive_nodes----------------------------- // Expensive nodes have their control input set to prevent the GVN // from commoning them and as a result forcing the resulting node to @@ -4805,6 +4863,7 @@ void PhaseIdealLoop::build_and_optimize() { } eliminate_useless_zero_trip_guard(); + eliminate_useless_multiversion_if(); if (stop_early) { assert(do_expensive_nodes, "why are we here?"); @@ -6596,6 +6655,9 @@ void PhaseIdealLoop::build_loop_late_post_work(Node *n, bool pinned) { _zero_trip_guard_opaque_nodes.push(n); } + if (!_verify_only && n->Opcode() == Op_OpaqueMultiversioning) { + _multiversion_opaque_nodes.push(n); + } } #ifdef ASSERT diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index a9c5d697c6b..dd310e769d6 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -290,6 +290,8 @@ public: bool has_atomic_post_loop () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; } void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; } + IfNode* find_multiversion_if_from_multiversion_fast_main_loop(); + int main_idx() const { return _main_idx; } @@ -932,6 +934,7 @@ private: // clear out dead code after build_loop_late Node_List _deadlist; Node_List _zero_trip_guard_opaque_nodes; + Node_List _multiversion_opaque_nodes; // Support for faster execution of get_late_ctrl()/dom_lca() // when a node has many uses and dominator depth is deep. @@ -1453,6 +1456,7 @@ public: void eliminate_useless_template_assertion_predicates(Unique_Node_List& useful_predicates); void eliminate_useless_zero_trip_guard(); + void eliminate_useless_multiversion_if(); public: // Change the control input of expensive nodes to allow commoning by diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 2d564c3c8cf..125399ea11a 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -791,7 +791,10 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) { // Ignore Template Assertion Predicates with OpaqueTemplateAssertionPredicate nodes. return nullptr; } - assert(bol->Opcode() == Op_Bool, "Unexpected node"); + if (!bol->is_Bool()) { + assert(false, "Expected Bool, but got %s", NodeClassNames[bol->Opcode()]); + return nullptr; + } int cmp_op = bol->in(1)->Opcode(); if (cmp_op == Op_SubTypeCheck) { // SubTypeCheck expansion expects an IfNode return nullptr; diff --git a/src/hotspot/share/opto/opaquenode.cpp b/src/hotspot/share/opto/opaquenode.cpp index 672ea13e507..949bdfbc839 100644 --- a/src/hotspot/share/opto/opaquenode.cpp +++ b/src/hotspot/share/opto/opaquenode.cpp @@ -82,6 +82,26 @@ IfNode* OpaqueZeroTripGuardNode::if_node() const { return iff->as_If(); } +Node* OpaqueMultiversioningNode::Identity(PhaseGVN* phase) { + // Constant fold the multiversion_if. Since the slow_loop is still delayed, + // i.e. we have not yet added any possibly failing condition, we can just + // take the true branch in all cases. + if (_useless) { + assert(_is_delayed_slow_loop, "the slow_loop should still be delayed"); + return in(1); + } + return Opaque1Node::Identity(phase); +} + +#ifndef PRODUCT +void OpaqueMultiversioningNode::dump_spec(outputStream *st) const { + Opaque1Node::dump_spec(st); + if (_useless) { + st->print(" #useless"); + } +} +#endif + const Type* OpaqueNotNullNode::Value(PhaseGVN* phase) const { return phase->type(in(1)); } diff --git a/src/hotspot/share/opto/opaquenode.hpp b/src/hotspot/share/opto/opaquenode.hpp index 0e8a53efd34..80d5ae6cd3e 100644 --- a/src/hotspot/share/opto/opaquenode.hpp +++ b/src/hotspot/share/opto/opaquenode.hpp @@ -101,17 +101,30 @@ public: class OpaqueMultiversioningNode : public Opaque1Node { private: bool _is_delayed_slow_loop; + bool _useless; public: OpaqueMultiversioningNode(Compile* C, Node* n) : - Opaque1Node(C, n), _is_delayed_slow_loop(true) + Opaque1Node(C, n), _is_delayed_slow_loop(true), _useless(false) { init_class_id(Class_OpaqueMultiversioning); } virtual int Opcode() const; virtual const Type* bottom_type() const { return TypeInt::BOOL; } bool is_delayed_slow_loop() const { return _is_delayed_slow_loop; } - void notify_slow_loop_that_it_can_resume_optimizations() { _is_delayed_slow_loop = false; } + + void notify_slow_loop_that_it_can_resume_optimizations() { + assert(!_useless, "must still be useful"); + _is_delayed_slow_loop = false; + } + + void mark_useless() { + assert(_is_delayed_slow_loop, "must still be delayed"); + _useless = true; + } + + virtual Node* Identity(PhaseGVN* phase); + NOT_PRODUCT(virtual void dump_spec(outputStream* st) const;) }; // This node is used in the context of intrinsics. We sometimes implicitly know that an object is non-null even though diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 6e373207d9c..20a0af1f111 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -329,6 +329,11 @@ public class IRNode { superWordNodes(ADD_REDUCTION_VL, "AddReductionVL"); } + public static final String OPAQUE_MULTIVERSIONING = PREFIX + "OPAQUE_MULTIVERSIONING" + POSTFIX; + static { + beforeMatchingNameRegex(OPAQUE_MULTIVERSIONING, "OpaqueMultiversioning"); + } + public static final String ADD_P_OF = COMPOSITE_PREFIX + "ADD_P_OF" + POSTFIX; static { String regex = START + "addP_" + IS_REPLACED + MID + ".*" + END; diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestMultiversionRemoveUselessSlowLoop.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestMultiversionRemoveUselessSlowLoop.java new file mode 100644 index 00000000000..7d2f56c66f3 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMultiversionRemoveUselessSlowLoop.java @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.loopopts.superword; + +import compiler.lib.ir_framework.*; + +/* + * @test + * @bug 8350756 + * @summary Test case where the multiversion fast_loop disappears, and we should + * constant fold the multiversion_if, to remove the slow_loop. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestMultiversionRemoveUselessSlowLoop + */ + +public class TestMultiversionRemoveUselessSlowLoop { + + public static void main(String[] args) { + TestFramework framework = new TestFramework(TestMultiversionRemoveUselessSlowLoop.class); + // No traps means we cannot use the predicates version for SuperWord / AutoVectorization, + // and instead use multiversioning directly. + framework.addFlags("-XX:-TieredCompilation", "-XX:PerMethodTrapLimit=0"); + framework.setDefaultWarmup(0); // simulates Xcomp + framework.start(); + } + + public static final int SIZE = 20; + public static final int[] a = new int[SIZE]; + public static final int[] b = new int[SIZE]; + public static final int SIZE2 = 10_000; + public static final int[] a2 = new int[SIZE2]; + public static final int[] b2 = new int[SIZE2]; + + @Test + @IR(counts = {"pre .* multiversion_fast", "= 2", // regular pre-main-post for both loops + "main .* multiversion_fast", "= 2", + "post .* multiversion_fast", "= 2", + "multiversion_delayed_slow", "= 2", // both have the delayed slow_loop + "multiversion", "= 8", // nothing unexpected + IRNode.OPAQUE_MULTIVERSIONING, "= 2"}, // Both multiversion_if are still here + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + phase = CompilePhase.PHASEIDEALLOOP1) + @IR(counts = {"pre .* multiversion_fast", "= 2", + "main .* multiversion_fast", "= 1", // The first main loop is fully unrolled + "post .* multiversion_fast", "= 3", // the second loop is vectorized, and has a vectorized post loop + "multiversion_delayed_slow", "= 1", // As a consequence of the first main loop being removed, we constant fold the multiversion_if + "multiversion", "= 7", // nothing unexpected + IRNode.OPAQUE_MULTIVERSIONING, "= 1"}, // The multiversion_if of the first loop was constant folded, because the main loop disappeared. + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + phase = CompilePhase.PHASEIDEALLOOP_ITERATIONS) + @IR(counts = {"pre .* multiversion_fast.*", ">= 1", // In some cases, the pre loop of the first loop also disappears because it only has a single iteration + "pre .* multiversion_fast.*", "<= 2", // but not in other cases the pre loop of the first loop remains. + "main .* multiversion_fast", "= 1", + "post .* multiversion_fast", "= 3", + "multiversion_delayed_slow", "= 0", // The second loop's multiversion_if was also not used, so it is constant folded after loop opts. + "multiversion", ">= 5", // nothing unexpected + "multiversion", "<= 6", // nothing unexpected + IRNode.OPAQUE_MULTIVERSIONING, "= 0"}, // After loop-opts, we also constant fold the multiversion_if of the second loop, as it is unused. + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + phase = CompilePhase.PRINT_IDEAL) + public static void testIR() { + // This loop is short, and the multiversion_fast main loop eventuall is fully unrolled. + for (int i = 0; i < SIZE; i++) { + a[i] = b[i]; + } + // We take this second loop with a larger limit so that loop opts keeps going once the loop + // above is fully optimized. It also gives us a reference where the main loop of the + // multiverion fast_loop does not disappear. + for (int i = 0; i < SIZE2; i++) { + a2[i] = b2[i]; + } + } + + static long instanceCount; + static int iFld; + static int iFld1; + + // The inner loop is Multiversioned, then PreMainPost and Unroll. + // Eventually, both the fast and slow loops (pre main and post) disappear, + // and leave us with a simple if-diamond using the multiversion_if. + // + // Verification code in PhaseIdealLoop::conditional_move finds this diamond + // and expects a Bool but gets an OpaqueMultiversioning instead. + // + // If we let the multiversion_if constant fold soon after the main fast loop + // disappears, then this issue does not occur any more. + @Test + public static void testCrash() { + boolean b2 = true; + for (int i = 0; i < 1000; i++) { + for (int i21 = 82; i21 > 9; --i21) { + if (b2) + break; + iFld1 = iFld; + b2 = true; + } + instanceCount = iFld1; + } + } +}