8373453: C2 SuperWord: must handle load slices that have loads with different memory inputs

Reviewed-by: kvn, thartmann, qamai
This commit is contained in:
Emanuel Peter 2026-01-07 12:37:52 +00:00
parent 929864b1a4
commit da14813a5b
3 changed files with 116 additions and 7 deletions

View File

@ -187,7 +187,10 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
return body_status;
}
_memory_slices.find_memory_slices();
VStatus slices_status = _memory_slices.find_memory_slices();
if (!slices_status.is_success()) {
return slices_status;
}
// If there is no memory slice detected, it means there is no store.
// If there is no reduction and no store, then we give up, because
@ -207,9 +210,11 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
}
// There are 2 kinds of slices:
// - No memory phi: only loads. All have the same input memory state from before the loop.
// - No memory phi: only loads.
// - Usually, all loads have the same input memory state from before the loop.
// - Only rarely this is not the case, and we just bail out for now.
// - With memory phi. Chain of memory operations inside the loop.
void VLoopMemorySlices::find_memory_slices() {
VStatus VLoopMemorySlices::find_memory_slices() {
Compile* C = _vloop.phase()->C;
// We iterate over the body, which is topologically sorted. Hence, if there is a phi
// in a slice, we will find it first, and the loads and stores afterwards.
@ -228,8 +233,15 @@ void VLoopMemorySlices::find_memory_slices() {
PhiNode* head = _heads.at(alias_idx);
if (head == nullptr) {
// We did not find a phi on this slice yet -> must be a slice with only loads.
assert(_inputs.at(alias_idx) == nullptr || _inputs.at(alias_idx) == load->in(1),
"not yet touched or the same input");
// For now, we can only handle slices with a single memory input before the loop,
// so if we find multiple, we bail out of auto vectorization. If this becomes
// too restrictive in the fututure, we could consider tracking multiple inputs.
// Different memory inputs can for example happen if one load has its memory state
// optimized, and the other load fails to have it optimized, for example because
// it does not end up on the IGVN worklist any more.
if (_inputs.at(alias_idx) != nullptr && _inputs.at(alias_idx) != load->in(1)) {
return VStatus::make_failure(FAILURE_DIFFERENT_MEMORY_INPUT);
}
_inputs.at_put(alias_idx, load->in(1));
} // else: the load belongs to a slice with a phi that already set heads and inputs.
#ifdef ASSERT
@ -243,6 +255,7 @@ void VLoopMemorySlices::find_memory_slices() {
}
}
NOT_PRODUCT( if (_vloop.is_trace_memory_slices()) { print(); } )
return VStatus::make_success();
}
#ifndef PRODUCT

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, Arm Limited. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
@ -504,6 +504,8 @@ private:
//
class VLoopMemorySlices : public StackObj {
private:
static constexpr char const* FAILURE_DIFFERENT_MEMORY_INPUT = "Load only slice has multiple memory inputs";
const VLoop& _vloop;
const VLoopBody& _body;
@ -521,7 +523,7 @@ public:
const GrowableArray<Node*>& inputs() const { return _inputs; }
const GrowableArray<PhiNode*>& heads() const { return _heads; }
void find_memory_slices();
VStatus find_memory_slices();
void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<MemNode*>& slice) const;
bool same_memory_slice(MemNode* m1, MemNode* m2) const;

View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test id=all-flags
* @summary Test a case where we can have one memory slice that has only loads,
* but the loads from the slice do not have all the same input memory
* state from before the loop. This is rather rare but it can happen.
* @bug 8373453
* @run main/othervm
* -XX:CompileCommand=compileonly,${test.main.class}::test
* -Xbatch -XX:-TieredCompilation
* ${test.main.class}
*/
/*
* @test id=fewer-flags
* @bug 8373453
* @run main/othervm
* -XX:CompileCommand=compileonly,${test.main.class}::test
* ${test.main.class}
*/
/*
* @test id=vanilla
* @bug 8373453
* @run main ${test.main.class}
*/
package compiler.loopopts.superword;
public class TestLoadSliceWithMultipleMemoryInputStates {
static void test() {
// The relevant slice is the value field of the Byte Objects.
Byte x = 1;
for (int i = 0; i < 2; i++) {
if ((i & 1) == 0) {
// Not sure what this loop is needed for, but it is very sensitive,
// I cannot even replace N with 32.
int N = 32;
for (int j = 0; j < N; j++) {
if (j == 1) {
x = (byte) x;
}
}
for (int j = 0; j < 32; j++) {
// The call below has an effect on the memory state
// If we optimize the Load for Byte::value, we can bypass
// this call, since we know that Byte::value cannot be
// modified during the call.
Object o = 1;
o.toString();
for (int k = 0; k < 32; k++) { // OSR around here
// Loads of x byte field have different memory input states
// This is because some loads can split their memory state
// through a phi further up, and others are not put back on
// the IGVN worklist and are thus not optimized and keep
// the old memory state. Both are correct though.
x = (byte) (x + 1);
}
}
}
}
}
public static void main(String[] args) {
for (int i = 0; i < 10_000; i++) {
test();
}
}
}