From da14813a5bdadaf0a1f81fa57ff6e1b103eaf113 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 7 Jan 2026 12:37:52 +0000 Subject: [PATCH] 8373453: C2 SuperWord: must handle load slices that have loads with different memory inputs Reviewed-by: kvn, thartmann, qamai --- src/hotspot/share/opto/vectorization.cpp | 23 ++++- src/hotspot/share/opto/vectorization.hpp | 6 +- ...oadSliceWithMultipleMemoryInputStates.java | 94 +++++++++++++++++++ 3 files changed, 116 insertions(+), 7 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestLoadSliceWithMultipleMemoryInputStates.java diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index f81ee1b7ddb..1755b0453eb 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -187,7 +187,10 @@ VStatus VLoopAnalyzer::setup_submodules_helper() { return body_status; } - _memory_slices.find_memory_slices(); + VStatus slices_status = _memory_slices.find_memory_slices(); + if (!slices_status.is_success()) { + return slices_status; + } // If there is no memory slice detected, it means there is no store. // If there is no reduction and no store, then we give up, because @@ -207,9 +210,11 @@ VStatus VLoopAnalyzer::setup_submodules_helper() { } // There are 2 kinds of slices: -// - No memory phi: only loads. All have the same input memory state from before the loop. +// - No memory phi: only loads. +// - Usually, all loads have the same input memory state from before the loop. +// - Only rarely this is not the case, and we just bail out for now. // - With memory phi. Chain of memory operations inside the loop. -void VLoopMemorySlices::find_memory_slices() { +VStatus VLoopMemorySlices::find_memory_slices() { Compile* C = _vloop.phase()->C; // We iterate over the body, which is topologically sorted. Hence, if there is a phi // in a slice, we will find it first, and the loads and stores afterwards. @@ -228,8 +233,15 @@ void VLoopMemorySlices::find_memory_slices() { PhiNode* head = _heads.at(alias_idx); if (head == nullptr) { // We did not find a phi on this slice yet -> must be a slice with only loads. - assert(_inputs.at(alias_idx) == nullptr || _inputs.at(alias_idx) == load->in(1), - "not yet touched or the same input"); + // For now, we can only handle slices with a single memory input before the loop, + // so if we find multiple, we bail out of auto vectorization. If this becomes + // too restrictive in the fututure, we could consider tracking multiple inputs. + // Different memory inputs can for example happen if one load has its memory state + // optimized, and the other load fails to have it optimized, for example because + // it does not end up on the IGVN worklist any more. + if (_inputs.at(alias_idx) != nullptr && _inputs.at(alias_idx) != load->in(1)) { + return VStatus::make_failure(FAILURE_DIFFERENT_MEMORY_INPUT); + } _inputs.at_put(alias_idx, load->in(1)); } // else: the load belongs to a slice with a phi that already set heads and inputs. #ifdef ASSERT @@ -243,6 +255,7 @@ void VLoopMemorySlices::find_memory_slices() { } } NOT_PRODUCT( if (_vloop.is_trace_memory_slices()) { print(); } ) + return VStatus::make_success(); } #ifndef PRODUCT diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 9308712f78a..b187435c04d 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2023, Arm Limited. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -504,6 +504,8 @@ private: // class VLoopMemorySlices : public StackObj { private: + static constexpr char const* FAILURE_DIFFERENT_MEMORY_INPUT = "Load only slice has multiple memory inputs"; + const VLoop& _vloop; const VLoopBody& _body; @@ -521,7 +523,7 @@ public: const GrowableArray& inputs() const { return _inputs; } const GrowableArray& heads() const { return _heads; } - void find_memory_slices(); + VStatus find_memory_slices(); void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray& slice) const; bool same_memory_slice(MemNode* m1, MemNode* m2) const; diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestLoadSliceWithMultipleMemoryInputStates.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestLoadSliceWithMultipleMemoryInputStates.java new file mode 100644 index 00000000000..7bd85d343cd --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestLoadSliceWithMultipleMemoryInputStates.java @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test id=all-flags + * @summary Test a case where we can have one memory slice that has only loads, + * but the loads from the slice do not have all the same input memory + * state from before the loop. This is rather rare but it can happen. + * @bug 8373453 + * @run main/othervm + * -XX:CompileCommand=compileonly,${test.main.class}::test + * -Xbatch -XX:-TieredCompilation + * ${test.main.class} + */ + +/* + * @test id=fewer-flags + * @bug 8373453 + * @run main/othervm + * -XX:CompileCommand=compileonly,${test.main.class}::test + * ${test.main.class} + */ + +/* + * @test id=vanilla + * @bug 8373453 + * @run main ${test.main.class} + */ + +package compiler.loopopts.superword; + +public class TestLoadSliceWithMultipleMemoryInputStates { + static void test() { + // The relevant slice is the value field of the Byte Objects. + Byte x = 1; + + for (int i = 0; i < 2; i++) { + if ((i & 1) == 0) { + // Not sure what this loop is needed for, but it is very sensitive, + // I cannot even replace N with 32. + int N = 32; + for (int j = 0; j < N; j++) { + if (j == 1) { + x = (byte) x; + } + } + + for (int j = 0; j < 32; j++) { + // The call below has an effect on the memory state + // If we optimize the Load for Byte::value, we can bypass + // this call, since we know that Byte::value cannot be + // modified during the call. + Object o = 1; + o.toString(); + + for (int k = 0; k < 32; k++) { // OSR around here + // Loads of x byte field have different memory input states + // This is because some loads can split their memory state + // through a phi further up, and others are not put back on + // the IGVN worklist and are thus not optimized and keep + // the old memory state. Both are correct though. + x = (byte) (x + 1); + } + } + } + } + } + + public static void main(String[] args) { + for (int i = 0; i < 10_000; i++) { + test(); + } + } +}