jdk/src/hotspot/share/opto/vtransform.cpp
Emanuel Peter 6df78c4585 8371065: C2 SuperWord: VTransformLoopPhiNode::apply setting type leads to assert/wrong result
Co-authored-by: Roland Westrelin <roland@openjdk.org>
Reviewed-by: qamai, chagedorn
2025-11-12 07:10:29 +00:00

1634 lines
66 KiB
C++

/*
* Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
#include "opto/castnode.hpp"
#include "opto/convertnode.hpp"
#include "opto/rootnode.hpp"
#include "opto/vectorization.hpp"
#include "opto/vectornode.hpp"
#include "opto/vtransform.hpp"
void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
assert(vtnode->_idx == _vtnodes.length(), "position must match idx");
_vtnodes.push(vtnode);
}
#define TRACE_OPTIMIZE(code) \
NOT_PRODUCT( \
if (vtransform.vloop().is_trace_optimization()) { \
code \
} \
)
// This is similar to IGVN optimization. But we are a bit lazy, and don't care about
// notification / worklist, since the list of nodes is rather small, and we don't
// expect optimizations that trickle over the whole graph.
void VTransformGraph::optimize(VTransform& vtransform) {
TRACE_OPTIMIZE( tty->print_cr("\nVTransformGraph::optimize"); )
bool progress = true;
DEBUG_ONLY(int pass_count = 0;)
while (progress) {
progress = false;
assert(++pass_count < 10, "ensure we do not have endless loops");
for (int i = 0; i < _vtnodes.length(); i++) {
VTransformNode* vtn = _vtnodes.at(i);
if (!vtn->is_alive()) { continue; }
progress |= vtn->optimize(_vloop_analyzer, vtransform);
// Nodes that have no use any more are dead.
if (vtn->out_strong_edges() == 0 &&
// There are some exceptions:
// 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive.
// 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive.
// 3. Outer node with strong inputs: is a use after the loop that we must keep alive.
!(vtn->isa_PhiScalar() != nullptr ||
vtn->is_load_or_store_in_loop() ||
(vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) {
vtn->mark_dead();
progress = true;
}
}
}
}
// Compute a linearization of the graph. We do this with a reverse-post-order of a DFS.
// This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and
// the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the
// graph has additional constraints which can introduce cycles. Example:
//
// +--------+
// A -> X | v
// Pack [A,B] and [X,Y] [A,B] [X,Y]
// Y -> B ^ |
// +--------+
//
// We return "true" IFF we find no cycle, i.e. if the linearization succeeds.
bool VTransformGraph::schedule() {
assert(!is_scheduled(), "not yet scheduled");
#ifndef PRODUCT
if (_trace._verbose) {
print_vtnodes();
}
#endif
ResourceMark rm;
GrowableArray<VTransformNode*> stack;
VectorSet pre_visited;
VectorSet post_visited;
collect_nodes_without_strong_in_edges(stack);
const int num_alive_nodes = count_alive_vtnodes();
// We create a reverse-post-visit order. This gives us a linearization, if there are
// no cycles. Then, we simply reverse the order, and we have a schedule.
int rpo_idx = num_alive_nodes - 1;
while (!stack.is_empty()) {
VTransformNode* vtn = stack.top();
if (!pre_visited.test_set(vtn->_idx)) {
// Forward arc in graph (pre-visit).
} else if (!post_visited.test(vtn->_idx)) {
// Forward arc in graph. Check if all uses were already visited:
// Yes -> post-visit.
// No -> we are mid-visit.
bool all_uses_already_visited = true;
// We only need to respect the strong edges (data edges and strong memory edges).
// Violated weak memory edges are allowed, but require a speculative aliasing
// runtime check, see VTransform::apply_speculative_aliasing_runtime_checks.
for (uint i = 0; i < vtn->out_strong_edges(); i++) {
VTransformNode* use = vtn->out_strong_edge(i);
// Skip dead nodes
if (!use->is_alive()) { continue; }
// Skip backedges.
if ((use->is_loop_head_phi() || use->isa_CountedLoop() != nullptr) && use->in_req(2) == vtn) {
continue;
}
if (post_visited.test(use->_idx)) { continue; }
if (pre_visited.test(use->_idx)) {
// Cycle detected!
// The nodes that are pre_visited but not yet post_visited form a path from
// the "root" to the current vtn. Now, we are looking at an edge (vtn, use),
// and discover that use is also pre_visited but not post_visited. Thus, use
// lies on that path from "root" to vtn, and the edge (vtn, use) closes a
// cycle.
NOT_PRODUCT(if (_trace._rejections) { trace_schedule_cycle(stack, pre_visited, post_visited); } )
return false;
}
stack.push(use);
all_uses_already_visited = false;
}
if (all_uses_already_visited) {
stack.pop();
post_visited.set(vtn->_idx); // post-visit
_schedule.at_put_grow(rpo_idx--, vtn); // assign rpo_idx
}
} else {
stack.pop(); // Already post-visited. Ignore secondary edge.
}
}
#ifndef PRODUCT
if (_trace._info) {
print_schedule();
}
#endif
assert(rpo_idx == -1, "used up all rpo_idx, rpo_idx=%d", rpo_idx);
return true;
}
// Push all "root" nodes, i.e. those that have no strong input edges (data edges and strong memory edges):
void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const {
for (int i = 0; i < _vtnodes.length(); i++) {
VTransformNode* vtn = _vtnodes.at(i);
if (!vtn->is_alive()) { continue; }
if (!vtn->has_strong_in_edge()) {
stack.push(vtn);
}
// If an Outer node has both inputs and outputs, we will most likely have cycles in the final graph.
// This is not a correctness problem, but it just will prevent vectorization. If this ever happens
// try to find a way to avoid the cycle somehow.
assert(vtn->isa_Outer() == nullptr || (vtn->has_strong_in_edge() != (vtn->out_strong_edges() > 0)),
"Outer nodes should either be inputs or outputs, but not both, otherwise we may get cycles");
}
}
int VTransformGraph::count_alive_vtnodes() const {
int count = 0;
for (int i = 0; i < _vtnodes.length(); i++) {
VTransformNode* vtn = _vtnodes.at(i);
if (vtn->is_alive()) { count++; }
}
return count;
}
// Find all nodes that in the loop, in a 2-phase process:
// - First, find all nodes that are not before the loop:
// - loop-phis
// - loads and stores that are in the loop
// - and all their transitive uses.
// - Second, we find all nodes that are not after the loop:
// - backedges
// - loads and stores that are in the loop
// - and all their transitive uses.
//
// in_loop: vtn->_idx -> bool
void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
assert(is_scheduled(), "must already be scheduled");
// Phase 1: find all nodes that are not before the loop.
VectorSet is_not_before_loop;
for (int i = 0; i < _schedule.length(); i++) {
VTransformNode* vtn = _schedule.at(i);
// Is vtn a loop-phi?
if (vtn->is_loop_head_phi() ||
vtn->is_load_or_store_in_loop()) {
is_not_before_loop.set(vtn->_idx);
continue;
}
// Or one of its transitive uses?
for (uint j = 0; j < vtn->req(); j++) {
VTransformNode* def = vtn->in_req(j);
if (def != nullptr && is_not_before_loop.test(def->_idx)) {
is_not_before_loop.set(vtn->_idx);
break;
}
}
}
// Phase 2: find all nodes that are not after the loop.
for (int i = _schedule.length()-1; i >= 0; i--) {
VTransformNode* vtn = _schedule.at(i);
if (!is_not_before_loop.test(vtn->_idx)) { continue; }
// Is load or store?
if (vtn->is_load_or_store_in_loop()) {
in_loop.set(vtn->_idx);
continue;
}
for (uint i = 0; i < vtn->out_strong_edges(); i++) {
VTransformNode* use = vtn->out_strong_edge(i);
// Or is vtn a backedge or one of its transitive defs?
if (in_loop.test(use->_idx) || use->is_loop_head_phi()) {
in_loop.set(vtn->_idx);
break;
}
}
}
}
float VTransformGraph::cost_for_vector_loop() const {
assert(is_scheduled(), "must already be scheduled");
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr("\nVTransformGraph::cost_for_vector_loop:");
}
#endif
// We only want to count the cost of nodes that are in the loop.
// This is especially important for cases where we were able to move
// some nodes outside the loop during VTransform::optimize, e.g.:
// VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
ResourceMark rm;
VectorSet in_loop; // vtn->_idx -> bool
mark_vtnodes_in_loop(in_loop);
float sum = 0;
for (int i = 0; i < _schedule.length(); i++) {
VTransformNode* vtn = _schedule.at(i);
if (!in_loop.test(vtn->_idx)) { continue; }
float c = vtn->cost(_vloop_analyzer);
sum += c;
#ifndef PRODUCT
if (c != 0 && _vloop.is_trace_cost_verbose()) {
tty->print(" -> cost = %.2f for ", c);
vtn->print();
}
#endif
}
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr(" total_cost = %.2f", sum);
}
#endif
return sum;
}
#ifndef PRODUCT
void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
const VectorSet& pre_visited,
const VectorSet& post_visited) const {
tty->print_cr("\nVTransform::schedule found a cycle on path (P), vectorization attempt fails.");
for (int j = 0; j < stack.length(); j++) {
VTransformNode* n = stack.at(j);
bool on_path = pre_visited.test(n->_idx) && !post_visited.test(n->_idx);
tty->print(" %s ", on_path ? "P" : "_");
n->print();
}
}
void VTransformApplyResult::trace(VTransformNode* vtnode) const {
tty->print(" apply: ");
vtnode->print();
tty->print(" -> ");
if (_node == nullptr) {
tty->print_cr("nullptr");
} else {
_node->dump();
}
}
#endif
void VTransform::apply_speculative_alignment_runtime_checks() {
if (VLoop::vectors_should_be_aligned()) {
#ifdef ASSERT
if (_trace._align_vector || _trace._speculative_runtime_checks) {
tty->print_cr("\nVTransform::apply_speculative_alignment_runtime_checks: native memory alignment");
}
#endif
const GrowableArray<VTransformNode*>& vtnodes = _graph.vtnodes();
for (int i = 0; i < vtnodes.length(); i++) {
VTransformMemVectorNode* vtn = vtnodes.at(i)->isa_MemVector();
if (vtn == nullptr) { continue; }
const VPointer& vp = vtn->vpointer();
if (vp.mem_pointer().base().is_object()) { continue; }
assert(vp.mem_pointer().base().is_native(), "VPointer base must be object or native");
// We have a native memory reference. Build a runtime check for it.
// See: AlignmentSolver::solve
// In a future RFE we may be able to speculate on invar alignment as
// well, and allow vectorization of more cases.
add_speculative_alignment_check(vp.mem_pointer().base().native(), ObjectAlignmentInBytes);
}
}
}
#define TRACE_SPECULATIVE_ALIGNMENT_CHECK(node) { \
DEBUG_ONLY( \
if (_trace._align_vector || _trace._speculative_runtime_checks) { \
tty->print(" " #node ": "); \
node->dump(); \
} \
) \
} \
// Check: (node % alignment) == 0.
void VTransform::add_speculative_alignment_check(Node* node, juint alignment) {
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
Node* ctrl = phase()->get_ctrl(node);
// Cast adr/long -> int
if (node->bottom_type()->basic_type() == T_ADDRESS) {
// adr -> int/long
node = new CastP2XNode(nullptr, node);
phase()->register_new_node(node, ctrl);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
}
if (node->bottom_type()->basic_type() == T_LONG) {
// long -> int
node = new ConvL2INode(node);
phase()->register_new_node(node, ctrl);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
}
Node* mask_alignment = phase()->intcon(alignment-1);
Node* base_alignment = new AndINode(node, mask_alignment);
phase()->register_new_node(base_alignment, ctrl);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(mask_alignment);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(base_alignment);
Node* zero = phase()->intcon(0);
Node* cmp_alignment = CmpNode::make(base_alignment, zero, T_INT, false);
BoolNode* bol_alignment = new BoolNode(cmp_alignment, BoolTest::eq);
phase()->register_new_node(cmp_alignment, ctrl);
phase()->register_new_node(bol_alignment, ctrl);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(cmp_alignment);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(bol_alignment);
add_speculative_check([&] (Node* ctrl) { return bol_alignment; });
}
class VPointerWeakAliasingPair : public StackObj {
private:
// Using references instead of pointers would be preferrable, but GrowableArray
// requires a default constructor, and we do not have a default constructor for
// VPointer.
const VPointer* _vp1 = nullptr;
const VPointer* _vp2 = nullptr;
VPointerWeakAliasingPair(const VPointer& vp1, const VPointer& vp2) : _vp1(&vp1), _vp2(&vp2) {
assert(vp1.is_valid(), "sanity");
assert(vp2.is_valid(), "sanity");
assert(!vp1.never_overlaps_with(vp2), "otherwise no aliasing");
assert(!vp1.always_overlaps_with(vp2), "otherwise must be strong");
assert(VPointer::cmp_summands_and_con(vp1, vp2) <= 0, "must be sorted");
}
public:
// Default constructor to make GrowableArray happy.
VPointerWeakAliasingPair() : _vp1(nullptr), _vp2(nullptr) {}
static VPointerWeakAliasingPair make(const VPointer& vp1, const VPointer& vp2) {
if (VPointer::cmp_summands_and_con(vp1, vp2) <= 0) {
return VPointerWeakAliasingPair(vp1, vp2);
} else {
return VPointerWeakAliasingPair(vp2, vp1);
}
}
const VPointer& vp1() const { return *_vp1; }
const VPointer& vp2() const { return *_vp2; }
// Sort by summands, so that pairs with same summands (summand1, summands2) are adjacent.
static int cmp_for_sort(VPointerWeakAliasingPair* pair1, VPointerWeakAliasingPair* pair2) {
int cmp_summands1 = VPointer::cmp_summands(pair1->vp1(), pair2->vp1());
if (cmp_summands1 != 0) { return cmp_summands1; }
return VPointer::cmp_summands(pair1->vp2(), pair2->vp2());
}
};
void VTransform::apply_speculative_aliasing_runtime_checks() {
if (_vloop.use_speculative_aliasing_checks()) {
#ifdef ASSERT
if (_trace._speculative_aliasing_analysis || _trace._speculative_runtime_checks) {
tty->print_cr("\nVTransform::apply_speculative_aliasing_runtime_checks: speculative aliasing analysis runtime checks");
}
#endif
// It would be nice to add a ResourceMark here. But it would collide with resource allocation
// in PhaseIdealLoop::set_idom for _idom and _dom_depth. See also JDK-8337015.
VectorSet visited;
GrowableArray<VPointerWeakAliasingPair> weak_aliasing_pairs;
const GrowableArray<VTransformNode*>& schedule = _graph.get_schedule();
for (int i = 0; i < schedule.length(); i++) {
VTransformNode* vtn = schedule.at(i);
for (uint i = 0; i < vtn->out_weak_edges(); i++) {
VTransformNode* use = vtn->out_weak_edge(i);
if (visited.test(use->_idx)) {
// The use node was already visited, i.e. is higher up in the schedule.
// The "out" edge thus points backward, i.e. it is violated.
const VPointer& vp1 = vtn->vpointer();
const VPointer& vp2 = use->vpointer();
#ifdef ASSERT
if (_trace._speculative_aliasing_analysis || _trace._speculative_runtime_checks) {
tty->print_cr("\nViolated Weak Edge:");
vtn->print();
vp1.print_on(tty);
use->print();
vp2.print_on(tty);
}
#endif
// We could generate checks for the pair (vp1, vp2) directly. But in
// some graphs, this generates quadratically many checks. Example:
//
// set1: a[i+0] a[i+1] a[i+2] a[i+3]
// set2: b[i+0] b[i+1] b[i+2] b[i+3]
//
// We may have a weak memory edge between every memory access from
// set1 to every memory access from set2. In this example, this would
// be 4 * 4 = 16 checks. But instead, we can create a union VPointer
// for set1 and set2 each, and only create a single check.
//
// set1: a[i+0, size = 4]
// set1: b[i+0, size = 4]
//
// For this, we add all pairs to an array, and process it below.
weak_aliasing_pairs.push(VPointerWeakAliasingPair::make(vp1, vp2));
}
}
visited.set(vtn->_idx);
}
// Sort so that all pairs with the same summands (summands1, summands2)
// are consecutive, i.e. in the same group. This allows us to do a linear
// walk over all pairs of a group and create the union VPointers.
weak_aliasing_pairs.sort(VPointerWeakAliasingPair::cmp_for_sort);
int group_start = 0;
while (group_start < weak_aliasing_pairs.length()) {
// New group: pick the first pair as the reference.
const VPointer* vp1 = &weak_aliasing_pairs.at(group_start).vp1();
const VPointer* vp2 = &weak_aliasing_pairs.at(group_start).vp2();
jint size1 = vp1->size();
jint size2 = vp2->size();
int group_end = group_start + 1;
while (group_end < weak_aliasing_pairs.length()) {
const VPointer* vp1_next = &weak_aliasing_pairs.at(group_end).vp1();
const VPointer* vp2_next = &weak_aliasing_pairs.at(group_end).vp2();
jint size1_next = vp1_next->size();
jint size2_next = vp2_next->size();
// Different summands -> different group.
if (VPointer::cmp_summands(*vp1, *vp1_next) != 0) { break; }
if (VPointer::cmp_summands(*vp2, *vp2_next) != 0) { break; }
// Pick the one with the lower con as the reference.
if (vp1->con() > vp1_next->con()) {
swap(vp1, vp1_next);
swap(size1, size1_next);
}
if (vp2->con() > vp2_next->con()) {
swap(vp2, vp2_next);
swap(size2, size2_next);
}
// Compute the distance from vp1 to vp1_next + size, to get a size that would include vp1_next.
NoOverflowInt new_size1 = NoOverflowInt(vp1_next->con()) + NoOverflowInt(size1_next) - NoOverflowInt(vp1->con());
NoOverflowInt new_size2 = NoOverflowInt(vp2_next->con()) + NoOverflowInt(size2_next) - NoOverflowInt(vp2->con());
if (new_size1.is_NaN() || new_size2.is_NaN()) { break; /* overflow -> new group */ }
// The "next" VPointer indeed belong to the group.
//
// vp1: |-------------->
// vp1_next: |---------------->
// result: |-------------------------->
//
// vp1: |-------------------------->
// vp1_next: |------->
// result: |-------------------------->
//
size1 = MAX2(size1, new_size1.value());
size2 = MAX2(size2, new_size2.value());
group_end++;
}
// Create "union" VPointer that cover all VPointer from the group.
const VPointer vp1_union = vp1->make_with_size(size1);
const VPointer vp2_union = vp2->make_with_size(size2);
#ifdef ASSERT
if (_trace._speculative_aliasing_analysis || _trace._speculative_runtime_checks) {
tty->print_cr("\nUnion of %d weak aliasing edges:", group_end - group_start);
vp1_union.print_on(tty);
vp2_union.print_on(tty);
}
// Verification - union must contain all VPointer of the group.
for (int i = group_start; i < group_end; i++) {
const VPointer& vp1_i = weak_aliasing_pairs.at(i).vp1();
const VPointer& vp2_i = weak_aliasing_pairs.at(i).vp2();
assert(vp1_union.con() <= vp1_i.con(), "must start before");
assert(vp2_union.con() <= vp2_i.con(), "must start before");
assert(vp1_union.size() >= vp1_i.size(), "must end after");
assert(vp2_union.size() >= vp2_i.size(), "must end after");
}
#endif
add_speculative_check([&] (Node* ctrl) {
return vp1_union.make_speculative_aliasing_check_with(vp2_union, ctrl);
});
group_start = group_end;
}
}
}
// Runtime Checks:
// Some required properties cannot be proven statically, and require a
// runtime check:
// - Alignment:
// See VTransform::add_speculative_alignment_check
// - Aliasing:
// See VTransform::apply_speculative_aliasing_runtime_checks
// There is a two staged approach for compilation:
// - AutoVectorization Predicate:
// See VM flag UseAutoVectorizationPredicate and documentation in predicates.hpp
// We speculate that the checks pass, and only compile a vectorized loop.
// We expect the checks to pass in almost all cases, and so we only need
// to compile and cache the vectorized loop.
// If the predicate ever fails, we deoptimize, and eventually compile
// without predicate. This means we will recompile with multiversioning.
// - Multiversioning:
// See VM Flag LoopMultiversioning and documentaiton in loopUnswitch.cpp
// If the predicate is not available or previously failed, then we compile
// a vectorized and a scalar loop. If the runtime check passes we take the
// vectorized loop, else the scalar loop.
// Multiversioning takes more compile time and code cache, but it also
// produces fast code for when the runtime check passes (vectorized) and
// when it fails (scalar performance).
//
// Callback:
// In some cases, we require the ctrl just before the check iff_speculate to
// generate the values required in the check. We pass this ctrl into the
// callback, which is expected to produce the check, i.e. a BoolNode.
template<typename Callback>
void VTransform::add_speculative_check(Callback callback) {
assert(_vloop.are_speculative_checks_possible(), "otherwise we cannot make speculative assumptions");
ParsePredicateSuccessProj* parse_predicate_proj = _vloop.auto_vectorization_parse_predicate_proj();
IfTrueNode* new_check_proj = nullptr;
if (parse_predicate_proj != nullptr) {
new_check_proj = phase()->create_new_if_for_predicate(parse_predicate_proj, nullptr,
Deoptimization::Reason_auto_vectorization_check,
Op_If);
} else {
new_check_proj = phase()->create_new_if_for_multiversion(_vloop.multiversioning_fast_proj());
}
Node* iff_speculate = new_check_proj->in(0);
// Create the check, given the ctrl just before the iff.
BoolNode* bol = callback(iff_speculate->in(0));
igvn().replace_input_of(iff_speculate, 1, bol);
TRACE_SPECULATIVE_ALIGNMENT_CHECK(iff_speculate);
}
// Helper-class for VTransformGraph::has_store_to_load_forwarding_failure.
// It wraps a VPointer. The VPointer has an iv_offset applied, which
// simulates a virtual unrolling. They represent the memory region:
// [adr, adr + size)
// adr = base + invar + iv_scale * (iv + iv_offset) + con
class VMemoryRegion : public ResourceObj {
private:
// Note: VPointer has no default constructor, so we cannot use VMemoryRegion
// in-place in a GrowableArray. Hence, we make VMemoryRegion a resource
// allocated object, so the GrowableArray of VMemoryRegion* has a default
// nullptr element.
const VPointer _vpointer;
bool _is_load; // load or store?
uint _schedule_order;
public:
VMemoryRegion(const VPointer& vpointer, bool is_load, uint schedule_order) :
_vpointer(vpointer),
_is_load(is_load),
_schedule_order(schedule_order) {}
const VPointer& vpointer() const { return _vpointer; }
bool is_load() const { return _is_load; }
uint schedule_order() const { return _schedule_order; }
static int cmp_for_sort_by_group(VMemoryRegion* r1, VMemoryRegion* r2) {
// Sort by mem_pointer (base, invar, iv_scale), except for the con.
return MemPointer::cmp_summands(r1->vpointer().mem_pointer(),
r2->vpointer().mem_pointer());
}
static int cmp_for_sort(VMemoryRegion** r1, VMemoryRegion** r2) {
int cmp_group = cmp_for_sort_by_group(*r1, *r2);
if (cmp_group != 0) { return cmp_group; }
// We use two comparisons, because a subtraction could underflow.
jint con1 = (*r1)->vpointer().con();
jint con2 = (*r2)->vpointer().con();
if (con1 < con2) { return -1; }
if (con1 > con2) { return 1; }
return 0;
}
enum Aliasing { DIFFERENT_GROUP, BEFORE, EXACT_OVERLAP, PARTIAL_OVERLAP, AFTER };
Aliasing aliasing(VMemoryRegion& other) {
VMemoryRegion* p1 = this;
VMemoryRegion* p2 = &other;
if (cmp_for_sort_by_group(p1, p2) != 0) { return DIFFERENT_GROUP; }
jlong con1 = p1->vpointer().con();
jlong con2 = p2->vpointer().con();
jlong size1 = p1->vpointer().size();
jlong size2 = p2->vpointer().size();
if (con1 >= con2 + size2) { return AFTER; }
if (con2 >= con1 + size1) { return BEFORE; }
if (con1 == con2 && size1 == size2) { return EXACT_OVERLAP; }
return PARTIAL_OVERLAP;
}
#ifndef PRODUCT
void print() const {
tty->print("VMemoryRegion[%s schedule_order(%4d), ",
_is_load ? "load, " : "store,", _schedule_order);
vpointer().print_on(tty, false);
tty->print_cr("]");
}
#endif
};
// Store-to-load-forwarding is a CPU memory optimization, where a load can directly fetch
// its value from the store-buffer, rather than from the L1 cache. This is many CPU cycles
// faster. However, this optimization comes with some restrictions, depending on the CPU.
// Generally, store-to-load-forwarding works if the load and store memory regions match
// exactly (same start and width). Generally problematic are partial overlaps - though
// some CPU's can handle even some subsets of these cases. We conservatively assume that
// all such partial overlaps lead to a store-to-load-forwarding failures, which means the
// load has to stall until the store goes from the store-buffer into the L1 cache, incurring
// a penalty of many CPU cycles.
//
// Example (with "iteration distance" 2):
// for (int i = 10; i < SIZE; i++) {
// aI[i] = aI[i - 2] + 1;
// }
//
// load_4_bytes( ptr + -8)
// store_4_bytes(ptr + 0) *
// load_4_bytes( ptr + -4) |
// store_4_bytes(ptr + 4) | *
// load_4_bytes( ptr + 0) <-+ |
// store_4_bytes(ptr + 8) |
// load_4_bytes( ptr + 4) <---+
// store_4_bytes(ptr + 12)
// ...
//
// In the scalar loop, we can forward the stores from 2 iterations back.
//
// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 2
// example. This gives us this machine code:
// load_8_bytes( ptr + -8)
// store_8_bytes(ptr + 0) |
// load_8_bytes( ptr + 0) v
// store_8_bytes(ptr + 8) |
// load_8_bytes( ptr + 8) v
// store_8_bytes(ptr + 16)
// ...
//
// We packed 2 iterations, and the stores can perfectly forward to the loads of
// the next 2 iterations.
//
// Example (with "iteration distance" 3):
// for (int i = 10; i < SIZE; i++) {
// aI[i] = aI[i - 3] + 1;
// }
//
// load_4_bytes( ptr + -12)
// store_4_bytes(ptr + 0) *
// load_4_bytes( ptr + -8) |
// store_4_bytes(ptr + 4) |
// load_4_bytes( ptr + -4) |
// store_4_bytes(ptr + 8) |
// load_4_bytes( ptr + 0) <-+
// store_4_bytes(ptr + 12)
// ...
//
// In the scalar loop, we can forward the stores from 3 iterations back.
//
// Unfortunately, vectorization can introduce such store-to-load-forwarding failures.
// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 3
// example. This gives us this machine code:
// load_8_bytes( ptr + -12)
// store_8_bytes(ptr + 0) | |
// load_8_bytes( ptr + -4) x |
// store_8_bytes(ptr + 8) ||
// load_8_bytes( ptr + 4) xx <-- partial overlap with 2 stores
// store_8_bytes(ptr + 16)
// ...
//
// We see that eventually all loads are dependent on earlier stores, but the values cannot
// be forwarded because there is some partial overlap.
//
// Preferably, we would have some latency-based cost-model that accounts for such forwarding
// failures, and decide if vectorization with forwarding failures is still profitable. For
// now we go with a simpler heuristic: we simply forbid vectorization if we can PROVE that
// there will be a forwarding failure. This approach has at least 2 possible weaknesses:
//
// (1) There may be forwarding failures in cases where we cannot prove it.
// Example:
// for (int i = 10; i < SIZE; i++) {
// bI[i] = aI[i - 3] + 1;
// }
//
// We do not know if aI and bI refer to the same array or not. However, it is reasonable
// to assume that if we have two different array references, that they most likely refer
// to different arrays (i.e. no aliasing), where we would have no forwarding failures.
// (2) There could be some loops where vectorization introduces forwarding failures, and thus
// the latency of the loop body is high, but this does not matter because it is dominated
// by other latency/throughput based costs in the loop body.
//
// Performance measurements with the JMH benchmark StoreToLoadForwarding.java have indicated
// that there is some iteration threshold: if the failure happens between a store and load that
// have an iteration distance below this threshold, the latency is the limiting factor, and we
// should not vectorize to avoid the latency penalty of store-to-load-forwarding failures. If
// the iteration distance is larger than this threshold, the throughput is the limiting factor,
// and we should vectorize in these cases to improve throughput.
//
bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const {
if (SuperWordStoreToLoadForwardingFailureDetection == 0) { return false; }
// Collect all pointers for scalar and vector loads/stores.
ResourceMark rm;
// Use pointers because no default constructor for elements available.
GrowableArray<VMemoryRegion*> memory_regions;
// To detect store-to-load-forwarding failures at the iteration threshold or below, we
// simulate a super-unrolling to reach SuperWordStoreToLoadForwardingFailureDetection
// iterations at least. This is a heuristic, and we are not trying to be very precise
// with the iteration distance. If we have already unrolled more than the iteration
// threshold, i.e. if "SuperWordStoreToLoadForwardingFailureDetection < unrolled_count",
// then we simply check if there are any store-to-load-forwarding failures in the unrolled
// loop body, which may be at larger distance than the desired threshold. We cannot do any
// more fine-grained analysis, because the unrolling has lost the information about the
// iteration distance.
int simulated_unrolling_count = SuperWordStoreToLoadForwardingFailureDetection;
int unrolled_count = vloop_analyzer.vloop().cl()->unrolled_count();
uint simulated_super_unrolling_count = MAX2(1, simulated_unrolling_count / unrolled_count);
int iv_stride = vloop_analyzer.vloop().iv_stride();
int schedule_order = 0;
for (uint k = 0; k < simulated_super_unrolling_count; k++) {
int iv_offset = k * iv_stride; // virtual super-unrolling
for (int i = 0; i < _schedule.length(); i++) {
VTransformNode* vtn = _schedule.at(i);
if (vtn->is_load_or_store_in_loop()) {
const VPointer& p = vtn->vpointer();
if (p.is_valid()) {
VTransformVectorNode* vector = vtn->isa_Vector();
bool is_load = vtn->is_load_in_loop();
const VPointer iv_offset_p(p.make_with_iv_offset(iv_offset));
if (iv_offset_p.is_valid()) {
// The iv_offset may lead to overflows. This is a heuristic, so we do not
// care too much about those edge cases.
memory_regions.push(new VMemoryRegion(iv_offset_p, is_load, schedule_order++));
}
}
}
}
}
// Sort the pointers by group (same base, invar and stride), and then by offset.
memory_regions.sort(VMemoryRegion::cmp_for_sort);
#ifndef PRODUCT
if (_trace._verbose) {
tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
tty->print_cr(" simulated_unrolling_count = %d", simulated_unrolling_count);
tty->print_cr(" simulated_super_unrolling_count = %d", simulated_super_unrolling_count);
for (int i = 0; i < memory_regions.length(); i++) {
VMemoryRegion& region = *memory_regions.at(i);
region.print();
}
}
#endif
// For all pairs of pointers in the same group, check if they have a partial overlap.
for (int i = 0; i < memory_regions.length(); i++) {
VMemoryRegion& region1 = *memory_regions.at(i);
for (int j = i + 1; j < memory_regions.length(); j++) {
VMemoryRegion& region2 = *memory_regions.at(j);
const VMemoryRegion::Aliasing aliasing = region1.aliasing(region2);
if (aliasing == VMemoryRegion::Aliasing::DIFFERENT_GROUP ||
aliasing == VMemoryRegion::Aliasing::BEFORE) {
break; // We have reached the next group or pointers that are always after.
} else if (aliasing == VMemoryRegion::Aliasing::EXACT_OVERLAP) {
continue;
} else {
assert(aliasing == VMemoryRegion::Aliasing::PARTIAL_OVERLAP, "no other case can happen");
if ((region1.is_load() && !region2.is_load() && region1.schedule_order() > region2.schedule_order()) ||
(!region1.is_load() && region2.is_load() && region1.schedule_order() < region2.schedule_order())) {
// We predict that this leads to a store-to-load-forwarding failure penalty.
#ifndef PRODUCT
if (_trace._rejections) {
tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
tty->print_cr(" Partial overlap of store->load. We predict that this leads to");
tty->print_cr(" a store-to-load-forwarding failure penalty which makes");
tty->print_cr(" vectorization unprofitable. These are the two pointers:");
region1.print();
region2.print();
}
#endif
return true;
}
}
}
}
return false;
}
void VTransformApplyState::set_transformed_node(VTransformNode* vtn, Node* n) {
assert(_vtnode_idx_to_transformed_node.at(vtn->_idx) == nullptr, "only set once");
_vtnode_idx_to_transformed_node.at_put(vtn->_idx, n);
}
Node* VTransformApplyState::transformed_node(const VTransformNode* vtn) const {
Node* n = _vtnode_idx_to_transformed_node.at(vtn->_idx);
assert(n != nullptr, "must find IR node for vtnode");
return n;
}
void VTransformApplyState::init_memory_states_and_uses_after_loop() {
const GrowableArray<Node*>& inputs = _vloop_analyzer.memory_slices().inputs();
const GrowableArray<PhiNode*>& heads = _vloop_analyzer.memory_slices().heads();
for (int i = 0; i < inputs.length(); i++) {
PhiNode* head = heads.at(i);
if (head != nullptr) {
// Slice with Phi (i.e. with stores) -> start with the phi (phi_mem)
_memory_states.at_put(i, head);
// Remember uses outside the loop of the last memory state (store).
StoreNode* last_store = head->in(2)->as_Store();
assert(vloop().in_bb(last_store), "backedge store should be in the loop");
for (DUIterator_Fast jmax, j = last_store->fast_outs(jmax); j < jmax; j++) {
Node* use = last_store->fast_out(j);
if (!vloop().in_bb(use)) {
for (uint k = 0; k < use->req(); k++) {
if (use->in(k) == last_store) {
_memory_state_uses_after_loop.push(MemoryStateUseAfterLoop(use, k, i));
}
}
}
}
} else {
// Slice without Phi (i.e. only loads) -> use the input state (entry_mem)
_memory_states.at_put(i, inputs.at(i));
}
}
}
// We may have reordered the scalar stores, or replaced them with vectors. Now
// the last memory state in the loop may have changed. Thus, we need to change
// the uses of the old last memory state the new last memory state.
void VTransformApplyState::fix_memory_state_uses_after_loop() {
for (int i = 0; i < _memory_state_uses_after_loop.length(); i++) {
MemoryStateUseAfterLoop& use = _memory_state_uses_after_loop.at(i);
Node* last_state = memory_state(use._alias_idx);
phase()->igvn().replace_input_of(use._use, use._in_idx, last_state);
}
}
void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const {
PhaseIdealLoop* phase = apply_state.phase();
for (uint i = 0; i < req(); i++) {
VTransformNode* vtn_def = in_req(i);
if (vtn_def != nullptr) {
Node* def = apply_state.transformed_node(vtn_def);
phase->igvn().replace_input_of(n, i, def);
}
}
}
float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
// This is an identity transform, but loads and stores must be counted.
assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted");
return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
}
VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const {
apply_vtn_inputs_to_node(_node, apply_state);
// The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
Node* mem = apply_state.memory_state(_node->adr_type());
apply_state.phase()->igvn().replace_input_of(_node, 1, mem);
if (_node->is_Store()) {
apply_state.set_memory_state(_node->adr_type(), _node);
}
return VTransformApplyResult::make_scalar(_node);
}
float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
// Since this is an identity transform, we may have nodes that also
// VLoopAnalyzer::cost does not count for the scalar loop.
if (vloop_analyzer.has_zero_cost(_node)) {
return 0;
} else {
return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
}
}
VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const {
apply_vtn_inputs_to_node(_node, apply_state);
return VTransformApplyResult::make_scalar(_node);
}
VTransformApplyResult VTransformPhiScalarNode::apply(VTransformApplyState& apply_state) const {
PhaseIdealLoop* phase = apply_state.phase();
Node* in0 = apply_state.transformed_node(in_req(0));
Node* in1 = apply_state.transformed_node(in_req(1));
phase->igvn().replace_input_of(_node, 0, in0);
phase->igvn().replace_input_of(_node, 1, in1);
// Note: the backedge is hooked up later.
return VTransformApplyResult::make_scalar(_node);
}
// Cleanup backedges. In the schedule, the backedges come after their phis. Hence,
// we only have the transformed backedges after the phis are already transformed.
// We hook the backedges into the phis now, during cleanup.
void VTransformPhiScalarNode::apply_backedge(VTransformApplyState& apply_state) const {
assert(_node == apply_state.transformed_node(this), "sanity");
PhaseIdealLoop* phase = apply_state.phase();
if (_node->is_memory_phi()) {
// Memory phi/backedge
// The last memory state of that slice is the backedge.
Node* last_state = apply_state.memory_state(_node->adr_type());
phase->igvn().replace_input_of(_node, 2, last_state);
} else {
// Data phi/backedge
Node* in2 = apply_state.transformed_node(in_req(2));
phase->igvn().replace_input_of(_node, 2, in2);
}
}
VTransformApplyResult VTransformCFGNode::apply(VTransformApplyState& apply_state) const {
// We do not modify the inputs of the CountedLoop (and certainly not its backedge)
if (!_node->is_CountedLoop()) {
apply_vtn_inputs_to_node(_node, apply_state);
}
return VTransformApplyResult::make_scalar(_node);
}
VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_state) const {
apply_vtn_inputs_to_node(_node, apply_state);
return VTransformApplyResult::make_scalar(_node);
}
float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type);
}
VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const {
Node* val = apply_state.transformed_node(in_req(1));
VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type);
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn);
}
float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L);
}
VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const {
Node* val = apply_state.transformed_node(in_req(1));
Node* n = new ConvI2LNode(val);
register_new_node_from_vectorization(apply_state, n);
return VTransformApplyResult::make_scalar(n);
}
float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode);
return vloop_analyzer.cost_for_scalar_node(Op_AndI) +
vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt);
}
VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const {
PhaseIdealLoop* phase = apply_state.phase();
Node* shift_count_in = apply_state.transformed_node(in_req(1));
assert(shift_count_in->bottom_type()->isa_int(), "int type only for shift count");
// The shift_count_in would be automatically truncated to the lowest _mask
// bits in a scalar shift operation. But vector shift does not truncate, so
// we must apply the mask now.
Node* shift_count_masked = new AndINode(shift_count_in, phase->intcon(_mask));
register_new_node_from_vectorization(apply_state, shift_count_masked);
// Now that masked value is "boadcast" (some platforms only set the lowest element).
VectorNode* vn = VectorNode::shift_count(_shift_opcode, shift_count_masked, _vlen, _element_bt);
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn);
}
float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt);
}
VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const {
PhaseIdealLoop* phase = apply_state.phase();
Node* val = apply_state.transformed_node(in_req(1));
assert(val->is_Phi(), "expected to be iv");
assert(VectorNode::is_populate_index_supported(_element_bt), "should support");
const TypeVect* vt = TypeVect::make(_element_bt, _vlen);
VectorNode* vn = new PopulateIndexNode(val, phase->intcon(1), vt);
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn);
}
float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type());
}
VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const {
assert(2 <= req() && req() <= 4, "Must have 1-3 inputs");
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
Node* in1 = apply_state.transformed_node(in_req(1));
Node* in2 = (req() >= 3) ? apply_state.transformed_node(in_req(2)) : nullptr;
VectorNode* vn = nullptr;
if (req() <= 3) {
vn = VectorNode::make(_vector_opcode, in1, in2, vt); // unary and binary
} else {
Node* in3 = apply_state.transformed_node(in_req(3));
vn = VectorNode::make(_vector_opcode, in1, in2, in3, vt); // ternary
}
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn);
}
float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type());
return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) +
vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT);
}
VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
uint vlen = vector_length();
int sopc = scalar_opcode();
Node* in1 = apply_state.transformed_node(in_req(1));
// The scalar operation was a long -> int operation.
// However, the vector operation is long -> long.
VectorNode* long_vn = VectorNode::make(sopc, in1, nullptr, vlen, T_LONG);
register_new_node_from_vectorization(apply_state, long_vn);
// Cast long -> int, to mimic the scalar long -> int operation.
VectorNode* vn = VectorCastNode::make(Op_VectorCastL2X, long_vn, T_INT, vlen);
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn);
}
float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type());
}
VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {
const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length());
const TypeVect* src_vt = TypeVect::make(_src_bt, vector_length());
assert(VectorNode::is_reinterpret_opcode(scalar_opcode()), "scalar opcode must be reinterpret");
Node* in1 = apply_state.transformed_node(in_req(1));
VectorNode* vn = new VectorReinterpretNode(in1, src_vt, dst_vt);
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn);
}
float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
assert(scalar_opcode() == Op_Bool, "");
return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type());
}
VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const {
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
assert(scalar_opcode() == Op_Bool, "");
// Cmp + Bool -> VectorMaskCmp
VTransformCmpVectorNode* vtn_cmp = in_req(1)->isa_CmpVector();
assert(vtn_cmp != nullptr, "bool vtn expects cmp vtn as input");
Node* cmp_in1 = apply_state.transformed_node(vtn_cmp->in_req(1));
Node* cmp_in2 = apply_state.transformed_node(vtn_cmp->in_req(2));
BoolTest::mask mask = test()._mask;
PhaseIdealLoop* phase = apply_state.phase();
ConINode* mask_node = phase->intcon((int)mask);
VectorNode* vn = new VectorMaskCmpNode(mask, cmp_in1, cmp_in2, mask_node, vt);
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn);
}
bool VTransformReductionVectorNode::optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
return optimize_move_non_strict_order_reductions_out_of_loop(vloop_analyzer, vtransform);
}
int VTransformReductionVectorNode::vector_reduction_opcode() const {
return ReductionNode::opcode(scalar_opcode(), element_basic_type());
}
bool VTransformReductionVectorNode::requires_strict_order() const {
int vopc = vector_reduction_opcode();
return ReductionNode::auto_vectorization_requires_strict_order(vopc);
}
// Having ReductionNodes in the loop is expensive. They need to recursively
// fold together the vector values, for every vectorized loop iteration. If
// we encounter the following pattern, we can vector accumulate the values
// inside the loop, and only have a single UnorderedReduction after the loop.
//
// Note: UnorderedReduction represents a ReductionNode which does not require
// calculating in strict order.
//
// CountedLoop init
// | |
// +------+ | +------------------------+
// | | | |
// PhiNode (s) |
// | |
// | Vector |
// | | |
// UnorderedReduction (first_red) |
// | |
// ... Vector |
// | | |
// UnorderedReduction (last_red) |
// | |
// +----------------------+
//
// We patch the graph to look like this:
//
// CountedLoop identity_vector
// | |
// +-------+ | +---------------+
// | | | |
// PhiNode (v) |
// | |
// | Vector |
// | | |
// VectorAccumulator |
// | |
// ... Vector |
// | | |
// init VectorAccumulator |
// | | | |
// UnorderedReduction +-----------+
//
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
// use vector_accumulators, which do the same reductions, but only element
// wise. This is a single operation per vector_accumulator, rather than many
// for a UnorderedReduction. We can then reduce the last vector_accumulator
// after the loop, and also reduce the init value into it.
//
// We can not do this with all reductions. Some reductions do not allow the
// reordering of operations (for example float addition/multiplication require
// strict order).
//
// Note: we must perform this optimization already during auto vectorization,
// before we evaluate the cost-model. Without this optimization, we may
// still have expensive reduction nodes in the loop which can make
// vectorization unprofitable. Only with the optimization does vectorization
// become profitable, since the expensive reduction node is moved
// outside the loop, and instead cheaper element-wise vector accumulations
// are performed inside the loop.
bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform) {
// We have a phi with a single use.
VTransformPhiScalarNode* phi = in_req(1)->isa_PhiScalar();
if (phi == nullptr) {
return false;
}
if (phi->out_strong_edges() != 1) {
TRACE_OPTIMIZE(
tty->print(" Cannot move out of loop, phi has multiple uses:");
print();
tty->print(" phi: ");
phi->print();
)
return false;
}
if (requires_strict_order()) {
TRACE_OPTIMIZE(
tty->print(" Cannot move out of loop, strict order required: ");
print();
)
return false;
}
const int sopc = scalar_opcode();
const uint vlen = vector_length();
const BasicType bt = element_basic_type();
const int ropc = vector_reduction_opcode();
const int vopc = VectorNode::opcode(sopc, bt);
if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) {
// The element-wise vector operation needed for the vector accumulator
// is not implemented / supported.
return false;
}
// Traverse up the chain of non strict order reductions, checking that it loops
// back to the phi. Check that all non strict order reductions only have a single
// use, except for the last (last_red), which only has phi as a use in the loop,
// and all other uses are outside the loop.
VTransformReductionVectorNode* first_red = this;
VTransformReductionVectorNode* last_red = phi->in_req(2)->isa_ReductionVector();
VTransformReductionVectorNode* current_red = last_red;
while (true) {
if (current_red == nullptr ||
current_red->vector_reduction_opcode() != ropc ||
current_red->element_basic_type() != bt ||
current_red->vector_length() != vlen) {
TRACE_OPTIMIZE(
tty->print(" Cannot move out of loop, other reduction node does not match:");
print();
tty->print(" other: ");
current_red->print();
)
return false; // not compatible
}
VTransformVectorNode* vector_input = current_red->in_req(2)->isa_Vector();
if (vector_input == nullptr) {
assert(false, "reduction has a bad vector input");
return false;
}
// Expect single use of the non strict order reduction. Except for the last_red.
if (current_red == last_red) {
// All uses must be outside loop body, except for the phi.
for (uint i = 0; i < current_red->out_strong_edges(); i++) {
VTransformNode* use = current_red->out_strong_edge(i);
if (use->isa_PhiScalar() == nullptr &&
use->isa_Outer() == nullptr) {
// Should not be allowed by SuperWord::mark_reductions
assert(false, "reduction has use inside loop");
return false;
}
}
} else {
if (current_red->out_strong_edges() != 1) {
TRACE_OPTIMIZE(
tty->print(" Cannot move out of loop, other reduction node has use outside loop:");
print();
tty->print(" other: ");
current_red->print();
)
return false; // Only single use allowed
}
}
// If the scalar input is a phi, we passed all checks.
VTransformNode* scalar_input = current_red->in_req(1);
if (scalar_input == phi) {
break;
}
// We expect another non strict reduction, verify it in the next iteration.
current_red = scalar_input->isa_ReductionVector();
}
return true; // success
}
bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
if (!optimize_move_non_strict_order_reductions_out_of_loop_preconditions(vtransform)) {
return false;
}
// All checks were successful. Edit the vtransform graph now.
TRACE_OPTIMIZE(
tty->print_cr("VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop");
)
const int sopc = scalar_opcode();
const uint vlen = vector_length();
const BasicType bt = element_basic_type();
const int vopc = VectorNode::opcode(sopc, bt);
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
// Create a vector of identity values.
Node* identity = ReductionNode::make_identity_con_scalar(phase->igvn(), sopc, bt);
phase->set_root_as_ctrl(identity);
VTransformNode* vtn_identity = new (vtransform.arena()) VTransformOuterNode(vtransform, identity);
VTransformNode* vtn_identity_vector = new (vtransform.arena()) VTransformReplicateNode(vtransform, vlen, bt);
vtn_identity_vector->init_req(1, vtn_identity);
// Look at old scalar phi.
VTransformPhiScalarNode* phi_scalar = in_req(1)->isa_PhiScalar();
PhiNode* old_phi = phi_scalar->node();
VTransformNode* init = phi_scalar->in_req(1);
TRACE_OPTIMIZE(
tty->print(" phi_scalar ");
phi_scalar->print();
)
// Create new vector phi
const VTransformVectorNodeProperties properties = VTransformVectorNodeProperties::make_for_phi_vector(old_phi, vlen, bt);
VTransformPhiVectorNode* phi_vector = new (vtransform.arena()) VTransformPhiVectorNode(vtransform, 3, properties);
phi_vector->init_req(0, phi_scalar->in_req(0));
phi_vector->init_req(1, vtn_identity_vector);
// Note: backedge comes later
// Traverse down the chain of reductions, and replace them with vector_accumulators.
VTransformReductionVectorNode* first_red = this;
VTransformReductionVectorNode* last_red = phi_scalar->in_req(2)->isa_ReductionVector();
VTransformReductionVectorNode* current_red = first_red;
VTransformNode* current_vector_accumulator = phi_vector;
while (true) {
VTransformNode* vector_input = current_red->in_req(2);
VTransformVectorNode* vector_accumulator = new (vtransform.arena()) VTransformElementWiseVectorNode(vtransform, 3, current_red->properties(), vopc);
vector_accumulator->init_req(1, current_vector_accumulator);
vector_accumulator->init_req(2, vector_input);
TRACE_OPTIMIZE(
tty->print(" replace ");
current_red->print();
tty->print(" with ");
vector_accumulator->print();
)
current_vector_accumulator = vector_accumulator;
if (current_red == last_red) { break; }
current_red = current_red->unique_out_strong_edge()->isa_ReductionVector();
}
// Feed vector accumulator into the backedge.
phi_vector->set_req(2, current_vector_accumulator);
// Create post-loop reduction. last_red keeps all uses outside the loop.
last_red->set_req(1, init);
last_red->set_req(2, current_vector_accumulator);
TRACE_OPTIMIZE(
tty->print(" phi_scalar ");
phi_scalar->print();
tty->print(" after loop ");
last_red->print();
)
return true; // success
}
float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
uint vlen = vector_length();
BasicType bt = element_basic_type();
int vopc = vector_reduction_opcode();
bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc);
return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order);
}
VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
Node* init = apply_state.transformed_node(in_req(1));
Node* vec = apply_state.transformed_node(in_req(2));
ReductionNode* vn = ReductionNode::make(scalar_opcode(), nullptr, init, vec, element_basic_type());
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn, vn->vect_type());
}
VTransformApplyResult VTransformPhiVectorNode::apply(VTransformApplyState& apply_state) const {
PhaseIdealLoop* phase = apply_state.phase();
Node* in0 = apply_state.transformed_node(in_req(0));
Node* in1 = apply_state.transformed_node(in_req(1));
// We create a new phi node, because the type is different to the scalar phi.
PhiNode* old_phi = approximate_origin()->as_Phi();
PhiNode* new_phi = old_phi->clone()->as_Phi();
phase->igvn().replace_input_of(new_phi, 0, in0);
phase->igvn().replace_input_of(new_phi, 1, in1);
// Note: the backedge is hooked up later.
// Give the new phi node the correct vector type.
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
new_phi->as_Type()->set_type(vt);
phase->igvn().set_type(new_phi, vt);
return VTransformApplyResult::make_vector(new_phi, vt);
}
// Cleanup backedges. In the schedule, the backedges come after their phis. Hence,
// we only have the transformed backedges after the phis are already transformed.
// We hook the backedges into the phis now, during cleanup.
void VTransformPhiVectorNode::apply_backedge(VTransformApplyState& apply_state) const {
PhaseIdealLoop* phase = apply_state.phase();
PhiNode* new_phi = apply_state.transformed_node(this)->as_Phi();
Node* in2 = apply_state.transformed_node(in_req(2));
phase->igvn().replace_input_of(new_phi, 2, in2);
}
float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
uint vlen = vector_length();
BasicType bt = element_basic_type();
return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt);
}
VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const {
int sopc = scalar_opcode();
uint vlen = vector_length();
BasicType bt = element_basic_type();
// The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
Node* ctrl = apply_state.transformed_node(in_req(MemNode::Control));
Node* mem = apply_state.memory_state(_adr_type);
Node* adr = apply_state.transformed_node(in_req(MemNode::Address));
// Set the memory dependency of the LoadVector as early as possible.
// Walk up the memory chain, and ignore any StoreVector that provably
// does not have any memory dependency.
const VPointer& load_p = vpointer();
while (mem->is_StoreVector()) {
VPointer store_p(mem->as_Mem(), apply_state.vloop());
if (store_p.never_overlaps_with(load_p)) {
mem = mem->in(MemNode::Memory);
} else {
break;
}
}
LoadVectorNode* vn = LoadVectorNode::make(sopc, ctrl, mem, adr, _adr_type, vlen, bt, _control_dependency);
DEBUG_ONLY( if (VerifyAlignVector) { vn->set_must_verify_alignment(); } )
register_new_node_from_vectorization(apply_state, vn);
return VTransformApplyResult::make_vector(vn, vn->vect_type());
}
float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
uint vlen = vector_length();
BasicType bt = element_basic_type();
return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt);
}
VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const {
int sopc = scalar_opcode();
uint vlen = vector_length();
// The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
Node* ctrl = apply_state.transformed_node(in_req(MemNode::Control));
Node* mem = apply_state.memory_state(_adr_type);
Node* adr = apply_state.transformed_node(in_req(MemNode::Address));
Node* value = apply_state.transformed_node(in_req(MemNode::ValueIn));
StoreVectorNode* vn = StoreVectorNode::make(sopc, ctrl, mem, adr, _adr_type, value, vlen);
DEBUG_ONLY( if (VerifyAlignVector) { vn->set_must_verify_alignment(); } )
register_new_node_from_vectorization(apply_state, vn);
apply_state.set_memory_state(_adr_type, vn);
return VTransformApplyResult::make_vector(vn, vn->vect_type());
}
void VTransformNode::register_new_node_from_vectorization(VTransformApplyState& apply_state, Node* vn) const {
PhaseIdealLoop* phase = apply_state.phase();
// Using the cl is sometimes not the most accurate, but still correct. We do not have to be
// perfectly accurate, because we will set major_progress anyway.
phase->register_new_node(vn, apply_state.vloop().cl());
phase->igvn()._worklist.push(vn);
VectorNode::trace_new_vector(vn, "AutoVectorization");
}
#ifndef PRODUCT
void VTransformGraph::print_vtnodes() const {
tty->print_cr("\nVTransformGraph::print_vtnodes:");
for (int i = 0; i < _vtnodes.length(); i++) {
_vtnodes.at(i)->print();
}
}
void VTransformGraph::print_schedule() const {
tty->print_cr("\nVTransformGraph::print_schedule:");
for (int i = 0; i < _schedule.length(); i++) {
tty->print(" %3d: ", i);
VTransformNode* vtn = _schedule.at(i);
if (vtn == nullptr) {
tty->print_cr("nullptr");
} else {
vtn->print();
}
}
}
void VTransformNode::print() const {
tty->print("%3d %s (", _idx, name());
for (uint i = 0; i < _req; i++) {
print_node_idx(_in.at(i));
}
if ((uint)_in.length() > _req) {
tty->print(" | strong:");
for (uint i = _req; i < _in_end_strong_memory_edges; i++) {
print_node_idx(_in.at(i));
}
}
if ((uint)_in.length() > _in_end_strong_memory_edges) {
tty->print(" | weak:");
for (uint i = _in_end_strong_memory_edges; i < (uint)_in.length(); i++) {
print_node_idx(_in.at(i));
}
}
tty->print(") %s[", _is_alive ? "" : "dead ");
for (uint i = 0; i < _out_end_strong_edges; i++) {
print_node_idx(_out.at(i));
}
if ((uint)_out.length() > _out_end_strong_edges) {
tty->print(" | weak:");
for (uint i = _out_end_strong_edges; i < (uint)_out.length(); i++) {
print_node_idx(_out.at(i));
}
}
tty->print("] ");
print_spec();
tty->cr();
}
void VTransformNode::print_node_idx(const VTransformNode* vtn) {
if (vtn == nullptr) {
tty->print(" _");
} else {
tty->print(" %d", vtn->_idx);
}
}
void VTransformMemopScalarNode::print_spec() const {
tty->print("node[%d %s] ", _node->_idx, _node->Name());
_vpointer.print_on(tty, false);
}
void VTransformDataScalarNode::print_spec() const {
tty->print("node[%d %s]", _node->_idx, _node->Name());
}
void VTransformPhiScalarNode::print_spec() const {
tty->print("node[%d %s]", _node->_idx, _node->Name());
}
void VTransformCFGNode::print_spec() const {
tty->print("node[%d %s]", _node->_idx, _node->Name());
}
void VTransformOuterNode::print_spec() const {
tty->print("node[%d %s]", _node->_idx, _node->Name());
}
void VTransformReplicateNode::print_spec() const {
tty->print("vlen=%d element_type=%s", _vlen, type2name(_element_type));
}
void VTransformShiftCountNode::print_spec() const {
tty->print("vlen=%d element_bt=%s mask=%d shift_opcode=%s",
_vlen, type2name(_element_bt), _mask,
NodeClassNames[_shift_opcode]);
}
void VTransformPopulateIndexNode::print_spec() const {
tty->print("vlen=%d element_bt=%s", _vlen, type2name(_element_bt));
}
void VTransformVectorNode::print_spec() const {
tty->print("Properties[orig=[%d %s] sopc=%s vlen=%d element_bt=%s]",
approximate_origin()->_idx,
approximate_origin()->Name(),
NodeClassNames[scalar_opcode()],
vector_length(),
type2name(element_basic_type()));
if (is_load_or_store_in_loop()) {
tty->print(" ");
vpointer().print_on(tty, false);
}
}
void VTransformElementWiseVectorNode::print_spec() const {
VTransformVectorNode::print_spec();
tty->print(" vopc=%s", NodeClassNames[_vector_opcode]);
}
void VTransformReinterpretVectorNode::print_spec() const {
VTransformVectorNode::print_spec();
tty->print(" src_bt=%s", type2name(_src_bt));
}
void VTransformBoolVectorNode::print_spec() const {
VTransformVectorNode::print_spec();
BoolTest::mask m = BoolTest::mask(_test._mask & ~BoolTest::unsigned_compare);
const BoolTest bt(m);
tty->print(" test=%s", m == _test._mask ? "" : "unsigned ");
bt.dump_on(tty);
}
#endif