mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-15 02:13:19 +00:00
1634 lines
66 KiB
C++
1634 lines
66 KiB
C++
/*
|
|
* Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*/
|
|
|
|
#include "opto/castnode.hpp"
|
|
#include "opto/convertnode.hpp"
|
|
#include "opto/rootnode.hpp"
|
|
#include "opto/vectorization.hpp"
|
|
#include "opto/vectornode.hpp"
|
|
#include "opto/vtransform.hpp"
|
|
|
|
void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
|
|
assert(vtnode->_idx == _vtnodes.length(), "position must match idx");
|
|
_vtnodes.push(vtnode);
|
|
}
|
|
|
|
#define TRACE_OPTIMIZE(code) \
|
|
NOT_PRODUCT( \
|
|
if (vtransform.vloop().is_trace_optimization()) { \
|
|
code \
|
|
} \
|
|
)
|
|
|
|
// This is similar to IGVN optimization. But we are a bit lazy, and don't care about
|
|
// notification / worklist, since the list of nodes is rather small, and we don't
|
|
// expect optimizations that trickle over the whole graph.
|
|
void VTransformGraph::optimize(VTransform& vtransform) {
|
|
TRACE_OPTIMIZE( tty->print_cr("\nVTransformGraph::optimize"); )
|
|
|
|
bool progress = true;
|
|
DEBUG_ONLY(int pass_count = 0;)
|
|
while (progress) {
|
|
progress = false;
|
|
assert(++pass_count < 10, "ensure we do not have endless loops");
|
|
for (int i = 0; i < _vtnodes.length(); i++) {
|
|
VTransformNode* vtn = _vtnodes.at(i);
|
|
if (!vtn->is_alive()) { continue; }
|
|
progress |= vtn->optimize(_vloop_analyzer, vtransform);
|
|
|
|
// Nodes that have no use any more are dead.
|
|
if (vtn->out_strong_edges() == 0 &&
|
|
// There are some exceptions:
|
|
// 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive.
|
|
// 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive.
|
|
// 3. Outer node with strong inputs: is a use after the loop that we must keep alive.
|
|
!(vtn->isa_PhiScalar() != nullptr ||
|
|
vtn->is_load_or_store_in_loop() ||
|
|
(vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) {
|
|
vtn->mark_dead();
|
|
progress = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Compute a linearization of the graph. We do this with a reverse-post-order of a DFS.
|
|
// This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and
|
|
// the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the
|
|
// graph has additional constraints which can introduce cycles. Example:
|
|
//
|
|
// +--------+
|
|
// A -> X | v
|
|
// Pack [A,B] and [X,Y] [A,B] [X,Y]
|
|
// Y -> B ^ |
|
|
// +--------+
|
|
//
|
|
// We return "true" IFF we find no cycle, i.e. if the linearization succeeds.
|
|
bool VTransformGraph::schedule() {
|
|
assert(!is_scheduled(), "not yet scheduled");
|
|
|
|
#ifndef PRODUCT
|
|
if (_trace._verbose) {
|
|
print_vtnodes();
|
|
}
|
|
#endif
|
|
|
|
ResourceMark rm;
|
|
GrowableArray<VTransformNode*> stack;
|
|
VectorSet pre_visited;
|
|
VectorSet post_visited;
|
|
|
|
collect_nodes_without_strong_in_edges(stack);
|
|
const int num_alive_nodes = count_alive_vtnodes();
|
|
|
|
// We create a reverse-post-visit order. This gives us a linearization, if there are
|
|
// no cycles. Then, we simply reverse the order, and we have a schedule.
|
|
int rpo_idx = num_alive_nodes - 1;
|
|
while (!stack.is_empty()) {
|
|
VTransformNode* vtn = stack.top();
|
|
if (!pre_visited.test_set(vtn->_idx)) {
|
|
// Forward arc in graph (pre-visit).
|
|
} else if (!post_visited.test(vtn->_idx)) {
|
|
// Forward arc in graph. Check if all uses were already visited:
|
|
// Yes -> post-visit.
|
|
// No -> we are mid-visit.
|
|
bool all_uses_already_visited = true;
|
|
|
|
// We only need to respect the strong edges (data edges and strong memory edges).
|
|
// Violated weak memory edges are allowed, but require a speculative aliasing
|
|
// runtime check, see VTransform::apply_speculative_aliasing_runtime_checks.
|
|
for (uint i = 0; i < vtn->out_strong_edges(); i++) {
|
|
VTransformNode* use = vtn->out_strong_edge(i);
|
|
|
|
// Skip dead nodes
|
|
if (!use->is_alive()) { continue; }
|
|
|
|
// Skip backedges.
|
|
if ((use->is_loop_head_phi() || use->isa_CountedLoop() != nullptr) && use->in_req(2) == vtn) {
|
|
continue;
|
|
}
|
|
|
|
if (post_visited.test(use->_idx)) { continue; }
|
|
if (pre_visited.test(use->_idx)) {
|
|
// Cycle detected!
|
|
// The nodes that are pre_visited but not yet post_visited form a path from
|
|
// the "root" to the current vtn. Now, we are looking at an edge (vtn, use),
|
|
// and discover that use is also pre_visited but not post_visited. Thus, use
|
|
// lies on that path from "root" to vtn, and the edge (vtn, use) closes a
|
|
// cycle.
|
|
NOT_PRODUCT(if (_trace._rejections) { trace_schedule_cycle(stack, pre_visited, post_visited); } )
|
|
return false;
|
|
}
|
|
stack.push(use);
|
|
all_uses_already_visited = false;
|
|
}
|
|
|
|
if (all_uses_already_visited) {
|
|
stack.pop();
|
|
post_visited.set(vtn->_idx); // post-visit
|
|
_schedule.at_put_grow(rpo_idx--, vtn); // assign rpo_idx
|
|
}
|
|
} else {
|
|
stack.pop(); // Already post-visited. Ignore secondary edge.
|
|
}
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
if (_trace._info) {
|
|
print_schedule();
|
|
}
|
|
#endif
|
|
|
|
assert(rpo_idx == -1, "used up all rpo_idx, rpo_idx=%d", rpo_idx);
|
|
return true;
|
|
}
|
|
|
|
// Push all "root" nodes, i.e. those that have no strong input edges (data edges and strong memory edges):
|
|
void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const {
|
|
for (int i = 0; i < _vtnodes.length(); i++) {
|
|
VTransformNode* vtn = _vtnodes.at(i);
|
|
if (!vtn->is_alive()) { continue; }
|
|
if (!vtn->has_strong_in_edge()) {
|
|
stack.push(vtn);
|
|
}
|
|
// If an Outer node has both inputs and outputs, we will most likely have cycles in the final graph.
|
|
// This is not a correctness problem, but it just will prevent vectorization. If this ever happens
|
|
// try to find a way to avoid the cycle somehow.
|
|
assert(vtn->isa_Outer() == nullptr || (vtn->has_strong_in_edge() != (vtn->out_strong_edges() > 0)),
|
|
"Outer nodes should either be inputs or outputs, but not both, otherwise we may get cycles");
|
|
}
|
|
}
|
|
|
|
int VTransformGraph::count_alive_vtnodes() const {
|
|
int count = 0;
|
|
for (int i = 0; i < _vtnodes.length(); i++) {
|
|
VTransformNode* vtn = _vtnodes.at(i);
|
|
if (vtn->is_alive()) { count++; }
|
|
}
|
|
return count;
|
|
}
|
|
|
|
// Find all nodes that in the loop, in a 2-phase process:
|
|
// - First, find all nodes that are not before the loop:
|
|
// - loop-phis
|
|
// - loads and stores that are in the loop
|
|
// - and all their transitive uses.
|
|
// - Second, we find all nodes that are not after the loop:
|
|
// - backedges
|
|
// - loads and stores that are in the loop
|
|
// - and all their transitive uses.
|
|
//
|
|
// in_loop: vtn->_idx -> bool
|
|
void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
|
|
assert(is_scheduled(), "must already be scheduled");
|
|
|
|
// Phase 1: find all nodes that are not before the loop.
|
|
VectorSet is_not_before_loop;
|
|
for (int i = 0; i < _schedule.length(); i++) {
|
|
VTransformNode* vtn = _schedule.at(i);
|
|
// Is vtn a loop-phi?
|
|
if (vtn->is_loop_head_phi() ||
|
|
vtn->is_load_or_store_in_loop()) {
|
|
is_not_before_loop.set(vtn->_idx);
|
|
continue;
|
|
}
|
|
// Or one of its transitive uses?
|
|
for (uint j = 0; j < vtn->req(); j++) {
|
|
VTransformNode* def = vtn->in_req(j);
|
|
if (def != nullptr && is_not_before_loop.test(def->_idx)) {
|
|
is_not_before_loop.set(vtn->_idx);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Phase 2: find all nodes that are not after the loop.
|
|
for (int i = _schedule.length()-1; i >= 0; i--) {
|
|
VTransformNode* vtn = _schedule.at(i);
|
|
if (!is_not_before_loop.test(vtn->_idx)) { continue; }
|
|
// Is load or store?
|
|
if (vtn->is_load_or_store_in_loop()) {
|
|
in_loop.set(vtn->_idx);
|
|
continue;
|
|
}
|
|
for (uint i = 0; i < vtn->out_strong_edges(); i++) {
|
|
VTransformNode* use = vtn->out_strong_edge(i);
|
|
// Or is vtn a backedge or one of its transitive defs?
|
|
if (in_loop.test(use->_idx) || use->is_loop_head_phi()) {
|
|
in_loop.set(vtn->_idx);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
float VTransformGraph::cost_for_vector_loop() const {
|
|
assert(is_scheduled(), "must already be scheduled");
|
|
#ifndef PRODUCT
|
|
if (_vloop.is_trace_cost()) {
|
|
tty->print_cr("\nVTransformGraph::cost_for_vector_loop:");
|
|
}
|
|
#endif
|
|
|
|
// We only want to count the cost of nodes that are in the loop.
|
|
// This is especially important for cases where we were able to move
|
|
// some nodes outside the loop during VTransform::optimize, e.g.:
|
|
// VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
|
|
ResourceMark rm;
|
|
VectorSet in_loop; // vtn->_idx -> bool
|
|
mark_vtnodes_in_loop(in_loop);
|
|
|
|
float sum = 0;
|
|
for (int i = 0; i < _schedule.length(); i++) {
|
|
VTransformNode* vtn = _schedule.at(i);
|
|
if (!in_loop.test(vtn->_idx)) { continue; }
|
|
float c = vtn->cost(_vloop_analyzer);
|
|
sum += c;
|
|
#ifndef PRODUCT
|
|
if (c != 0 && _vloop.is_trace_cost_verbose()) {
|
|
tty->print(" -> cost = %.2f for ", c);
|
|
vtn->print();
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
if (_vloop.is_trace_cost()) {
|
|
tty->print_cr(" total_cost = %.2f", sum);
|
|
}
|
|
#endif
|
|
return sum;
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
|
|
const VectorSet& pre_visited,
|
|
const VectorSet& post_visited) const {
|
|
tty->print_cr("\nVTransform::schedule found a cycle on path (P), vectorization attempt fails.");
|
|
for (int j = 0; j < stack.length(); j++) {
|
|
VTransformNode* n = stack.at(j);
|
|
bool on_path = pre_visited.test(n->_idx) && !post_visited.test(n->_idx);
|
|
tty->print(" %s ", on_path ? "P" : "_");
|
|
n->print();
|
|
}
|
|
}
|
|
|
|
void VTransformApplyResult::trace(VTransformNode* vtnode) const {
|
|
tty->print(" apply: ");
|
|
vtnode->print();
|
|
tty->print(" -> ");
|
|
if (_node == nullptr) {
|
|
tty->print_cr("nullptr");
|
|
} else {
|
|
_node->dump();
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void VTransform::apply_speculative_alignment_runtime_checks() {
|
|
if (VLoop::vectors_should_be_aligned()) {
|
|
#ifdef ASSERT
|
|
if (_trace._align_vector || _trace._speculative_runtime_checks) {
|
|
tty->print_cr("\nVTransform::apply_speculative_alignment_runtime_checks: native memory alignment");
|
|
}
|
|
#endif
|
|
|
|
const GrowableArray<VTransformNode*>& vtnodes = _graph.vtnodes();
|
|
for (int i = 0; i < vtnodes.length(); i++) {
|
|
VTransformMemVectorNode* vtn = vtnodes.at(i)->isa_MemVector();
|
|
if (vtn == nullptr) { continue; }
|
|
const VPointer& vp = vtn->vpointer();
|
|
if (vp.mem_pointer().base().is_object()) { continue; }
|
|
assert(vp.mem_pointer().base().is_native(), "VPointer base must be object or native");
|
|
|
|
// We have a native memory reference. Build a runtime check for it.
|
|
// See: AlignmentSolver::solve
|
|
// In a future RFE we may be able to speculate on invar alignment as
|
|
// well, and allow vectorization of more cases.
|
|
add_speculative_alignment_check(vp.mem_pointer().base().native(), ObjectAlignmentInBytes);
|
|
}
|
|
}
|
|
}
|
|
|
|
#define TRACE_SPECULATIVE_ALIGNMENT_CHECK(node) { \
|
|
DEBUG_ONLY( \
|
|
if (_trace._align_vector || _trace._speculative_runtime_checks) { \
|
|
tty->print(" " #node ": "); \
|
|
node->dump(); \
|
|
} \
|
|
) \
|
|
} \
|
|
|
|
// Check: (node % alignment) == 0.
|
|
void VTransform::add_speculative_alignment_check(Node* node, juint alignment) {
|
|
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
|
|
Node* ctrl = phase()->get_ctrl(node);
|
|
|
|
// Cast adr/long -> int
|
|
if (node->bottom_type()->basic_type() == T_ADDRESS) {
|
|
// adr -> int/long
|
|
node = new CastP2XNode(nullptr, node);
|
|
phase()->register_new_node(node, ctrl);
|
|
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
|
|
}
|
|
if (node->bottom_type()->basic_type() == T_LONG) {
|
|
// long -> int
|
|
node = new ConvL2INode(node);
|
|
phase()->register_new_node(node, ctrl);
|
|
TRACE_SPECULATIVE_ALIGNMENT_CHECK(node);
|
|
}
|
|
|
|
Node* mask_alignment = phase()->intcon(alignment-1);
|
|
Node* base_alignment = new AndINode(node, mask_alignment);
|
|
phase()->register_new_node(base_alignment, ctrl);
|
|
TRACE_SPECULATIVE_ALIGNMENT_CHECK(mask_alignment);
|
|
TRACE_SPECULATIVE_ALIGNMENT_CHECK(base_alignment);
|
|
|
|
Node* zero = phase()->intcon(0);
|
|
Node* cmp_alignment = CmpNode::make(base_alignment, zero, T_INT, false);
|
|
BoolNode* bol_alignment = new BoolNode(cmp_alignment, BoolTest::eq);
|
|
phase()->register_new_node(cmp_alignment, ctrl);
|
|
phase()->register_new_node(bol_alignment, ctrl);
|
|
TRACE_SPECULATIVE_ALIGNMENT_CHECK(cmp_alignment);
|
|
TRACE_SPECULATIVE_ALIGNMENT_CHECK(bol_alignment);
|
|
|
|
add_speculative_check([&] (Node* ctrl) { return bol_alignment; });
|
|
}
|
|
|
|
class VPointerWeakAliasingPair : public StackObj {
|
|
private:
|
|
// Using references instead of pointers would be preferrable, but GrowableArray
|
|
// requires a default constructor, and we do not have a default constructor for
|
|
// VPointer.
|
|
const VPointer* _vp1 = nullptr;
|
|
const VPointer* _vp2 = nullptr;
|
|
|
|
VPointerWeakAliasingPair(const VPointer& vp1, const VPointer& vp2) : _vp1(&vp1), _vp2(&vp2) {
|
|
assert(vp1.is_valid(), "sanity");
|
|
assert(vp2.is_valid(), "sanity");
|
|
assert(!vp1.never_overlaps_with(vp2), "otherwise no aliasing");
|
|
assert(!vp1.always_overlaps_with(vp2), "otherwise must be strong");
|
|
assert(VPointer::cmp_summands_and_con(vp1, vp2) <= 0, "must be sorted");
|
|
}
|
|
|
|
public:
|
|
// Default constructor to make GrowableArray happy.
|
|
VPointerWeakAliasingPair() : _vp1(nullptr), _vp2(nullptr) {}
|
|
|
|
static VPointerWeakAliasingPair make(const VPointer& vp1, const VPointer& vp2) {
|
|
if (VPointer::cmp_summands_and_con(vp1, vp2) <= 0) {
|
|
return VPointerWeakAliasingPair(vp1, vp2);
|
|
} else {
|
|
return VPointerWeakAliasingPair(vp2, vp1);
|
|
}
|
|
}
|
|
|
|
const VPointer& vp1() const { return *_vp1; }
|
|
const VPointer& vp2() const { return *_vp2; }
|
|
|
|
// Sort by summands, so that pairs with same summands (summand1, summands2) are adjacent.
|
|
static int cmp_for_sort(VPointerWeakAliasingPair* pair1, VPointerWeakAliasingPair* pair2) {
|
|
int cmp_summands1 = VPointer::cmp_summands(pair1->vp1(), pair2->vp1());
|
|
if (cmp_summands1 != 0) { return cmp_summands1; }
|
|
return VPointer::cmp_summands(pair1->vp2(), pair2->vp2());
|
|
}
|
|
};
|
|
|
|
void VTransform::apply_speculative_aliasing_runtime_checks() {
|
|
|
|
if (_vloop.use_speculative_aliasing_checks()) {
|
|
|
|
#ifdef ASSERT
|
|
if (_trace._speculative_aliasing_analysis || _trace._speculative_runtime_checks) {
|
|
tty->print_cr("\nVTransform::apply_speculative_aliasing_runtime_checks: speculative aliasing analysis runtime checks");
|
|
}
|
|
#endif
|
|
|
|
// It would be nice to add a ResourceMark here. But it would collide with resource allocation
|
|
// in PhaseIdealLoop::set_idom for _idom and _dom_depth. See also JDK-8337015.
|
|
VectorSet visited;
|
|
GrowableArray<VPointerWeakAliasingPair> weak_aliasing_pairs;
|
|
|
|
const GrowableArray<VTransformNode*>& schedule = _graph.get_schedule();
|
|
for (int i = 0; i < schedule.length(); i++) {
|
|
VTransformNode* vtn = schedule.at(i);
|
|
for (uint i = 0; i < vtn->out_weak_edges(); i++) {
|
|
VTransformNode* use = vtn->out_weak_edge(i);
|
|
if (visited.test(use->_idx)) {
|
|
// The use node was already visited, i.e. is higher up in the schedule.
|
|
// The "out" edge thus points backward, i.e. it is violated.
|
|
const VPointer& vp1 = vtn->vpointer();
|
|
const VPointer& vp2 = use->vpointer();
|
|
#ifdef ASSERT
|
|
if (_trace._speculative_aliasing_analysis || _trace._speculative_runtime_checks) {
|
|
tty->print_cr("\nViolated Weak Edge:");
|
|
vtn->print();
|
|
vp1.print_on(tty);
|
|
use->print();
|
|
vp2.print_on(tty);
|
|
}
|
|
#endif
|
|
|
|
// We could generate checks for the pair (vp1, vp2) directly. But in
|
|
// some graphs, this generates quadratically many checks. Example:
|
|
//
|
|
// set1: a[i+0] a[i+1] a[i+2] a[i+3]
|
|
// set2: b[i+0] b[i+1] b[i+2] b[i+3]
|
|
//
|
|
// We may have a weak memory edge between every memory access from
|
|
// set1 to every memory access from set2. In this example, this would
|
|
// be 4 * 4 = 16 checks. But instead, we can create a union VPointer
|
|
// for set1 and set2 each, and only create a single check.
|
|
//
|
|
// set1: a[i+0, size = 4]
|
|
// set1: b[i+0, size = 4]
|
|
//
|
|
// For this, we add all pairs to an array, and process it below.
|
|
weak_aliasing_pairs.push(VPointerWeakAliasingPair::make(vp1, vp2));
|
|
}
|
|
}
|
|
visited.set(vtn->_idx);
|
|
}
|
|
|
|
// Sort so that all pairs with the same summands (summands1, summands2)
|
|
// are consecutive, i.e. in the same group. This allows us to do a linear
|
|
// walk over all pairs of a group and create the union VPointers.
|
|
weak_aliasing_pairs.sort(VPointerWeakAliasingPair::cmp_for_sort);
|
|
|
|
int group_start = 0;
|
|
while (group_start < weak_aliasing_pairs.length()) {
|
|
// New group: pick the first pair as the reference.
|
|
const VPointer* vp1 = &weak_aliasing_pairs.at(group_start).vp1();
|
|
const VPointer* vp2 = &weak_aliasing_pairs.at(group_start).vp2();
|
|
jint size1 = vp1->size();
|
|
jint size2 = vp2->size();
|
|
int group_end = group_start + 1;
|
|
while (group_end < weak_aliasing_pairs.length()) {
|
|
const VPointer* vp1_next = &weak_aliasing_pairs.at(group_end).vp1();
|
|
const VPointer* vp2_next = &weak_aliasing_pairs.at(group_end).vp2();
|
|
jint size1_next = vp1_next->size();
|
|
jint size2_next = vp2_next->size();
|
|
|
|
// Different summands -> different group.
|
|
if (VPointer::cmp_summands(*vp1, *vp1_next) != 0) { break; }
|
|
if (VPointer::cmp_summands(*vp2, *vp2_next) != 0) { break; }
|
|
|
|
// Pick the one with the lower con as the reference.
|
|
if (vp1->con() > vp1_next->con()) {
|
|
swap(vp1, vp1_next);
|
|
swap(size1, size1_next);
|
|
}
|
|
if (vp2->con() > vp2_next->con()) {
|
|
swap(vp2, vp2_next);
|
|
swap(size2, size2_next);
|
|
}
|
|
|
|
// Compute the distance from vp1 to vp1_next + size, to get a size that would include vp1_next.
|
|
NoOverflowInt new_size1 = NoOverflowInt(vp1_next->con()) + NoOverflowInt(size1_next) - NoOverflowInt(vp1->con());
|
|
NoOverflowInt new_size2 = NoOverflowInt(vp2_next->con()) + NoOverflowInt(size2_next) - NoOverflowInt(vp2->con());
|
|
if (new_size1.is_NaN() || new_size2.is_NaN()) { break; /* overflow -> new group */ }
|
|
|
|
// The "next" VPointer indeed belong to the group.
|
|
//
|
|
// vp1: |-------------->
|
|
// vp1_next: |---------------->
|
|
// result: |-------------------------->
|
|
//
|
|
// vp1: |-------------------------->
|
|
// vp1_next: |------->
|
|
// result: |-------------------------->
|
|
//
|
|
size1 = MAX2(size1, new_size1.value());
|
|
size2 = MAX2(size2, new_size2.value());
|
|
group_end++;
|
|
}
|
|
// Create "union" VPointer that cover all VPointer from the group.
|
|
const VPointer vp1_union = vp1->make_with_size(size1);
|
|
const VPointer vp2_union = vp2->make_with_size(size2);
|
|
|
|
#ifdef ASSERT
|
|
if (_trace._speculative_aliasing_analysis || _trace._speculative_runtime_checks) {
|
|
tty->print_cr("\nUnion of %d weak aliasing edges:", group_end - group_start);
|
|
vp1_union.print_on(tty);
|
|
vp2_union.print_on(tty);
|
|
}
|
|
|
|
// Verification - union must contain all VPointer of the group.
|
|
for (int i = group_start; i < group_end; i++) {
|
|
const VPointer& vp1_i = weak_aliasing_pairs.at(i).vp1();
|
|
const VPointer& vp2_i = weak_aliasing_pairs.at(i).vp2();
|
|
assert(vp1_union.con() <= vp1_i.con(), "must start before");
|
|
assert(vp2_union.con() <= vp2_i.con(), "must start before");
|
|
assert(vp1_union.size() >= vp1_i.size(), "must end after");
|
|
assert(vp2_union.size() >= vp2_i.size(), "must end after");
|
|
}
|
|
#endif
|
|
|
|
add_speculative_check([&] (Node* ctrl) {
|
|
return vp1_union.make_speculative_aliasing_check_with(vp2_union, ctrl);
|
|
});
|
|
|
|
group_start = group_end;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Runtime Checks:
|
|
// Some required properties cannot be proven statically, and require a
|
|
// runtime check:
|
|
// - Alignment:
|
|
// See VTransform::add_speculative_alignment_check
|
|
// - Aliasing:
|
|
// See VTransform::apply_speculative_aliasing_runtime_checks
|
|
// There is a two staged approach for compilation:
|
|
// - AutoVectorization Predicate:
|
|
// See VM flag UseAutoVectorizationPredicate and documentation in predicates.hpp
|
|
// We speculate that the checks pass, and only compile a vectorized loop.
|
|
// We expect the checks to pass in almost all cases, and so we only need
|
|
// to compile and cache the vectorized loop.
|
|
// If the predicate ever fails, we deoptimize, and eventually compile
|
|
// without predicate. This means we will recompile with multiversioning.
|
|
// - Multiversioning:
|
|
// See VM Flag LoopMultiversioning and documentaiton in loopUnswitch.cpp
|
|
// If the predicate is not available or previously failed, then we compile
|
|
// a vectorized and a scalar loop. If the runtime check passes we take the
|
|
// vectorized loop, else the scalar loop.
|
|
// Multiversioning takes more compile time and code cache, but it also
|
|
// produces fast code for when the runtime check passes (vectorized) and
|
|
// when it fails (scalar performance).
|
|
//
|
|
// Callback:
|
|
// In some cases, we require the ctrl just before the check iff_speculate to
|
|
// generate the values required in the check. We pass this ctrl into the
|
|
// callback, which is expected to produce the check, i.e. a BoolNode.
|
|
template<typename Callback>
|
|
void VTransform::add_speculative_check(Callback callback) {
|
|
assert(_vloop.are_speculative_checks_possible(), "otherwise we cannot make speculative assumptions");
|
|
ParsePredicateSuccessProj* parse_predicate_proj = _vloop.auto_vectorization_parse_predicate_proj();
|
|
IfTrueNode* new_check_proj = nullptr;
|
|
if (parse_predicate_proj != nullptr) {
|
|
new_check_proj = phase()->create_new_if_for_predicate(parse_predicate_proj, nullptr,
|
|
Deoptimization::Reason_auto_vectorization_check,
|
|
Op_If);
|
|
} else {
|
|
new_check_proj = phase()->create_new_if_for_multiversion(_vloop.multiversioning_fast_proj());
|
|
}
|
|
Node* iff_speculate = new_check_proj->in(0);
|
|
|
|
// Create the check, given the ctrl just before the iff.
|
|
BoolNode* bol = callback(iff_speculate->in(0));
|
|
|
|
igvn().replace_input_of(iff_speculate, 1, bol);
|
|
TRACE_SPECULATIVE_ALIGNMENT_CHECK(iff_speculate);
|
|
}
|
|
|
|
// Helper-class for VTransformGraph::has_store_to_load_forwarding_failure.
|
|
// It wraps a VPointer. The VPointer has an iv_offset applied, which
|
|
// simulates a virtual unrolling. They represent the memory region:
|
|
// [adr, adr + size)
|
|
// adr = base + invar + iv_scale * (iv + iv_offset) + con
|
|
class VMemoryRegion : public ResourceObj {
|
|
private:
|
|
// Note: VPointer has no default constructor, so we cannot use VMemoryRegion
|
|
// in-place in a GrowableArray. Hence, we make VMemoryRegion a resource
|
|
// allocated object, so the GrowableArray of VMemoryRegion* has a default
|
|
// nullptr element.
|
|
const VPointer _vpointer;
|
|
bool _is_load; // load or store?
|
|
uint _schedule_order;
|
|
|
|
public:
|
|
VMemoryRegion(const VPointer& vpointer, bool is_load, uint schedule_order) :
|
|
_vpointer(vpointer),
|
|
_is_load(is_load),
|
|
_schedule_order(schedule_order) {}
|
|
|
|
const VPointer& vpointer() const { return _vpointer; }
|
|
bool is_load() const { return _is_load; }
|
|
uint schedule_order() const { return _schedule_order; }
|
|
|
|
static int cmp_for_sort_by_group(VMemoryRegion* r1, VMemoryRegion* r2) {
|
|
// Sort by mem_pointer (base, invar, iv_scale), except for the con.
|
|
return MemPointer::cmp_summands(r1->vpointer().mem_pointer(),
|
|
r2->vpointer().mem_pointer());
|
|
}
|
|
|
|
static int cmp_for_sort(VMemoryRegion** r1, VMemoryRegion** r2) {
|
|
int cmp_group = cmp_for_sort_by_group(*r1, *r2);
|
|
if (cmp_group != 0) { return cmp_group; }
|
|
|
|
// We use two comparisons, because a subtraction could underflow.
|
|
jint con1 = (*r1)->vpointer().con();
|
|
jint con2 = (*r2)->vpointer().con();
|
|
if (con1 < con2) { return -1; }
|
|
if (con1 > con2) { return 1; }
|
|
return 0;
|
|
}
|
|
|
|
enum Aliasing { DIFFERENT_GROUP, BEFORE, EXACT_OVERLAP, PARTIAL_OVERLAP, AFTER };
|
|
|
|
Aliasing aliasing(VMemoryRegion& other) {
|
|
VMemoryRegion* p1 = this;
|
|
VMemoryRegion* p2 = &other;
|
|
if (cmp_for_sort_by_group(p1, p2) != 0) { return DIFFERENT_GROUP; }
|
|
|
|
jlong con1 = p1->vpointer().con();
|
|
jlong con2 = p2->vpointer().con();
|
|
jlong size1 = p1->vpointer().size();
|
|
jlong size2 = p2->vpointer().size();
|
|
|
|
if (con1 >= con2 + size2) { return AFTER; }
|
|
if (con2 >= con1 + size1) { return BEFORE; }
|
|
if (con1 == con2 && size1 == size2) { return EXACT_OVERLAP; }
|
|
return PARTIAL_OVERLAP;
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
void print() const {
|
|
tty->print("VMemoryRegion[%s schedule_order(%4d), ",
|
|
_is_load ? "load, " : "store,", _schedule_order);
|
|
vpointer().print_on(tty, false);
|
|
tty->print_cr("]");
|
|
}
|
|
#endif
|
|
};
|
|
|
|
// Store-to-load-forwarding is a CPU memory optimization, where a load can directly fetch
|
|
// its value from the store-buffer, rather than from the L1 cache. This is many CPU cycles
|
|
// faster. However, this optimization comes with some restrictions, depending on the CPU.
|
|
// Generally, store-to-load-forwarding works if the load and store memory regions match
|
|
// exactly (same start and width). Generally problematic are partial overlaps - though
|
|
// some CPU's can handle even some subsets of these cases. We conservatively assume that
|
|
// all such partial overlaps lead to a store-to-load-forwarding failures, which means the
|
|
// load has to stall until the store goes from the store-buffer into the L1 cache, incurring
|
|
// a penalty of many CPU cycles.
|
|
//
|
|
// Example (with "iteration distance" 2):
|
|
// for (int i = 10; i < SIZE; i++) {
|
|
// aI[i] = aI[i - 2] + 1;
|
|
// }
|
|
//
|
|
// load_4_bytes( ptr + -8)
|
|
// store_4_bytes(ptr + 0) *
|
|
// load_4_bytes( ptr + -4) |
|
|
// store_4_bytes(ptr + 4) | *
|
|
// load_4_bytes( ptr + 0) <-+ |
|
|
// store_4_bytes(ptr + 8) |
|
|
// load_4_bytes( ptr + 4) <---+
|
|
// store_4_bytes(ptr + 12)
|
|
// ...
|
|
//
|
|
// In the scalar loop, we can forward the stores from 2 iterations back.
|
|
//
|
|
// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 2
|
|
// example. This gives us this machine code:
|
|
// load_8_bytes( ptr + -8)
|
|
// store_8_bytes(ptr + 0) |
|
|
// load_8_bytes( ptr + 0) v
|
|
// store_8_bytes(ptr + 8) |
|
|
// load_8_bytes( ptr + 8) v
|
|
// store_8_bytes(ptr + 16)
|
|
// ...
|
|
//
|
|
// We packed 2 iterations, and the stores can perfectly forward to the loads of
|
|
// the next 2 iterations.
|
|
//
|
|
// Example (with "iteration distance" 3):
|
|
// for (int i = 10; i < SIZE; i++) {
|
|
// aI[i] = aI[i - 3] + 1;
|
|
// }
|
|
//
|
|
// load_4_bytes( ptr + -12)
|
|
// store_4_bytes(ptr + 0) *
|
|
// load_4_bytes( ptr + -8) |
|
|
// store_4_bytes(ptr + 4) |
|
|
// load_4_bytes( ptr + -4) |
|
|
// store_4_bytes(ptr + 8) |
|
|
// load_4_bytes( ptr + 0) <-+
|
|
// store_4_bytes(ptr + 12)
|
|
// ...
|
|
//
|
|
// In the scalar loop, we can forward the stores from 3 iterations back.
|
|
//
|
|
// Unfortunately, vectorization can introduce such store-to-load-forwarding failures.
|
|
// Assume we have 2-element vectors (2*4 = 8 bytes), with the "iteration distance" 3
|
|
// example. This gives us this machine code:
|
|
// load_8_bytes( ptr + -12)
|
|
// store_8_bytes(ptr + 0) | |
|
|
// load_8_bytes( ptr + -4) x |
|
|
// store_8_bytes(ptr + 8) ||
|
|
// load_8_bytes( ptr + 4) xx <-- partial overlap with 2 stores
|
|
// store_8_bytes(ptr + 16)
|
|
// ...
|
|
//
|
|
// We see that eventually all loads are dependent on earlier stores, but the values cannot
|
|
// be forwarded because there is some partial overlap.
|
|
//
|
|
// Preferably, we would have some latency-based cost-model that accounts for such forwarding
|
|
// failures, and decide if vectorization with forwarding failures is still profitable. For
|
|
// now we go with a simpler heuristic: we simply forbid vectorization if we can PROVE that
|
|
// there will be a forwarding failure. This approach has at least 2 possible weaknesses:
|
|
//
|
|
// (1) There may be forwarding failures in cases where we cannot prove it.
|
|
// Example:
|
|
// for (int i = 10; i < SIZE; i++) {
|
|
// bI[i] = aI[i - 3] + 1;
|
|
// }
|
|
//
|
|
// We do not know if aI and bI refer to the same array or not. However, it is reasonable
|
|
// to assume that if we have two different array references, that they most likely refer
|
|
// to different arrays (i.e. no aliasing), where we would have no forwarding failures.
|
|
// (2) There could be some loops where vectorization introduces forwarding failures, and thus
|
|
// the latency of the loop body is high, but this does not matter because it is dominated
|
|
// by other latency/throughput based costs in the loop body.
|
|
//
|
|
// Performance measurements with the JMH benchmark StoreToLoadForwarding.java have indicated
|
|
// that there is some iteration threshold: if the failure happens between a store and load that
|
|
// have an iteration distance below this threshold, the latency is the limiting factor, and we
|
|
// should not vectorize to avoid the latency penalty of store-to-load-forwarding failures. If
|
|
// the iteration distance is larger than this threshold, the throughput is the limiting factor,
|
|
// and we should vectorize in these cases to improve throughput.
|
|
//
|
|
bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const {
|
|
if (SuperWordStoreToLoadForwardingFailureDetection == 0) { return false; }
|
|
|
|
// Collect all pointers for scalar and vector loads/stores.
|
|
ResourceMark rm;
|
|
// Use pointers because no default constructor for elements available.
|
|
GrowableArray<VMemoryRegion*> memory_regions;
|
|
|
|
// To detect store-to-load-forwarding failures at the iteration threshold or below, we
|
|
// simulate a super-unrolling to reach SuperWordStoreToLoadForwardingFailureDetection
|
|
// iterations at least. This is a heuristic, and we are not trying to be very precise
|
|
// with the iteration distance. If we have already unrolled more than the iteration
|
|
// threshold, i.e. if "SuperWordStoreToLoadForwardingFailureDetection < unrolled_count",
|
|
// then we simply check if there are any store-to-load-forwarding failures in the unrolled
|
|
// loop body, which may be at larger distance than the desired threshold. We cannot do any
|
|
// more fine-grained analysis, because the unrolling has lost the information about the
|
|
// iteration distance.
|
|
int simulated_unrolling_count = SuperWordStoreToLoadForwardingFailureDetection;
|
|
int unrolled_count = vloop_analyzer.vloop().cl()->unrolled_count();
|
|
uint simulated_super_unrolling_count = MAX2(1, simulated_unrolling_count / unrolled_count);
|
|
int iv_stride = vloop_analyzer.vloop().iv_stride();
|
|
int schedule_order = 0;
|
|
for (uint k = 0; k < simulated_super_unrolling_count; k++) {
|
|
int iv_offset = k * iv_stride; // virtual super-unrolling
|
|
for (int i = 0; i < _schedule.length(); i++) {
|
|
VTransformNode* vtn = _schedule.at(i);
|
|
if (vtn->is_load_or_store_in_loop()) {
|
|
const VPointer& p = vtn->vpointer();
|
|
if (p.is_valid()) {
|
|
VTransformVectorNode* vector = vtn->isa_Vector();
|
|
bool is_load = vtn->is_load_in_loop();
|
|
const VPointer iv_offset_p(p.make_with_iv_offset(iv_offset));
|
|
if (iv_offset_p.is_valid()) {
|
|
// The iv_offset may lead to overflows. This is a heuristic, so we do not
|
|
// care too much about those edge cases.
|
|
memory_regions.push(new VMemoryRegion(iv_offset_p, is_load, schedule_order++));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort the pointers by group (same base, invar and stride), and then by offset.
|
|
memory_regions.sort(VMemoryRegion::cmp_for_sort);
|
|
|
|
#ifndef PRODUCT
|
|
if (_trace._verbose) {
|
|
tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
|
|
tty->print_cr(" simulated_unrolling_count = %d", simulated_unrolling_count);
|
|
tty->print_cr(" simulated_super_unrolling_count = %d", simulated_super_unrolling_count);
|
|
for (int i = 0; i < memory_regions.length(); i++) {
|
|
VMemoryRegion& region = *memory_regions.at(i);
|
|
region.print();
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// For all pairs of pointers in the same group, check if they have a partial overlap.
|
|
for (int i = 0; i < memory_regions.length(); i++) {
|
|
VMemoryRegion& region1 = *memory_regions.at(i);
|
|
|
|
for (int j = i + 1; j < memory_regions.length(); j++) {
|
|
VMemoryRegion& region2 = *memory_regions.at(j);
|
|
|
|
const VMemoryRegion::Aliasing aliasing = region1.aliasing(region2);
|
|
if (aliasing == VMemoryRegion::Aliasing::DIFFERENT_GROUP ||
|
|
aliasing == VMemoryRegion::Aliasing::BEFORE) {
|
|
break; // We have reached the next group or pointers that are always after.
|
|
} else if (aliasing == VMemoryRegion::Aliasing::EXACT_OVERLAP) {
|
|
continue;
|
|
} else {
|
|
assert(aliasing == VMemoryRegion::Aliasing::PARTIAL_OVERLAP, "no other case can happen");
|
|
if ((region1.is_load() && !region2.is_load() && region1.schedule_order() > region2.schedule_order()) ||
|
|
(!region1.is_load() && region2.is_load() && region1.schedule_order() < region2.schedule_order())) {
|
|
// We predict that this leads to a store-to-load-forwarding failure penalty.
|
|
#ifndef PRODUCT
|
|
if (_trace._rejections) {
|
|
tty->print_cr("VTransformGraph::has_store_to_load_forwarding_failure:");
|
|
tty->print_cr(" Partial overlap of store->load. We predict that this leads to");
|
|
tty->print_cr(" a store-to-load-forwarding failure penalty which makes");
|
|
tty->print_cr(" vectorization unprofitable. These are the two pointers:");
|
|
region1.print();
|
|
region2.print();
|
|
}
|
|
#endif
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void VTransformApplyState::set_transformed_node(VTransformNode* vtn, Node* n) {
|
|
assert(_vtnode_idx_to_transformed_node.at(vtn->_idx) == nullptr, "only set once");
|
|
_vtnode_idx_to_transformed_node.at_put(vtn->_idx, n);
|
|
}
|
|
|
|
Node* VTransformApplyState::transformed_node(const VTransformNode* vtn) const {
|
|
Node* n = _vtnode_idx_to_transformed_node.at(vtn->_idx);
|
|
assert(n != nullptr, "must find IR node for vtnode");
|
|
return n;
|
|
}
|
|
|
|
void VTransformApplyState::init_memory_states_and_uses_after_loop() {
|
|
const GrowableArray<Node*>& inputs = _vloop_analyzer.memory_slices().inputs();
|
|
const GrowableArray<PhiNode*>& heads = _vloop_analyzer.memory_slices().heads();
|
|
for (int i = 0; i < inputs.length(); i++) {
|
|
PhiNode* head = heads.at(i);
|
|
if (head != nullptr) {
|
|
// Slice with Phi (i.e. with stores) -> start with the phi (phi_mem)
|
|
_memory_states.at_put(i, head);
|
|
|
|
// Remember uses outside the loop of the last memory state (store).
|
|
StoreNode* last_store = head->in(2)->as_Store();
|
|
assert(vloop().in_bb(last_store), "backedge store should be in the loop");
|
|
for (DUIterator_Fast jmax, j = last_store->fast_outs(jmax); j < jmax; j++) {
|
|
Node* use = last_store->fast_out(j);
|
|
if (!vloop().in_bb(use)) {
|
|
for (uint k = 0; k < use->req(); k++) {
|
|
if (use->in(k) == last_store) {
|
|
_memory_state_uses_after_loop.push(MemoryStateUseAfterLoop(use, k, i));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Slice without Phi (i.e. only loads) -> use the input state (entry_mem)
|
|
_memory_states.at_put(i, inputs.at(i));
|
|
}
|
|
}
|
|
}
|
|
|
|
// We may have reordered the scalar stores, or replaced them with vectors. Now
|
|
// the last memory state in the loop may have changed. Thus, we need to change
|
|
// the uses of the old last memory state the new last memory state.
|
|
void VTransformApplyState::fix_memory_state_uses_after_loop() {
|
|
for (int i = 0; i < _memory_state_uses_after_loop.length(); i++) {
|
|
MemoryStateUseAfterLoop& use = _memory_state_uses_after_loop.at(i);
|
|
Node* last_state = memory_state(use._alias_idx);
|
|
phase()->igvn().replace_input_of(use._use, use._in_idx, last_state);
|
|
}
|
|
}
|
|
|
|
void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const {
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
for (uint i = 0; i < req(); i++) {
|
|
VTransformNode* vtn_def = in_req(i);
|
|
if (vtn_def != nullptr) {
|
|
Node* def = apply_state.transformed_node(vtn_def);
|
|
phase->igvn().replace_input_of(n, i, def);
|
|
}
|
|
}
|
|
}
|
|
|
|
float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
// This is an identity transform, but loads and stores must be counted.
|
|
assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted");
|
|
return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
|
|
}
|
|
|
|
VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const {
|
|
apply_vtn_inputs_to_node(_node, apply_state);
|
|
// The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
|
|
Node* mem = apply_state.memory_state(_node->adr_type());
|
|
apply_state.phase()->igvn().replace_input_of(_node, 1, mem);
|
|
if (_node->is_Store()) {
|
|
apply_state.set_memory_state(_node->adr_type(), _node);
|
|
}
|
|
|
|
return VTransformApplyResult::make_scalar(_node);
|
|
}
|
|
|
|
float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
// Since this is an identity transform, we may have nodes that also
|
|
// VLoopAnalyzer::cost does not count for the scalar loop.
|
|
if (vloop_analyzer.has_zero_cost(_node)) {
|
|
return 0;
|
|
} else {
|
|
return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
|
|
}
|
|
}
|
|
|
|
VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const {
|
|
apply_vtn_inputs_to_node(_node, apply_state);
|
|
return VTransformApplyResult::make_scalar(_node);
|
|
}
|
|
|
|
VTransformApplyResult VTransformPhiScalarNode::apply(VTransformApplyState& apply_state) const {
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
Node* in0 = apply_state.transformed_node(in_req(0));
|
|
Node* in1 = apply_state.transformed_node(in_req(1));
|
|
phase->igvn().replace_input_of(_node, 0, in0);
|
|
phase->igvn().replace_input_of(_node, 1, in1);
|
|
// Note: the backedge is hooked up later.
|
|
|
|
return VTransformApplyResult::make_scalar(_node);
|
|
}
|
|
|
|
// Cleanup backedges. In the schedule, the backedges come after their phis. Hence,
|
|
// we only have the transformed backedges after the phis are already transformed.
|
|
// We hook the backedges into the phis now, during cleanup.
|
|
void VTransformPhiScalarNode::apply_backedge(VTransformApplyState& apply_state) const {
|
|
assert(_node == apply_state.transformed_node(this), "sanity");
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
if (_node->is_memory_phi()) {
|
|
// Memory phi/backedge
|
|
// The last memory state of that slice is the backedge.
|
|
Node* last_state = apply_state.memory_state(_node->adr_type());
|
|
phase->igvn().replace_input_of(_node, 2, last_state);
|
|
} else {
|
|
// Data phi/backedge
|
|
Node* in2 = apply_state.transformed_node(in_req(2));
|
|
phase->igvn().replace_input_of(_node, 2, in2);
|
|
}
|
|
}
|
|
|
|
VTransformApplyResult VTransformCFGNode::apply(VTransformApplyState& apply_state) const {
|
|
// We do not modify the inputs of the CountedLoop (and certainly not its backedge)
|
|
if (!_node->is_CountedLoop()) {
|
|
apply_vtn_inputs_to_node(_node, apply_state);
|
|
}
|
|
return VTransformApplyResult::make_scalar(_node);
|
|
}
|
|
|
|
VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_state) const {
|
|
apply_vtn_inputs_to_node(_node, apply_state);
|
|
return VTransformApplyResult::make_scalar(_node);
|
|
}
|
|
|
|
float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type);
|
|
}
|
|
|
|
VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const {
|
|
Node* val = apply_state.transformed_node(in_req(1));
|
|
VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type);
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn);
|
|
}
|
|
|
|
float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L);
|
|
}
|
|
|
|
VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const {
|
|
Node* val = apply_state.transformed_node(in_req(1));
|
|
Node* n = new ConvI2LNode(val);
|
|
register_new_node_from_vectorization(apply_state, n);
|
|
return VTransformApplyResult::make_scalar(n);
|
|
}
|
|
|
|
float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode);
|
|
return vloop_analyzer.cost_for_scalar_node(Op_AndI) +
|
|
vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt);
|
|
}
|
|
|
|
VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const {
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
Node* shift_count_in = apply_state.transformed_node(in_req(1));
|
|
assert(shift_count_in->bottom_type()->isa_int(), "int type only for shift count");
|
|
// The shift_count_in would be automatically truncated to the lowest _mask
|
|
// bits in a scalar shift operation. But vector shift does not truncate, so
|
|
// we must apply the mask now.
|
|
Node* shift_count_masked = new AndINode(shift_count_in, phase->intcon(_mask));
|
|
register_new_node_from_vectorization(apply_state, shift_count_masked);
|
|
// Now that masked value is "boadcast" (some platforms only set the lowest element).
|
|
VectorNode* vn = VectorNode::shift_count(_shift_opcode, shift_count_masked, _vlen, _element_bt);
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn);
|
|
}
|
|
|
|
float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt);
|
|
}
|
|
|
|
VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const {
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
Node* val = apply_state.transformed_node(in_req(1));
|
|
assert(val->is_Phi(), "expected to be iv");
|
|
assert(VectorNode::is_populate_index_supported(_element_bt), "should support");
|
|
const TypeVect* vt = TypeVect::make(_element_bt, _vlen);
|
|
VectorNode* vn = new PopulateIndexNode(val, phase->intcon(1), vt);
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn);
|
|
}
|
|
|
|
float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type());
|
|
}
|
|
|
|
VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const {
|
|
assert(2 <= req() && req() <= 4, "Must have 1-3 inputs");
|
|
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
|
|
Node* in1 = apply_state.transformed_node(in_req(1));
|
|
Node* in2 = (req() >= 3) ? apply_state.transformed_node(in_req(2)) : nullptr;
|
|
|
|
VectorNode* vn = nullptr;
|
|
if (req() <= 3) {
|
|
vn = VectorNode::make(_vector_opcode, in1, in2, vt); // unary and binary
|
|
} else {
|
|
Node* in3 = apply_state.transformed_node(in_req(3));
|
|
vn = VectorNode::make(_vector_opcode, in1, in2, in3, vt); // ternary
|
|
}
|
|
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn);
|
|
}
|
|
|
|
float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type());
|
|
return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) +
|
|
vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT);
|
|
}
|
|
|
|
VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
|
|
uint vlen = vector_length();
|
|
int sopc = scalar_opcode();
|
|
Node* in1 = apply_state.transformed_node(in_req(1));
|
|
|
|
// The scalar operation was a long -> int operation.
|
|
// However, the vector operation is long -> long.
|
|
VectorNode* long_vn = VectorNode::make(sopc, in1, nullptr, vlen, T_LONG);
|
|
register_new_node_from_vectorization(apply_state, long_vn);
|
|
// Cast long -> int, to mimic the scalar long -> int operation.
|
|
VectorNode* vn = VectorCastNode::make(Op_VectorCastL2X, long_vn, T_INT, vlen);
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn);
|
|
}
|
|
|
|
float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type());
|
|
}
|
|
|
|
VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {
|
|
const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length());
|
|
const TypeVect* src_vt = TypeVect::make(_src_bt, vector_length());
|
|
assert(VectorNode::is_reinterpret_opcode(scalar_opcode()), "scalar opcode must be reinterpret");
|
|
|
|
Node* in1 = apply_state.transformed_node(in_req(1));
|
|
VectorNode* vn = new VectorReinterpretNode(in1, src_vt, dst_vt);
|
|
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn);
|
|
}
|
|
|
|
float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
assert(scalar_opcode() == Op_Bool, "");
|
|
return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type());
|
|
}
|
|
|
|
VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const {
|
|
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
|
|
assert(scalar_opcode() == Op_Bool, "");
|
|
|
|
// Cmp + Bool -> VectorMaskCmp
|
|
VTransformCmpVectorNode* vtn_cmp = in_req(1)->isa_CmpVector();
|
|
assert(vtn_cmp != nullptr, "bool vtn expects cmp vtn as input");
|
|
|
|
Node* cmp_in1 = apply_state.transformed_node(vtn_cmp->in_req(1));
|
|
Node* cmp_in2 = apply_state.transformed_node(vtn_cmp->in_req(2));
|
|
BoolTest::mask mask = test()._mask;
|
|
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
ConINode* mask_node = phase->intcon((int)mask);
|
|
VectorNode* vn = new VectorMaskCmpNode(mask, cmp_in1, cmp_in2, mask_node, vt);
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn);
|
|
}
|
|
|
|
bool VTransformReductionVectorNode::optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
|
|
return optimize_move_non_strict_order_reductions_out_of_loop(vloop_analyzer, vtransform);
|
|
}
|
|
|
|
int VTransformReductionVectorNode::vector_reduction_opcode() const {
|
|
return ReductionNode::opcode(scalar_opcode(), element_basic_type());
|
|
}
|
|
|
|
bool VTransformReductionVectorNode::requires_strict_order() const {
|
|
int vopc = vector_reduction_opcode();
|
|
return ReductionNode::auto_vectorization_requires_strict_order(vopc);
|
|
}
|
|
|
|
// Having ReductionNodes in the loop is expensive. They need to recursively
|
|
// fold together the vector values, for every vectorized loop iteration. If
|
|
// we encounter the following pattern, we can vector accumulate the values
|
|
// inside the loop, and only have a single UnorderedReduction after the loop.
|
|
//
|
|
// Note: UnorderedReduction represents a ReductionNode which does not require
|
|
// calculating in strict order.
|
|
//
|
|
// CountedLoop init
|
|
// | |
|
|
// +------+ | +------------------------+
|
|
// | | | |
|
|
// PhiNode (s) |
|
|
// | |
|
|
// | Vector |
|
|
// | | |
|
|
// UnorderedReduction (first_red) |
|
|
// | |
|
|
// ... Vector |
|
|
// | | |
|
|
// UnorderedReduction (last_red) |
|
|
// | |
|
|
// +----------------------+
|
|
//
|
|
// We patch the graph to look like this:
|
|
//
|
|
// CountedLoop identity_vector
|
|
// | |
|
|
// +-------+ | +---------------+
|
|
// | | | |
|
|
// PhiNode (v) |
|
|
// | |
|
|
// | Vector |
|
|
// | | |
|
|
// VectorAccumulator |
|
|
// | |
|
|
// ... Vector |
|
|
// | | |
|
|
// init VectorAccumulator |
|
|
// | | | |
|
|
// UnorderedReduction +-----------+
|
|
//
|
|
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
|
|
// use vector_accumulators, which do the same reductions, but only element
|
|
// wise. This is a single operation per vector_accumulator, rather than many
|
|
// for a UnorderedReduction. We can then reduce the last vector_accumulator
|
|
// after the loop, and also reduce the init value into it.
|
|
//
|
|
// We can not do this with all reductions. Some reductions do not allow the
|
|
// reordering of operations (for example float addition/multiplication require
|
|
// strict order).
|
|
//
|
|
// Note: we must perform this optimization already during auto vectorization,
|
|
// before we evaluate the cost-model. Without this optimization, we may
|
|
// still have expensive reduction nodes in the loop which can make
|
|
// vectorization unprofitable. Only with the optimization does vectorization
|
|
// become profitable, since the expensive reduction node is moved
|
|
// outside the loop, and instead cheaper element-wise vector accumulations
|
|
// are performed inside the loop.
|
|
bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform) {
|
|
// We have a phi with a single use.
|
|
VTransformPhiScalarNode* phi = in_req(1)->isa_PhiScalar();
|
|
if (phi == nullptr) {
|
|
return false;
|
|
}
|
|
if (phi->out_strong_edges() != 1) {
|
|
TRACE_OPTIMIZE(
|
|
tty->print(" Cannot move out of loop, phi has multiple uses:");
|
|
print();
|
|
tty->print(" phi: ");
|
|
phi->print();
|
|
)
|
|
return false;
|
|
}
|
|
|
|
if (requires_strict_order()) {
|
|
TRACE_OPTIMIZE(
|
|
tty->print(" Cannot move out of loop, strict order required: ");
|
|
print();
|
|
)
|
|
return false;
|
|
}
|
|
|
|
const int sopc = scalar_opcode();
|
|
const uint vlen = vector_length();
|
|
const BasicType bt = element_basic_type();
|
|
const int ropc = vector_reduction_opcode();
|
|
const int vopc = VectorNode::opcode(sopc, bt);
|
|
if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) {
|
|
// The element-wise vector operation needed for the vector accumulator
|
|
// is not implemented / supported.
|
|
return false;
|
|
}
|
|
|
|
// Traverse up the chain of non strict order reductions, checking that it loops
|
|
// back to the phi. Check that all non strict order reductions only have a single
|
|
// use, except for the last (last_red), which only has phi as a use in the loop,
|
|
// and all other uses are outside the loop.
|
|
VTransformReductionVectorNode* first_red = this;
|
|
VTransformReductionVectorNode* last_red = phi->in_req(2)->isa_ReductionVector();
|
|
VTransformReductionVectorNode* current_red = last_red;
|
|
while (true) {
|
|
if (current_red == nullptr ||
|
|
current_red->vector_reduction_opcode() != ropc ||
|
|
current_red->element_basic_type() != bt ||
|
|
current_red->vector_length() != vlen) {
|
|
TRACE_OPTIMIZE(
|
|
tty->print(" Cannot move out of loop, other reduction node does not match:");
|
|
print();
|
|
tty->print(" other: ");
|
|
current_red->print();
|
|
)
|
|
return false; // not compatible
|
|
}
|
|
|
|
VTransformVectorNode* vector_input = current_red->in_req(2)->isa_Vector();
|
|
if (vector_input == nullptr) {
|
|
assert(false, "reduction has a bad vector input");
|
|
return false;
|
|
}
|
|
|
|
// Expect single use of the non strict order reduction. Except for the last_red.
|
|
if (current_red == last_red) {
|
|
// All uses must be outside loop body, except for the phi.
|
|
for (uint i = 0; i < current_red->out_strong_edges(); i++) {
|
|
VTransformNode* use = current_red->out_strong_edge(i);
|
|
if (use->isa_PhiScalar() == nullptr &&
|
|
use->isa_Outer() == nullptr) {
|
|
// Should not be allowed by SuperWord::mark_reductions
|
|
assert(false, "reduction has use inside loop");
|
|
return false;
|
|
}
|
|
}
|
|
} else {
|
|
if (current_red->out_strong_edges() != 1) {
|
|
TRACE_OPTIMIZE(
|
|
tty->print(" Cannot move out of loop, other reduction node has use outside loop:");
|
|
print();
|
|
tty->print(" other: ");
|
|
current_red->print();
|
|
)
|
|
return false; // Only single use allowed
|
|
}
|
|
}
|
|
|
|
// If the scalar input is a phi, we passed all checks.
|
|
VTransformNode* scalar_input = current_red->in_req(1);
|
|
if (scalar_input == phi) {
|
|
break;
|
|
}
|
|
|
|
// We expect another non strict reduction, verify it in the next iteration.
|
|
current_red = scalar_input->isa_ReductionVector();
|
|
}
|
|
return true; // success
|
|
}
|
|
|
|
bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
|
|
if (!optimize_move_non_strict_order_reductions_out_of_loop_preconditions(vtransform)) {
|
|
return false;
|
|
}
|
|
|
|
// All checks were successful. Edit the vtransform graph now.
|
|
TRACE_OPTIMIZE(
|
|
tty->print_cr("VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop");
|
|
)
|
|
|
|
const int sopc = scalar_opcode();
|
|
const uint vlen = vector_length();
|
|
const BasicType bt = element_basic_type();
|
|
const int vopc = VectorNode::opcode(sopc, bt);
|
|
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
|
|
|
|
// Create a vector of identity values.
|
|
Node* identity = ReductionNode::make_identity_con_scalar(phase->igvn(), sopc, bt);
|
|
phase->set_root_as_ctrl(identity);
|
|
VTransformNode* vtn_identity = new (vtransform.arena()) VTransformOuterNode(vtransform, identity);
|
|
|
|
VTransformNode* vtn_identity_vector = new (vtransform.arena()) VTransformReplicateNode(vtransform, vlen, bt);
|
|
vtn_identity_vector->init_req(1, vtn_identity);
|
|
|
|
// Look at old scalar phi.
|
|
VTransformPhiScalarNode* phi_scalar = in_req(1)->isa_PhiScalar();
|
|
PhiNode* old_phi = phi_scalar->node();
|
|
VTransformNode* init = phi_scalar->in_req(1);
|
|
|
|
TRACE_OPTIMIZE(
|
|
tty->print(" phi_scalar ");
|
|
phi_scalar->print();
|
|
)
|
|
|
|
// Create new vector phi
|
|
const VTransformVectorNodeProperties properties = VTransformVectorNodeProperties::make_for_phi_vector(old_phi, vlen, bt);
|
|
VTransformPhiVectorNode* phi_vector = new (vtransform.arena()) VTransformPhiVectorNode(vtransform, 3, properties);
|
|
phi_vector->init_req(0, phi_scalar->in_req(0));
|
|
phi_vector->init_req(1, vtn_identity_vector);
|
|
// Note: backedge comes later
|
|
|
|
// Traverse down the chain of reductions, and replace them with vector_accumulators.
|
|
VTransformReductionVectorNode* first_red = this;
|
|
VTransformReductionVectorNode* last_red = phi_scalar->in_req(2)->isa_ReductionVector();
|
|
VTransformReductionVectorNode* current_red = first_red;
|
|
VTransformNode* current_vector_accumulator = phi_vector;
|
|
while (true) {
|
|
VTransformNode* vector_input = current_red->in_req(2);
|
|
VTransformVectorNode* vector_accumulator = new (vtransform.arena()) VTransformElementWiseVectorNode(vtransform, 3, current_red->properties(), vopc);
|
|
vector_accumulator->init_req(1, current_vector_accumulator);
|
|
vector_accumulator->init_req(2, vector_input);
|
|
TRACE_OPTIMIZE(
|
|
tty->print(" replace ");
|
|
current_red->print();
|
|
tty->print(" with ");
|
|
vector_accumulator->print();
|
|
)
|
|
current_vector_accumulator = vector_accumulator;
|
|
if (current_red == last_red) { break; }
|
|
current_red = current_red->unique_out_strong_edge()->isa_ReductionVector();
|
|
}
|
|
|
|
// Feed vector accumulator into the backedge.
|
|
phi_vector->set_req(2, current_vector_accumulator);
|
|
|
|
// Create post-loop reduction. last_red keeps all uses outside the loop.
|
|
last_red->set_req(1, init);
|
|
last_red->set_req(2, current_vector_accumulator);
|
|
|
|
TRACE_OPTIMIZE(
|
|
tty->print(" phi_scalar ");
|
|
phi_scalar->print();
|
|
tty->print(" after loop ");
|
|
last_red->print();
|
|
)
|
|
return true; // success
|
|
}
|
|
|
|
float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
uint vlen = vector_length();
|
|
BasicType bt = element_basic_type();
|
|
int vopc = vector_reduction_opcode();
|
|
bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc);
|
|
return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order);
|
|
}
|
|
|
|
VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
|
|
Node* init = apply_state.transformed_node(in_req(1));
|
|
Node* vec = apply_state.transformed_node(in_req(2));
|
|
|
|
ReductionNode* vn = ReductionNode::make(scalar_opcode(), nullptr, init, vec, element_basic_type());
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn, vn->vect_type());
|
|
}
|
|
|
|
VTransformApplyResult VTransformPhiVectorNode::apply(VTransformApplyState& apply_state) const {
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
Node* in0 = apply_state.transformed_node(in_req(0));
|
|
Node* in1 = apply_state.transformed_node(in_req(1));
|
|
|
|
// We create a new phi node, because the type is different to the scalar phi.
|
|
PhiNode* old_phi = approximate_origin()->as_Phi();
|
|
PhiNode* new_phi = old_phi->clone()->as_Phi();
|
|
|
|
phase->igvn().replace_input_of(new_phi, 0, in0);
|
|
phase->igvn().replace_input_of(new_phi, 1, in1);
|
|
// Note: the backedge is hooked up later.
|
|
|
|
// Give the new phi node the correct vector type.
|
|
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
|
|
new_phi->as_Type()->set_type(vt);
|
|
phase->igvn().set_type(new_phi, vt);
|
|
|
|
return VTransformApplyResult::make_vector(new_phi, vt);
|
|
}
|
|
|
|
// Cleanup backedges. In the schedule, the backedges come after their phis. Hence,
|
|
// we only have the transformed backedges after the phis are already transformed.
|
|
// We hook the backedges into the phis now, during cleanup.
|
|
void VTransformPhiVectorNode::apply_backedge(VTransformApplyState& apply_state) const {
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
PhiNode* new_phi = apply_state.transformed_node(this)->as_Phi();
|
|
Node* in2 = apply_state.transformed_node(in_req(2));
|
|
phase->igvn().replace_input_of(new_phi, 2, in2);
|
|
}
|
|
|
|
float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
uint vlen = vector_length();
|
|
BasicType bt = element_basic_type();
|
|
return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt);
|
|
}
|
|
|
|
VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const {
|
|
int sopc = scalar_opcode();
|
|
uint vlen = vector_length();
|
|
BasicType bt = element_basic_type();
|
|
|
|
// The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
|
|
Node* ctrl = apply_state.transformed_node(in_req(MemNode::Control));
|
|
Node* mem = apply_state.memory_state(_adr_type);
|
|
Node* adr = apply_state.transformed_node(in_req(MemNode::Address));
|
|
|
|
// Set the memory dependency of the LoadVector as early as possible.
|
|
// Walk up the memory chain, and ignore any StoreVector that provably
|
|
// does not have any memory dependency.
|
|
const VPointer& load_p = vpointer();
|
|
while (mem->is_StoreVector()) {
|
|
VPointer store_p(mem->as_Mem(), apply_state.vloop());
|
|
if (store_p.never_overlaps_with(load_p)) {
|
|
mem = mem->in(MemNode::Memory);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
LoadVectorNode* vn = LoadVectorNode::make(sopc, ctrl, mem, adr, _adr_type, vlen, bt, _control_dependency);
|
|
DEBUG_ONLY( if (VerifyAlignVector) { vn->set_must_verify_alignment(); } )
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
return VTransformApplyResult::make_vector(vn, vn->vect_type());
|
|
}
|
|
|
|
float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
|
uint vlen = vector_length();
|
|
BasicType bt = element_basic_type();
|
|
return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt);
|
|
}
|
|
|
|
VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const {
|
|
int sopc = scalar_opcode();
|
|
uint vlen = vector_length();
|
|
|
|
// The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
|
|
Node* ctrl = apply_state.transformed_node(in_req(MemNode::Control));
|
|
Node* mem = apply_state.memory_state(_adr_type);
|
|
Node* adr = apply_state.transformed_node(in_req(MemNode::Address));
|
|
|
|
Node* value = apply_state.transformed_node(in_req(MemNode::ValueIn));
|
|
StoreVectorNode* vn = StoreVectorNode::make(sopc, ctrl, mem, adr, _adr_type, value, vlen);
|
|
DEBUG_ONLY( if (VerifyAlignVector) { vn->set_must_verify_alignment(); } )
|
|
register_new_node_from_vectorization(apply_state, vn);
|
|
apply_state.set_memory_state(_adr_type, vn);
|
|
return VTransformApplyResult::make_vector(vn, vn->vect_type());
|
|
}
|
|
|
|
void VTransformNode::register_new_node_from_vectorization(VTransformApplyState& apply_state, Node* vn) const {
|
|
PhaseIdealLoop* phase = apply_state.phase();
|
|
// Using the cl is sometimes not the most accurate, but still correct. We do not have to be
|
|
// perfectly accurate, because we will set major_progress anyway.
|
|
phase->register_new_node(vn, apply_state.vloop().cl());
|
|
phase->igvn()._worklist.push(vn);
|
|
VectorNode::trace_new_vector(vn, "AutoVectorization");
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
void VTransformGraph::print_vtnodes() const {
|
|
tty->print_cr("\nVTransformGraph::print_vtnodes:");
|
|
for (int i = 0; i < _vtnodes.length(); i++) {
|
|
_vtnodes.at(i)->print();
|
|
}
|
|
}
|
|
|
|
void VTransformGraph::print_schedule() const {
|
|
tty->print_cr("\nVTransformGraph::print_schedule:");
|
|
for (int i = 0; i < _schedule.length(); i++) {
|
|
tty->print(" %3d: ", i);
|
|
VTransformNode* vtn = _schedule.at(i);
|
|
if (vtn == nullptr) {
|
|
tty->print_cr("nullptr");
|
|
} else {
|
|
vtn->print();
|
|
}
|
|
}
|
|
}
|
|
|
|
void VTransformNode::print() const {
|
|
tty->print("%3d %s (", _idx, name());
|
|
for (uint i = 0; i < _req; i++) {
|
|
print_node_idx(_in.at(i));
|
|
}
|
|
if ((uint)_in.length() > _req) {
|
|
tty->print(" | strong:");
|
|
for (uint i = _req; i < _in_end_strong_memory_edges; i++) {
|
|
print_node_idx(_in.at(i));
|
|
}
|
|
}
|
|
if ((uint)_in.length() > _in_end_strong_memory_edges) {
|
|
tty->print(" | weak:");
|
|
for (uint i = _in_end_strong_memory_edges; i < (uint)_in.length(); i++) {
|
|
print_node_idx(_in.at(i));
|
|
}
|
|
}
|
|
tty->print(") %s[", _is_alive ? "" : "dead ");
|
|
for (uint i = 0; i < _out_end_strong_edges; i++) {
|
|
print_node_idx(_out.at(i));
|
|
}
|
|
if ((uint)_out.length() > _out_end_strong_edges) {
|
|
tty->print(" | weak:");
|
|
for (uint i = _out_end_strong_edges; i < (uint)_out.length(); i++) {
|
|
print_node_idx(_out.at(i));
|
|
}
|
|
}
|
|
tty->print("] ");
|
|
print_spec();
|
|
tty->cr();
|
|
}
|
|
|
|
void VTransformNode::print_node_idx(const VTransformNode* vtn) {
|
|
if (vtn == nullptr) {
|
|
tty->print(" _");
|
|
} else {
|
|
tty->print(" %d", vtn->_idx);
|
|
}
|
|
}
|
|
|
|
void VTransformMemopScalarNode::print_spec() const {
|
|
tty->print("node[%d %s] ", _node->_idx, _node->Name());
|
|
_vpointer.print_on(tty, false);
|
|
}
|
|
|
|
void VTransformDataScalarNode::print_spec() const {
|
|
tty->print("node[%d %s]", _node->_idx, _node->Name());
|
|
}
|
|
|
|
void VTransformPhiScalarNode::print_spec() const {
|
|
tty->print("node[%d %s]", _node->_idx, _node->Name());
|
|
}
|
|
|
|
void VTransformCFGNode::print_spec() const {
|
|
tty->print("node[%d %s]", _node->_idx, _node->Name());
|
|
}
|
|
|
|
void VTransformOuterNode::print_spec() const {
|
|
tty->print("node[%d %s]", _node->_idx, _node->Name());
|
|
}
|
|
|
|
void VTransformReplicateNode::print_spec() const {
|
|
tty->print("vlen=%d element_type=%s", _vlen, type2name(_element_type));
|
|
}
|
|
|
|
void VTransformShiftCountNode::print_spec() const {
|
|
tty->print("vlen=%d element_bt=%s mask=%d shift_opcode=%s",
|
|
_vlen, type2name(_element_bt), _mask,
|
|
NodeClassNames[_shift_opcode]);
|
|
}
|
|
|
|
void VTransformPopulateIndexNode::print_spec() const {
|
|
tty->print("vlen=%d element_bt=%s", _vlen, type2name(_element_bt));
|
|
}
|
|
|
|
void VTransformVectorNode::print_spec() const {
|
|
tty->print("Properties[orig=[%d %s] sopc=%s vlen=%d element_bt=%s]",
|
|
approximate_origin()->_idx,
|
|
approximate_origin()->Name(),
|
|
NodeClassNames[scalar_opcode()],
|
|
vector_length(),
|
|
type2name(element_basic_type()));
|
|
if (is_load_or_store_in_loop()) {
|
|
tty->print(" ");
|
|
vpointer().print_on(tty, false);
|
|
}
|
|
}
|
|
|
|
void VTransformElementWiseVectorNode::print_spec() const {
|
|
VTransformVectorNode::print_spec();
|
|
tty->print(" vopc=%s", NodeClassNames[_vector_opcode]);
|
|
}
|
|
|
|
void VTransformReinterpretVectorNode::print_spec() const {
|
|
VTransformVectorNode::print_spec();
|
|
tty->print(" src_bt=%s", type2name(_src_bt));
|
|
}
|
|
|
|
void VTransformBoolVectorNode::print_spec() const {
|
|
VTransformVectorNode::print_spec();
|
|
BoolTest::mask m = BoolTest::mask(_test._mask & ~BoolTest::unsigned_compare);
|
|
const BoolTest bt(m);
|
|
tty->print(" test=%s", m == _test._mask ? "" : "unsigned ");
|
|
bt.dump_on(tty);
|
|
}
|
|
#endif
|