jdk/src/hotspot/share/opto/superword.cpp
2024-06-13 06:35:26 +00:00

3569 lines
129 KiB
C++

/*
* Copyright (c) 2007, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
#include "precompiled.hpp"
#include "libadt/vectset.hpp"
#include "memory/allocation.inline.hpp"
#include "memory/resourceArea.hpp"
#include "opto/addnode.hpp"
#include "opto/c2compiler.hpp"
#include "opto/castnode.hpp"
#include "opto/convertnode.hpp"
#include "opto/matcher.hpp"
#include "opto/memnode.hpp"
#include "opto/opcodes.hpp"
#include "opto/opaquenode.hpp"
#include "opto/rootnode.hpp"
#include "opto/superword.hpp"
#include "opto/vectornode.hpp"
#include "opto/movenode.hpp"
#include "utilities/powerOfTwo.hpp"
SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
_vloop_analyzer(vloop_analyzer),
_vloop(vloop_analyzer.vloop()),
_arena(mtCompiler),
_clone_map(phase()->C->clone_map()), // map of nodes created in cloning
_pairset(&_arena, _vloop_analyzer),
_packset(&_arena, _vloop_analyzer
NOT_PRODUCT(COMMA is_trace_superword_packset())
NOT_PRODUCT(COMMA is_trace_superword_rejections())
),
_mem_ref_for_main_loop_alignment(nullptr),
_aw_for_main_loop_alignment(0),
_do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
_num_work_vecs(0), // amount of vector work we have
_num_reductions(0) // amount of reduction work we have
{
}
void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor) {
IdealLoopTree* lpt = vloop.lpt();
CountedLoopNode* cl = vloop.cl();
Node* cl_exit = vloop.cl_exit();
PhaseIdealLoop* phase = vloop.phase();
bool is_slp = true;
size_t ignored_size = lpt->_body.size();
int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
Node_Stack nstack((int)ignored_size);
// First clear the entries
for (uint i = 0; i < lpt->_body.size(); i++) {
ignored_loop_nodes[i] = -1;
}
int max_vector = Matcher::max_vector_size_auto_vectorization(T_BYTE);
// Process the loop, some/all of the stack entries will not be in order, ergo
// need to preprocess the ignored initial state before we process the loop
for (uint i = 0; i < lpt->_body.size(); i++) {
Node* n = lpt->_body.at(i);
if (n == cl->incr() ||
n->is_AddP() ||
n->is_Cmp() ||
n->is_Bool() ||
n->is_IfTrue() ||
n->is_CountedLoop() ||
(n == cl_exit)) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
if (n->is_If()) {
IfNode *iff = n->as_If();
if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
if (lpt->is_loop_exit(iff)) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
}
}
if (n->is_memory_phi()) {
Node* n_tail = n->in(LoopNode::LoopBackControl);
if (n_tail != n->in(LoopNode::EntryControl)) {
if (!n_tail->is_Mem()) {
is_slp = false;
break;
}
}
}
// This must happen after check of phi/if
if (n->is_Phi() || n->is_If()) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
if (n->is_LoadStore() || n->is_MergeMem() ||
(n->is_Proj() && !n->as_Proj()->is_CFG())) {
is_slp = false;
break;
}
// Ignore nodes with non-primitive type.
BasicType bt;
if (n->is_Mem()) {
bt = n->as_Mem()->memory_type();
} else {
bt = n->bottom_type()->basic_type();
}
if (is_java_primitive(bt) == false) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
if (n->is_Mem()) {
MemNode* current = n->as_Mem();
Node* adr = n->in(MemNode::Address);
Node* n_ctrl = phase->get_ctrl(adr);
// save a queue of post process nodes
if (n_ctrl != nullptr && lpt->is_member(phase->get_loop(n_ctrl))) {
// Process the memory expression
int stack_idx = 0;
bool have_side_effects = true;
if (adr->is_AddP() == false) {
nstack.push(adr, stack_idx++);
} else {
// Mark the components of the memory operation in nstack
VPointer p1(current, vloop, &nstack);
have_side_effects = p1.node_stack()->is_nonempty();
}
// Process the pointer stack
while (have_side_effects) {
Node* pointer_node = nstack.node();
for (uint j = 0; j < lpt->_body.size(); j++) {
Node* cur_node = lpt->_body.at(j);
if (cur_node == pointer_node) {
ignored_loop_nodes[j] = cur_node->_idx;
break;
}
}
nstack.pop();
have_side_effects = nstack.is_nonempty();
}
}
}
}
if (is_slp) {
// Now we try to find the maximum supported consistent vector which the machine
// description can use
bool flag_small_bt = false;
for (uint i = 0; i < lpt->_body.size(); i++) {
if (ignored_loop_nodes[i] != -1) continue;
BasicType bt;
Node* n = lpt->_body.at(i);
if (n->is_Mem()) {
bt = n->as_Mem()->memory_type();
} else {
bt = n->bottom_type()->basic_type();
}
if (is_java_primitive(bt) == false) continue;
int cur_max_vector = Matcher::max_vector_size_auto_vectorization(bt);
// If a max vector exists which is not larger than _local_loop_unroll_factor
// stop looking, we already have the max vector to map to.
if (cur_max_vector < local_loop_unroll_factor) {
is_slp = false;
#ifndef PRODUCT
if (TraceSuperWordLoopUnrollAnalysis) {
tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
}
#endif
break;
}
// Map the maximal common vector except conversion nodes, because we can't get
// the precise basic type for conversion nodes in the stage of early analysis.
if (!VectorNode::is_convert_opcode(n->Opcode()) &&
VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
if (cur_max_vector < max_vector && !flag_small_bt) {
max_vector = cur_max_vector;
} else if (cur_max_vector > max_vector && UseSubwordForMaxVector) {
// Analyse subword in the loop to set maximum vector size to take advantage of full vector width for subword types.
// Here we analyze if narrowing is likely to happen and if it is we set vector size more aggressively.
// We check for possibility of narrowing by looking through chain operations using subword types.
if (is_subword_type(bt)) {
uint start, end;
VectorNode::vector_operands(n, &start, &end);
for (uint j = start; j < end; j++) {
Node* in = n->in(j);
// Don't propagate through a memory
if (!in->is_Mem() && vloop.in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
bool same_type = true;
for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
Node *use = in->fast_out(k);
if (!vloop.in_bb(use) && use->bottom_type()->basic_type() != bt) {
same_type = false;
break;
}
}
if (same_type) {
max_vector = cur_max_vector;
flag_small_bt = true;
cl->mark_subword_loop();
}
}
}
}
}
}
}
if (is_slp) {
local_loop_unroll_factor = max_vector;
cl->mark_passed_slp();
}
cl->mark_was_slp();
if (cl->is_main_loop()) {
#ifndef PRODUCT
if (TraceSuperWordLoopUnrollAnalysis) {
tty->print_cr("slp analysis: set max unroll to %d", local_loop_unroll_factor);
}
#endif
cl->set_slp_max_unroll(local_loop_unroll_factor);
}
}
}
bool VLoopReductions::is_reduction(const Node* n) {
if (!is_reduction_operator(n)) {
return false;
}
// Test whether there is a reduction cycle via every edge index
// (typically indices 1 and 2).
for (uint input = 1; input < n->req(); input++) {
if (in_reduction_cycle(n, input)) {
return true;
}
}
return false;
}
bool VLoopReductions::is_reduction_operator(const Node* n) {
int opc = n->Opcode();
return (opc != ReductionNode::opcode(opc, n->bottom_type()->basic_type()));
}
bool VLoopReductions::in_reduction_cycle(const Node* n, uint input) {
// First find input reduction path to phi node.
auto has_my_opcode = [&](const Node* m){ return m->Opcode() == n->Opcode(); };
PathEnd path_to_phi = find_in_path(n, input, LoopMaxUnroll, has_my_opcode,
[&](const Node* m) { return m->is_Phi(); });
const Node* phi = path_to_phi.first;
if (phi == nullptr) {
return false;
}
// If there is an input reduction path from the phi's loop-back to n, then n
// is part of a reduction cycle.
const Node* first = phi->in(LoopNode::LoopBackControl);
PathEnd path_from_phi = find_in_path(first, input, LoopMaxUnroll, has_my_opcode,
[&](const Node* m) { return m == n; });
return path_from_phi.first != nullptr;
}
Node* VLoopReductions::original_input(const Node* n, uint i) {
if (n->has_swapped_edges()) {
assert(n->is_Add() || n->is_Mul(), "n should be commutative");
if (i == 1) {
return n->in(2);
} else if (i == 2) {
return n->in(1);
}
}
return n->in(i);
}
void VLoopReductions::mark_reductions() {
assert(_loop_reductions.is_empty(), "must not yet be computed");
CountedLoopNode* cl = _vloop.cl();
// Iterate through all phi nodes associated to the loop and search for
// reduction cycles in the basic block.
for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) {
const Node* phi = cl->fast_out(i);
if (!phi->is_Phi()) {
continue;
}
if (phi->outcnt() == 0) {
continue;
}
if (phi == _vloop.iv()) {
continue;
}
// The phi's loop-back is considered the first node in the reduction cycle.
const Node* first = phi->in(LoopNode::LoopBackControl);
if (first == nullptr) {
continue;
}
// Test that the node fits the standard pattern for a reduction operator.
if (!is_reduction_operator(first)) {
continue;
}
// Test that 'first' is the beginning of a reduction cycle ending in 'phi'.
// To contain the number of searched paths, assume that all nodes in a
// reduction cycle are connected via the same edge index, modulo swapped
// inputs. This assumption is realistic because reduction cycles usually
// consist of nodes cloned by loop unrolling.
int reduction_input = -1;
int path_nodes = -1;
for (uint input = 1; input < first->req(); input++) {
// Test whether there is a reduction path in the basic block from 'first'
// to the phi node following edge index 'input'.
PathEnd path =
find_in_path(
first, input, _vloop.lpt()->_body.size(),
[&](const Node* n) { return n->Opcode() == first->Opcode() &&
_vloop.in_bb(n); },
[&](const Node* n) { return n == phi; });
if (path.first != nullptr) {
reduction_input = input;
path_nodes = path.second;
break;
}
}
if (reduction_input == -1) {
continue;
}
// Test that reduction nodes do not have any users in the loop besides their
// reduction cycle successors.
const Node* current = first;
const Node* succ = phi; // current's successor in the reduction cycle.
bool used_in_loop = false;
for (int i = 0; i < path_nodes; i++) {
for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) {
Node* u = current->fast_out(j);
if (!_vloop.in_bb(u)) {
continue;
}
if (u == succ) {
continue;
}
used_in_loop = true;
break;
}
if (used_in_loop) {
break;
}
succ = current;
current = original_input(current, reduction_input);
}
if (used_in_loop) {
continue;
}
// Reduction cycle found. Mark all nodes in the found path as reductions.
current = first;
for (int i = 0; i < path_nodes; i++) {
_loop_reductions.set(current->_idx);
current = original_input(current, reduction_input);
}
}
}
bool SuperWord::transform_loop() {
assert(phase()->C->do_superword(), "SuperWord option should be enabled");
assert(cl()->is_main_loop(), "SLP should only work on main loops");
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nSuperWord::transform_loop:");
lpt()->dump_head();
cl()->dump();
}
#endif
if (!SLP_extract()) {
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nSuperWord::transform_loop failed: SuperWord::SLP_extract did not vectorize");
}
#endif
return false;
}
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nSuperWord::transform_loop: success");
}
#endif
return true;
}
//------------------------------SLP_extract---------------------------
// Extract the superword level parallelism
//
// 1) A reverse post-order of nodes in the block is constructed. By scanning
// this list from first to last, all definitions are visited before their uses.
//
// 2) A point-to-point dependence graph is constructed between memory references.
// This simplifies the upcoming "independence" checker.
//
// 3) The maximum depth in the node graph from the beginning of the block
// to each node is computed. This is used to prune the graph search
// in the independence checker.
//
// 4) For integer types, the necessary bit width is propagated backwards
// from stores to allow packed operations on byte, char, and short
// integers. This reverses the promotion to type "int" that javac
// did for operations like: char c1,c2,c3; c1 = c2 + c3.
//
// 5) One of the memory references is picked to be an aligned vector reference.
// The pre-loop trip count is adjusted to align this reference in the
// unrolled body.
//
// 6) The initial set of pack pairs is seeded with memory references.
//
// 7) The set of pack pairs is extended by following use->def and def->use links.
//
// 8) The pairs are combined into vector sized packs.
//
// 9) Reorder the memory slices to co-locate members of the memory packs.
//
// 10) Generate ideal vector nodes for the final set of packs and where necessary,
// inserting scalar promotion, vector creation from multiple scalars, and
// extraction of scalar values from vectors.
//
bool SuperWord::SLP_extract() {
assert(cl()->is_main_loop(), "SLP should only work on main loops");
// Find "seed" pairs.
create_adjacent_memop_pairs();
if (_pairset.is_empty()) {
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nNo pair packs generated, abort SuperWord.");
tty->cr();
}
#endif
return false;
}
extend_pairset_with_more_pairs_by_following_use_and_def();
combine_pairs_to_longer_packs();
split_packs_at_use_def_boundaries(); // a first time: create natural boundaries
split_packs_only_implemented_with_smaller_size();
split_packs_to_break_mutual_dependence();
split_packs_at_use_def_boundaries(); // again: propagate split of other packs
filter_packs_for_power_of_2_size();
filter_packs_for_mutual_independence();
filter_packs_for_alignment();
filter_packs_for_implemented();
filter_packs_for_profitable();
DEBUG_ONLY(verify_packs();)
DEBUG_ONLY(verify_no_extract());
return schedule_and_apply();
}
// Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization.
void SuperWord::create_adjacent_memop_pairs() {
ResourceMark rm;
GrowableArray<const VPointer*> vpointers;
collect_valid_vpointers(vpointers);
// Sort the VPointers. This does 2 things:
// - Separate the VPointer into groups: all memops that have the same opcode and the same
// VPointer, except for the offset. Adjacent memops must have the same opcode and the
// same VPointer, except for a shift in the offset. Thus, two memops can only be adjacent
// if they are in the same group. This decreases the work.
// - Sort by offset inside the groups. This decreases the work needed to determine adjacent
// memops inside a group.
vpointers.sort(VPointer::cmp_for_sort);
#ifndef PRODUCT
if (is_trace_superword_adjacent_memops()) {
tty->print_cr("\nSuperWord::create_adjacent_memop_pairs:");
}
#endif
create_adjacent_memop_pairs_in_all_groups(vpointers);
#ifndef PRODUCT
if (is_trace_superword_packset()) {
tty->print_cr("\nAfter Superword::create_adjacent_memop_pairs");
_pairset.print();
}
#endif
}
// Collect all memops vpointers that could potentially be vectorized.
void SuperWord::collect_valid_vpointers(GrowableArray<const VPointer*>& vpointers) {
for_each_mem([&] (const MemNode* mem, int bb_idx) {
const VPointer& p = vpointer(mem);
if (p.valid() &&
!mem->is_LoadStore() &&
is_java_primitive(mem->memory_type())) {
vpointers.append(&p);
}
});
}
// For each group, find the adjacent memops.
void SuperWord::create_adjacent_memop_pairs_in_all_groups(const GrowableArray<const VPointer*> &vpointers) {
int group_start = 0;
while (group_start < vpointers.length()) {
int group_end = find_group_end(vpointers, group_start);
create_adjacent_memop_pairs_in_one_group(vpointers, group_start, group_end);
group_start = group_end;
}
}
// Step forward until we find a VPointer of another group, or we reach the end of the array.
int SuperWord::find_group_end(const GrowableArray<const VPointer*>& vpointers, int group_start) {
int group_end = group_start + 1;
while (group_end < vpointers.length() &&
VPointer::cmp_for_sort_by_group(
vpointers.adr_at(group_start),
vpointers.adr_at(group_end)
) == 0) {
group_end++;
}
return group_end;
}
// Find adjacent memops for a single group, e.g. for all LoadI of the same base, invar, etc.
// Create pairs and add them to the pairset.
void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArray<const VPointer*>& vpointers, const int group_start, const int group_end) {
#ifndef PRODUCT
if (is_trace_superword_adjacent_memops()) {
tty->print_cr(" group:");
for (int i = group_start; i < group_end; i++) {
const VPointer* p = vpointers.at(i);
tty->print(" ");
p->print();
}
}
#endif
MemNode* first = vpointers.at(group_start)->mem();
int element_size = data_size(first);
// For each ref in group: find others that can be paired:
for (int i = group_start; i < group_end; i++) {
const VPointer* p1 = vpointers.at(i);
MemNode* mem1 = p1->mem();
bool found = false;
// For each ref in group with larger or equal offset:
for (int j = i + 1; j < group_end; j++) {
const VPointer* p2 = vpointers.at(j);
MemNode* mem2 = p2->mem();
assert(mem1 != mem2, "look only at pair of different memops");
// Check for correct distance.
assert(data_size(mem1) == element_size, "all nodes in group must have the same element size");
assert(data_size(mem2) == element_size, "all nodes in group must have the same element size");
assert(p1->offset_in_bytes() <= p2->offset_in_bytes(), "must be sorted by offset");
if (p1->offset_in_bytes() + element_size > p2->offset_in_bytes()) { continue; }
if (p1->offset_in_bytes() + element_size < p2->offset_in_bytes()) { break; }
// Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize)
if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; }
if (!can_pack_into_pair(mem1, mem2)) { continue; }
#ifndef PRODUCT
if (is_trace_superword_adjacent_memops()) {
if (found) {
tty->print_cr(" WARNING: multiple pairs with the same node. Ignored pairing:");
} else {
tty->print_cr(" pair:");
}
tty->print(" ");
p1->print();
tty->print(" ");
p2->print();
}
#endif
if (!found) {
_pairset.add_pair(mem1, mem2);
}
}
}
}
void VLoopMemorySlices::find_memory_slices() {
assert(_heads.is_empty(), "not yet computed");
assert(_tails.is_empty(), "not yet computed");
CountedLoopNode* cl = _vloop.cl();
// Iterate over all memory phis
for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) {
PhiNode* phi = cl->fast_out(i)->isa_Phi();
if (phi != nullptr && _vloop.in_bb(phi) && phi->is_memory_phi()) {
Node* phi_tail = phi->in(LoopNode::LoopBackControl);
if (phi_tail != phi->in(LoopNode::EntryControl)) {
_heads.push(phi);
_tails.push(phi_tail->as_Mem());
}
}
}
NOT_PRODUCT( if (_vloop.is_trace_memory_slices()) { print(); } )
}
#ifndef PRODUCT
void VLoopMemorySlices::print() const {
tty->print_cr("\nVLoopMemorySlices::print: %s",
heads().length() > 0 ? "" : "NONE");
for (int m = 0; m < heads().length(); m++) {
tty->print("%6d ", m); heads().at(m)->dump();
tty->print(" "); tails().at(m)->dump();
}
}
#endif
// Get all memory nodes of a slice, in reverse order
void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<MemNode*> &slice) const {
assert(slice.is_empty(), "start empty");
Node* n = tail;
Node* prev = nullptr;
while (true) {
assert(_vloop.in_bb(n), "must be in block");
for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
Node* out = n->fast_out(i);
if (out->is_Load()) {
if (_vloop.in_bb(out)) {
slice.push(out->as_Load());
}
} else {
// FIXME
if (out->is_MergeMem() && !_vloop.in_bb(out)) {
// Either unrolling is causing a memory edge not to disappear,
// or need to run igvn.optimize() again before SLP
} else if (out->is_memory_phi() && !_vloop.in_bb(out)) {
// Ditto. Not sure what else to check further.
} else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
// StoreCM has an input edge used as a precedence edge.
// Maybe an issue when oop stores are vectorized.
} else {
assert(out == prev || prev == nullptr, "no branches off of store slice");
}
}//else
}//for
if (n == head) { break; }
slice.push(n->as_Mem());
prev = n;
assert(n->is_Mem(), "unexpected node %s", n->Name());
n = n->in(MemNode::Memory);
}
#ifndef PRODUCT
if (_vloop.is_trace_memory_slices()) {
tty->print_cr("\nVLoopMemorySlices::get_slice_in_reverse_order:");
head->dump();
for (int j = slice.length() - 1; j >= 0 ; j--) {
slice.at(j)->dump();
}
}
#endif
}
// Check if two nodes can be packed into a pair.
bool SuperWord::can_pack_into_pair(Node* s1, Node* s2) {
// Do not use superword for non-primitives
BasicType bt1 = velt_basic_type(s1);
BasicType bt2 = velt_basic_type(s2);
if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
return false;
BasicType longer_bt = longer_type_for_conversion(s1);
if (Matcher::max_vector_size_auto_vectorization(bt1) < 2 ||
(longer_bt != T_ILLEGAL && Matcher::max_vector_size_auto_vectorization(longer_bt) < 2)) {
return false; // No vectors for this type
}
// Forbid anything that looks like a PopulateIndex to be packed. It does not need to be packed,
// and will still be vectorized by SuperWord::vector_opd.
if (isomorphic(s1, s2) && !is_populate_index(s1, s2)) {
if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
if (!_pairset.is_left(s1) && !_pairset.is_right(s2)) {
if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
return true;
}
}
}
}
return false;
}
//------------------------------are_adjacent_refs---------------------------
// Is s1 immediately before s2 in memory?
bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) const {
if (!s1->is_Mem() || !s2->is_Mem()) return false;
if (!in_bb(s1) || !in_bb(s2)) return false;
// Do not use superword for non-primitives
if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
!is_java_primitive(s2->as_Mem()->memory_type())) {
return false;
}
// Adjacent memory references must be on the same slice.
if (!same_memory_slice(s1->as_Mem(), s2->as_Mem())) {
return false;
}
// Adjacent memory references must have the same base, be comparable
// and have the correct distance between them.
const VPointer& p1 = vpointer(s1->as_Mem());
const VPointer& p2 = vpointer(s2->as_Mem());
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
return diff == data_size(s1);
}
//------------------------------isomorphic---------------------------
// Are s1 and s2 similar?
bool SuperWord::isomorphic(Node* s1, Node* s2) {
if (s1->Opcode() != s2->Opcode() ||
s1->req() != s2->req() ||
!same_velt_type(s1, s2) ||
(s1->is_Bool() && s1->as_Bool()->_test._test != s2->as_Bool()->_test._test)) {
return false;
}
Node* s1_ctrl = s1->in(0);
Node* s2_ctrl = s2->in(0);
// If the control nodes are equivalent, no further checks are required to test for isomorphism.
if (s1_ctrl == s2_ctrl) {
return true;
} else {
// If the control nodes are not invariant for the loop, fail isomorphism test.
const bool s1_ctrl_inv = (s1_ctrl == nullptr) || lpt()->is_invariant(s1_ctrl);
const bool s2_ctrl_inv = (s2_ctrl == nullptr) || lpt()->is_invariant(s2_ctrl);
return s1_ctrl_inv && s2_ctrl_inv;
}
}
// Look for pattern n1 = (iv + c) and n2 = (iv + c + 1), which may lead to PopulateIndex vector node.
// We skip the pack creation of these nodes. They will be vectorized by SuperWord::vector_opd.
bool SuperWord::is_populate_index(const Node* n1, const Node* n2) const {
return n1->is_Add() &&
n2->is_Add() &&
n1->in(1) == iv() &&
n2->in(1) == iv() &&
n1->in(2)->is_Con() &&
n2->in(2)->is_Con() &&
n2->in(2)->get_int() - n1->in(2)->get_int() == 1;
}
// Is there no data path from s1 to s2 or s2 to s1?
bool VLoopDependencyGraph::independent(Node* s1, Node* s2) const {
int d1 = depth(s1);
int d2 = depth(s2);
if (d1 == d2) {
// Same depth:
// 1) same node -> dependent
// 2) different nodes -> same level implies there is no path
return s1 != s2;
}
// Traversal starting at the deeper node to find the shallower one.
Node* deep = d1 > d2 ? s1 : s2;
Node* shallow = d1 > d2 ? s2 : s1;
int min_d = MIN2(d1, d2); // prune traversal at min_d
ResourceMark rm;
Unique_Node_List worklist;
worklist.push(deep);
for (uint i = 0; i < worklist.size(); i++) {
Node* n = worklist.at(i);
for (PredsIterator preds(*this, n); !preds.done(); preds.next()) {
Node* pred = preds.current();
if (_vloop.in_bb(pred) && depth(pred) >= min_d) {
if (pred == shallow) {
return false; // found it -> dependent
}
worklist.push(pred);
}
}
}
return true; // not found -> independent
}
// Are all nodes in nodes list mutually independent?
// We could query independent(s1, s2) for all pairs, but that results
// in O(size * size) graph traversals. We can do it all in one BFS!
// Start the BFS traversal at all nodes from the nodes list. Traverse
// Preds recursively, for nodes that have at least depth min_d, which
// is the smallest depth of all nodes from the nodes list. Once we have
// traversed all those nodes, and have not found another node from the
// nodes list, we know that all nodes in the nodes list are independent.
bool VLoopDependencyGraph::mutually_independent(const Node_List* nodes) const {
ResourceMark rm;
Unique_Node_List worklist;
VectorSet nodes_set;
int min_d = depth(nodes->at(0));
for (uint k = 0; k < nodes->size(); k++) {
Node* n = nodes->at(k);
min_d = MIN2(min_d, depth(n));
worklist.push(n); // start traversal at all nodes in nodes list
nodes_set.set(_body.bb_idx(n));
}
for (uint i = 0; i < worklist.size(); i++) {
Node* n = worklist.at(i);
for (PredsIterator preds(*this, n); !preds.done(); preds.next()) {
Node* pred = preds.current();
if (_vloop.in_bb(pred) && depth(pred) >= min_d) {
if (nodes_set.test(_body.bb_idx(pred))) {
return false; // found one -> dependent
}
worklist.push(pred);
}
}
}
return true; // not found -> independent
}
//--------------------------have_similar_inputs-----------------------
// For a node pair (s1, s2) which is isomorphic and independent,
// do s1 and s2 have similar input edges?
bool SuperWord::have_similar_inputs(Node* s1, Node* s2) {
// assert(isomorphic(s1, s2) == true, "check isomorphic");
// assert(independent(s1, s2) == true, "check independent");
if (s1->req() > 1 && !s1->is_Store() && !s1->is_Load()) {
for (uint i = 1; i < s1->req(); i++) {
Node* s1_in = s1->in(i);
Node* s2_in = s2->in(i);
if (s1_in->is_Phi() && s2_in->is_Add() && s2_in->in(1) == s1_in) {
// Special handling for expressions with loop iv, like "b[i] = a[i] * i".
// In this case, one node has an input from the tripcount iv and another
// node has an input from iv plus an offset.
if (!s1_in->as_Phi()->is_tripcount(T_INT)) return false;
} else {
if (s1_in->Opcode() != s2_in->Opcode()) return false;
}
}
}
return true;
}
bool VLoopReductions::is_marked_reduction_pair(const Node* s1, const Node* s2) const {
if (is_marked_reduction(s1) &&
is_marked_reduction(s2)) {
// This is an ordered set, so s1 should define s2
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i);
if (t1 == s2) {
// both nodes are reductions and connected
return true;
}
}
}
return false;
}
// Extend pairset by following use->def and def->use links from pair members.
void SuperWord::extend_pairset_with_more_pairs_by_following_use_and_def() {
bool changed;
do {
changed = false;
// Iterate the pairs in insertion order.
for (int i = 0; i < _pairset.length(); i++) {
Node* left = _pairset.left_at_in_insertion_order(i);
Node* right = _pairset.right_at_in_insertion_order(i);
changed |= extend_pairset_with_more_pairs_by_following_def(left, right);
changed |= extend_pairset_with_more_pairs_by_following_use(left, right);
}
} while (changed);
// During extend_pairset_with_more_pairs_by_following_use, we may have re-ordered the
// inputs of some nodes, when calling order_inputs_of_uses_to_match_def_pair. If a def
// node has multiple uses, we may have re-ordered some of the inputs one use after
// packing another use with the old order. Now that we have all pairs, we must ensure
// that the order between the pairs is matching again. Since the PairSetIterator visits
// all pair-chains from left-to-right, we essencially impose the order of the first
// element on all other elements in the pair-chain.
for (PairSetIterator pair(_pairset); !pair.done(); pair.next()) {
Node* left = pair.left();
Node* right = pair.right();
order_inputs_of_all_use_pairs_to_match_def_pair(left, right);
}
#ifndef PRODUCT
if (is_trace_superword_packset()) {
tty->print_cr("\nAfter Superword::extend_pairset_with_more_pairs_by_following_use_and_def");
_pairset.print();
}
#endif
}
bool SuperWord::extend_pairset_with_more_pairs_by_following_def(Node* s1, Node* s2) {
assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
assert(s1->req() == s2->req(), "just checking");
if (s1->is_Load()) return false;
bool changed = false;
int start = s1->is_Store() ? MemNode::ValueIn : 1;
int end = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
for (int j = start; j < end; j++) {
Node* t1 = s1->in(j);
Node* t2 = s2->in(j);
if (!in_bb(t1) || !in_bb(t2) || t1->is_Mem() || t2->is_Mem()) {
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
continue;
}
if (can_pack_into_pair(t1, t2)) {
if (estimate_cost_savings_when_packing_as_pair(t1, t2) >= 0) {
_pairset.add_pair(t1, t2);
changed = true;
}
}
}
return changed;
}
// Note: we only extend with a single pair (the one with most savings) for every call. Since we keep
// calling this method as long as there are some changes, we will eventually pack all pairs that
// can be packed.
bool SuperWord::extend_pairset_with_more_pairs_by_following_use(Node* s1, Node* s2) {
assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
assert(s1->req() == s2->req(), "just checking");
if (s1->is_Store()) return false;
int savings = -1;
Node* u1 = nullptr;
Node* u2 = nullptr;
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i);
if (!in_bb(t1) || t1->is_Mem()) {
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
continue;
}
for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
Node* t2 = s2->fast_out(j);
if (!in_bb(t2) || t2->is_Mem()) {
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
continue;
}
if (t2->Opcode() == Op_AddI && t2 == cl()->incr()) continue; // don't mess with the iv
if (order_inputs_of_uses_to_match_def_pair(s1, s2, t1, t2) != PairOrderStatus::Ordered) { continue; }
if (can_pack_into_pair(t1, t2)) {
int my_savings = estimate_cost_savings_when_packing_as_pair(t1, t2);
if (my_savings > savings) {
savings = my_savings;
u1 = t1;
u2 = t2;
}
}
}
}
if (savings >= 0) {
_pairset.add_pair(u1, u2);
return true; // changed
}
return false; // no change
}
// For a pair (def1, def2), find all use packs (use1, use2), and ensure that their inputs have an order
// that matches the (def1, def2) pair.
void SuperWord::order_inputs_of_all_use_pairs_to_match_def_pair(Node* def1, Node* def2) {
assert(_pairset.is_pair(def1, def2), "(def1, def2) must be a pair");
if (def1->is_Store()) return;
// reductions are always managed beforehand
if (is_marked_reduction(def1)) return;
for (DUIterator_Fast imax, i = def1->fast_outs(imax); i < imax; i++) {
Node* use1 = def1->fast_out(i);
// Only allow operand swap on commuting operations
if (!use1->is_Add() && !use1->is_Mul() && !VectorNode::is_muladds2i(use1)) {
break;
}
// Find pair (use1, use2)
Node* use2 = _pairset.get_right_or_null_for(use1);
if (use2 == nullptr) { break; }
order_inputs_of_uses_to_match_def_pair(def1, def2, use1, use2);
}
}
// For a def-pair (def1, def2), and their use-nodes (use1, use2):
// Ensure that the input order of (use1, use2) matches the order of (def1, def2).
//
// We have different cases:
//
// 1. Reduction (use1, use2): must always reduce left-to-right. Make sure that we have pattern:
//
// phi/reduction x1 phi/reduction x2 phi/reduction x1
// | | | | and hopefully: | |
// use1 use2 use1 x2
// | |
// use2
//
// 2: Commutative operations, just as Add/Mul and their subclasses: we can try to swap edges:
//
// def1 x1 x2 def2 def1 x1 def2 x2
// | | | | ==> | | | |
// use1 use2 use1 use2
//
// 3: MulAddS2I (use1, use2): we can try to swap edges:
//
// (x1 * x2) + (x3 * x4) ==> 3.a: (x2 * x1) + (x4 * x3)
// 3.b: (x4 * x3) + (x2 * x1)
// 3.c: (x3 * x4) + (x1 * x2)
//
// Note: MulAddS2I with its 4 inputs is too complicated, if there is any mismatch, we always
// return PairOrderStatus::Unknown.
// Therefore, extend_pairset_with_more_pairs_by_following_use cannot extend to MulAddS2I,
// but there is a chance that extend_pairset_with_more_pairs_by_following_def can do it.
//
// 4: Otherwise, check if the inputs of (use1, use2) already match (def1, def2), i.e. for all input indices i:
//
// use1->in(i) == def1 || use2->in(i) == def2 -> use1->in(i) == def1 && use2->in(i) == def2
//
SuperWord::PairOrderStatus SuperWord::order_inputs_of_uses_to_match_def_pair(Node* def1, Node* def2, Node* use1, Node* use2) {
assert(_pairset.is_pair(def1, def2), "(def1, def2) must be a pair");
// 1. Reduction
if (is_marked_reduction(use1) && is_marked_reduction(use2)) {
Node* use1_in2 = use1->in(2);
if (use1_in2->is_Phi() || is_marked_reduction(use1_in2)) {
use1->swap_edges(1, 2);
}
Node* use2_in2 = use2->in(2);
if (use2_in2->is_Phi() || is_marked_reduction(use2_in2)) {
use2->swap_edges(1, 2);
}
return PairOrderStatus::Ordered;
}
uint ct = use1->req();
if (ct != use2->req()) { return PairOrderStatus::Unordered; };
uint i1 = 0;
uint i2 = 0;
do {
for (i1++; i1 < ct; i1++) { if (use1->in(i1) == def1) { break; } }
for (i2++; i2 < ct; i2++) { if (use2->in(i2) == def2) { break; } }
if (i1 != i2) {
if ((i1 == (3-i2)) && (use2->is_Add() || use2->is_Mul())) {
// 2. Commutative: swap edges, and hope the other position matches too.
use2->swap_edges(i1, i2);
} else if (VectorNode::is_muladds2i(use2) && use1 != use2) {
// 3.a/b: MulAddS2I.
if (i1 == 5 - i2) { // ((i1 == 3 && i2 == 2) || (i1 == 2 && i2 == 3) || (i1 == 1 && i2 == 4) || (i1 == 4 && i2 == 1))
use2->swap_edges(1, 2);
use2->swap_edges(3, 4);
}
if (i1 == 3 - i2 || i1 == 7 - i2) { // ((i1 == 1 && i2 == 2) || (i1 == 2 && i2 == 1) || (i1 == 3 && i2 == 4) || (i1 == 4 && i2 == 3))
use2->swap_edges(2, 3);
use2->swap_edges(1, 4);
}
return PairOrderStatus::Unknown;
} else {
// 4. The inputs are not ordered, and we cannot do anything about it.
return PairOrderStatus::Unordered;
}
} else if (i1 == i2 && VectorNode::is_muladds2i(use2) && use1 != use2) {
// 3.c: MulAddS2I.
use2->swap_edges(1, 3);
use2->swap_edges(2, 4);
return PairOrderStatus::Unknown;
}
} while (i1 < ct);
// 4. All inputs match.
return PairOrderStatus::Ordered;
}
// Estimate the savings from executing s1 and s2 as a pair.
int SuperWord::estimate_cost_savings_when_packing_as_pair(const Node* s1, const Node* s2) const {
int save_in = 2 - 1; // 2 operations per instruction in packed form
const int adjacent_profit = 2;
auto pack_cost = [&] (const int size) { return size; };
auto unpack_cost = [&] (const int size) { return size; };
// inputs
for (uint i = 1; i < s1->req(); i++) {
Node* x1 = s1->in(i);
Node* x2 = s2->in(i);
if (x1 != x2) {
if (are_adjacent_refs(x1, x2)) {
save_in += adjacent_profit;
} else if (!_pairset.is_pair(x1, x2)) {
save_in -= pack_cost(2);
} else {
save_in += unpack_cost(2);
}
}
}
// uses of result
uint number_of_packed_use_pairs = 0;
int save_use = 0;
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* use1 = s1->fast_out(i);
// Find pair (use1, use2)
Node* use2 = _pairset.get_right_or_null_for(use1);
if (use2 == nullptr) { continue; }
for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
if (use2 == s2->fast_out(k)) {
// We have pattern:
//
// s1 s2
// | |
// [use1, use2]
//
number_of_packed_use_pairs++;
if (are_adjacent_refs(use1, use2)) {
save_use += adjacent_profit;
}
}
}
}
if (number_of_packed_use_pairs < s1->outcnt()) save_use += unpack_cost(1);
if (number_of_packed_use_pairs < s2->outcnt()) save_use += unpack_cost(1);
return MAX2(save_in, save_use);
}
// Combine pairs (n1, n2), (n2, n3), ... into pack (n1, n2, n3 ...)
void SuperWord::combine_pairs_to_longer_packs() {
#ifdef ASSERT
assert(!_pairset.is_empty(), "pairset not empty");
assert(_packset.is_empty(), "packset not empty");
#endif
// Iterate pair-chain by pair-chain, each from left-most to right-most.
Node_List* pack = nullptr;
for (PairSetIterator pair(_pairset); !pair.done(); pair.next()) {
Node* left = pair.left();
Node* right = pair.right();
if (_pairset.is_left_in_a_left_most_pair(left)) {
assert(pack == nullptr, "no unfinished pack");
pack = new (arena()) Node_List(arena());
pack->push(left);
}
assert(pack != nullptr, "must have unfinished pack");
pack->push(right);
if (_pairset.is_right_in_a_right_most_pair(right)) {
_packset.add_pack(pack);
pack = nullptr;
}
}
assert(pack == nullptr, "no unfinished pack");
assert(!_packset.is_empty(), "must have combined some packs");
#ifndef PRODUCT
if (is_trace_superword_packset()) {
tty->print_cr("\nAfter Superword::combine_pairs_to_longer_packs");
_packset.print();
}
#endif
}
SplitStatus PackSet::split_pack(const char* split_name,
Node_List* pack,
SplitTask task)
{
uint pack_size = pack->size();
if (task.is_unchanged()) {
return SplitStatus::make_unchanged(pack);
}
if (task.is_rejected()) {
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->cr();
tty->print_cr("WARNING: Removed pack: %s:", task.message());
print_pack(pack);
}
#endif
unmap_all_nodes_in_pack(pack);
return SplitStatus::make_rejected();
}
uint split_size = task.split_size();
assert(0 < split_size && split_size < pack_size, "split_size must be in range");
// Split the size
uint new_size = split_size;
uint old_size = pack_size - new_size;
#ifndef PRODUCT
if (is_trace_superword_packset()) {
tty->cr();
tty->print_cr("INFO: splitting pack (sizes: %d %d): %s:",
old_size, new_size, task.message());
print_pack(pack);
}
#endif
// Are both sizes too small to be a pack?
if (old_size < 2 && new_size < 2) {
assert(old_size == 1 && new_size == 1, "implied");
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->cr();
tty->print_cr("WARNING: Removed size 2 pack, cannot be split: %s:", task.message());
print_pack(pack);
}
#endif
unmap_all_nodes_in_pack(pack);
return SplitStatus::make_rejected();
}
// Just pop off a single node?
if (new_size < 2) {
assert(new_size == 1 && old_size >= 2, "implied");
Node* n = pack->pop();
unmap_node_in_pack(n);
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->cr();
tty->print_cr("WARNING: Removed node from pack, because of split: %s:", task.message());
n->dump();
}
#endif
return SplitStatus::make_modified(pack);
}
// Just remove a single node at front?
if (old_size < 2) {
assert(old_size == 1 && new_size >= 2, "implied");
Node* n = pack->at(0);
pack->remove(0);
unmap_node_in_pack(n);
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->cr();
tty->print_cr("WARNING: Removed node from pack, because of split: %s:", task.message());
n->dump();
}
#endif
return SplitStatus::make_modified(pack);
}
// We will have two packs
assert(old_size >= 2 && new_size >= 2, "implied");
Node_List* new_pack = new Node_List(new_size);
for (uint i = 0; i < new_size; i++) {
Node* n = pack->at(old_size + i);
new_pack->push(n);
remap_node_in_pack(n, new_pack);
}
for (uint i = 0; i < new_size; i++) {
pack->pop();
}
// We assume that new_pack is more "stable" (i.e. will have to be split less than new_pack).
// Put "pack" second, so that we insert it later in the list, and iterate over it again sooner.
return SplitStatus::make_split(new_pack, pack);
}
template <typename SplitStrategy>
void PackSet::split_packs(const char* split_name,
SplitStrategy strategy) {
bool changed;
do {
changed = false;
int new_packset_length = 0;
for (int i = 0; i < _packs.length(); i++) {
Node_List* pack = _packs.at(i);
assert(pack != nullptr && pack->size() >= 2, "no nullptr, at least size 2");
SplitTask task = strategy(pack);
SplitStatus status = split_pack(split_name, pack, task);
changed |= !status.is_unchanged();
Node_List* first_pack = status.first_pack();
Node_List* second_pack = status.second_pack();
_packs.at_put(i, nullptr); // take out pack
if (first_pack != nullptr) {
// The first pack can be put at the current position
assert(i >= new_packset_length, "only move packs down");
_packs.at_put(new_packset_length++, first_pack);
}
if (second_pack != nullptr) {
// The second node has to be appended at the end
_packs.append(second_pack);
}
}
_packs.trunc_to(new_packset_length);
} while (changed);
#ifndef PRODUCT
if (is_trace_superword_packset()) {
tty->print_cr("\nAfter %s", split_name);
print();
}
#endif
}
// Split packs at boundaries where left and right have different use or def packs.
void SuperWord::split_packs_at_use_def_boundaries() {
auto split_strategy = [&](const Node_List* pack) {
uint pack_size = pack->size();
uint boundary = find_use_def_boundary(pack);
assert(boundary < pack_size, "valid boundary %d", boundary);
if (boundary != 0) {
return SplitTask::make_split(pack_size - boundary, "found a use/def boundary");
}
return SplitTask::make_unchanged();
};
_packset.split_packs("SuperWord::split_packs_at_use_def_boundaries", split_strategy);
}
// Split packs that are only implemented with a smaller pack size. Also splits packs
// such that they eventually have power of 2 size.
void SuperWord::split_packs_only_implemented_with_smaller_size() {
auto split_strategy = [&](const Node_List* pack) {
uint pack_size = pack->size();
uint implemented_size = max_implemented_size(pack);
if (implemented_size == 0) {
return SplitTask::make_rejected("not implemented at any smaller size");
}
assert(is_power_of_2(implemented_size), "power of 2 size or zero: %d", implemented_size);
if (implemented_size != pack_size) {
return SplitTask::make_split(implemented_size, "only implemented at smaller size");
}
return SplitTask::make_unchanged();
};
_packset.split_packs("SuperWord::split_packs_only_implemented_with_smaller_size", split_strategy);
}
// Split packs that have a mutual dependency, until all packs are mutually_independent.
void SuperWord::split_packs_to_break_mutual_dependence() {
auto split_strategy = [&](const Node_List* pack) {
uint pack_size = pack->size();
assert(is_power_of_2(pack_size), "ensured by earlier splits %d", pack_size);
if (!is_marked_reduction(pack->at(0)) &&
!mutually_independent(pack)) {
// As a best guess, we split the pack in half. This way, we iteratively make the
// packs smaller, until there is no dependency.
return SplitTask::make_split(pack_size >> 1, "was not mutually independent");
}
return SplitTask::make_unchanged();
};
_packset.split_packs("SuperWord::split_packs_to_break_mutual_dependence", split_strategy);
}
template <typename FilterPredicate>
void PackSet::filter_packs(const char* filter_name,
const char* rejection_message,
FilterPredicate filter) {
auto split_strategy = [&](const Node_List* pack) {
if (filter(pack)) {
return SplitTask::make_unchanged();
} else {
return SplitTask::make_rejected(rejection_message);
}
};
split_packs(filter_name, split_strategy);
}
void SuperWord::filter_packs_for_power_of_2_size() {
auto filter = [&](const Node_List* pack) {
return is_power_of_2(pack->size());
};
_packset.filter_packs("SuperWord::filter_packs_for_power_of_2_size",
"size is not a power of 2", filter);
}
// We know that the nodes in a pair pack were independent - this gives us independence
// at distance 1. But now that we may have more than 2 nodes in a pack, we need to check
// if they are all mutually independent. If there is a dependence we remove the pack.
// This is better than giving up completely - we can have partial vectorization if some
// are rejected and others still accepted.
//
// Examples with dependence at distance 1 (pack pairs are not created):
// for (int i ...) { v[i + 1] = v[i] + 5; }
// for (int i ...) { v[i] = v[i - 1] + 5; }
//
// Example with independence at distance 1, but dependence at distance 2 (pack pairs are
// created and we need to filter them out now):
// for (int i ...) { v[i + 2] = v[i] + 5; }
// for (int i ...) { v[i] = v[i - 2] + 5; }
//
// Note: dependencies are created when a later load may reference the same memory location
// as an earlier store. This happens in "read backward" or "store forward" cases. On the
// other hand, "read forward" or "store backward" cases do not have such dependencies:
// for (int i ...) { v[i] = v[i + 1] + 5; }
// for (int i ...) { v[i - 1] = v[i] + 5; }
void SuperWord::filter_packs_for_mutual_independence() {
auto filter = [&](const Node_List* pack) {
// reductions are trivially connected
return is_marked_reduction(pack->at(0)) ||
mutually_independent(pack);
};
_packset.filter_packs("SuperWord::filter_packs_for_mutual_independence",
"found dependency between nodes at distance greater than 1", filter);
}
// Find the set of alignment solutions for load/store pack.
const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pack) {
assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs");
const MemNode* mem_ref = pack->at(0)->as_Mem();
const VPointer& mem_ref_p = vpointer(mem_ref);
const CountedLoopEndNode* pre_end = _vloop.pre_loop_end();
assert(pre_end->stride_is_con(), "pre loop stride is constant");
AlignmentSolver solver(pack->at(0)->as_Mem(),
pack->size(),
mem_ref_p.base(),
mem_ref_p.offset_in_bytes(),
mem_ref_p.invar(),
mem_ref_p.invar_factor(),
mem_ref_p.scale_in_bytes(),
pre_end->init_trip(),
pre_end->stride_con(),
iv_stride()
DEBUG_ONLY(COMMA is_trace_align_vector()));
return solver.solve();
}
// Ensure all packs are aligned, if AlignVector is on.
// Find an alignment solution: find the set of pre_iter that memory align all packs.
// Start with the maximal set (pre_iter >= 0) and filter it with the constraints
// that the packs impose. Remove packs that do not have a compatible solution.
void SuperWord::filter_packs_for_alignment() {
// We do not need to filter if no alignment is required.
if (!VLoop::vectors_should_be_aligned()) {
return;
}
#ifndef PRODUCT
if (is_trace_superword_info() || is_trace_align_vector()) {
tty->print_cr("\nSuperWord::filter_packs_for_alignment:");
}
#endif
ResourceMark rm;
// Start with trivial (unconstrained) solution space
AlignmentSolution const* current = new TrivialAlignmentSolution();
int mem_ops_count = 0;
int mem_ops_rejected = 0;
auto filter = [&](const Node_List* pack) {
// Only memops need to be aligned.
if (!pack->at(0)->is_Load() &&
!pack->at(0)->is_Store()) {
return true; // accept all non memops
}
mem_ops_count++;
const AlignmentSolution* s = pack_alignment_solution(pack);
const AlignmentSolution* intersect = current->filter(s);
#ifndef PRODUCT
if (is_trace_align_vector()) {
tty->print(" solution for pack: ");
s->print();
tty->print(" intersection with current: ");
intersect->print();
}
#endif
if (intersect->is_empty()) {
mem_ops_rejected++;
return false; // reject because of empty solution
}
current = intersect;
return true; // accept because of non-empty solution
};
_packset.filter_packs("SuperWord::filter_packs_for_alignment",
"rejected by AlignVector (strict alignment requirement)", filter);
#ifndef PRODUCT
if (is_trace_superword_info() || is_trace_align_vector()) {
tty->print("\n final solution: ");
current->print();
tty->print_cr(" rejected mem_ops packs: %d of %d", mem_ops_rejected, mem_ops_count);
tty->cr();
}
#endif
assert(!current->is_empty(), "solution must be non-empty");
if (current->is_constrained()) {
// Solution is constrained (not trivial)
// -> must change pre-limit to achieve alignment
MemNode const* mem = current->as_constrained()->mem_ref();
Node_List* pack = get_pack(mem);
assert(pack != nullptr, "memop of final solution must still be packed");
_mem_ref_for_main_loop_alignment = mem;
_aw_for_main_loop_alignment = pack->size() * mem->memory_size();
}
}
// Remove packs that are not implemented
void SuperWord::filter_packs_for_implemented() {
auto filter = [&](const Node_List* pack) {
return implemented(pack, pack->size());
};
_packset.filter_packs("SuperWord::filter_packs_for_implemented",
"Unimplemented", filter);
}
// Remove packs that are not profitable.
void SuperWord::filter_packs_for_profitable() {
// Count the number of reductions vs other vector ops, for the
// reduction profitability heuristic.
for (int i = 0; i < _packset.length(); i++) {
Node_List* pack = _packset.at(i);
Node* n = pack->at(0);
if (is_marked_reduction(n)) {
_num_reductions++;
} else {
_num_work_vecs++;
}
}
// Remove packs that are not profitable
auto filter = [&](const Node_List* pack) {
return profitable(pack);
};
_packset.filter_packs("Superword::filter_packs_for_profitable",
"not profitable", filter);
}
// Can code be generated for the pack, restricted to size nodes?
bool SuperWord::implemented(const Node_List* pack, const uint size) const {
assert(size >= 2 && size <= pack->size() && is_power_of_2(size), "valid size");
bool retValue = false;
Node* p0 = pack->at(0);
if (p0 != nullptr) {
int opc = p0->Opcode();
if (is_marked_reduction(p0)) {
const Type *arith_type = p0->bottom_type();
// Length 2 reductions of INT/LONG do not offer performance benefits
if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
retValue = false;
} else {
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
}
} else if (VectorNode::is_convert_opcode(opc)) {
retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
} else if (VectorNode::is_minmax_opcode(opc) && is_subword_type(velt_basic_type(p0))) {
// Java API for Math.min/max operations supports only int, long, float
// and double types. Thus, avoid generating vector min/max nodes for
// integer subword types with superword vectorization.
// See JDK-8294816 for miscompilation issues with shorts.
return false;
} else if (p0->is_Cmp()) {
// Cmp -> Bool -> Cmove
retValue = UseVectorCmov;
} else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) {
// Requires extra vector long -> int conversion.
retValue = VectorNode::implemented(opc, size, T_LONG) &&
VectorCastNode::implemented(Op_ConvL2I, size, T_LONG, T_INT);
} else {
if (VectorNode::can_use_RShiftI_instead_of_URShiftI(p0, velt_basic_type(p0))) {
opc = Op_RShiftI;
}
retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
}
}
return retValue;
}
// Find the maximal implemented size smaller or equal to the packs size
uint SuperWord::max_implemented_size(const Node_List* pack) {
uint size = round_down_power_of_2(pack->size());
if (implemented(pack, size)) {
return size;
} else {
// Iteratively divide size by 2, and check.
for (uint s = size >> 1; s >= 2; s >>= 1) {
if (implemented(pack, s)) {
return s;
}
}
return 0; // not implementable at all
}
}
// If the j-th input for all nodes in the pack is the same input: return it, else nullptr.
Node* PackSet::same_inputs_at_index_or_null(const Node_List* pack, const int index) const {
Node* p0_in = pack->at(0)->in(index);
for (uint i = 1; i < pack->size(); i++) {
if (pack->at(i)->in(index) != p0_in) {
return nullptr; // not same
}
}
return p0_in;
}
VTransformBoolTest PackSet::get_bool_test(const Node_List* bool_pack) const {
BoolNode* bol = bool_pack->at(0)->as_Bool();
BoolTest::mask mask = bol->_test._test;
bool is_negated = false;
assert(mask == BoolTest::eq ||
mask == BoolTest::ne ||
mask == BoolTest::ge ||
mask == BoolTest::gt ||
mask == BoolTest::lt ||
mask == BoolTest::le,
"Bool should be one of: eq, ne, ge, gt, lt, le");
#ifdef ASSERT
for (uint j = 0; j < bool_pack->size(); j++) {
Node* m = bool_pack->at(j);
assert(m->as_Bool()->_test._test == mask,
"all bool nodes must have same test");
}
#endif
CmpNode* cmp0 = bol->in(1)->as_Cmp();
assert(get_pack(cmp0) != nullptr, "Bool must have matching Cmp pack");
if (cmp0->Opcode() == Op_CmpF || cmp0->Opcode() == Op_CmpD) {
// If we have a Float or Double comparison, we must be careful with
// handling NaN's correctly. CmpF and CmpD have a return code, as
// they are based on the java bytecodes fcmpl/dcmpl:
// -1: cmp_in1 < cmp_in2, or at least one of the two is a NaN
// 0: cmp_in1 == cmp_in2 (no NaN)
// 1: cmp_in1 > cmp_in2 (no NaN)
//
// The "mask" selects which of the [-1, 0, 1] cases lead to "true".
//
// Note: ordered (O) comparison returns "false" if either input is NaN.
// unordered (U) comparison returns "true" if either input is NaN.
//
// The VectorMaskCmpNode does a comparison directly on in1 and in2, in the java
// standard way (all comparisons are ordered, except NEQ is unordered).
//
// In the following, "mask" already matches the cmp code for VectorMaskCmpNode:
// BoolTest::eq: Case 0 -> EQ_O
// BoolTest::ne: Case -1, 1 -> NEQ_U
// BoolTest::ge: Case 0, 1 -> GE_O
// BoolTest::gt: Case 1 -> GT_O
//
// But the lt and le comparisons must be converted from unordered to ordered:
// BoolTest::lt: Case -1 -> LT_U -> VectorMaskCmp would interpret lt as LT_O
// BoolTest::le: Case -1, 0 -> LE_U -> VectorMaskCmp would interpret le as LE_O
//
if (mask == BoolTest::lt || mask == BoolTest::le) {
// Negating the mask gives us the negated result, since all non-NaN cases are
// negated, and the unordered (U) comparisons are turned into ordered (O) comparisons.
// VectorMaskCmp(LT_U, in1_cmp, in2_cmp)
// <==> NOT VectorMaskCmp(GE_O, in1_cmp, in2_cmp)
// VectorMaskCmp(LE_U, in1_cmp, in2_cmp)
// <==> NOT VectorMaskCmp(GT_O, in1_cmp, in2_cmp)
//
// When a VectorBlend uses the negated mask, it can simply swap its blend-inputs:
// VectorBlend( VectorMaskCmp(LT_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
// <==> VectorBlend(NOT VectorMaskCmp(GE_O, in1_cmp, in2_cmp), in1_blend, in2_blend)
// <==> VectorBlend( VectorMaskCmp(GE_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
// VectorBlend( VectorMaskCmp(LE_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
// <==> VectorBlend(NOT VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in1_blend, in2_blend)
// <==> VectorBlend( VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
mask = bol->_test.negate();
is_negated = true;
}
}
return VTransformBoolTest(mask, is_negated);
}
//------------------------------profitable---------------------------
// For pack p, are all operands and all uses (with in the block) vector?
bool SuperWord::profitable(const Node_List* p) const {
Node* p0 = p->at(0);
uint start, end;
VectorNode::vector_operands(p0, &start, &end);
// Return false if some inputs are not vectors or vectors with different
// size or alignment.
// Also, for now, return false if not scalar promotion case when inputs are
// the same. Later, implement PackNode and allow differing, non-vector inputs
// (maybe just the ones from outside the block.)
for (uint i = start; i < end; i++) {
if (!is_vector_use(p0, i)) {
return false;
}
}
// Check if reductions are connected
if (is_marked_reduction(p0)) {
Node* second_in = p0->in(2);
Node_List* second_pk = get_pack(second_in);
if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
// No parent pack or not enough work
// to cover reduction expansion overhead
return false;
} else if (second_pk->size() != p->size()) {
return false;
}
}
if (VectorNode::is_shift(p0)) {
// For now, return false if shift count is vector or not scalar promotion
// case (different shift counts) because it is not supported yet.
Node* cnt = p0->in(2);
Node_List* cnt_pk = get_pack(cnt);
if (cnt_pk != nullptr || _packset.same_inputs_at_index_or_null(p, 2) == nullptr) {
return false;
}
}
if (!p0->is_Store()) {
// For now, return false if not all uses are vector.
// Later, implement ExtractNode and allow non-vector uses (maybe
// just the ones outside the block.)
for (uint i = 0; i < p->size(); i++) {
Node* def = p->at(i);
for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
Node* use = def->fast_out(j);
for (uint k = 0; k < use->req(); k++) {
Node* n = use->in(k);
if (def == n) {
// Reductions should only have a Phi use at the loop head or a non-phi use
// outside of the loop if it is the last element of the pack (e.g. SafePoint).
if (is_marked_reduction(def) &&
((use->is_Phi() && use->in(0) == lpt()->_head) ||
(!lpt()->is_member(phase()->get_loop(phase()->ctrl_or_self(use))) && i == p->size()-1))) {
continue;
}
if (!is_vector_use(use, k)) {
return false;
}
}
}
}
}
}
if (p0->is_Cmp()) {
// Verify that Cmp pack only has Bool pack uses
for (DUIterator_Fast jmax, j = p0->fast_outs(jmax); j < jmax; j++) {
Node* bol = p0->fast_out(j);
if (!bol->is_Bool() || bol->in(0) != nullptr || !is_vector_use(bol, 1)) {
return false;
}
}
}
if (p0->is_Bool()) {
// Verify that Bool pack only has CMove pack uses
for (DUIterator_Fast jmax, j = p0->fast_outs(jmax); j < jmax; j++) {
Node* cmove = p0->fast_out(j);
if (!cmove->is_CMove() || cmove->in(0) != nullptr || !is_vector_use(cmove, 1)) {
return false;
}
}
}
if (p0->is_CMove()) {
// Verify that CMove has a matching Bool pack
BoolNode* bol = p0->in(1)->as_Bool();
if (bol == nullptr || get_pack(bol) == nullptr) {
return false;
}
// Verify that Bool has a matching Cmp pack
CmpNode* cmp = bol->in(1)->as_Cmp();
if (cmp == nullptr || get_pack(cmp) == nullptr) {
return false;
}
}
return true;
}
#ifdef ASSERT
void SuperWord::verify_packs() const {
_packset.verify();
// All packs must be:
for (int i = 0; i < _packset.length(); i++) {
Node_List* pack = _packset.at(i);
// 1. Mutually independent (or a reduction).
if (!is_marked_reduction(pack->at(0)) &&
!mutually_independent(pack)) {
tty->print_cr("FAILURE: nodes not mutually independent in pack[%d]", i);
_packset.print_pack(pack);
assert(false, "pack nodes not mutually independent");
}
// 2. Implemented.
if (!implemented(pack, pack->size())) {
tty->print_cr("FAILURE: nodes not implementable in pack[%d]", i);
_packset.print_pack(pack);
assert(false, "pack not implementable");
}
// 3. Profitable.
if (!profitable(pack)) {
tty->print_cr("FAILURE: nodes not profitable in pack[%d]", i);
_packset.print_pack(pack);
assert(false, "pack not profitable");
}
}
}
void PackSet::verify() const {
// Verify all nodes in packset have pack set correctly.
ResourceMark rm;
Unique_Node_List processed;
for (int i = 0; i < _packs.length(); i++) {
Node_List* p = _packs.at(i);
for (uint k = 0; k < p->size(); k++) {
Node* n = p->at(k);
assert(_vloop.in_bb(n), "only nodes in bb can be in packset");
assert(!processed.member(n), "node should only occur once in packset");
assert(get_pack(n) == p, "n has consisten packset info");
processed.push(n);
}
}
// Check that no other node has pack set.
for (int i = 0; i < _body.body().length(); i++) {
Node* n = _body.body().at(i);
if (!processed.member(n)) {
assert(get_pack(n) == nullptr, "should not have pack if not in packset");
}
}
}
#endif
// The PacksetGraph combines the dependency graph with the packset. In the PackSet
// graph, we have two kinds of nodes:
// (1) pack-node: Represents all nodes of some pack p in a single node, which
// shall later become a vector node.
// (2) scalar-node: Represents a node that is not in any pack.
// For any edge (n1, n2) in the dependency graph, we add an edge to the PacksetGraph for
// the PacksetGraph nodes corresponding to n1 and n2.
// We work from the dependency graph, because it gives us all the data-dependencies,
// as well as more refined memory-dependencies than the C2 graph. The dependency graph
// does not have cycles. But packing nodes can introduce cyclic dependencies. Example:
//
// +--------+
// A -> X | v
// Pack [A,B] and [X,Y] [A,B] [X,Y]
// Y -> B ^ |
// +--------+
//
class PacksetGraph {
private:
// pid: packset graph node id.
GrowableArray<int> _pid; // bb_idx(n) -> pid
GrowableArray<Node*> _pid_to_node; // one node per pid, find rest via _packset.pack
GrowableArray<GrowableArray<int>> _out; // out-edges
GrowableArray<int> _incnt; // number of (implicit) in-edges
int _max_pid = 0;
bool _schedule_success;
SuperWord* _slp;
public:
PacksetGraph(SuperWord* slp)
: _pid(8, 0, /* default */ 0), _slp(slp) {
}
// Get pid, if there is a packset node that n belongs to. Else return 0.
int get_pid_or_zero(const Node* n) const {
if (!_slp->in_bb(n)) {
return 0;
}
int idx = _slp->bb_idx(n);
if (idx >= _pid.length()) {
return 0;
} else {
return _pid.at(idx);
}
}
int get_pid(const Node* n) {
int poz = get_pid_or_zero(n);
assert(poz != 0, "pid should not be zero");
return poz;
}
void set_pid(Node* n, int pid) {
assert(n != nullptr && pid > 0, "sane inputs");
assert(_slp->in_bb(n), "must be");
int idx = _slp->bb_idx(n);
_pid.at_put_grow(idx, pid);
_pid_to_node.at_put_grow(pid - 1, n, nullptr);
}
Node* get_node(int pid) {
assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped");
Node* n = _pid_to_node.at(pid - 1);
assert(n != nullptr, "sanity");
return n;
}
int new_pid() {
_incnt.push(0);
_out.push(GrowableArray<int>());
return ++_max_pid;
}
int incnt(int pid) { return _incnt.at(pid - 1); }
void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); }
GrowableArray<int>& out(int pid) { return _out.at(pid - 1); }
bool schedule_success() const { return _schedule_success; }
// Create nodes (from packs and scalar-nodes), and add edges, based on the dependency graph.
void build() {
const PackSet& packset = _slp->packset();
const GrowableArray<Node*>& body = _slp->body();
// Map nodes in packsets
for (int i = 0; i < packset.length(); i++) {
Node_List* p = packset.at(i);
int pid = new_pid();
for (uint k = 0; k < p->size(); k++) {
Node* n = p->at(k);
set_pid(n, pid);
assert(packset.get_pack(n) == p, "matching packset");
}
}
int max_pid_packset = _max_pid;
// Map nodes not in packset
for (int i = 0; i < body.length(); i++) {
Node* n = body.at(i);
if (n->is_Phi() || n->is_CFG()) {
continue; // ignore control flow
}
int pid = get_pid_or_zero(n);
if (pid == 0) {
pid = new_pid();
set_pid(n, pid);
assert(packset.get_pack(n) == nullptr, "no packset");
}
}
// Map edges for packset nodes
VectorSet set;
for (int i = 0; i < packset.length(); i++) {
Node_List* p = packset.at(i);
set.clear();
int pid = get_pid(p->at(0));
for (uint k = 0; k < p->size(); k++) {
Node* n = p->at(k);
assert(pid == get_pid(n), "all nodes in pack have same pid");
for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) {
Node* pred = preds.current();
int pred_pid = get_pid_or_zero(pred);
if (pred_pid == pid && _slp->is_marked_reduction(n)) {
continue; // reduction -> self-cycle is not a cyclic dependency
}
// Only add edges once, and only for mapped nodes (in body)
if (pred_pid > 0 && !set.test_set(pred_pid)) {
incnt_set(pid, incnt(pid) + 1); // increment
out(pred_pid).push(pid);
}
}
}
}
// Map edges for nodes not in packset
for (int i = 0; i < body.length(); i++) {
Node* n = body.at(i);
int pid = get_pid_or_zero(n); // zero for Phi or CFG
if (pid <= max_pid_packset) {
continue; // Only scalar-nodes
}
for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) {
Node* pred = preds.current();
int pred_pid = get_pid_or_zero(pred);
// Only add edges for mapped nodes (in body)
if (pred_pid > 0) {
incnt_set(pid, incnt(pid) + 1); // increment
out(pred_pid).push(pid);
}
}
}
}
// Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node
// that has zero incnt. If a PacksetGraph node corresponds to memops, then add
// those to the memops_schedule. At the end, we return the memops_schedule, and
// note if topsort was successful.
Node_List schedule() {
Node_List memops_schedule;
GrowableArray<int> worklist;
// Directly schedule all nodes without precedence
for (int pid = 1; pid <= _max_pid; pid++) {
if (incnt(pid) == 0) {
worklist.push(pid);
}
}
// Continue scheduling via topological sort
for (int i = 0; i < worklist.length(); i++) {
int pid = worklist.at(i);
// Add memops to memops_schedule
Node* n = get_node(pid);
Node_List* p = _slp->packset().get_pack(n);
if (n->is_Mem()) {
if (p == nullptr) {
memops_schedule.push(n);
} else {
for (uint k = 0; k < p->size(); k++) {
memops_schedule.push(p->at(k));
assert(p->at(k)->is_Mem(), "only schedule memops");
}
}
}
// Decrement incnt for all successors
for (int j = 0; j < out(pid).length(); j++){
int pid_use = out(pid).at(j);
int incnt_use = incnt(pid_use) - 1;
incnt_set(pid_use, incnt_use);
// Did use lose its last input?
if (incnt_use == 0) {
worklist.push(pid_use);
}
}
}
// Was every pid scheduled? If not, we found some cycles in the PacksetGraph.
_schedule_success = (worklist.length() == _max_pid);
return memops_schedule;
}
// Print the PacksetGraph.
// print_nodes = true: print all C2 nodes beloning to PacksetGrahp node.
// print_zero_incnt = false: do not print nodes that have no in-edges (any more).
void print(bool print_nodes, bool print_zero_incnt) {
const GrowableArray<Node*> &body = _slp->body();
tty->print_cr("PacksetGraph");
for (int pid = 1; pid <= _max_pid; pid++) {
if (incnt(pid) == 0 && !print_zero_incnt) {
continue;
}
tty->print("Node %d. incnt %d [", pid, incnt(pid));
for (int j = 0; j < out(pid).length(); j++) {
tty->print("%d ", out(pid).at(j));
}
tty->print_cr("]");
#ifndef PRODUCT
if (print_nodes) {
for (int i = 0; i < body.length(); i++) {
Node* n = body.at(i);
if (get_pid_or_zero(n) == pid) {
tty->print(" ");
n->dump();
}
}
}
#endif
}
}
};
// We want to replace the packed scalars from the PackSet and replace them
// with vector operations. This requires scheduling and re-ordering the memory
// graph. We take these steps:
// (1) Build the PacksetGraph. It combines the dependency graph with the
// packset. The PacksetGraph gives us the dependencies that must be
// respected after scheduling.
// (2) Schedule the PacksetGraph to the memops_schedule, which represents
// a linear order of all memops in the body. The order respects the
// dependencies of the PacksetGraph.
// (3) If the PacksetGraph has cycles, we cannot schedule. Abort.
// (4) Apply the vectorization, including re-ordering the memops and replacing
// packed scalars with vector operations.
bool SuperWord::schedule_and_apply() {
if (_packset.is_empty()) {
return false;
}
ResourceMark rm;
// (1) Build the PacksetGraph.
PacksetGraph graph(this);
graph.build();
// (2) Schedule the PacksetGraph.
Node_List memops_schedule = graph.schedule();
// (3) Check if the PacksetGraph schedule succeeded (had no cycles).
// We now know that we only have independent packs, see verify_packs.
// This is a necessary but not a sufficient condition for an acyclic
// graph (DAG) after scheduling. Thus, we must check if the packs have
// introduced a cycle. The SuperWord paper mentions the need for this
// in "3.7 Scheduling".
if (!graph.schedule_success()) {
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:");
graph.print(true, false);
tty->print_cr("removing all packs from packset.");
}
#endif
_packset.clear();
return false;
}
// (4) Apply the vectorization, including re-ordering the memops.
return apply(memops_schedule);
}
bool SuperWord::apply(Node_List& memops_schedule) {
Compile* C = phase()->C;
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
C->print_method(PHASE_AUTO_VECTORIZATION1_BEFORE_APPLY, 4, cl);
apply_memops_reordering_with_schedule(memops_schedule);
C->print_method(PHASE_AUTO_VECTORIZATION2_AFTER_REORDER, 4, cl);
adjust_pre_loop_limit_to_align_main_loop_vectors();
C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl);
bool is_success = apply_vectorization();
C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl);
return is_success;
}
// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
// and track the current memory state of each slice.
void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule) {
#ifndef PRODUCT
if (is_trace_superword_info()) {
tty->print_cr("\nSuperWord::apply_memops_reordering_with_schedule:");
memops_schedule.dump();
}
#endif
int max_slices = phase()->C->num_alias_types();
// When iterating over the memops_schedule, we keep track of the current memory state,
// which is the Phi or a store in the loop.
GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
// The memory state after the loop is the last store inside the loop. If we reorder the
// loop we may have a different last store, and we need to adjust the uses accordingly.
GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
const GrowableArray<PhiNode*>& mem_slice_head = _vloop_analyzer.memory_slices().heads();
// (1) Set up the initial memory state from Phi. And find the old last store.
for (int i = 0; i < mem_slice_head.length(); i++) {
Node* phi = mem_slice_head.at(i);
assert(phi->is_Phi(), "must be phi");
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
current_state_in_slice.at_put(alias_idx, phi);
// If we have a memory phi, we have a last store in the loop, find it over backedge.
StoreNode* last_store = phi->in(2)->as_Store();
old_last_store_in_slice.at_put(alias_idx, last_store);
}
// (2) Walk over memops_schedule, append memops to the current state
// of that slice. If it is a Store, we take it as the new state.
for (uint i = 0; i < memops_schedule.size(); i++) {
MemNode* n = memops_schedule.at(i)->as_Mem();
assert(n->is_Load() || n->is_Store(), "only loads or stores");
int alias_idx = phase()->C->get_alias_index(n->adr_type());
Node* current_state = current_state_in_slice.at(alias_idx);
if (current_state == nullptr) {
// If there are only loads in a slice, we never update the memory
// state in the loop, hence there is no phi for the memory state.
// We just keep the old memory state that was outside the loop.
assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
"only loads can have memory state from outside loop");
} else {
igvn().replace_input_of(n, MemNode::Memory, current_state);
if (n->is_Store()) {
current_state_in_slice.at_put(alias_idx, n);
}
}
}
// (3) For each slice, we add the current state to the backedge
// in the Phi. Further, we replace uses of the old last store
// with uses of the new last store (current_state).
Node_List uses_after_loop;
for (int i = 0; i < mem_slice_head.length(); i++) {
Node* phi = mem_slice_head.at(i);
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
Node* current_state = current_state_in_slice.at(alias_idx);
assert(current_state != nullptr, "slice is mapped");
assert(current_state != phi, "did some work in between");
assert(current_state->is_Store(), "sanity");
igvn().replace_input_of(phi, 2, current_state);
// Replace uses of old last store with current_state (new last store)
// Do it in two loops: first find all the uses, and change the graph
// in as second loop so that we do not break the iterator.
Node* last_store = old_last_store_in_slice.at(alias_idx);
assert(last_store != nullptr, "we have a old last store");
uses_after_loop.clear();
for (DUIterator_Fast kmax, k = last_store->fast_outs(kmax); k < kmax; k++) {
Node* use = last_store->fast_out(k);
if (!in_bb(use)) {
uses_after_loop.push(use);
}
}
for (uint k = 0; k < uses_after_loop.size(); k++) {
Node* use = uses_after_loop.at(k);
for (uint j = 0; j < use->req(); j++) {
Node* def = use->in(j);
if (def == last_store) {
igvn().replace_input_of(use, j, current_state);
}
}
}
}
}
// Convert packs into vector node operations
// At this point, all correctness and profitability checks have passed.
// We start the irreversible process of editing the C2 graph. Should
// there be an unexpected situation (assert fails), then we can only
// bail out of the compilation, as the graph has already been partially
// modified. We bail out, and retry without SuperWord.
bool SuperWord::apply_vectorization() {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
assert(cl->is_main_loop(), "SLP should only work on main loops");
Compile* C = phase()->C;
assert(!_packset.is_empty(), "vectorization requires non-empty packset");
#ifndef PRODUCT
if (TraceLoopOpts) {
tty->print("SuperWord::apply_vectorization ");
lpt()->dump_head();
}
#endif
uint max_vlen_in_bytes = 0;
uint max_vlen = 0;
for (int i = 0; i < body().length(); i++) {
Node* n = body().at(i);
Node_List* p = get_pack(n);
if (p != nullptr && n == p->at(p->size()-1)) {
// After apply_memops_reordering_with_schedule, we know that the memops have the same order in the pack
// as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
// and "n" is the last node in the slice from the pack.
Node* first = p->at(0);
uint vlen = p->size();
uint vlen_in_bytes = 0;
Node* vn = nullptr;
int opc = n->Opcode();
if (n->is_Load()) {
Node* ctl = n->in(MemNode::Control);
Node* mem = first->in(MemNode::Memory);
// Set the memory dependency of the LoadVector as early as possible.
// Walk up the memory chain, and ignore any StoreVector that provably
// does not have any memory dependency.
while (mem->is_StoreVector()) {
VPointer p_store(mem->as_Mem(), _vloop);
if (p_store.overlap_possible_with_any_in(p)) {
break;
} else {
mem = mem->in(MemNode::Memory);
}
}
Node* adr = first->in(MemNode::Address);
const TypePtr* atyp = n->adr_type();
vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
vlen_in_bytes = vn->as_LoadVector()->memory_size();
} else if (n->is_Store()) {
// Promote value to be stored to vector
Node* val = vector_opd(p, MemNode::ValueIn);
if (val == nullptr) {
assert(false, "input to vector store was not created");
C->record_failure(C2Compiler::retry_no_superword());
return false; // bailout
}
Node* ctl = n->in(MemNode::Control);
Node* mem = first->in(MemNode::Memory);
Node* adr = first->in(MemNode::Address);
const TypePtr* atyp = n->adr_type();
vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
vlen_in_bytes = vn->as_StoreVector()->memory_size();
} else if (VectorNode::is_scalar_rotate(n)) {
Node* in1 = vector_opd(p, 1);
Node* in2 = first->in(2);
// If rotation count is non-constant or greater than 8bit value create a vector.
if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) {
in2 = vector_opd(p, 2);
}
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (VectorNode::is_roundopD(n)) {
Node* in1 = vector_opd(p, 1);
Node* in2 = first->in(2);
assert(in2->is_Con(), "Constant rounding mode expected.");
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (VectorNode::is_muladds2i(n)) {
assert(n->req() == 5u, "MulAddS2I should have 4 operands.");
Node* in1 = vector_opd(p, 1);
Node* in2 = vector_opd(p, 2);
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (opc == Op_SignumF || opc == Op_SignumD) {
assert(n->req() == 4, "four inputs expected");
Node* in = vector_opd(p, 1);
Node* zero = vector_opd(p, 2);
Node* one = vector_opd(p, 3);
vn = VectorNode::make(opc, in, zero, one, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (n->is_Cmp()) {
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
continue;
} else if (n->is_Bool()) {
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
continue;
} else if (n->is_CMove()) {
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
BoolNode* bol = n->in(1)->as_Bool();
assert(bol != nullptr, "must have Bool above CMove");
Node_List* bool_pack = get_pack(bol);
assert(bool_pack != nullptr, "CMove must have matching Bool pack");
CmpNode* cmp = bol->in(1)->as_Cmp();
assert(cmp != nullptr, "must have cmp above CMove");
Node_List* cmp_pack = get_pack(cmp);
assert(cmp_pack != nullptr, "Bool must have matching Cmp pack");
Node* cmp_in1 = vector_opd(cmp_pack, 1);
Node* cmp_in2 = vector_opd(cmp_pack, 2);
Node* blend_in1 = vector_opd(p, 2);
Node* blend_in2 = vector_opd(p, 3);
VTransformBoolTest bool_test = _packset.get_bool_test(bool_pack);
BoolTest::mask test_mask = bool_test._mask;
if (bool_test._is_negated) {
// We can cancel out the negation by swapping the blend inputs.
swap(blend_in1, blend_in2);
}
// VectorMaskCmp
ConINode* test_mask_node = igvn().intcon((int)test_mask);
BasicType bt = velt_basic_type(cmp);
const TypeVect* vt = TypeVect::make(bt, vlen);
VectorNode* mask = new VectorMaskCmpNode(test_mask, cmp_in1, cmp_in2, test_mask_node, vt);
phase()->register_new_node_with_ctrl_of(mask, p->at(0));
igvn()._worklist.push(mask);
// VectorBlend
vn = new VectorBlendNode(blend_in1, blend_in2, mask);
} else if (n->req() == 3) {
// Promote operands to vector
Node* in1 = nullptr;
bool node_isa_reduction = is_marked_reduction(n);
if (node_isa_reduction) {
// the input to the first reduction operation is retained
in1 = first->in(1);
} else {
in1 = vector_opd(p, 1);
if (in1 == nullptr) {
assert(false, "input in1 to vector operand was not created");
C->record_failure(C2Compiler::retry_no_superword());
return false; // bailout
}
}
Node* in2 = vector_opd(p, 2);
if (in2 == nullptr) {
assert(false, "input in2 to vector operand was not created");
C->record_failure(C2Compiler::retry_no_superword());
return false; // bailout
}
if (in1->Opcode() == Op_Replicate && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
// Move invariant vector input into second position to avoid register spilling.
Node* tmp = in1;
in1 = in2;
in2 = tmp;
}
if (node_isa_reduction) {
const Type *arith_type = n->bottom_type();
vn = ReductionNode::make(opc, nullptr, in1, in2, arith_type->basic_type());
if (in2->is_Load()) {
vlen_in_bytes = in2->as_LoadVector()->memory_size();
} else {
vlen_in_bytes = in2->as_Vector()->length_in_bytes();
}
} else {
if (VectorNode::can_use_RShiftI_instead_of_URShiftI(n, velt_basic_type(n))) {
opc = Op_RShiftI;
}
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
}
} else if (VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc)) {
assert(n->req() == 2, "only one input expected");
Node* in = vector_opd(p, 1);
vn = VectorNode::make(opc, in, nullptr, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) {
assert(n->req() == 2, "only one input expected");
Node* in = vector_opd(p, 1);
Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
phase()->register_new_node_with_ctrl_of(longval, first);
// Requires extra vector long -> int conversion.
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (VectorNode::is_convert_opcode(opc)) {
assert(n->req() == 2, "only one input expected");
BasicType bt = velt_basic_type(n);
Node* in = vector_opd(p, 1);
int vopc = VectorCastNode::opcode(opc, in->bottom_type()->is_vect()->element_basic_type());
vn = VectorCastNode::make(vopc, in, bt, vlen);
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (opc == Op_FmaD || opc == Op_FmaF) {
// Promote operands to vector
Node* in1 = vector_opd(p, 1);
Node* in2 = vector_opd(p, 2);
Node* in3 = vector_opd(p, 3);
vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else {
assert(false, "Unhandled scalar opcode (%s)", NodeClassNames[opc]);
C->record_failure(C2Compiler::retry_no_superword());
return false; // bailout
}
if (vn == nullptr) {
assert(false, "got null node instead of vector node");
C->record_failure(C2Compiler::retry_no_superword());
return false; // bailout
}
#ifdef ASSERT
// Mark Load/Store Vector for alignment verification
if (VerifyAlignVector) {
if (vn->Opcode() == Op_LoadVector) {
vn->as_LoadVector()->set_must_verify_alignment();
} else if (vn->Opcode() == Op_StoreVector) {
vn->as_StoreVector()->set_must_verify_alignment();
}
}
#endif
phase()->register_new_node_with_ctrl_of(vn, first);
for (uint j = 0; j < p->size(); j++) {
Node* pm = p->at(j);
igvn().replace_node(pm, vn);
}
igvn()._worklist.push(vn);
if (vlen > max_vlen) {
max_vlen = vlen;
}
if (vlen_in_bytes > max_vlen_in_bytes) {
max_vlen_in_bytes = vlen_in_bytes;
}
VectorNode::trace_new_vector(vn, "SuperWord");
}
}//for (int i = 0; i < body().length(); i++)
if (max_vlen_in_bytes > C->max_vector_size()) {
C->set_max_vector_size(max_vlen_in_bytes);
}
if (max_vlen_in_bytes > 0) {
cl->mark_loop_vectorized();
}
if (SuperWordLoopUnrollAnalysis) {
if (cl->has_passed_slp()) {
uint slp_max_unroll_factor = cl->slp_max_unroll();
if (slp_max_unroll_factor == max_vlen) {
#ifndef PRODUCT
if (TraceSuperWordLoopUnrollAnalysis) {
tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
}
#endif
// For atomic unrolled loops which are vector mapped, instigate more unrolling
cl->set_notpassed_slp();
// if vector resources are limited, do not allow additional unrolling
if (Matcher::float_pressure_limit() > 8) {
C->set_major_progress();
cl->mark_do_unroll_only();
}
}
}
}
return true;
}
//------------------------------vector_opd---------------------------
// Create a vector operand for the nodes in pack p for operand: in(opd_idx)
Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
Node* p0 = p->at(0);
uint vlen = p->size();
Node* opd = p0->in(opd_idx);
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
Node* same_input = _packset.same_inputs_at_index_or_null(p, opd_idx);
// Insert index population operation to create a vector of increasing
// indices starting from the iv value. In some special unrolled loops
// (see JDK-8286125), we need scalar replications of the iv value if
// all inputs are the same iv, so we do a same inputs check here.
if (opd == iv() && same_input == nullptr) {
BasicType p0_bt = velt_basic_type(p0);
BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
Node* vn = new PopulateIndexNode(iv(), igvn().intcon(1), vt);
VectorNode::trace_new_vector(vn, "SuperWord");
phase()->register_new_node_with_ctrl_of(vn, opd);
return vn;
}
if (same_input != nullptr) {
if (opd->is_Vector() || opd->is_LoadVector()) {
if (opd_idx == 2 && VectorNode::is_shift(p0)) {
assert(false, "shift's count can't be vector");
return nullptr;
}
return opd; // input is matching vector
}
if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
Node* cnt = opd;
// Vector instructions do not mask shift count, do it here.
juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
const TypeInt* t = opd->find_int_type();
if (t != nullptr && t->is_con()) {
juint shift = t->get_con();
if (shift > mask) { // Unsigned cmp
cnt = igvn().intcon(shift & mask);
phase()->set_ctrl(cnt, phase()->C->root());
}
} else {
if (t == nullptr || t->_lo < 0 || t->_hi > (int)mask) {
cnt = igvn().intcon(mask);
cnt = new AndINode(opd, cnt);
phase()->register_new_node_with_ctrl_of(cnt, opd);
}
if (!opd->bottom_type()->isa_int()) {
assert(false, "int type only");
return nullptr;
}
}
// Move shift count into vector register.
cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
phase()->register_new_node_with_ctrl_of(cnt, opd);
return cnt;
}
if (opd->is_StoreVector()) {
assert(false, "StoreVector is not expected here");
return nullptr;
}
// Convert scalar input to vector with the same number of elements as
// p0's vector. Use p0's type because size of operand's container in
// vector should match p0's size regardless operand's size.
const Type* p0_t = nullptr;
VectorNode* vn = nullptr;
if (opd_idx == 2 && VectorNode::is_scalar_rotate(p0)) {
Node* conv = opd;
p0_t = TypeInt::INT;
if (p0->bottom_type()->isa_long()) {
p0_t = TypeLong::LONG;
conv = new ConvI2LNode(opd);
phase()->register_new_node_with_ctrl_of(conv, opd);
}
vn = VectorNode::scalar2vector(conv, vlen, p0_t);
} else {
p0_t = velt_type(p0);
vn = VectorNode::scalar2vector(opd, vlen, p0_t);
}
phase()->register_new_node_with_ctrl_of(vn, opd);
VectorNode::trace_new_vector(vn, "SuperWord");
return vn;
}
// Insert pack operation
BasicType bt = velt_basic_type(p0);
PackNode* pk = PackNode::make(opd, vlen, bt);
DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
for (uint i = 1; i < vlen; i++) {
Node* pi = p->at(i);
Node* in = pi->in(opd_idx);
if (get_pack(in) != nullptr) {
assert(false, "Should already have been unpacked");
return nullptr;
}
assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
pk->add_opd(in);
if (VectorNode::is_muladds2i(pi)) {
Node* in2 = pi->in(opd_idx + 2);
if (get_pack(in2) != nullptr) {
assert(false, "Should already have been unpacked");
return nullptr;
}
assert(opd_bt == in2->bottom_type()->basic_type(), "all same type");
pk->add_opd(in2);
}
}
phase()->register_new_node_with_ctrl_of(pk, opd);
VectorNode::trace_new_vector(pk, "SuperWord");
return pk;
}
#ifdef ASSERT
// We check that every packset (name it p_def) only has vector uses (p_use),
// which are proper vector uses of def.
void SuperWord::verify_no_extract() {
for (int i = 0; i < _packset.length(); i++) {
Node_List* p_def = _packset.at(i);
// A vector store has no uses
if (p_def->at(0)->is_Store()) { continue; }
// for every def in p_def, and every use:
for (uint i = 0; i < p_def->size(); i++) {
Node* def = p_def->at(i);
for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
Node* use = def->fast_out(j);
// find every use->def edge:
for (uint k = 0; k < use->req(); k++) {
Node* maybe_def = use->in(k);
if (def == maybe_def) {
Node_List* p_use = get_pack(use);
if (is_marked_reduction(def)) { continue; }
assert(p_use != nullptr && is_vector_use(use, k), "all uses must be vector uses");
}
}
}
}
}
}
#endif
// Check if n_super's pack uses are a superset of n_sub's pack uses.
bool SuperWord::has_use_pack_superset(const Node* n_super, const Node* n_sub) const {
Node_List* pack = get_pack(n_super);
assert(pack != nullptr && pack == get_pack(n_sub), "must have the same pack");
// For all uses of n_sub that are in a pack (use_sub) ...
for (DUIterator_Fast jmax, j = n_sub->fast_outs(jmax); j < jmax; j++) {
Node* use_sub = n_sub->fast_out(j);
Node_List* pack_use_sub = get_pack(use_sub);
if (pack_use_sub == nullptr) { continue; }
// ... and all input edges: use_sub->in(i) == n_sub.
uint start, end;
VectorNode::vector_operands(use_sub, &start, &end);
for (uint i = start; i < end; i++) {
if (use_sub->in(i) != n_sub) { continue; }
// Check if n_super has any use use_super in the same pack ...
bool found = false;
for (DUIterator_Fast kmax, k = n_super->fast_outs(kmax); k < kmax; k++) {
Node* use_super = n_super->fast_out(k);
Node_List* pack_use_super = get_pack(use_super);
if (pack_use_sub != pack_use_super) { continue; }
// ... and where there is an edge use_super->in(i) == n_super.
// For MulAddS2I it is expected to have defs over different input edges.
if (use_super->in(i) != n_super && !VectorNode::is_muladds2i(use_super)) { continue; }
found = true;
break;
}
if (!found) {
// n_sub has a use-edge (use_sub->in(i) == n_sub) with use_sub in a packset,
// but n_super does not have any edge (use_super->in(i) == n_super) with
// use_super in the same packset. Hence, n_super does not have a use pack
// superset of n_sub.
return false;
}
}
}
// n_super has all edges that n_sub has.
return true;
}
// Find a boundary in the pack, where left and right have different pack uses and defs.
// This is a natural boundary to split a pack, to ensure that use and def packs match.
// If no boundary is found, return zero.
uint SuperWord::find_use_def_boundary(const Node_List* pack) const {
Node* p0 = pack->at(0);
Node* p1 = pack->at(1);
const bool is_reduction_pack = reduction(p0, p1);
// Inputs range
uint start, end;
VectorNode::vector_operands(p0, &start, &end);
for (int i = pack->size() - 2; i >= 0; i--) {
// For all neighbours
Node* n0 = pack->at(i + 0);
Node* n1 = pack->at(i + 1);
// 1. Check for matching defs
for (uint j = start; j < end; j++) {
Node* n0_in = n0->in(j);
Node* n1_in = n1->in(j);
// No boundary if:
// 1) the same packs OR
// 2) reduction edge n0->n1 or n1->n0
if (get_pack(n0_in) != get_pack(n1_in) &&
!((n0 == n1_in || n1 == n0_in) && is_reduction_pack)) {
return i + 1;
}
}
// 2. Check for matching uses: equal if both are superset of the other.
// Reductions have no pack uses, so they match trivially on the use packs.
if (!is_reduction_pack &&
!(has_use_pack_superset(n0, n1) &&
has_use_pack_superset(n1, n0))) {
return i + 1;
}
}
return 0;
}
//------------------------------is_vector_use---------------------------
// Is use->in(u_idx) a vector use?
bool SuperWord::is_vector_use(Node* use, int u_idx) const {
Node_List* u_pk = get_pack(use);
if (u_pk == nullptr) return false;
// Reduction: first input is internal connection.
if (is_marked_reduction(use) && u_idx == 1) {
#ifdef ASSERT
for (uint i = 1; i < u_pk->size(); i++) {
assert(u_pk->at(i - 1) == u_pk->at(i)->in(1), "internal connection");
}
#endif
return true;
}
Node* def = use->in(u_idx);
Node_List* d_pk = get_pack(def);
if (d_pk == nullptr) {
Node* n = u_pk->at(0)->in(u_idx);
if (n == iv()) {
// check for index population
BasicType bt = velt_basic_type(use);
if (!VectorNode::is_populate_index_supported(bt)) return false;
for (uint i = 1; i < u_pk->size(); i++) {
// We can create a vector filled with iv indices if all other nodes
// in use pack have inputs of iv plus node index.
Node* use_in = u_pk->at(i)->in(u_idx);
if (!use_in->is_Add() || use_in->in(1) != n) return false;
const TypeInt* offset_t = use_in->in(2)->bottom_type()->is_int();
if (offset_t == nullptr || !offset_t->is_con() ||
offset_t->get_con() != (jint) i) return false;
}
} else {
// check for scalar promotion
for (uint i = 1; i < u_pk->size(); i++) {
if (u_pk->at(i)->in(u_idx) != n) return false;
}
}
return true;
}
if (!is_velt_basic_type_compatible_use_def(use, def)) {
return false;
}
if (VectorNode::is_muladds2i(use)) {
// MulAddS2I takes shorts and produces ints.
if (u_pk->size() * 2 != d_pk->size()) {
return false;
}
return true;
}
if (u_pk->size() != d_pk->size()) {
return false;
}
for (uint i = 0; i < u_pk->size(); i++) {
Node* ui = u_pk->at(i);
Node* di = d_pk->at(i);
if (ui->in(u_idx) != di) {
return false;
}
}
return true;
}
// Check if the output type of def is compatible with the input type of use, i.e. if the
// types have the same size.
bool SuperWord::is_velt_basic_type_compatible_use_def(Node* use, Node* def) const {
assert(in_bb(def) && in_bb(use), "both use and def are in loop");
// Conversions are trivially compatible.
if (VectorNode::is_convert_opcode(use->Opcode())) {
return true;
}
BasicType use_bt = velt_basic_type(use);
BasicType def_bt = velt_basic_type(def);
assert(is_java_primitive(use_bt), "sanity %s", type2name(use_bt));
assert(is_java_primitive(def_bt), "sanity %s", type2name(def_bt));
// Nodes like Long.bitCount: expect long input, and int output.
if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(use->Opcode())) {
return type2aelembytes(def_bt) == 8 &&
type2aelembytes(use_bt) == 4;
}
// MulAddS2I: expect short input, and int output.
if (VectorNode::is_muladds2i(use)) {
return type2aelembytes(def_bt) == 2 &&
type2aelembytes(use_bt) == 4;
}
// Default case: input size of use equals output size of def.
return type2aelembytes(use_bt) == type2aelembytes(def_bt);
}
// Return nullptr if success, else failure message
VStatus VLoopBody::construct() {
assert(_body.is_empty(), "body is empty");
// First pass over loop body:
// (1) Check that there are no unwanted nodes (LoadStore, MergeMem, data Proj).
// (2) Count number of nodes, and create a temporary map (_idx -> bb_idx).
// (3) Verify that all non-ctrl nodes have an input inside the loop.
int body_count = 0;
for (uint i = 0; i < _vloop.lpt()->_body.size(); i++) {
Node* n = _vloop.lpt()->_body.at(i);
set_bb_idx(n, i); // Create a temporary map
if (_vloop.in_bb(n)) {
body_count++;
if (n->is_LoadStore() || n->is_MergeMem() ||
(n->is_Proj() && !n->as_Proj()->is_CFG())) {
// Bailout if the loop has LoadStore, MergeMem or data Proj
// nodes. Superword optimization does not work with them.
#ifndef PRODUCT
if (_vloop.is_trace_body()) {
tty->print_cr("VLoopBody::construct: fails because of unhandled node:");
n->dump();
}
#endif
return VStatus::make_failure(VLoopBody::FAILURE_NODE_NOT_ALLOWED);
}
if (!n->is_CFG()) {
bool found = false;
for (uint j = 0; j < n->req(); j++) {
Node* def = n->in(j);
if (def != nullptr && _vloop.in_bb(def)) {
found = true;
break;
}
}
if (!found) {
// If all inputs to a data-node are outside the loop, the node itself should be outside the loop.
#ifndef PRODUCT
if (_vloop.is_trace_body()) {
tty->print_cr("VLoopBody::construct: fails because data node in loop has no input in loop:");
n->dump();
}
#endif
return VStatus::make_failure(VLoopBody::FAILURE_UNEXPECTED_CTRL);
}
}
}
}
// Create a reverse-post-order list of nodes in body
ResourceMark rm;
GrowableArray<Node*> stack;
VectorSet visited;
VectorSet post_visited;
visited.set(bb_idx(_vloop.cl()));
stack.push(_vloop.cl());
// Do a depth first walk over out edges
int rpo_idx = body_count - 1;
while (!stack.is_empty()) {
Node* n = stack.top(); // Leave node on stack
if (!visited.test_set(bb_idx(n))) {
// forward arc in graph
} else if (!post_visited.test(bb_idx(n))) {
// cross or back arc
const int old_length = stack.length();
// If a Load depends on the same memory state as a Store, we must make sure that
// the Load is ordered before the Store.
//
// mem
// |
// +--+--+
// | |
// | Load (n)
// |
// Store (mem_use)
//
if (n->is_Load()) {
Node* mem = n->in(MemNode::Memory);
for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
Node* mem_use = mem->fast_out(i);
if (mem_use->is_Store() && _vloop.in_bb(mem_use) && !visited.test(bb_idx(mem_use))) {
stack.push(mem_use); // Ordering edge: Load (n) -> Store (mem_use)
}
}
}
for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
Node* use = n->fast_out(i);
if (_vloop.in_bb(use) && !visited.test(bb_idx(use)) &&
// Don't go around backedge
(!use->is_Phi() || n == _vloop.cl())) {
stack.push(use); // Ordering edge: n -> use
}
}
if (stack.length() == old_length) {
// There were no additional uses, post visit node now
stack.pop(); // Remove node from stack
assert(rpo_idx >= 0, "must still have idx to pass out");
_body.at_put_grow(rpo_idx, n);
rpo_idx--;
post_visited.set(bb_idx(n));
assert(rpo_idx >= 0 || stack.is_empty(), "still have idx left or are finished");
}
} else {
stack.pop(); // Remove post-visited node from stack
}
}
// Create real map of body indices for nodes
for (int j = 0; j < _body.length(); j++) {
Node* n = _body.at(j);
set_bb_idx(n, j);
}
#ifndef PRODUCT
if (_vloop.is_trace_body()) {
print();
}
#endif
assert(rpo_idx == -1 && body_count == _body.length(), "all body members found");
return VStatus::make_success();
}
BasicType SuperWord::longer_type_for_conversion(Node* n) const {
if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(n->Opcode())) ||
!in_bb(n->in(1))) {
return T_ILLEGAL;
}
assert(in_bb(n), "must be in the bb");
BasicType src_t = velt_basic_type(n->in(1));
BasicType dst_t = velt_basic_type(n);
// Do not use superword for non-primitives.
// Superword does not support casting involving unsigned types.
if (!is_java_primitive(src_t) || is_unsigned_subword_type(src_t) ||
!is_java_primitive(dst_t) || is_unsigned_subword_type(dst_t)) {
return T_ILLEGAL;
}
int src_size = type2aelembytes(src_t);
int dst_size = type2aelembytes(dst_t);
return src_size == dst_size ? T_ILLEGAL
: (src_size > dst_size ? src_t : dst_t);
}
void VLoopTypes::compute_vector_element_type() {
#ifndef PRODUCT
if (_vloop.is_trace_vector_element_type()) {
tty->print_cr("\nVLoopTypes::compute_vector_element_type:");
}
#endif
const GrowableArray<Node*>& body = _body.body();
assert(_velt_type.is_empty(), "must not yet be computed");
// reserve space
_velt_type.at_put_grow(body.length()-1, nullptr);
// Initial type
for (int i = 0; i < body.length(); i++) {
Node* n = body.at(i);
set_velt_type(n, container_type(n));
}
// Propagate integer narrowed type backwards through operations
// that don't depend on higher order bits
for (int i = body.length() - 1; i >= 0; i--) {
Node* n = body.at(i);
// Only integer types need be examined
const Type* vtn = velt_type(n);
if (vtn->basic_type() == T_INT) {
uint start, end;
VectorNode::vector_operands(n, &start, &end);
for (uint j = start; j < end; j++) {
Node* in = n->in(j);
// Don't propagate through a memory
if (!in->is_Mem() &&
_vloop.in_bb(in) &&
velt_type(in)->basic_type() == T_INT &&
data_size(n) < data_size(in)) {
bool same_type = true;
for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
Node *use = in->fast_out(k);
if (!_vloop.in_bb(use) || !same_velt_type(use, n)) {
same_type = false;
break;
}
}
if (same_type) {
// In any Java arithmetic operation, operands of small integer types
// (boolean, byte, char & short) should be promoted to int first.
// During narrowed integer type backward propagation, for some operations
// like RShiftI, Abs, and ReverseBytesI,
// the compiler has to know the higher order bits of the 1st operand,
// which will be lost in the narrowed type. These operations shouldn't
// be vectorized if the higher order bits info is imprecise.
const Type* vt = vtn;
int op = in->Opcode();
if (VectorNode::is_shift_opcode(op) || op == Op_AbsI || op == Op_ReverseBytesI) {
Node* load = in->in(1);
if (load->is_Load() &&
_vloop.in_bb(load) &&
(velt_type(load)->basic_type() == T_INT)) {
// Only Load nodes distinguish signed (LoadS/LoadB) and unsigned
// (LoadUS/LoadUB) values. Store nodes only have one version.
vt = velt_type(load);
} else if (op != Op_LShiftI) {
// Widen type to int to avoid the creation of vector nodes. Note
// that left shifts work regardless of the signedness.
vt = TypeInt::INT;
}
}
set_velt_type(in, vt);
}
}
}
}
}
for (int i = 0; i < body.length(); i++) {
Node* n = body.at(i);
Node* nn = n;
if (nn->is_Bool() && nn->in(0) == nullptr) {
nn = nn->in(1);
assert(nn->is_Cmp(), "always have Cmp above Bool");
}
if (nn->is_Cmp() && nn->in(0) == nullptr) {
assert(_vloop.in_bb(nn->in(1)) || _vloop.in_bb(nn->in(2)),
"one of the inputs must be in the loop, too");
if (_vloop.in_bb(nn->in(1))) {
set_velt_type(n, velt_type(nn->in(1)));
} else {
set_velt_type(n, velt_type(nn->in(2)));
}
}
}
#ifndef PRODUCT
if (_vloop.is_trace_vector_element_type()) {
for (int i = 0; i < body.length(); i++) {
Node* n = body.at(i);
velt_type(n)->dump();
tty->print("\t");
n->dump();
}
}
#endif
}
// Smallest type containing range of values
const Type* VLoopTypes::container_type(Node* n) const {
if (n->is_Mem()) {
BasicType bt = n->as_Mem()->memory_type();
if (n->is_Store() && (bt == T_CHAR)) {
// Use T_SHORT type instead of T_CHAR for stored values because any
// preceding arithmetic operation extends values to signed Int.
bt = T_SHORT;
}
if (n->Opcode() == Op_LoadUB) {
// Adjust type for unsigned byte loads, it is important for right shifts.
// T_BOOLEAN is used because there is no basic type representing type
// TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
// size (one byte) and sign is important.
bt = T_BOOLEAN;
}
return Type::get_const_basic_type(bt);
}
const Type* t = _vloop.phase()->igvn().type(n);
if (t->basic_type() == T_INT) {
// A narrow type of arithmetic operations will be determined by
// propagating the type of memory operations.
return TypeInt::INT;
}
return t;
}
bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const {
return _vloop.phase()->C->get_alias_index(m1->adr_type()) ==
_vloop.phase()->C->get_alias_index(m2->adr_type());
}
LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
for (uint i = 0; i < p->size(); i++) {
Node* n = p->at(i);
assert(n->is_Load(), "only meaningful for loads");
if (!n->depends_only_on_test()) {
if (n->as_Load()->has_unknown_control_dependency() &&
dep != LoadNode::Pinned) {
// Upgrade to unknown control...
dep = LoadNode::UnknownControl;
} else {
// Otherwise, we must pin it.
dep = LoadNode::Pinned;
}
}
}
return dep;
}
// Find the memop pack with the maximum vector width, unless they were already
// determined by SuperWord::filter_packs_for_alignment().
void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() {
if (_mem_ref_for_main_loop_alignment != nullptr) {
assert(VLoop::vectors_should_be_aligned(), "mem_ref only set if filtered for alignment");
return;
}
MemNode const* mem_ref = nullptr;
int max_aw = 0;
for (int i = 0; i < _packset.length(); i++) {
Node_List* pack = _packset.at(i);
MemNode* first = pack->at(0)->isa_Mem();
if (first == nullptr) { continue; }
int vw = first->memory_size() * pack->size();
if (vw > max_aw) {
max_aw = vw;
mem_ref = first;
}
}
assert(mem_ref != nullptr && max_aw > 0, "found mem_ref and aw");
_mem_ref_for_main_loop_alignment = mem_ref;
_aw_for_main_loop_alignment = max_aw;
}
#define TRACE_ALIGN_VECTOR_NODE(node) { \
DEBUG_ONLY( \
if (is_trace_align_vector()) { \
tty->print(" " #node ": "); \
node->dump(); \
} \
) \
} \
// Ensure that the main loop vectors are aligned by adjusting the pre loop limit. We memory-align
// the address of "_mem_ref_for_main_loop_alignment" to "_aw_for_main_loop_alignment", which is a
// sufficiently large alignment width. We adjust the pre-loop iteration count by adjusting the
// pre-loop limit.
void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
determine_mem_ref_and_aw_for_main_loop_alignment();
const MemNode* align_to_ref = _mem_ref_for_main_loop_alignment;
const int aw = _aw_for_main_loop_alignment;
assert(align_to_ref != nullptr && aw > 0, "must have alignment reference and aw");
assert(cl()->is_main_loop(), "can only do alignment for main loop");
// The opaque node for the limit, where we adjust the input
Opaque1Node* pre_opaq = _vloop.pre_loop_end()->limit()->as_Opaque1();
// Current pre-loop limit.
Node* old_limit = pre_opaq->in(1);
// Where we put new limit calculations.
Node* pre_ctrl = _vloop.pre_loop_head()->in(LoopNode::EntryControl);
// Ensure the original loop limit is available from the pre-loop Opaque1 node.
Node* orig_limit = pre_opaq->original_loop_limit();
assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, "");
const VPointer& align_to_ref_p = vpointer(align_to_ref);
assert(align_to_ref_p.valid(), "sanity");
// For the main-loop, we want the address of align_to_ref to be memory aligned
// with some alignment width (aw, a power of 2). When we enter the main-loop,
// we know that iv is equal to the pre-loop limit. If we adjust the pre-loop
// limit by executing adjust_pre_iter many extra iterations, we can change the
// alignment of the address.
//
// adr = base + offset + invar + scale * iv (1)
// adr % aw = 0 (2)
//
// Note, that we are defining the modulo operator "%" such that the remainder is
// always positive, see AlignmentSolution::mod(i, q). Since we are only computing
// modulo with powers of 2, we can instead simply use the last log2(q) bits of
// a number i, to get "i % q". This is performed with a bitmask.
//
// The limit of the pre-loop needs to be adjusted:
//
// old_limit: current pre-loop limit
// new_limit: new pre-loop limit
// adjust_pre_iter: additional pre-loop iterations for alignment adjustment
//
// We want to find adjust_pre_iter, such that the address is aligned when entering
// the main-loop:
//
// iv = new_limit = old_limit + adjust_pre_iter (3a, stride > 0)
// iv = new_limit = old_limit - adjust_pre_iter (3b, stride < 0)
//
// We define boi as:
//
// boi = base + offset + invar (4)
//
// And now we can simplify the address using (1), (3), and (4):
//
// adr = boi + scale * new_limit
// adr = boi + scale * (old_limit + adjust_pre_iter) (5a, stride > 0)
// adr = boi + scale * (old_limit - adjust_pre_iter) (5b, stride < 0)
//
// And hence we can restate (2) with (5), and solve the equation for adjust_pre_iter:
//
// (boi + scale * (old_limit + adjust_pre_iter) % aw = 0 (6a, stride > 0)
// (boi + scale * (old_limit - adjust_pre_iter) % aw = 0 (6b, stride < 0)
//
// In most cases, scale is the element size, for example:
//
// for (i = 0; i < a.length; i++) { a[i] = ...; }
//
// It is thus reasonable to assume that both abs(scale) and abs(stride) are
// strictly positive powers of 2. Further, they can be assumed to be non-zero,
// otherwise the address does not depend on iv, and the alignment cannot be
// affected by adjusting the pre-loop limit.
//
// Further, if abs(scale) >= aw, then adjust_pre_iter has no effect on alignment, and
// we are not able to affect the alignment at all. Hence, we require abs(scale) < aw.
//
// Moreover, for alignment to be achievable, boi must be a multiple of scale. If strict
// alignment is required (i.e. -XX:+AlignVector), this is guaranteed by the filtering
// done with the AlignmentSolver / AlignmentSolution. If strict alignment is not
// required, then alignment is still preferable for performance, but not necessary.
// In many cases boi will be a multiple of scale, but if it is not, then the adjustment
// does not guarantee alignment, but the code is still correct.
//
// Hence, in what follows we assume that boi is a multiple of scale, and in fact all
// terms in (6) are multiples of scale. Therefore we divide all terms by scale:
//
// AW = aw / abs(scale) (power of 2) (7)
// BOI = boi / abs(scale) (8)
//
// and restate (6), using (7) and (8), i.e. we divide (6) by abs(scale):
//
// (BOI + sign(scale) * (old_limit + adjust_pre_iter) % AW = 0 (9a, stride > 0)
// (BOI + sign(scale) * (old_limit - adjust_pre_iter) % AW = 0 (9b, stride < 0)
//
// where: sign(scale) = scale / abs(scale) = (scale > 0 ? 1 : -1)
//
// Note, (9) allows for periodic solutions of adjust_pre_iter, with periodicity AW.
// But we would like to spend as few iterations in the pre-loop as possible,
// hence we want the smallest adjust_pre_iter, and so:
//
// 0 <= adjust_pre_iter < AW (10)
//
// We solve (9) for adjust_pre_iter, in the following 4 cases:
//
// Case A: scale > 0 && stride > 0 (i.e. sign(scale) = 1)
// (BOI + old_limit + adjust_pre_iter) % AW = 0
// adjust_pre_iter = (-BOI - old_limit) % AW (11a)
//
// Case B: scale < 0 && stride > 0 (i.e. sign(scale) = -1)
// (BOI - old_limit - adjust_pre_iter) % AW = 0
// adjust_pre_iter = (BOI - old_limit) % AW (11b)
//
// Case C: scale > 0 && stride < 0 (i.e. sign(scale) = 1)
// (BOI + old_limit - adjust_pre_iter) % AW = 0
// adjust_pre_iter = (BOI + old_limit) % AW (11c)
//
// Case D: scale < 0 && stride < 0 (i.e. sign(scale) = -1)
// (BOI - old_limit + adjust_pre_iter) % AW = 0
// adjust_pre_iter = (-BOI + old_limit) % AW (11d)
//
// We now generalize the equations (11*) by using:
//
// OP: (stride > 0) ? SUB : ADD
// XBOI: (stride * scale > 0) ? -BOI : BOI
//
// which gives us the final pre-loop limit adjustment:
//
// adjust_pre_iter = (XBOI OP old_limit) % AW (12)
//
// We can construct XBOI by additionally defining:
//
// xboi = (stride * scale > 0) ? -boi : boi (13)
//
// which gives us:
//
// XBOI = (stride * scale > 0) ? -BOI : BOI
// = (stride * scale > 0) ? -boi / abs(scale) : boi / abs(scale)
// = xboi / abs(scale) (14)
//
// When we have computed adjust_pre_iter, we update the pre-loop limit
// with (3a, b). However, we have to make sure that the adjust_pre_iter
// additional pre-loop iterations do not lead the pre-loop to execute
// iterations that would step over the original limit (orig_limit) of
// the loop. Hence, we must constrain the updated limit as follows:
//
// constrained_limit = MIN(old_limit + adjust_pre_iter, orig_limit)
// = MIN(new_limit, orig_limit) (15a, stride > 0)
// constrained_limit = MAX(old_limit - adjust_pre_iter, orig_limit)
// = MAX(new_limit, orig_limit) (15a, stride < 0)
//
const int stride = iv_stride();
const int scale = align_to_ref_p.scale_in_bytes();
const int offset = align_to_ref_p.offset_in_bytes();
Node* base = align_to_ref_p.adr();
Node* invar = align_to_ref_p.invar();
#ifdef ASSERT
if (is_trace_align_vector()) {
tty->print_cr("\nadjust_pre_loop_limit_to_align_main_loop_vectors:");
tty->print(" align_to_ref:");
align_to_ref->dump();
tty->print_cr(" aw: %d", aw);
tty->print_cr(" stride: %d", stride);
tty->print_cr(" scale: %d", scale);
tty->print_cr(" offset: %d", offset);
tty->print(" base:");
base->dump();
if (invar == nullptr) {
tty->print_cr(" invar: null");
} else {
tty->print(" invar:");
invar->dump();
}
tty->print(" old_limit: ");
old_limit->dump();
tty->print(" orig_limit: ");
orig_limit->dump();
}
#endif
if (stride == 0 || !is_power_of_2(abs(stride)) ||
scale == 0 || !is_power_of_2(abs(scale)) ||
abs(scale) >= aw) {
#ifdef ASSERT
if (is_trace_align_vector()) {
tty->print_cr(" Alignment cannot be affected by changing pre-loop limit because");
tty->print_cr(" stride or scale are not power of 2, or abs(scale) >= aw.");
}
#endif
// Cannot affect alignment, abort.
return;
}
assert(stride != 0 && is_power_of_2(abs(stride)) &&
scale != 0 && is_power_of_2(abs(scale)) &&
abs(scale) < aw, "otherwise we cannot affect alignment with pre-loop");
const int AW = aw / abs(scale);
#ifdef ASSERT
if (is_trace_align_vector()) {
tty->print_cr(" AW = aw(%d) / abs(scale(%d)) = %d", aw, scale, AW);
}
#endif
// 1: Compute (13a, b):
// xboi = -boi = (-base - offset - invar) (stride * scale > 0)
// xboi = +boi = (+base + offset + invar) (stride * scale < 0)
const bool is_sub = scale * stride > 0;
// 1.1: offset
Node* xboi = igvn().intcon(is_sub ? -offset : offset);
TRACE_ALIGN_VECTOR_NODE(xboi);
// 1.2: invar (if it exists)
if (invar != nullptr) {
if (igvn().type(invar)->isa_long()) {
// Computations are done % (vector width/element size) so it's
// safe to simply convert invar to an int and loose the upper 32
// bit half.
invar = new ConvL2INode(invar);
phase()->register_new_node(invar, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(invar);
}
if (is_sub) {
xboi = new SubINode(xboi, invar);
} else {
xboi = new AddINode(xboi, invar);
}
phase()->register_new_node(xboi, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(xboi);
}
// 1.3: base (unless base is guaranteed aw aligned)
if (aw > ObjectAlignmentInBytes || align_to_ref_p.base()->is_top()) {
// The base is only aligned with ObjectAlignmentInBytes with arrays.
// When the base() is top, we have no alignment guarantee at all.
// Hence, we must now take the base into account for the calculation.
Node* xbase = new CastP2XNode(nullptr, base);
phase()->register_new_node(xbase, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(xbase);
#ifdef _LP64
xbase = new ConvL2INode(xbase);
phase()->register_new_node(xbase, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(xbase);
#endif
if (is_sub) {
xboi = new SubINode(xboi, xbase);
} else {
xboi = new AddINode(xboi, xbase);
}
phase()->register_new_node(xboi, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(xboi);
}
// 2: Compute (14):
// XBOI = xboi / abs(scale)
// The division is executed as shift
Node* log2_abs_scale = igvn().intcon(exact_log2(abs(scale)));
Node* XBOI = new URShiftINode(xboi, log2_abs_scale);
phase()->register_new_node(XBOI, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(log2_abs_scale);
TRACE_ALIGN_VECTOR_NODE(XBOI);
// 3: Compute (12):
// adjust_pre_iter = (XBOI OP old_limit) % AW
//
// 3.1: XBOI_OP_old_limit = XBOI OP old_limit
Node* XBOI_OP_old_limit = nullptr;
if (stride > 0) {
XBOI_OP_old_limit = new SubINode(XBOI, old_limit);
} else {
XBOI_OP_old_limit = new AddINode(XBOI, old_limit);
}
phase()->register_new_node(XBOI_OP_old_limit, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(XBOI_OP_old_limit);
// 3.2: Compute:
// adjust_pre_iter = (XBOI OP old_limit) % AW
// = XBOI_OP_old_limit % AW
// = XBOI_OP_old_limit AND (AW - 1)
// Since AW is a power of 2, the modulo operation can be replaced with
// a bitmask operation.
Node* mask_AW = igvn().intcon(AW-1);
Node* adjust_pre_iter = new AndINode(XBOI_OP_old_limit, mask_AW);
phase()->register_new_node(adjust_pre_iter, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(mask_AW);
TRACE_ALIGN_VECTOR_NODE(adjust_pre_iter);
// 4: Compute (3a, b):
// new_limit = old_limit + adjust_pre_iter (stride > 0)
// new_limit = old_limit - adjust_pre_iter (stride < 0)
Node* new_limit = nullptr;
if (stride < 0) {
new_limit = new SubINode(old_limit, adjust_pre_iter);
} else {
new_limit = new AddINode(old_limit, adjust_pre_iter);
}
phase()->register_new_node(new_limit, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(new_limit);
// 5: Compute (15a, b):
// Prevent pre-loop from going past the original limit of the loop.
Node* constrained_limit =
(stride > 0) ? (Node*) new MinINode(new_limit, orig_limit)
: (Node*) new MaxINode(new_limit, orig_limit);
phase()->register_new_node(constrained_limit, pre_ctrl);
TRACE_ALIGN_VECTOR_NODE(constrained_limit);
// 6: Hack the pre-loop limit
igvn().replace_input_of(pre_opaq, 1, constrained_limit);
}
#ifndef PRODUCT
void PairSet::print() const {
tty->print_cr("\nPairSet::print: %d pairs", length());
int chain = 0;
int chain_index = 0;
for (PairSetIterator pair(*this); !pair.done(); pair.next()) {
Node* left = pair.left();
Node* right = pair.right();
if (is_left_in_a_left_most_pair(left)) {
chain_index = 0;
tty->print_cr(" Pair-chain %d:", chain++);
tty->print(" %3d: ", chain_index++);
left->dump();
}
tty->print(" %3d: ", chain_index++);
right->dump();
}
}
void PackSet::print() const {
tty->print_cr("\nPackSet::print: %d packs", _packs.length());
for (int i = 0; i < _packs.length(); i++) {
tty->print_cr(" Pack: %d", i);
Node_List* pack = _packs.at(i);
if (pack == nullptr) {
tty->print_cr(" nullptr");
} else {
print_pack(pack);
}
}
}
void PackSet::print_pack(Node_List* pack) {
for (uint i = 0; i < pack->size(); i++) {
tty->print(" %3d: ", i);
pack->at(i)->dump();
}
}
#endif
#ifndef PRODUCT
void VLoopBody::print() const {
tty->print_cr("\nBlock");
for (int i = 0; i < body().length(); i++) {
Node* n = body().at(i);
tty->print("%d ", i);
if (n != nullptr) {
n->dump();
}
}
}
#endif
//
// --------------------------------- vectorization/simd -----------------------------------
//
bool SuperWord::same_origin_idx(Node* a, Node* b) const {
return a != nullptr && b != nullptr && _clone_map.same_idx(a->_idx, b->_idx);
}
bool SuperWord::same_generation(Node* a, Node* b) const {
return a != nullptr && b != nullptr && _clone_map.same_gen(a->_idx, b->_idx);
}