mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-04 05:00:31 +00:00
3569 lines
129 KiB
C++
3569 lines
129 KiB
C++
/*
|
|
* Copyright (c) 2007, 2024, Oracle and/or its affiliates. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*/
|
|
|
|
#include "precompiled.hpp"
|
|
#include "libadt/vectset.hpp"
|
|
#include "memory/allocation.inline.hpp"
|
|
#include "memory/resourceArea.hpp"
|
|
#include "opto/addnode.hpp"
|
|
#include "opto/c2compiler.hpp"
|
|
#include "opto/castnode.hpp"
|
|
#include "opto/convertnode.hpp"
|
|
#include "opto/matcher.hpp"
|
|
#include "opto/memnode.hpp"
|
|
#include "opto/opcodes.hpp"
|
|
#include "opto/opaquenode.hpp"
|
|
#include "opto/rootnode.hpp"
|
|
#include "opto/superword.hpp"
|
|
#include "opto/vectornode.hpp"
|
|
#include "opto/movenode.hpp"
|
|
#include "utilities/powerOfTwo.hpp"
|
|
|
|
SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
|
|
_vloop_analyzer(vloop_analyzer),
|
|
_vloop(vloop_analyzer.vloop()),
|
|
_arena(mtCompiler),
|
|
_clone_map(phase()->C->clone_map()), // map of nodes created in cloning
|
|
_pairset(&_arena, _vloop_analyzer),
|
|
_packset(&_arena, _vloop_analyzer
|
|
NOT_PRODUCT(COMMA is_trace_superword_packset())
|
|
NOT_PRODUCT(COMMA is_trace_superword_rejections())
|
|
),
|
|
_mem_ref_for_main_loop_alignment(nullptr),
|
|
_aw_for_main_loop_alignment(0),
|
|
_do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
|
|
_num_work_vecs(0), // amount of vector work we have
|
|
_num_reductions(0) // amount of reduction work we have
|
|
{
|
|
}
|
|
|
|
void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor) {
|
|
IdealLoopTree* lpt = vloop.lpt();
|
|
CountedLoopNode* cl = vloop.cl();
|
|
Node* cl_exit = vloop.cl_exit();
|
|
PhaseIdealLoop* phase = vloop.phase();
|
|
|
|
bool is_slp = true;
|
|
size_t ignored_size = lpt->_body.size();
|
|
int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
|
|
Node_Stack nstack((int)ignored_size);
|
|
|
|
// First clear the entries
|
|
for (uint i = 0; i < lpt->_body.size(); i++) {
|
|
ignored_loop_nodes[i] = -1;
|
|
}
|
|
|
|
int max_vector = Matcher::max_vector_size_auto_vectorization(T_BYTE);
|
|
|
|
// Process the loop, some/all of the stack entries will not be in order, ergo
|
|
// need to preprocess the ignored initial state before we process the loop
|
|
for (uint i = 0; i < lpt->_body.size(); i++) {
|
|
Node* n = lpt->_body.at(i);
|
|
if (n == cl->incr() ||
|
|
n->is_AddP() ||
|
|
n->is_Cmp() ||
|
|
n->is_Bool() ||
|
|
n->is_IfTrue() ||
|
|
n->is_CountedLoop() ||
|
|
(n == cl_exit)) {
|
|
ignored_loop_nodes[i] = n->_idx;
|
|
continue;
|
|
}
|
|
|
|
if (n->is_If()) {
|
|
IfNode *iff = n->as_If();
|
|
if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
|
|
if (lpt->is_loop_exit(iff)) {
|
|
ignored_loop_nodes[i] = n->_idx;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (n->is_memory_phi()) {
|
|
Node* n_tail = n->in(LoopNode::LoopBackControl);
|
|
if (n_tail != n->in(LoopNode::EntryControl)) {
|
|
if (!n_tail->is_Mem()) {
|
|
is_slp = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// This must happen after check of phi/if
|
|
if (n->is_Phi() || n->is_If()) {
|
|
ignored_loop_nodes[i] = n->_idx;
|
|
continue;
|
|
}
|
|
|
|
if (n->is_LoadStore() || n->is_MergeMem() ||
|
|
(n->is_Proj() && !n->as_Proj()->is_CFG())) {
|
|
is_slp = false;
|
|
break;
|
|
}
|
|
|
|
// Ignore nodes with non-primitive type.
|
|
BasicType bt;
|
|
if (n->is_Mem()) {
|
|
bt = n->as_Mem()->memory_type();
|
|
} else {
|
|
bt = n->bottom_type()->basic_type();
|
|
}
|
|
if (is_java_primitive(bt) == false) {
|
|
ignored_loop_nodes[i] = n->_idx;
|
|
continue;
|
|
}
|
|
|
|
if (n->is_Mem()) {
|
|
MemNode* current = n->as_Mem();
|
|
Node* adr = n->in(MemNode::Address);
|
|
Node* n_ctrl = phase->get_ctrl(adr);
|
|
|
|
// save a queue of post process nodes
|
|
if (n_ctrl != nullptr && lpt->is_member(phase->get_loop(n_ctrl))) {
|
|
// Process the memory expression
|
|
int stack_idx = 0;
|
|
bool have_side_effects = true;
|
|
if (adr->is_AddP() == false) {
|
|
nstack.push(adr, stack_idx++);
|
|
} else {
|
|
// Mark the components of the memory operation in nstack
|
|
VPointer p1(current, vloop, &nstack);
|
|
have_side_effects = p1.node_stack()->is_nonempty();
|
|
}
|
|
|
|
// Process the pointer stack
|
|
while (have_side_effects) {
|
|
Node* pointer_node = nstack.node();
|
|
for (uint j = 0; j < lpt->_body.size(); j++) {
|
|
Node* cur_node = lpt->_body.at(j);
|
|
if (cur_node == pointer_node) {
|
|
ignored_loop_nodes[j] = cur_node->_idx;
|
|
break;
|
|
}
|
|
}
|
|
nstack.pop();
|
|
have_side_effects = nstack.is_nonempty();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (is_slp) {
|
|
// Now we try to find the maximum supported consistent vector which the machine
|
|
// description can use
|
|
bool flag_small_bt = false;
|
|
for (uint i = 0; i < lpt->_body.size(); i++) {
|
|
if (ignored_loop_nodes[i] != -1) continue;
|
|
|
|
BasicType bt;
|
|
Node* n = lpt->_body.at(i);
|
|
if (n->is_Mem()) {
|
|
bt = n->as_Mem()->memory_type();
|
|
} else {
|
|
bt = n->bottom_type()->basic_type();
|
|
}
|
|
|
|
if (is_java_primitive(bt) == false) continue;
|
|
|
|
int cur_max_vector = Matcher::max_vector_size_auto_vectorization(bt);
|
|
|
|
// If a max vector exists which is not larger than _local_loop_unroll_factor
|
|
// stop looking, we already have the max vector to map to.
|
|
if (cur_max_vector < local_loop_unroll_factor) {
|
|
is_slp = false;
|
|
#ifndef PRODUCT
|
|
if (TraceSuperWordLoopUnrollAnalysis) {
|
|
tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
|
|
}
|
|
#endif
|
|
break;
|
|
}
|
|
|
|
// Map the maximal common vector except conversion nodes, because we can't get
|
|
// the precise basic type for conversion nodes in the stage of early analysis.
|
|
if (!VectorNode::is_convert_opcode(n->Opcode()) &&
|
|
VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
|
|
if (cur_max_vector < max_vector && !flag_small_bt) {
|
|
max_vector = cur_max_vector;
|
|
} else if (cur_max_vector > max_vector && UseSubwordForMaxVector) {
|
|
// Analyse subword in the loop to set maximum vector size to take advantage of full vector width for subword types.
|
|
// Here we analyze if narrowing is likely to happen and if it is we set vector size more aggressively.
|
|
// We check for possibility of narrowing by looking through chain operations using subword types.
|
|
if (is_subword_type(bt)) {
|
|
uint start, end;
|
|
VectorNode::vector_operands(n, &start, &end);
|
|
|
|
for (uint j = start; j < end; j++) {
|
|
Node* in = n->in(j);
|
|
// Don't propagate through a memory
|
|
if (!in->is_Mem() && vloop.in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
|
|
bool same_type = true;
|
|
for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
|
|
Node *use = in->fast_out(k);
|
|
if (!vloop.in_bb(use) && use->bottom_type()->basic_type() != bt) {
|
|
same_type = false;
|
|
break;
|
|
}
|
|
}
|
|
if (same_type) {
|
|
max_vector = cur_max_vector;
|
|
flag_small_bt = true;
|
|
cl->mark_subword_loop();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (is_slp) {
|
|
local_loop_unroll_factor = max_vector;
|
|
cl->mark_passed_slp();
|
|
}
|
|
cl->mark_was_slp();
|
|
if (cl->is_main_loop()) {
|
|
#ifndef PRODUCT
|
|
if (TraceSuperWordLoopUnrollAnalysis) {
|
|
tty->print_cr("slp analysis: set max unroll to %d", local_loop_unroll_factor);
|
|
}
|
|
#endif
|
|
cl->set_slp_max_unroll(local_loop_unroll_factor);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool VLoopReductions::is_reduction(const Node* n) {
|
|
if (!is_reduction_operator(n)) {
|
|
return false;
|
|
}
|
|
// Test whether there is a reduction cycle via every edge index
|
|
// (typically indices 1 and 2).
|
|
for (uint input = 1; input < n->req(); input++) {
|
|
if (in_reduction_cycle(n, input)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool VLoopReductions::is_reduction_operator(const Node* n) {
|
|
int opc = n->Opcode();
|
|
return (opc != ReductionNode::opcode(opc, n->bottom_type()->basic_type()));
|
|
}
|
|
|
|
bool VLoopReductions::in_reduction_cycle(const Node* n, uint input) {
|
|
// First find input reduction path to phi node.
|
|
auto has_my_opcode = [&](const Node* m){ return m->Opcode() == n->Opcode(); };
|
|
PathEnd path_to_phi = find_in_path(n, input, LoopMaxUnroll, has_my_opcode,
|
|
[&](const Node* m) { return m->is_Phi(); });
|
|
const Node* phi = path_to_phi.first;
|
|
if (phi == nullptr) {
|
|
return false;
|
|
}
|
|
// If there is an input reduction path from the phi's loop-back to n, then n
|
|
// is part of a reduction cycle.
|
|
const Node* first = phi->in(LoopNode::LoopBackControl);
|
|
PathEnd path_from_phi = find_in_path(first, input, LoopMaxUnroll, has_my_opcode,
|
|
[&](const Node* m) { return m == n; });
|
|
return path_from_phi.first != nullptr;
|
|
}
|
|
|
|
Node* VLoopReductions::original_input(const Node* n, uint i) {
|
|
if (n->has_swapped_edges()) {
|
|
assert(n->is_Add() || n->is_Mul(), "n should be commutative");
|
|
if (i == 1) {
|
|
return n->in(2);
|
|
} else if (i == 2) {
|
|
return n->in(1);
|
|
}
|
|
}
|
|
return n->in(i);
|
|
}
|
|
|
|
void VLoopReductions::mark_reductions() {
|
|
assert(_loop_reductions.is_empty(), "must not yet be computed");
|
|
CountedLoopNode* cl = _vloop.cl();
|
|
|
|
// Iterate through all phi nodes associated to the loop and search for
|
|
// reduction cycles in the basic block.
|
|
for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) {
|
|
const Node* phi = cl->fast_out(i);
|
|
if (!phi->is_Phi()) {
|
|
continue;
|
|
}
|
|
if (phi->outcnt() == 0) {
|
|
continue;
|
|
}
|
|
if (phi == _vloop.iv()) {
|
|
continue;
|
|
}
|
|
// The phi's loop-back is considered the first node in the reduction cycle.
|
|
const Node* first = phi->in(LoopNode::LoopBackControl);
|
|
if (first == nullptr) {
|
|
continue;
|
|
}
|
|
// Test that the node fits the standard pattern for a reduction operator.
|
|
if (!is_reduction_operator(first)) {
|
|
continue;
|
|
}
|
|
// Test that 'first' is the beginning of a reduction cycle ending in 'phi'.
|
|
// To contain the number of searched paths, assume that all nodes in a
|
|
// reduction cycle are connected via the same edge index, modulo swapped
|
|
// inputs. This assumption is realistic because reduction cycles usually
|
|
// consist of nodes cloned by loop unrolling.
|
|
int reduction_input = -1;
|
|
int path_nodes = -1;
|
|
for (uint input = 1; input < first->req(); input++) {
|
|
// Test whether there is a reduction path in the basic block from 'first'
|
|
// to the phi node following edge index 'input'.
|
|
PathEnd path =
|
|
find_in_path(
|
|
first, input, _vloop.lpt()->_body.size(),
|
|
[&](const Node* n) { return n->Opcode() == first->Opcode() &&
|
|
_vloop.in_bb(n); },
|
|
[&](const Node* n) { return n == phi; });
|
|
if (path.first != nullptr) {
|
|
reduction_input = input;
|
|
path_nodes = path.second;
|
|
break;
|
|
}
|
|
}
|
|
if (reduction_input == -1) {
|
|
continue;
|
|
}
|
|
// Test that reduction nodes do not have any users in the loop besides their
|
|
// reduction cycle successors.
|
|
const Node* current = first;
|
|
const Node* succ = phi; // current's successor in the reduction cycle.
|
|
bool used_in_loop = false;
|
|
for (int i = 0; i < path_nodes; i++) {
|
|
for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) {
|
|
Node* u = current->fast_out(j);
|
|
if (!_vloop.in_bb(u)) {
|
|
continue;
|
|
}
|
|
if (u == succ) {
|
|
continue;
|
|
}
|
|
used_in_loop = true;
|
|
break;
|
|
}
|
|
if (used_in_loop) {
|
|
break;
|
|
}
|
|
succ = current;
|
|
current = original_input(current, reduction_input);
|
|
}
|
|
if (used_in_loop) {
|
|
continue;
|
|
}
|
|
// Reduction cycle found. Mark all nodes in the found path as reductions.
|
|
current = first;
|
|
for (int i = 0; i < path_nodes; i++) {
|
|
_loop_reductions.set(current->_idx);
|
|
current = original_input(current, reduction_input);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool SuperWord::transform_loop() {
|
|
assert(phase()->C->do_superword(), "SuperWord option should be enabled");
|
|
assert(cl()->is_main_loop(), "SLP should only work on main loops");
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_any()) {
|
|
tty->print_cr("\nSuperWord::transform_loop:");
|
|
lpt()->dump_head();
|
|
cl()->dump();
|
|
}
|
|
#endif
|
|
|
|
if (!SLP_extract()) {
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_any()) {
|
|
tty->print_cr("\nSuperWord::transform_loop failed: SuperWord::SLP_extract did not vectorize");
|
|
}
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_any()) {
|
|
tty->print_cr("\nSuperWord::transform_loop: success");
|
|
}
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
//------------------------------SLP_extract---------------------------
|
|
// Extract the superword level parallelism
|
|
//
|
|
// 1) A reverse post-order of nodes in the block is constructed. By scanning
|
|
// this list from first to last, all definitions are visited before their uses.
|
|
//
|
|
// 2) A point-to-point dependence graph is constructed between memory references.
|
|
// This simplifies the upcoming "independence" checker.
|
|
//
|
|
// 3) The maximum depth in the node graph from the beginning of the block
|
|
// to each node is computed. This is used to prune the graph search
|
|
// in the independence checker.
|
|
//
|
|
// 4) For integer types, the necessary bit width is propagated backwards
|
|
// from stores to allow packed operations on byte, char, and short
|
|
// integers. This reverses the promotion to type "int" that javac
|
|
// did for operations like: char c1,c2,c3; c1 = c2 + c3.
|
|
//
|
|
// 5) One of the memory references is picked to be an aligned vector reference.
|
|
// The pre-loop trip count is adjusted to align this reference in the
|
|
// unrolled body.
|
|
//
|
|
// 6) The initial set of pack pairs is seeded with memory references.
|
|
//
|
|
// 7) The set of pack pairs is extended by following use->def and def->use links.
|
|
//
|
|
// 8) The pairs are combined into vector sized packs.
|
|
//
|
|
// 9) Reorder the memory slices to co-locate members of the memory packs.
|
|
//
|
|
// 10) Generate ideal vector nodes for the final set of packs and where necessary,
|
|
// inserting scalar promotion, vector creation from multiple scalars, and
|
|
// extraction of scalar values from vectors.
|
|
//
|
|
bool SuperWord::SLP_extract() {
|
|
assert(cl()->is_main_loop(), "SLP should only work on main loops");
|
|
|
|
// Find "seed" pairs.
|
|
create_adjacent_memop_pairs();
|
|
|
|
if (_pairset.is_empty()) {
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_any()) {
|
|
tty->print_cr("\nNo pair packs generated, abort SuperWord.");
|
|
tty->cr();
|
|
}
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
extend_pairset_with_more_pairs_by_following_use_and_def();
|
|
|
|
combine_pairs_to_longer_packs();
|
|
|
|
split_packs_at_use_def_boundaries(); // a first time: create natural boundaries
|
|
split_packs_only_implemented_with_smaller_size();
|
|
split_packs_to_break_mutual_dependence();
|
|
split_packs_at_use_def_boundaries(); // again: propagate split of other packs
|
|
|
|
filter_packs_for_power_of_2_size();
|
|
filter_packs_for_mutual_independence();
|
|
filter_packs_for_alignment();
|
|
filter_packs_for_implemented();
|
|
filter_packs_for_profitable();
|
|
|
|
DEBUG_ONLY(verify_packs();)
|
|
DEBUG_ONLY(verify_no_extract());
|
|
|
|
return schedule_and_apply();
|
|
}
|
|
|
|
// Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization.
|
|
void SuperWord::create_adjacent_memop_pairs() {
|
|
ResourceMark rm;
|
|
GrowableArray<const VPointer*> vpointers;
|
|
|
|
collect_valid_vpointers(vpointers);
|
|
|
|
// Sort the VPointers. This does 2 things:
|
|
// - Separate the VPointer into groups: all memops that have the same opcode and the same
|
|
// VPointer, except for the offset. Adjacent memops must have the same opcode and the
|
|
// same VPointer, except for a shift in the offset. Thus, two memops can only be adjacent
|
|
// if they are in the same group. This decreases the work.
|
|
// - Sort by offset inside the groups. This decreases the work needed to determine adjacent
|
|
// memops inside a group.
|
|
vpointers.sort(VPointer::cmp_for_sort);
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_adjacent_memops()) {
|
|
tty->print_cr("\nSuperWord::create_adjacent_memop_pairs:");
|
|
}
|
|
#endif
|
|
|
|
create_adjacent_memop_pairs_in_all_groups(vpointers);
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_packset()) {
|
|
tty->print_cr("\nAfter Superword::create_adjacent_memop_pairs");
|
|
_pairset.print();
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Collect all memops vpointers that could potentially be vectorized.
|
|
void SuperWord::collect_valid_vpointers(GrowableArray<const VPointer*>& vpointers) {
|
|
for_each_mem([&] (const MemNode* mem, int bb_idx) {
|
|
const VPointer& p = vpointer(mem);
|
|
if (p.valid() &&
|
|
!mem->is_LoadStore() &&
|
|
is_java_primitive(mem->memory_type())) {
|
|
vpointers.append(&p);
|
|
}
|
|
});
|
|
}
|
|
|
|
// For each group, find the adjacent memops.
|
|
void SuperWord::create_adjacent_memop_pairs_in_all_groups(const GrowableArray<const VPointer*> &vpointers) {
|
|
int group_start = 0;
|
|
while (group_start < vpointers.length()) {
|
|
int group_end = find_group_end(vpointers, group_start);
|
|
create_adjacent_memop_pairs_in_one_group(vpointers, group_start, group_end);
|
|
group_start = group_end;
|
|
}
|
|
}
|
|
|
|
// Step forward until we find a VPointer of another group, or we reach the end of the array.
|
|
int SuperWord::find_group_end(const GrowableArray<const VPointer*>& vpointers, int group_start) {
|
|
int group_end = group_start + 1;
|
|
while (group_end < vpointers.length() &&
|
|
VPointer::cmp_for_sort_by_group(
|
|
vpointers.adr_at(group_start),
|
|
vpointers.adr_at(group_end)
|
|
) == 0) {
|
|
group_end++;
|
|
}
|
|
return group_end;
|
|
}
|
|
|
|
// Find adjacent memops for a single group, e.g. for all LoadI of the same base, invar, etc.
|
|
// Create pairs and add them to the pairset.
|
|
void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArray<const VPointer*>& vpointers, const int group_start, const int group_end) {
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_adjacent_memops()) {
|
|
tty->print_cr(" group:");
|
|
for (int i = group_start; i < group_end; i++) {
|
|
const VPointer* p = vpointers.at(i);
|
|
tty->print(" ");
|
|
p->print();
|
|
}
|
|
}
|
|
#endif
|
|
|
|
MemNode* first = vpointers.at(group_start)->mem();
|
|
int element_size = data_size(first);
|
|
|
|
// For each ref in group: find others that can be paired:
|
|
for (int i = group_start; i < group_end; i++) {
|
|
const VPointer* p1 = vpointers.at(i);
|
|
MemNode* mem1 = p1->mem();
|
|
|
|
bool found = false;
|
|
// For each ref in group with larger or equal offset:
|
|
for (int j = i + 1; j < group_end; j++) {
|
|
const VPointer* p2 = vpointers.at(j);
|
|
MemNode* mem2 = p2->mem();
|
|
assert(mem1 != mem2, "look only at pair of different memops");
|
|
|
|
// Check for correct distance.
|
|
assert(data_size(mem1) == element_size, "all nodes in group must have the same element size");
|
|
assert(data_size(mem2) == element_size, "all nodes in group must have the same element size");
|
|
assert(p1->offset_in_bytes() <= p2->offset_in_bytes(), "must be sorted by offset");
|
|
if (p1->offset_in_bytes() + element_size > p2->offset_in_bytes()) { continue; }
|
|
if (p1->offset_in_bytes() + element_size < p2->offset_in_bytes()) { break; }
|
|
|
|
// Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize)
|
|
if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; }
|
|
|
|
if (!can_pack_into_pair(mem1, mem2)) { continue; }
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_adjacent_memops()) {
|
|
if (found) {
|
|
tty->print_cr(" WARNING: multiple pairs with the same node. Ignored pairing:");
|
|
} else {
|
|
tty->print_cr(" pair:");
|
|
}
|
|
tty->print(" ");
|
|
p1->print();
|
|
tty->print(" ");
|
|
p2->print();
|
|
}
|
|
#endif
|
|
|
|
if (!found) {
|
|
_pairset.add_pair(mem1, mem2);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void VLoopMemorySlices::find_memory_slices() {
|
|
assert(_heads.is_empty(), "not yet computed");
|
|
assert(_tails.is_empty(), "not yet computed");
|
|
CountedLoopNode* cl = _vloop.cl();
|
|
|
|
// Iterate over all memory phis
|
|
for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) {
|
|
PhiNode* phi = cl->fast_out(i)->isa_Phi();
|
|
if (phi != nullptr && _vloop.in_bb(phi) && phi->is_memory_phi()) {
|
|
Node* phi_tail = phi->in(LoopNode::LoopBackControl);
|
|
if (phi_tail != phi->in(LoopNode::EntryControl)) {
|
|
_heads.push(phi);
|
|
_tails.push(phi_tail->as_Mem());
|
|
}
|
|
}
|
|
}
|
|
|
|
NOT_PRODUCT( if (_vloop.is_trace_memory_slices()) { print(); } )
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
void VLoopMemorySlices::print() const {
|
|
tty->print_cr("\nVLoopMemorySlices::print: %s",
|
|
heads().length() > 0 ? "" : "NONE");
|
|
for (int m = 0; m < heads().length(); m++) {
|
|
tty->print("%6d ", m); heads().at(m)->dump();
|
|
tty->print(" "); tails().at(m)->dump();
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Get all memory nodes of a slice, in reverse order
|
|
void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<MemNode*> &slice) const {
|
|
assert(slice.is_empty(), "start empty");
|
|
Node* n = tail;
|
|
Node* prev = nullptr;
|
|
while (true) {
|
|
assert(_vloop.in_bb(n), "must be in block");
|
|
for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
|
|
Node* out = n->fast_out(i);
|
|
if (out->is_Load()) {
|
|
if (_vloop.in_bb(out)) {
|
|
slice.push(out->as_Load());
|
|
}
|
|
} else {
|
|
// FIXME
|
|
if (out->is_MergeMem() && !_vloop.in_bb(out)) {
|
|
// Either unrolling is causing a memory edge not to disappear,
|
|
// or need to run igvn.optimize() again before SLP
|
|
} else if (out->is_memory_phi() && !_vloop.in_bb(out)) {
|
|
// Ditto. Not sure what else to check further.
|
|
} else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
|
|
// StoreCM has an input edge used as a precedence edge.
|
|
// Maybe an issue when oop stores are vectorized.
|
|
} else {
|
|
assert(out == prev || prev == nullptr, "no branches off of store slice");
|
|
}
|
|
}//else
|
|
}//for
|
|
if (n == head) { break; }
|
|
slice.push(n->as_Mem());
|
|
prev = n;
|
|
assert(n->is_Mem(), "unexpected node %s", n->Name());
|
|
n = n->in(MemNode::Memory);
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
if (_vloop.is_trace_memory_slices()) {
|
|
tty->print_cr("\nVLoopMemorySlices::get_slice_in_reverse_order:");
|
|
head->dump();
|
|
for (int j = slice.length() - 1; j >= 0 ; j--) {
|
|
slice.at(j)->dump();
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Check if two nodes can be packed into a pair.
|
|
bool SuperWord::can_pack_into_pair(Node* s1, Node* s2) {
|
|
|
|
// Do not use superword for non-primitives
|
|
BasicType bt1 = velt_basic_type(s1);
|
|
BasicType bt2 = velt_basic_type(s2);
|
|
if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
|
|
return false;
|
|
BasicType longer_bt = longer_type_for_conversion(s1);
|
|
if (Matcher::max_vector_size_auto_vectorization(bt1) < 2 ||
|
|
(longer_bt != T_ILLEGAL && Matcher::max_vector_size_auto_vectorization(longer_bt) < 2)) {
|
|
return false; // No vectors for this type
|
|
}
|
|
|
|
// Forbid anything that looks like a PopulateIndex to be packed. It does not need to be packed,
|
|
// and will still be vectorized by SuperWord::vector_opd.
|
|
if (isomorphic(s1, s2) && !is_populate_index(s1, s2)) {
|
|
if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
|
|
if (!_pairset.is_left(s1) && !_pairset.is_right(s2)) {
|
|
if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
//------------------------------are_adjacent_refs---------------------------
|
|
// Is s1 immediately before s2 in memory?
|
|
bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) const {
|
|
if (!s1->is_Mem() || !s2->is_Mem()) return false;
|
|
if (!in_bb(s1) || !in_bb(s2)) return false;
|
|
|
|
// Do not use superword for non-primitives
|
|
if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
|
|
!is_java_primitive(s2->as_Mem()->memory_type())) {
|
|
return false;
|
|
}
|
|
|
|
// Adjacent memory references must be on the same slice.
|
|
if (!same_memory_slice(s1->as_Mem(), s2->as_Mem())) {
|
|
return false;
|
|
}
|
|
|
|
// Adjacent memory references must have the same base, be comparable
|
|
// and have the correct distance between them.
|
|
const VPointer& p1 = vpointer(s1->as_Mem());
|
|
const VPointer& p2 = vpointer(s2->as_Mem());
|
|
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
|
|
int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
|
|
return diff == data_size(s1);
|
|
}
|
|
|
|
//------------------------------isomorphic---------------------------
|
|
// Are s1 and s2 similar?
|
|
bool SuperWord::isomorphic(Node* s1, Node* s2) {
|
|
if (s1->Opcode() != s2->Opcode() ||
|
|
s1->req() != s2->req() ||
|
|
!same_velt_type(s1, s2) ||
|
|
(s1->is_Bool() && s1->as_Bool()->_test._test != s2->as_Bool()->_test._test)) {
|
|
return false;
|
|
}
|
|
|
|
Node* s1_ctrl = s1->in(0);
|
|
Node* s2_ctrl = s2->in(0);
|
|
// If the control nodes are equivalent, no further checks are required to test for isomorphism.
|
|
if (s1_ctrl == s2_ctrl) {
|
|
return true;
|
|
} else {
|
|
// If the control nodes are not invariant for the loop, fail isomorphism test.
|
|
const bool s1_ctrl_inv = (s1_ctrl == nullptr) || lpt()->is_invariant(s1_ctrl);
|
|
const bool s2_ctrl_inv = (s2_ctrl == nullptr) || lpt()->is_invariant(s2_ctrl);
|
|
return s1_ctrl_inv && s2_ctrl_inv;
|
|
}
|
|
}
|
|
|
|
// Look for pattern n1 = (iv + c) and n2 = (iv + c + 1), which may lead to PopulateIndex vector node.
|
|
// We skip the pack creation of these nodes. They will be vectorized by SuperWord::vector_opd.
|
|
bool SuperWord::is_populate_index(const Node* n1, const Node* n2) const {
|
|
return n1->is_Add() &&
|
|
n2->is_Add() &&
|
|
n1->in(1) == iv() &&
|
|
n2->in(1) == iv() &&
|
|
n1->in(2)->is_Con() &&
|
|
n2->in(2)->is_Con() &&
|
|
n2->in(2)->get_int() - n1->in(2)->get_int() == 1;
|
|
}
|
|
|
|
// Is there no data path from s1 to s2 or s2 to s1?
|
|
bool VLoopDependencyGraph::independent(Node* s1, Node* s2) const {
|
|
int d1 = depth(s1);
|
|
int d2 = depth(s2);
|
|
|
|
if (d1 == d2) {
|
|
// Same depth:
|
|
// 1) same node -> dependent
|
|
// 2) different nodes -> same level implies there is no path
|
|
return s1 != s2;
|
|
}
|
|
|
|
// Traversal starting at the deeper node to find the shallower one.
|
|
Node* deep = d1 > d2 ? s1 : s2;
|
|
Node* shallow = d1 > d2 ? s2 : s1;
|
|
int min_d = MIN2(d1, d2); // prune traversal at min_d
|
|
|
|
ResourceMark rm;
|
|
Unique_Node_List worklist;
|
|
worklist.push(deep);
|
|
for (uint i = 0; i < worklist.size(); i++) {
|
|
Node* n = worklist.at(i);
|
|
for (PredsIterator preds(*this, n); !preds.done(); preds.next()) {
|
|
Node* pred = preds.current();
|
|
if (_vloop.in_bb(pred) && depth(pred) >= min_d) {
|
|
if (pred == shallow) {
|
|
return false; // found it -> dependent
|
|
}
|
|
worklist.push(pred);
|
|
}
|
|
}
|
|
}
|
|
return true; // not found -> independent
|
|
}
|
|
|
|
// Are all nodes in nodes list mutually independent?
|
|
// We could query independent(s1, s2) for all pairs, but that results
|
|
// in O(size * size) graph traversals. We can do it all in one BFS!
|
|
// Start the BFS traversal at all nodes from the nodes list. Traverse
|
|
// Preds recursively, for nodes that have at least depth min_d, which
|
|
// is the smallest depth of all nodes from the nodes list. Once we have
|
|
// traversed all those nodes, and have not found another node from the
|
|
// nodes list, we know that all nodes in the nodes list are independent.
|
|
bool VLoopDependencyGraph::mutually_independent(const Node_List* nodes) const {
|
|
ResourceMark rm;
|
|
Unique_Node_List worklist;
|
|
VectorSet nodes_set;
|
|
int min_d = depth(nodes->at(0));
|
|
for (uint k = 0; k < nodes->size(); k++) {
|
|
Node* n = nodes->at(k);
|
|
min_d = MIN2(min_d, depth(n));
|
|
worklist.push(n); // start traversal at all nodes in nodes list
|
|
nodes_set.set(_body.bb_idx(n));
|
|
}
|
|
for (uint i = 0; i < worklist.size(); i++) {
|
|
Node* n = worklist.at(i);
|
|
for (PredsIterator preds(*this, n); !preds.done(); preds.next()) {
|
|
Node* pred = preds.current();
|
|
if (_vloop.in_bb(pred) && depth(pred) >= min_d) {
|
|
if (nodes_set.test(_body.bb_idx(pred))) {
|
|
return false; // found one -> dependent
|
|
}
|
|
worklist.push(pred);
|
|
}
|
|
}
|
|
}
|
|
return true; // not found -> independent
|
|
}
|
|
|
|
//--------------------------have_similar_inputs-----------------------
|
|
// For a node pair (s1, s2) which is isomorphic and independent,
|
|
// do s1 and s2 have similar input edges?
|
|
bool SuperWord::have_similar_inputs(Node* s1, Node* s2) {
|
|
// assert(isomorphic(s1, s2) == true, "check isomorphic");
|
|
// assert(independent(s1, s2) == true, "check independent");
|
|
if (s1->req() > 1 && !s1->is_Store() && !s1->is_Load()) {
|
|
for (uint i = 1; i < s1->req(); i++) {
|
|
Node* s1_in = s1->in(i);
|
|
Node* s2_in = s2->in(i);
|
|
if (s1_in->is_Phi() && s2_in->is_Add() && s2_in->in(1) == s1_in) {
|
|
// Special handling for expressions with loop iv, like "b[i] = a[i] * i".
|
|
// In this case, one node has an input from the tripcount iv and another
|
|
// node has an input from iv plus an offset.
|
|
if (!s1_in->as_Phi()->is_tripcount(T_INT)) return false;
|
|
} else {
|
|
if (s1_in->Opcode() != s2_in->Opcode()) return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool VLoopReductions::is_marked_reduction_pair(const Node* s1, const Node* s2) const {
|
|
if (is_marked_reduction(s1) &&
|
|
is_marked_reduction(s2)) {
|
|
// This is an ordered set, so s1 should define s2
|
|
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
|
|
Node* t1 = s1->fast_out(i);
|
|
if (t1 == s2) {
|
|
// both nodes are reductions and connected
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Extend pairset by following use->def and def->use links from pair members.
|
|
void SuperWord::extend_pairset_with_more_pairs_by_following_use_and_def() {
|
|
bool changed;
|
|
do {
|
|
changed = false;
|
|
// Iterate the pairs in insertion order.
|
|
for (int i = 0; i < _pairset.length(); i++) {
|
|
Node* left = _pairset.left_at_in_insertion_order(i);
|
|
Node* right = _pairset.right_at_in_insertion_order(i);
|
|
changed |= extend_pairset_with_more_pairs_by_following_def(left, right);
|
|
changed |= extend_pairset_with_more_pairs_by_following_use(left, right);
|
|
}
|
|
} while (changed);
|
|
|
|
// During extend_pairset_with_more_pairs_by_following_use, we may have re-ordered the
|
|
// inputs of some nodes, when calling order_inputs_of_uses_to_match_def_pair. If a def
|
|
// node has multiple uses, we may have re-ordered some of the inputs one use after
|
|
// packing another use with the old order. Now that we have all pairs, we must ensure
|
|
// that the order between the pairs is matching again. Since the PairSetIterator visits
|
|
// all pair-chains from left-to-right, we essencially impose the order of the first
|
|
// element on all other elements in the pair-chain.
|
|
for (PairSetIterator pair(_pairset); !pair.done(); pair.next()) {
|
|
Node* left = pair.left();
|
|
Node* right = pair.right();
|
|
order_inputs_of_all_use_pairs_to_match_def_pair(left, right);
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_packset()) {
|
|
tty->print_cr("\nAfter Superword::extend_pairset_with_more_pairs_by_following_use_and_def");
|
|
_pairset.print();
|
|
}
|
|
#endif
|
|
}
|
|
|
|
bool SuperWord::extend_pairset_with_more_pairs_by_following_def(Node* s1, Node* s2) {
|
|
assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
|
|
assert(s1->req() == s2->req(), "just checking");
|
|
|
|
if (s1->is_Load()) return false;
|
|
|
|
bool changed = false;
|
|
int start = s1->is_Store() ? MemNode::ValueIn : 1;
|
|
int end = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
|
|
for (int j = start; j < end; j++) {
|
|
Node* t1 = s1->in(j);
|
|
Node* t2 = s2->in(j);
|
|
if (!in_bb(t1) || !in_bb(t2) || t1->is_Mem() || t2->is_Mem()) {
|
|
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
|
|
continue;
|
|
}
|
|
if (can_pack_into_pair(t1, t2)) {
|
|
if (estimate_cost_savings_when_packing_as_pair(t1, t2) >= 0) {
|
|
_pairset.add_pair(t1, t2);
|
|
changed = true;
|
|
}
|
|
}
|
|
}
|
|
return changed;
|
|
}
|
|
|
|
// Note: we only extend with a single pair (the one with most savings) for every call. Since we keep
|
|
// calling this method as long as there are some changes, we will eventually pack all pairs that
|
|
// can be packed.
|
|
bool SuperWord::extend_pairset_with_more_pairs_by_following_use(Node* s1, Node* s2) {
|
|
assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
|
|
assert(s1->req() == s2->req(), "just checking");
|
|
|
|
if (s1->is_Store()) return false;
|
|
|
|
int savings = -1;
|
|
Node* u1 = nullptr;
|
|
Node* u2 = nullptr;
|
|
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
|
|
Node* t1 = s1->fast_out(i);
|
|
if (!in_bb(t1) || t1->is_Mem()) {
|
|
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
|
|
continue;
|
|
}
|
|
for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
|
|
Node* t2 = s2->fast_out(j);
|
|
if (!in_bb(t2) || t2->is_Mem()) {
|
|
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
|
|
continue;
|
|
}
|
|
if (t2->Opcode() == Op_AddI && t2 == cl()->incr()) continue; // don't mess with the iv
|
|
if (order_inputs_of_uses_to_match_def_pair(s1, s2, t1, t2) != PairOrderStatus::Ordered) { continue; }
|
|
if (can_pack_into_pair(t1, t2)) {
|
|
int my_savings = estimate_cost_savings_when_packing_as_pair(t1, t2);
|
|
if (my_savings > savings) {
|
|
savings = my_savings;
|
|
u1 = t1;
|
|
u2 = t2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (savings >= 0) {
|
|
_pairset.add_pair(u1, u2);
|
|
return true; // changed
|
|
}
|
|
return false; // no change
|
|
}
|
|
|
|
// For a pair (def1, def2), find all use packs (use1, use2), and ensure that their inputs have an order
|
|
// that matches the (def1, def2) pair.
|
|
void SuperWord::order_inputs_of_all_use_pairs_to_match_def_pair(Node* def1, Node* def2) {
|
|
assert(_pairset.is_pair(def1, def2), "(def1, def2) must be a pair");
|
|
|
|
if (def1->is_Store()) return;
|
|
|
|
// reductions are always managed beforehand
|
|
if (is_marked_reduction(def1)) return;
|
|
|
|
for (DUIterator_Fast imax, i = def1->fast_outs(imax); i < imax; i++) {
|
|
Node* use1 = def1->fast_out(i);
|
|
|
|
// Only allow operand swap on commuting operations
|
|
if (!use1->is_Add() && !use1->is_Mul() && !VectorNode::is_muladds2i(use1)) {
|
|
break;
|
|
}
|
|
|
|
// Find pair (use1, use2)
|
|
Node* use2 = _pairset.get_right_or_null_for(use1);
|
|
if (use2 == nullptr) { break; }
|
|
|
|
order_inputs_of_uses_to_match_def_pair(def1, def2, use1, use2);
|
|
}
|
|
}
|
|
|
|
// For a def-pair (def1, def2), and their use-nodes (use1, use2):
|
|
// Ensure that the input order of (use1, use2) matches the order of (def1, def2).
|
|
//
|
|
// We have different cases:
|
|
//
|
|
// 1. Reduction (use1, use2): must always reduce left-to-right. Make sure that we have pattern:
|
|
//
|
|
// phi/reduction x1 phi/reduction x2 phi/reduction x1
|
|
// | | | | and hopefully: | |
|
|
// use1 use2 use1 x2
|
|
// | |
|
|
// use2
|
|
//
|
|
// 2: Commutative operations, just as Add/Mul and their subclasses: we can try to swap edges:
|
|
//
|
|
// def1 x1 x2 def2 def1 x1 def2 x2
|
|
// | | | | ==> | | | |
|
|
// use1 use2 use1 use2
|
|
//
|
|
// 3: MulAddS2I (use1, use2): we can try to swap edges:
|
|
//
|
|
// (x1 * x2) + (x3 * x4) ==> 3.a: (x2 * x1) + (x4 * x3)
|
|
// 3.b: (x4 * x3) + (x2 * x1)
|
|
// 3.c: (x3 * x4) + (x1 * x2)
|
|
//
|
|
// Note: MulAddS2I with its 4 inputs is too complicated, if there is any mismatch, we always
|
|
// return PairOrderStatus::Unknown.
|
|
// Therefore, extend_pairset_with_more_pairs_by_following_use cannot extend to MulAddS2I,
|
|
// but there is a chance that extend_pairset_with_more_pairs_by_following_def can do it.
|
|
//
|
|
// 4: Otherwise, check if the inputs of (use1, use2) already match (def1, def2), i.e. for all input indices i:
|
|
//
|
|
// use1->in(i) == def1 || use2->in(i) == def2 -> use1->in(i) == def1 && use2->in(i) == def2
|
|
//
|
|
SuperWord::PairOrderStatus SuperWord::order_inputs_of_uses_to_match_def_pair(Node* def1, Node* def2, Node* use1, Node* use2) {
|
|
assert(_pairset.is_pair(def1, def2), "(def1, def2) must be a pair");
|
|
|
|
// 1. Reduction
|
|
if (is_marked_reduction(use1) && is_marked_reduction(use2)) {
|
|
Node* use1_in2 = use1->in(2);
|
|
if (use1_in2->is_Phi() || is_marked_reduction(use1_in2)) {
|
|
use1->swap_edges(1, 2);
|
|
}
|
|
Node* use2_in2 = use2->in(2);
|
|
if (use2_in2->is_Phi() || is_marked_reduction(use2_in2)) {
|
|
use2->swap_edges(1, 2);
|
|
}
|
|
return PairOrderStatus::Ordered;
|
|
}
|
|
|
|
uint ct = use1->req();
|
|
if (ct != use2->req()) { return PairOrderStatus::Unordered; };
|
|
uint i1 = 0;
|
|
uint i2 = 0;
|
|
do {
|
|
for (i1++; i1 < ct; i1++) { if (use1->in(i1) == def1) { break; } }
|
|
for (i2++; i2 < ct; i2++) { if (use2->in(i2) == def2) { break; } }
|
|
if (i1 != i2) {
|
|
if ((i1 == (3-i2)) && (use2->is_Add() || use2->is_Mul())) {
|
|
// 2. Commutative: swap edges, and hope the other position matches too.
|
|
use2->swap_edges(i1, i2);
|
|
} else if (VectorNode::is_muladds2i(use2) && use1 != use2) {
|
|
// 3.a/b: MulAddS2I.
|
|
if (i1 == 5 - i2) { // ((i1 == 3 && i2 == 2) || (i1 == 2 && i2 == 3) || (i1 == 1 && i2 == 4) || (i1 == 4 && i2 == 1))
|
|
use2->swap_edges(1, 2);
|
|
use2->swap_edges(3, 4);
|
|
}
|
|
if (i1 == 3 - i2 || i1 == 7 - i2) { // ((i1 == 1 && i2 == 2) || (i1 == 2 && i2 == 1) || (i1 == 3 && i2 == 4) || (i1 == 4 && i2 == 3))
|
|
use2->swap_edges(2, 3);
|
|
use2->swap_edges(1, 4);
|
|
}
|
|
return PairOrderStatus::Unknown;
|
|
} else {
|
|
// 4. The inputs are not ordered, and we cannot do anything about it.
|
|
return PairOrderStatus::Unordered;
|
|
}
|
|
} else if (i1 == i2 && VectorNode::is_muladds2i(use2) && use1 != use2) {
|
|
// 3.c: MulAddS2I.
|
|
use2->swap_edges(1, 3);
|
|
use2->swap_edges(2, 4);
|
|
return PairOrderStatus::Unknown;
|
|
}
|
|
} while (i1 < ct);
|
|
|
|
// 4. All inputs match.
|
|
return PairOrderStatus::Ordered;
|
|
}
|
|
|
|
// Estimate the savings from executing s1 and s2 as a pair.
|
|
int SuperWord::estimate_cost_savings_when_packing_as_pair(const Node* s1, const Node* s2) const {
|
|
int save_in = 2 - 1; // 2 operations per instruction in packed form
|
|
|
|
const int adjacent_profit = 2;
|
|
auto pack_cost = [&] (const int size) { return size; };
|
|
auto unpack_cost = [&] (const int size) { return size; };
|
|
|
|
// inputs
|
|
for (uint i = 1; i < s1->req(); i++) {
|
|
Node* x1 = s1->in(i);
|
|
Node* x2 = s2->in(i);
|
|
if (x1 != x2) {
|
|
if (are_adjacent_refs(x1, x2)) {
|
|
save_in += adjacent_profit;
|
|
} else if (!_pairset.is_pair(x1, x2)) {
|
|
save_in -= pack_cost(2);
|
|
} else {
|
|
save_in += unpack_cost(2);
|
|
}
|
|
}
|
|
}
|
|
|
|
// uses of result
|
|
uint number_of_packed_use_pairs = 0;
|
|
int save_use = 0;
|
|
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
|
|
Node* use1 = s1->fast_out(i);
|
|
|
|
// Find pair (use1, use2)
|
|
Node* use2 = _pairset.get_right_or_null_for(use1);
|
|
if (use2 == nullptr) { continue; }
|
|
|
|
for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
|
|
if (use2 == s2->fast_out(k)) {
|
|
// We have pattern:
|
|
//
|
|
// s1 s2
|
|
// | |
|
|
// [use1, use2]
|
|
//
|
|
number_of_packed_use_pairs++;
|
|
if (are_adjacent_refs(use1, use2)) {
|
|
save_use += adjacent_profit;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (number_of_packed_use_pairs < s1->outcnt()) save_use += unpack_cost(1);
|
|
if (number_of_packed_use_pairs < s2->outcnt()) save_use += unpack_cost(1);
|
|
|
|
return MAX2(save_in, save_use);
|
|
}
|
|
|
|
// Combine pairs (n1, n2), (n2, n3), ... into pack (n1, n2, n3 ...)
|
|
void SuperWord::combine_pairs_to_longer_packs() {
|
|
#ifdef ASSERT
|
|
assert(!_pairset.is_empty(), "pairset not empty");
|
|
assert(_packset.is_empty(), "packset not empty");
|
|
#endif
|
|
|
|
// Iterate pair-chain by pair-chain, each from left-most to right-most.
|
|
Node_List* pack = nullptr;
|
|
for (PairSetIterator pair(_pairset); !pair.done(); pair.next()) {
|
|
Node* left = pair.left();
|
|
Node* right = pair.right();
|
|
if (_pairset.is_left_in_a_left_most_pair(left)) {
|
|
assert(pack == nullptr, "no unfinished pack");
|
|
pack = new (arena()) Node_List(arena());
|
|
pack->push(left);
|
|
}
|
|
assert(pack != nullptr, "must have unfinished pack");
|
|
pack->push(right);
|
|
if (_pairset.is_right_in_a_right_most_pair(right)) {
|
|
_packset.add_pack(pack);
|
|
pack = nullptr;
|
|
}
|
|
}
|
|
assert(pack == nullptr, "no unfinished pack");
|
|
|
|
assert(!_packset.is_empty(), "must have combined some packs");
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_packset()) {
|
|
tty->print_cr("\nAfter Superword::combine_pairs_to_longer_packs");
|
|
_packset.print();
|
|
}
|
|
#endif
|
|
}
|
|
|
|
SplitStatus PackSet::split_pack(const char* split_name,
|
|
Node_List* pack,
|
|
SplitTask task)
|
|
{
|
|
uint pack_size = pack->size();
|
|
|
|
if (task.is_unchanged()) {
|
|
return SplitStatus::make_unchanged(pack);
|
|
}
|
|
|
|
if (task.is_rejected()) {
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_rejections()) {
|
|
tty->cr();
|
|
tty->print_cr("WARNING: Removed pack: %s:", task.message());
|
|
print_pack(pack);
|
|
}
|
|
#endif
|
|
unmap_all_nodes_in_pack(pack);
|
|
return SplitStatus::make_rejected();
|
|
}
|
|
|
|
uint split_size = task.split_size();
|
|
assert(0 < split_size && split_size < pack_size, "split_size must be in range");
|
|
|
|
// Split the size
|
|
uint new_size = split_size;
|
|
uint old_size = pack_size - new_size;
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_packset()) {
|
|
tty->cr();
|
|
tty->print_cr("INFO: splitting pack (sizes: %d %d): %s:",
|
|
old_size, new_size, task.message());
|
|
print_pack(pack);
|
|
}
|
|
#endif
|
|
|
|
// Are both sizes too small to be a pack?
|
|
if (old_size < 2 && new_size < 2) {
|
|
assert(old_size == 1 && new_size == 1, "implied");
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_rejections()) {
|
|
tty->cr();
|
|
tty->print_cr("WARNING: Removed size 2 pack, cannot be split: %s:", task.message());
|
|
print_pack(pack);
|
|
}
|
|
#endif
|
|
unmap_all_nodes_in_pack(pack);
|
|
return SplitStatus::make_rejected();
|
|
}
|
|
|
|
// Just pop off a single node?
|
|
if (new_size < 2) {
|
|
assert(new_size == 1 && old_size >= 2, "implied");
|
|
Node* n = pack->pop();
|
|
unmap_node_in_pack(n);
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_rejections()) {
|
|
tty->cr();
|
|
tty->print_cr("WARNING: Removed node from pack, because of split: %s:", task.message());
|
|
n->dump();
|
|
}
|
|
#endif
|
|
return SplitStatus::make_modified(pack);
|
|
}
|
|
|
|
// Just remove a single node at front?
|
|
if (old_size < 2) {
|
|
assert(old_size == 1 && new_size >= 2, "implied");
|
|
Node* n = pack->at(0);
|
|
pack->remove(0);
|
|
unmap_node_in_pack(n);
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_rejections()) {
|
|
tty->cr();
|
|
tty->print_cr("WARNING: Removed node from pack, because of split: %s:", task.message());
|
|
n->dump();
|
|
}
|
|
#endif
|
|
return SplitStatus::make_modified(pack);
|
|
}
|
|
|
|
// We will have two packs
|
|
assert(old_size >= 2 && new_size >= 2, "implied");
|
|
Node_List* new_pack = new Node_List(new_size);
|
|
|
|
for (uint i = 0; i < new_size; i++) {
|
|
Node* n = pack->at(old_size + i);
|
|
new_pack->push(n);
|
|
remap_node_in_pack(n, new_pack);
|
|
}
|
|
|
|
for (uint i = 0; i < new_size; i++) {
|
|
pack->pop();
|
|
}
|
|
|
|
// We assume that new_pack is more "stable" (i.e. will have to be split less than new_pack).
|
|
// Put "pack" second, so that we insert it later in the list, and iterate over it again sooner.
|
|
return SplitStatus::make_split(new_pack, pack);
|
|
}
|
|
|
|
template <typename SplitStrategy>
|
|
void PackSet::split_packs(const char* split_name,
|
|
SplitStrategy strategy) {
|
|
bool changed;
|
|
do {
|
|
changed = false;
|
|
int new_packset_length = 0;
|
|
for (int i = 0; i < _packs.length(); i++) {
|
|
Node_List* pack = _packs.at(i);
|
|
assert(pack != nullptr && pack->size() >= 2, "no nullptr, at least size 2");
|
|
SplitTask task = strategy(pack);
|
|
SplitStatus status = split_pack(split_name, pack, task);
|
|
changed |= !status.is_unchanged();
|
|
Node_List* first_pack = status.first_pack();
|
|
Node_List* second_pack = status.second_pack();
|
|
_packs.at_put(i, nullptr); // take out pack
|
|
if (first_pack != nullptr) {
|
|
// The first pack can be put at the current position
|
|
assert(i >= new_packset_length, "only move packs down");
|
|
_packs.at_put(new_packset_length++, first_pack);
|
|
}
|
|
if (second_pack != nullptr) {
|
|
// The second node has to be appended at the end
|
|
_packs.append(second_pack);
|
|
}
|
|
}
|
|
_packs.trunc_to(new_packset_length);
|
|
} while (changed);
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_packset()) {
|
|
tty->print_cr("\nAfter %s", split_name);
|
|
print();
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Split packs at boundaries where left and right have different use or def packs.
|
|
void SuperWord::split_packs_at_use_def_boundaries() {
|
|
auto split_strategy = [&](const Node_List* pack) {
|
|
uint pack_size = pack->size();
|
|
uint boundary = find_use_def_boundary(pack);
|
|
assert(boundary < pack_size, "valid boundary %d", boundary);
|
|
if (boundary != 0) {
|
|
return SplitTask::make_split(pack_size - boundary, "found a use/def boundary");
|
|
}
|
|
return SplitTask::make_unchanged();
|
|
};
|
|
_packset.split_packs("SuperWord::split_packs_at_use_def_boundaries", split_strategy);
|
|
}
|
|
|
|
// Split packs that are only implemented with a smaller pack size. Also splits packs
|
|
// such that they eventually have power of 2 size.
|
|
void SuperWord::split_packs_only_implemented_with_smaller_size() {
|
|
auto split_strategy = [&](const Node_List* pack) {
|
|
uint pack_size = pack->size();
|
|
uint implemented_size = max_implemented_size(pack);
|
|
if (implemented_size == 0) {
|
|
return SplitTask::make_rejected("not implemented at any smaller size");
|
|
}
|
|
assert(is_power_of_2(implemented_size), "power of 2 size or zero: %d", implemented_size);
|
|
if (implemented_size != pack_size) {
|
|
return SplitTask::make_split(implemented_size, "only implemented at smaller size");
|
|
}
|
|
return SplitTask::make_unchanged();
|
|
};
|
|
_packset.split_packs("SuperWord::split_packs_only_implemented_with_smaller_size", split_strategy);
|
|
}
|
|
|
|
// Split packs that have a mutual dependency, until all packs are mutually_independent.
|
|
void SuperWord::split_packs_to_break_mutual_dependence() {
|
|
auto split_strategy = [&](const Node_List* pack) {
|
|
uint pack_size = pack->size();
|
|
assert(is_power_of_2(pack_size), "ensured by earlier splits %d", pack_size);
|
|
if (!is_marked_reduction(pack->at(0)) &&
|
|
!mutually_independent(pack)) {
|
|
// As a best guess, we split the pack in half. This way, we iteratively make the
|
|
// packs smaller, until there is no dependency.
|
|
return SplitTask::make_split(pack_size >> 1, "was not mutually independent");
|
|
}
|
|
return SplitTask::make_unchanged();
|
|
};
|
|
_packset.split_packs("SuperWord::split_packs_to_break_mutual_dependence", split_strategy);
|
|
}
|
|
|
|
template <typename FilterPredicate>
|
|
void PackSet::filter_packs(const char* filter_name,
|
|
const char* rejection_message,
|
|
FilterPredicate filter) {
|
|
auto split_strategy = [&](const Node_List* pack) {
|
|
if (filter(pack)) {
|
|
return SplitTask::make_unchanged();
|
|
} else {
|
|
return SplitTask::make_rejected(rejection_message);
|
|
}
|
|
};
|
|
split_packs(filter_name, split_strategy);
|
|
}
|
|
|
|
void SuperWord::filter_packs_for_power_of_2_size() {
|
|
auto filter = [&](const Node_List* pack) {
|
|
return is_power_of_2(pack->size());
|
|
};
|
|
_packset.filter_packs("SuperWord::filter_packs_for_power_of_2_size",
|
|
"size is not a power of 2", filter);
|
|
}
|
|
|
|
// We know that the nodes in a pair pack were independent - this gives us independence
|
|
// at distance 1. But now that we may have more than 2 nodes in a pack, we need to check
|
|
// if they are all mutually independent. If there is a dependence we remove the pack.
|
|
// This is better than giving up completely - we can have partial vectorization if some
|
|
// are rejected and others still accepted.
|
|
//
|
|
// Examples with dependence at distance 1 (pack pairs are not created):
|
|
// for (int i ...) { v[i + 1] = v[i] + 5; }
|
|
// for (int i ...) { v[i] = v[i - 1] + 5; }
|
|
//
|
|
// Example with independence at distance 1, but dependence at distance 2 (pack pairs are
|
|
// created and we need to filter them out now):
|
|
// for (int i ...) { v[i + 2] = v[i] + 5; }
|
|
// for (int i ...) { v[i] = v[i - 2] + 5; }
|
|
//
|
|
// Note: dependencies are created when a later load may reference the same memory location
|
|
// as an earlier store. This happens in "read backward" or "store forward" cases. On the
|
|
// other hand, "read forward" or "store backward" cases do not have such dependencies:
|
|
// for (int i ...) { v[i] = v[i + 1] + 5; }
|
|
// for (int i ...) { v[i - 1] = v[i] + 5; }
|
|
void SuperWord::filter_packs_for_mutual_independence() {
|
|
auto filter = [&](const Node_List* pack) {
|
|
// reductions are trivially connected
|
|
return is_marked_reduction(pack->at(0)) ||
|
|
mutually_independent(pack);
|
|
};
|
|
_packset.filter_packs("SuperWord::filter_packs_for_mutual_independence",
|
|
"found dependency between nodes at distance greater than 1", filter);
|
|
}
|
|
|
|
// Find the set of alignment solutions for load/store pack.
|
|
const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pack) {
|
|
assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs");
|
|
|
|
const MemNode* mem_ref = pack->at(0)->as_Mem();
|
|
const VPointer& mem_ref_p = vpointer(mem_ref);
|
|
const CountedLoopEndNode* pre_end = _vloop.pre_loop_end();
|
|
assert(pre_end->stride_is_con(), "pre loop stride is constant");
|
|
|
|
AlignmentSolver solver(pack->at(0)->as_Mem(),
|
|
pack->size(),
|
|
mem_ref_p.base(),
|
|
mem_ref_p.offset_in_bytes(),
|
|
mem_ref_p.invar(),
|
|
mem_ref_p.invar_factor(),
|
|
mem_ref_p.scale_in_bytes(),
|
|
pre_end->init_trip(),
|
|
pre_end->stride_con(),
|
|
iv_stride()
|
|
DEBUG_ONLY(COMMA is_trace_align_vector()));
|
|
return solver.solve();
|
|
}
|
|
|
|
// Ensure all packs are aligned, if AlignVector is on.
|
|
// Find an alignment solution: find the set of pre_iter that memory align all packs.
|
|
// Start with the maximal set (pre_iter >= 0) and filter it with the constraints
|
|
// that the packs impose. Remove packs that do not have a compatible solution.
|
|
void SuperWord::filter_packs_for_alignment() {
|
|
// We do not need to filter if no alignment is required.
|
|
if (!VLoop::vectors_should_be_aligned()) {
|
|
return;
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_info() || is_trace_align_vector()) {
|
|
tty->print_cr("\nSuperWord::filter_packs_for_alignment:");
|
|
}
|
|
#endif
|
|
|
|
ResourceMark rm;
|
|
|
|
// Start with trivial (unconstrained) solution space
|
|
AlignmentSolution const* current = new TrivialAlignmentSolution();
|
|
int mem_ops_count = 0;
|
|
int mem_ops_rejected = 0;
|
|
|
|
auto filter = [&](const Node_List* pack) {
|
|
// Only memops need to be aligned.
|
|
if (!pack->at(0)->is_Load() &&
|
|
!pack->at(0)->is_Store()) {
|
|
return true; // accept all non memops
|
|
}
|
|
|
|
mem_ops_count++;
|
|
const AlignmentSolution* s = pack_alignment_solution(pack);
|
|
const AlignmentSolution* intersect = current->filter(s);
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_align_vector()) {
|
|
tty->print(" solution for pack: ");
|
|
s->print();
|
|
tty->print(" intersection with current: ");
|
|
intersect->print();
|
|
}
|
|
#endif
|
|
if (intersect->is_empty()) {
|
|
mem_ops_rejected++;
|
|
return false; // reject because of empty solution
|
|
}
|
|
|
|
current = intersect;
|
|
return true; // accept because of non-empty solution
|
|
};
|
|
|
|
_packset.filter_packs("SuperWord::filter_packs_for_alignment",
|
|
"rejected by AlignVector (strict alignment requirement)", filter);
|
|
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_info() || is_trace_align_vector()) {
|
|
tty->print("\n final solution: ");
|
|
current->print();
|
|
tty->print_cr(" rejected mem_ops packs: %d of %d", mem_ops_rejected, mem_ops_count);
|
|
tty->cr();
|
|
}
|
|
#endif
|
|
|
|
assert(!current->is_empty(), "solution must be non-empty");
|
|
if (current->is_constrained()) {
|
|
// Solution is constrained (not trivial)
|
|
// -> must change pre-limit to achieve alignment
|
|
MemNode const* mem = current->as_constrained()->mem_ref();
|
|
Node_List* pack = get_pack(mem);
|
|
assert(pack != nullptr, "memop of final solution must still be packed");
|
|
_mem_ref_for_main_loop_alignment = mem;
|
|
_aw_for_main_loop_alignment = pack->size() * mem->memory_size();
|
|
}
|
|
}
|
|
|
|
// Remove packs that are not implemented
|
|
void SuperWord::filter_packs_for_implemented() {
|
|
auto filter = [&](const Node_List* pack) {
|
|
return implemented(pack, pack->size());
|
|
};
|
|
_packset.filter_packs("SuperWord::filter_packs_for_implemented",
|
|
"Unimplemented", filter);
|
|
}
|
|
|
|
// Remove packs that are not profitable.
|
|
void SuperWord::filter_packs_for_profitable() {
|
|
// Count the number of reductions vs other vector ops, for the
|
|
// reduction profitability heuristic.
|
|
for (int i = 0; i < _packset.length(); i++) {
|
|
Node_List* pack = _packset.at(i);
|
|
Node* n = pack->at(0);
|
|
if (is_marked_reduction(n)) {
|
|
_num_reductions++;
|
|
} else {
|
|
_num_work_vecs++;
|
|
}
|
|
}
|
|
|
|
// Remove packs that are not profitable
|
|
auto filter = [&](const Node_List* pack) {
|
|
return profitable(pack);
|
|
};
|
|
_packset.filter_packs("Superword::filter_packs_for_profitable",
|
|
"not profitable", filter);
|
|
}
|
|
|
|
// Can code be generated for the pack, restricted to size nodes?
|
|
bool SuperWord::implemented(const Node_List* pack, const uint size) const {
|
|
assert(size >= 2 && size <= pack->size() && is_power_of_2(size), "valid size");
|
|
bool retValue = false;
|
|
Node* p0 = pack->at(0);
|
|
if (p0 != nullptr) {
|
|
int opc = p0->Opcode();
|
|
if (is_marked_reduction(p0)) {
|
|
const Type *arith_type = p0->bottom_type();
|
|
// Length 2 reductions of INT/LONG do not offer performance benefits
|
|
if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
|
|
retValue = false;
|
|
} else {
|
|
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
|
|
}
|
|
} else if (VectorNode::is_convert_opcode(opc)) {
|
|
retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
|
|
} else if (VectorNode::is_minmax_opcode(opc) && is_subword_type(velt_basic_type(p0))) {
|
|
// Java API for Math.min/max operations supports only int, long, float
|
|
// and double types. Thus, avoid generating vector min/max nodes for
|
|
// integer subword types with superword vectorization.
|
|
// See JDK-8294816 for miscompilation issues with shorts.
|
|
return false;
|
|
} else if (p0->is_Cmp()) {
|
|
// Cmp -> Bool -> Cmove
|
|
retValue = UseVectorCmov;
|
|
} else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) {
|
|
// Requires extra vector long -> int conversion.
|
|
retValue = VectorNode::implemented(opc, size, T_LONG) &&
|
|
VectorCastNode::implemented(Op_ConvL2I, size, T_LONG, T_INT);
|
|
} else {
|
|
if (VectorNode::can_use_RShiftI_instead_of_URShiftI(p0, velt_basic_type(p0))) {
|
|
opc = Op_RShiftI;
|
|
}
|
|
retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
|
|
}
|
|
}
|
|
return retValue;
|
|
}
|
|
|
|
// Find the maximal implemented size smaller or equal to the packs size
|
|
uint SuperWord::max_implemented_size(const Node_List* pack) {
|
|
uint size = round_down_power_of_2(pack->size());
|
|
if (implemented(pack, size)) {
|
|
return size;
|
|
} else {
|
|
// Iteratively divide size by 2, and check.
|
|
for (uint s = size >> 1; s >= 2; s >>= 1) {
|
|
if (implemented(pack, s)) {
|
|
return s;
|
|
}
|
|
}
|
|
return 0; // not implementable at all
|
|
}
|
|
}
|
|
|
|
// If the j-th input for all nodes in the pack is the same input: return it, else nullptr.
|
|
Node* PackSet::same_inputs_at_index_or_null(const Node_List* pack, const int index) const {
|
|
Node* p0_in = pack->at(0)->in(index);
|
|
for (uint i = 1; i < pack->size(); i++) {
|
|
if (pack->at(i)->in(index) != p0_in) {
|
|
return nullptr; // not same
|
|
}
|
|
}
|
|
return p0_in;
|
|
}
|
|
|
|
VTransformBoolTest PackSet::get_bool_test(const Node_List* bool_pack) const {
|
|
BoolNode* bol = bool_pack->at(0)->as_Bool();
|
|
BoolTest::mask mask = bol->_test._test;
|
|
bool is_negated = false;
|
|
assert(mask == BoolTest::eq ||
|
|
mask == BoolTest::ne ||
|
|
mask == BoolTest::ge ||
|
|
mask == BoolTest::gt ||
|
|
mask == BoolTest::lt ||
|
|
mask == BoolTest::le,
|
|
"Bool should be one of: eq, ne, ge, gt, lt, le");
|
|
|
|
#ifdef ASSERT
|
|
for (uint j = 0; j < bool_pack->size(); j++) {
|
|
Node* m = bool_pack->at(j);
|
|
assert(m->as_Bool()->_test._test == mask,
|
|
"all bool nodes must have same test");
|
|
}
|
|
#endif
|
|
|
|
CmpNode* cmp0 = bol->in(1)->as_Cmp();
|
|
assert(get_pack(cmp0) != nullptr, "Bool must have matching Cmp pack");
|
|
|
|
if (cmp0->Opcode() == Op_CmpF || cmp0->Opcode() == Op_CmpD) {
|
|
// If we have a Float or Double comparison, we must be careful with
|
|
// handling NaN's correctly. CmpF and CmpD have a return code, as
|
|
// they are based on the java bytecodes fcmpl/dcmpl:
|
|
// -1: cmp_in1 < cmp_in2, or at least one of the two is a NaN
|
|
// 0: cmp_in1 == cmp_in2 (no NaN)
|
|
// 1: cmp_in1 > cmp_in2 (no NaN)
|
|
//
|
|
// The "mask" selects which of the [-1, 0, 1] cases lead to "true".
|
|
//
|
|
// Note: ordered (O) comparison returns "false" if either input is NaN.
|
|
// unordered (U) comparison returns "true" if either input is NaN.
|
|
//
|
|
// The VectorMaskCmpNode does a comparison directly on in1 and in2, in the java
|
|
// standard way (all comparisons are ordered, except NEQ is unordered).
|
|
//
|
|
// In the following, "mask" already matches the cmp code for VectorMaskCmpNode:
|
|
// BoolTest::eq: Case 0 -> EQ_O
|
|
// BoolTest::ne: Case -1, 1 -> NEQ_U
|
|
// BoolTest::ge: Case 0, 1 -> GE_O
|
|
// BoolTest::gt: Case 1 -> GT_O
|
|
//
|
|
// But the lt and le comparisons must be converted from unordered to ordered:
|
|
// BoolTest::lt: Case -1 -> LT_U -> VectorMaskCmp would interpret lt as LT_O
|
|
// BoolTest::le: Case -1, 0 -> LE_U -> VectorMaskCmp would interpret le as LE_O
|
|
//
|
|
if (mask == BoolTest::lt || mask == BoolTest::le) {
|
|
// Negating the mask gives us the negated result, since all non-NaN cases are
|
|
// negated, and the unordered (U) comparisons are turned into ordered (O) comparisons.
|
|
// VectorMaskCmp(LT_U, in1_cmp, in2_cmp)
|
|
// <==> NOT VectorMaskCmp(GE_O, in1_cmp, in2_cmp)
|
|
// VectorMaskCmp(LE_U, in1_cmp, in2_cmp)
|
|
// <==> NOT VectorMaskCmp(GT_O, in1_cmp, in2_cmp)
|
|
//
|
|
// When a VectorBlend uses the negated mask, it can simply swap its blend-inputs:
|
|
// VectorBlend( VectorMaskCmp(LT_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
|
|
// <==> VectorBlend(NOT VectorMaskCmp(GE_O, in1_cmp, in2_cmp), in1_blend, in2_blend)
|
|
// <==> VectorBlend( VectorMaskCmp(GE_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
|
|
// VectorBlend( VectorMaskCmp(LE_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
|
|
// <==> VectorBlend(NOT VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in1_blend, in2_blend)
|
|
// <==> VectorBlend( VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
|
|
mask = bol->_test.negate();
|
|
is_negated = true;
|
|
}
|
|
}
|
|
|
|
return VTransformBoolTest(mask, is_negated);
|
|
}
|
|
|
|
//------------------------------profitable---------------------------
|
|
// For pack p, are all operands and all uses (with in the block) vector?
|
|
bool SuperWord::profitable(const Node_List* p) const {
|
|
Node* p0 = p->at(0);
|
|
uint start, end;
|
|
VectorNode::vector_operands(p0, &start, &end);
|
|
|
|
// Return false if some inputs are not vectors or vectors with different
|
|
// size or alignment.
|
|
// Also, for now, return false if not scalar promotion case when inputs are
|
|
// the same. Later, implement PackNode and allow differing, non-vector inputs
|
|
// (maybe just the ones from outside the block.)
|
|
for (uint i = start; i < end; i++) {
|
|
if (!is_vector_use(p0, i)) {
|
|
return false;
|
|
}
|
|
}
|
|
// Check if reductions are connected
|
|
if (is_marked_reduction(p0)) {
|
|
Node* second_in = p0->in(2);
|
|
Node_List* second_pk = get_pack(second_in);
|
|
if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
|
|
// No parent pack or not enough work
|
|
// to cover reduction expansion overhead
|
|
return false;
|
|
} else if (second_pk->size() != p->size()) {
|
|
return false;
|
|
}
|
|
}
|
|
if (VectorNode::is_shift(p0)) {
|
|
// For now, return false if shift count is vector or not scalar promotion
|
|
// case (different shift counts) because it is not supported yet.
|
|
Node* cnt = p0->in(2);
|
|
Node_List* cnt_pk = get_pack(cnt);
|
|
if (cnt_pk != nullptr || _packset.same_inputs_at_index_or_null(p, 2) == nullptr) {
|
|
return false;
|
|
}
|
|
}
|
|
if (!p0->is_Store()) {
|
|
// For now, return false if not all uses are vector.
|
|
// Later, implement ExtractNode and allow non-vector uses (maybe
|
|
// just the ones outside the block.)
|
|
for (uint i = 0; i < p->size(); i++) {
|
|
Node* def = p->at(i);
|
|
for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
|
|
Node* use = def->fast_out(j);
|
|
for (uint k = 0; k < use->req(); k++) {
|
|
Node* n = use->in(k);
|
|
if (def == n) {
|
|
// Reductions should only have a Phi use at the loop head or a non-phi use
|
|
// outside of the loop if it is the last element of the pack (e.g. SafePoint).
|
|
if (is_marked_reduction(def) &&
|
|
((use->is_Phi() && use->in(0) == lpt()->_head) ||
|
|
(!lpt()->is_member(phase()->get_loop(phase()->ctrl_or_self(use))) && i == p->size()-1))) {
|
|
continue;
|
|
}
|
|
if (!is_vector_use(use, k)) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (p0->is_Cmp()) {
|
|
// Verify that Cmp pack only has Bool pack uses
|
|
for (DUIterator_Fast jmax, j = p0->fast_outs(jmax); j < jmax; j++) {
|
|
Node* bol = p0->fast_out(j);
|
|
if (!bol->is_Bool() || bol->in(0) != nullptr || !is_vector_use(bol, 1)) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
if (p0->is_Bool()) {
|
|
// Verify that Bool pack only has CMove pack uses
|
|
for (DUIterator_Fast jmax, j = p0->fast_outs(jmax); j < jmax; j++) {
|
|
Node* cmove = p0->fast_out(j);
|
|
if (!cmove->is_CMove() || cmove->in(0) != nullptr || !is_vector_use(cmove, 1)) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
if (p0->is_CMove()) {
|
|
// Verify that CMove has a matching Bool pack
|
|
BoolNode* bol = p0->in(1)->as_Bool();
|
|
if (bol == nullptr || get_pack(bol) == nullptr) {
|
|
return false;
|
|
}
|
|
// Verify that Bool has a matching Cmp pack
|
|
CmpNode* cmp = bol->in(1)->as_Cmp();
|
|
if (cmp == nullptr || get_pack(cmp) == nullptr) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#ifdef ASSERT
|
|
void SuperWord::verify_packs() const {
|
|
_packset.verify();
|
|
|
|
// All packs must be:
|
|
for (int i = 0; i < _packset.length(); i++) {
|
|
Node_List* pack = _packset.at(i);
|
|
|
|
// 1. Mutually independent (or a reduction).
|
|
if (!is_marked_reduction(pack->at(0)) &&
|
|
!mutually_independent(pack)) {
|
|
tty->print_cr("FAILURE: nodes not mutually independent in pack[%d]", i);
|
|
_packset.print_pack(pack);
|
|
assert(false, "pack nodes not mutually independent");
|
|
}
|
|
|
|
// 2. Implemented.
|
|
if (!implemented(pack, pack->size())) {
|
|
tty->print_cr("FAILURE: nodes not implementable in pack[%d]", i);
|
|
_packset.print_pack(pack);
|
|
assert(false, "pack not implementable");
|
|
}
|
|
|
|
// 3. Profitable.
|
|
if (!profitable(pack)) {
|
|
tty->print_cr("FAILURE: nodes not profitable in pack[%d]", i);
|
|
_packset.print_pack(pack);
|
|
assert(false, "pack not profitable");
|
|
}
|
|
}
|
|
}
|
|
|
|
void PackSet::verify() const {
|
|
// Verify all nodes in packset have pack set correctly.
|
|
ResourceMark rm;
|
|
Unique_Node_List processed;
|
|
for (int i = 0; i < _packs.length(); i++) {
|
|
Node_List* p = _packs.at(i);
|
|
for (uint k = 0; k < p->size(); k++) {
|
|
Node* n = p->at(k);
|
|
assert(_vloop.in_bb(n), "only nodes in bb can be in packset");
|
|
assert(!processed.member(n), "node should only occur once in packset");
|
|
assert(get_pack(n) == p, "n has consisten packset info");
|
|
processed.push(n);
|
|
}
|
|
}
|
|
|
|
// Check that no other node has pack set.
|
|
for (int i = 0; i < _body.body().length(); i++) {
|
|
Node* n = _body.body().at(i);
|
|
if (!processed.member(n)) {
|
|
assert(get_pack(n) == nullptr, "should not have pack if not in packset");
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// The PacksetGraph combines the dependency graph with the packset. In the PackSet
|
|
// graph, we have two kinds of nodes:
|
|
// (1) pack-node: Represents all nodes of some pack p in a single node, which
|
|
// shall later become a vector node.
|
|
// (2) scalar-node: Represents a node that is not in any pack.
|
|
// For any edge (n1, n2) in the dependency graph, we add an edge to the PacksetGraph for
|
|
// the PacksetGraph nodes corresponding to n1 and n2.
|
|
// We work from the dependency graph, because it gives us all the data-dependencies,
|
|
// as well as more refined memory-dependencies than the C2 graph. The dependency graph
|
|
// does not have cycles. But packing nodes can introduce cyclic dependencies. Example:
|
|
//
|
|
// +--------+
|
|
// A -> X | v
|
|
// Pack [A,B] and [X,Y] [A,B] [X,Y]
|
|
// Y -> B ^ |
|
|
// +--------+
|
|
//
|
|
class PacksetGraph {
|
|
private:
|
|
// pid: packset graph node id.
|
|
GrowableArray<int> _pid; // bb_idx(n) -> pid
|
|
GrowableArray<Node*> _pid_to_node; // one node per pid, find rest via _packset.pack
|
|
GrowableArray<GrowableArray<int>> _out; // out-edges
|
|
GrowableArray<int> _incnt; // number of (implicit) in-edges
|
|
int _max_pid = 0;
|
|
|
|
bool _schedule_success;
|
|
|
|
SuperWord* _slp;
|
|
public:
|
|
PacksetGraph(SuperWord* slp)
|
|
: _pid(8, 0, /* default */ 0), _slp(slp) {
|
|
}
|
|
// Get pid, if there is a packset node that n belongs to. Else return 0.
|
|
int get_pid_or_zero(const Node* n) const {
|
|
if (!_slp->in_bb(n)) {
|
|
return 0;
|
|
}
|
|
int idx = _slp->bb_idx(n);
|
|
if (idx >= _pid.length()) {
|
|
return 0;
|
|
} else {
|
|
return _pid.at(idx);
|
|
}
|
|
}
|
|
int get_pid(const Node* n) {
|
|
int poz = get_pid_or_zero(n);
|
|
assert(poz != 0, "pid should not be zero");
|
|
return poz;
|
|
}
|
|
void set_pid(Node* n, int pid) {
|
|
assert(n != nullptr && pid > 0, "sane inputs");
|
|
assert(_slp->in_bb(n), "must be");
|
|
int idx = _slp->bb_idx(n);
|
|
_pid.at_put_grow(idx, pid);
|
|
_pid_to_node.at_put_grow(pid - 1, n, nullptr);
|
|
}
|
|
Node* get_node(int pid) {
|
|
assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped");
|
|
Node* n = _pid_to_node.at(pid - 1);
|
|
assert(n != nullptr, "sanity");
|
|
return n;
|
|
}
|
|
int new_pid() {
|
|
_incnt.push(0);
|
|
_out.push(GrowableArray<int>());
|
|
return ++_max_pid;
|
|
}
|
|
int incnt(int pid) { return _incnt.at(pid - 1); }
|
|
void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); }
|
|
GrowableArray<int>& out(int pid) { return _out.at(pid - 1); }
|
|
bool schedule_success() const { return _schedule_success; }
|
|
|
|
// Create nodes (from packs and scalar-nodes), and add edges, based on the dependency graph.
|
|
void build() {
|
|
const PackSet& packset = _slp->packset();
|
|
const GrowableArray<Node*>& body = _slp->body();
|
|
// Map nodes in packsets
|
|
for (int i = 0; i < packset.length(); i++) {
|
|
Node_List* p = packset.at(i);
|
|
int pid = new_pid();
|
|
for (uint k = 0; k < p->size(); k++) {
|
|
Node* n = p->at(k);
|
|
set_pid(n, pid);
|
|
assert(packset.get_pack(n) == p, "matching packset");
|
|
}
|
|
}
|
|
|
|
int max_pid_packset = _max_pid;
|
|
|
|
// Map nodes not in packset
|
|
for (int i = 0; i < body.length(); i++) {
|
|
Node* n = body.at(i);
|
|
if (n->is_Phi() || n->is_CFG()) {
|
|
continue; // ignore control flow
|
|
}
|
|
int pid = get_pid_or_zero(n);
|
|
if (pid == 0) {
|
|
pid = new_pid();
|
|
set_pid(n, pid);
|
|
assert(packset.get_pack(n) == nullptr, "no packset");
|
|
}
|
|
}
|
|
|
|
// Map edges for packset nodes
|
|
VectorSet set;
|
|
for (int i = 0; i < packset.length(); i++) {
|
|
Node_List* p = packset.at(i);
|
|
set.clear();
|
|
int pid = get_pid(p->at(0));
|
|
for (uint k = 0; k < p->size(); k++) {
|
|
Node* n = p->at(k);
|
|
assert(pid == get_pid(n), "all nodes in pack have same pid");
|
|
for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) {
|
|
Node* pred = preds.current();
|
|
int pred_pid = get_pid_or_zero(pred);
|
|
if (pred_pid == pid && _slp->is_marked_reduction(n)) {
|
|
continue; // reduction -> self-cycle is not a cyclic dependency
|
|
}
|
|
// Only add edges once, and only for mapped nodes (in body)
|
|
if (pred_pid > 0 && !set.test_set(pred_pid)) {
|
|
incnt_set(pid, incnt(pid) + 1); // increment
|
|
out(pred_pid).push(pid);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Map edges for nodes not in packset
|
|
for (int i = 0; i < body.length(); i++) {
|
|
Node* n = body.at(i);
|
|
int pid = get_pid_or_zero(n); // zero for Phi or CFG
|
|
if (pid <= max_pid_packset) {
|
|
continue; // Only scalar-nodes
|
|
}
|
|
for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) {
|
|
Node* pred = preds.current();
|
|
int pred_pid = get_pid_or_zero(pred);
|
|
// Only add edges for mapped nodes (in body)
|
|
if (pred_pid > 0) {
|
|
incnt_set(pid, incnt(pid) + 1); // increment
|
|
out(pred_pid).push(pid);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node
|
|
// that has zero incnt. If a PacksetGraph node corresponds to memops, then add
|
|
// those to the memops_schedule. At the end, we return the memops_schedule, and
|
|
// note if topsort was successful.
|
|
Node_List schedule() {
|
|
Node_List memops_schedule;
|
|
GrowableArray<int> worklist;
|
|
// Directly schedule all nodes without precedence
|
|
for (int pid = 1; pid <= _max_pid; pid++) {
|
|
if (incnt(pid) == 0) {
|
|
worklist.push(pid);
|
|
}
|
|
}
|
|
// Continue scheduling via topological sort
|
|
for (int i = 0; i < worklist.length(); i++) {
|
|
int pid = worklist.at(i);
|
|
|
|
// Add memops to memops_schedule
|
|
Node* n = get_node(pid);
|
|
Node_List* p = _slp->packset().get_pack(n);
|
|
if (n->is_Mem()) {
|
|
if (p == nullptr) {
|
|
memops_schedule.push(n);
|
|
} else {
|
|
for (uint k = 0; k < p->size(); k++) {
|
|
memops_schedule.push(p->at(k));
|
|
assert(p->at(k)->is_Mem(), "only schedule memops");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Decrement incnt for all successors
|
|
for (int j = 0; j < out(pid).length(); j++){
|
|
int pid_use = out(pid).at(j);
|
|
int incnt_use = incnt(pid_use) - 1;
|
|
incnt_set(pid_use, incnt_use);
|
|
// Did use lose its last input?
|
|
if (incnt_use == 0) {
|
|
worklist.push(pid_use);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Was every pid scheduled? If not, we found some cycles in the PacksetGraph.
|
|
_schedule_success = (worklist.length() == _max_pid);
|
|
return memops_schedule;
|
|
}
|
|
|
|
// Print the PacksetGraph.
|
|
// print_nodes = true: print all C2 nodes beloning to PacksetGrahp node.
|
|
// print_zero_incnt = false: do not print nodes that have no in-edges (any more).
|
|
void print(bool print_nodes, bool print_zero_incnt) {
|
|
const GrowableArray<Node*> &body = _slp->body();
|
|
tty->print_cr("PacksetGraph");
|
|
for (int pid = 1; pid <= _max_pid; pid++) {
|
|
if (incnt(pid) == 0 && !print_zero_incnt) {
|
|
continue;
|
|
}
|
|
tty->print("Node %d. incnt %d [", pid, incnt(pid));
|
|
for (int j = 0; j < out(pid).length(); j++) {
|
|
tty->print("%d ", out(pid).at(j));
|
|
}
|
|
tty->print_cr("]");
|
|
#ifndef PRODUCT
|
|
if (print_nodes) {
|
|
for (int i = 0; i < body.length(); i++) {
|
|
Node* n = body.at(i);
|
|
if (get_pid_or_zero(n) == pid) {
|
|
tty->print(" ");
|
|
n->dump();
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
};
|
|
|
|
// We want to replace the packed scalars from the PackSet and replace them
|
|
// with vector operations. This requires scheduling and re-ordering the memory
|
|
// graph. We take these steps:
|
|
// (1) Build the PacksetGraph. It combines the dependency graph with the
|
|
// packset. The PacksetGraph gives us the dependencies that must be
|
|
// respected after scheduling.
|
|
// (2) Schedule the PacksetGraph to the memops_schedule, which represents
|
|
// a linear order of all memops in the body. The order respects the
|
|
// dependencies of the PacksetGraph.
|
|
// (3) If the PacksetGraph has cycles, we cannot schedule. Abort.
|
|
// (4) Apply the vectorization, including re-ordering the memops and replacing
|
|
// packed scalars with vector operations.
|
|
bool SuperWord::schedule_and_apply() {
|
|
if (_packset.is_empty()) {
|
|
return false;
|
|
}
|
|
ResourceMark rm;
|
|
|
|
// (1) Build the PacksetGraph.
|
|
PacksetGraph graph(this);
|
|
graph.build();
|
|
|
|
// (2) Schedule the PacksetGraph.
|
|
Node_List memops_schedule = graph.schedule();
|
|
|
|
// (3) Check if the PacksetGraph schedule succeeded (had no cycles).
|
|
// We now know that we only have independent packs, see verify_packs.
|
|
// This is a necessary but not a sufficient condition for an acyclic
|
|
// graph (DAG) after scheduling. Thus, we must check if the packs have
|
|
// introduced a cycle. The SuperWord paper mentions the need for this
|
|
// in "3.7 Scheduling".
|
|
if (!graph.schedule_success()) {
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_rejections()) {
|
|
tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:");
|
|
graph.print(true, false);
|
|
tty->print_cr("removing all packs from packset.");
|
|
}
|
|
#endif
|
|
_packset.clear();
|
|
return false;
|
|
}
|
|
|
|
// (4) Apply the vectorization, including re-ordering the memops.
|
|
return apply(memops_schedule);
|
|
}
|
|
|
|
bool SuperWord::apply(Node_List& memops_schedule) {
|
|
Compile* C = phase()->C;
|
|
CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
|
|
C->print_method(PHASE_AUTO_VECTORIZATION1_BEFORE_APPLY, 4, cl);
|
|
|
|
apply_memops_reordering_with_schedule(memops_schedule);
|
|
C->print_method(PHASE_AUTO_VECTORIZATION2_AFTER_REORDER, 4, cl);
|
|
|
|
adjust_pre_loop_limit_to_align_main_loop_vectors();
|
|
C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl);
|
|
|
|
bool is_success = apply_vectorization();
|
|
C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl);
|
|
|
|
return is_success;
|
|
}
|
|
|
|
// Reorder the memory graph for all slices in parallel. We walk over the schedule once,
|
|
// and track the current memory state of each slice.
|
|
void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule) {
|
|
#ifndef PRODUCT
|
|
if (is_trace_superword_info()) {
|
|
tty->print_cr("\nSuperWord::apply_memops_reordering_with_schedule:");
|
|
memops_schedule.dump();
|
|
}
|
|
#endif
|
|
|
|
int max_slices = phase()->C->num_alias_types();
|
|
// When iterating over the memops_schedule, we keep track of the current memory state,
|
|
// which is the Phi or a store in the loop.
|
|
GrowableArray<Node*> current_state_in_slice(max_slices, max_slices, nullptr);
|
|
// The memory state after the loop is the last store inside the loop. If we reorder the
|
|
// loop we may have a different last store, and we need to adjust the uses accordingly.
|
|
GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
|
|
|
|
const GrowableArray<PhiNode*>& mem_slice_head = _vloop_analyzer.memory_slices().heads();
|
|
|
|
// (1) Set up the initial memory state from Phi. And find the old last store.
|
|
for (int i = 0; i < mem_slice_head.length(); i++) {
|
|
Node* phi = mem_slice_head.at(i);
|
|
assert(phi->is_Phi(), "must be phi");
|
|
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
|
|
current_state_in_slice.at_put(alias_idx, phi);
|
|
|
|
// If we have a memory phi, we have a last store in the loop, find it over backedge.
|
|
StoreNode* last_store = phi->in(2)->as_Store();
|
|
old_last_store_in_slice.at_put(alias_idx, last_store);
|
|
}
|
|
|
|
// (2) Walk over memops_schedule, append memops to the current state
|
|
// of that slice. If it is a Store, we take it as the new state.
|
|
for (uint i = 0; i < memops_schedule.size(); i++) {
|
|
MemNode* n = memops_schedule.at(i)->as_Mem();
|
|
assert(n->is_Load() || n->is_Store(), "only loads or stores");
|
|
int alias_idx = phase()->C->get_alias_index(n->adr_type());
|
|
Node* current_state = current_state_in_slice.at(alias_idx);
|
|
if (current_state == nullptr) {
|
|
// If there are only loads in a slice, we never update the memory
|
|
// state in the loop, hence there is no phi for the memory state.
|
|
// We just keep the old memory state that was outside the loop.
|
|
assert(n->is_Load() && !in_bb(n->in(MemNode::Memory)),
|
|
"only loads can have memory state from outside loop");
|
|
} else {
|
|
igvn().replace_input_of(n, MemNode::Memory, current_state);
|
|
if (n->is_Store()) {
|
|
current_state_in_slice.at_put(alias_idx, n);
|
|
}
|
|
}
|
|
}
|
|
|
|
// (3) For each slice, we add the current state to the backedge
|
|
// in the Phi. Further, we replace uses of the old last store
|
|
// with uses of the new last store (current_state).
|
|
Node_List uses_after_loop;
|
|
for (int i = 0; i < mem_slice_head.length(); i++) {
|
|
Node* phi = mem_slice_head.at(i);
|
|
int alias_idx = phase()->C->get_alias_index(phi->adr_type());
|
|
Node* current_state = current_state_in_slice.at(alias_idx);
|
|
assert(current_state != nullptr, "slice is mapped");
|
|
assert(current_state != phi, "did some work in between");
|
|
assert(current_state->is_Store(), "sanity");
|
|
igvn().replace_input_of(phi, 2, current_state);
|
|
|
|
// Replace uses of old last store with current_state (new last store)
|
|
// Do it in two loops: first find all the uses, and change the graph
|
|
// in as second loop so that we do not break the iterator.
|
|
Node* last_store = old_last_store_in_slice.at(alias_idx);
|
|
assert(last_store != nullptr, "we have a old last store");
|
|
uses_after_loop.clear();
|
|
for (DUIterator_Fast kmax, k = last_store->fast_outs(kmax); k < kmax; k++) {
|
|
Node* use = last_store->fast_out(k);
|
|
if (!in_bb(use)) {
|
|
uses_after_loop.push(use);
|
|
}
|
|
}
|
|
for (uint k = 0; k < uses_after_loop.size(); k++) {
|
|
Node* use = uses_after_loop.at(k);
|
|
for (uint j = 0; j < use->req(); j++) {
|
|
Node* def = use->in(j);
|
|
if (def == last_store) {
|
|
igvn().replace_input_of(use, j, current_state);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert packs into vector node operations
|
|
// At this point, all correctness and profitability checks have passed.
|
|
// We start the irreversible process of editing the C2 graph. Should
|
|
// there be an unexpected situation (assert fails), then we can only
|
|
// bail out of the compilation, as the graph has already been partially
|
|
// modified. We bail out, and retry without SuperWord.
|
|
bool SuperWord::apply_vectorization() {
|
|
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
|
assert(cl->is_main_loop(), "SLP should only work on main loops");
|
|
Compile* C = phase()->C;
|
|
assert(!_packset.is_empty(), "vectorization requires non-empty packset");
|
|
|
|
#ifndef PRODUCT
|
|
if (TraceLoopOpts) {
|
|
tty->print("SuperWord::apply_vectorization ");
|
|
lpt()->dump_head();
|
|
}
|
|
#endif
|
|
|
|
uint max_vlen_in_bytes = 0;
|
|
uint max_vlen = 0;
|
|
|
|
for (int i = 0; i < body().length(); i++) {
|
|
Node* n = body().at(i);
|
|
Node_List* p = get_pack(n);
|
|
if (p != nullptr && n == p->at(p->size()-1)) {
|
|
// After apply_memops_reordering_with_schedule, we know that the memops have the same order in the pack
|
|
// as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
|
|
// and "n" is the last node in the slice from the pack.
|
|
Node* first = p->at(0);
|
|
uint vlen = p->size();
|
|
uint vlen_in_bytes = 0;
|
|
Node* vn = nullptr;
|
|
int opc = n->Opcode();
|
|
if (n->is_Load()) {
|
|
Node* ctl = n->in(MemNode::Control);
|
|
Node* mem = first->in(MemNode::Memory);
|
|
// Set the memory dependency of the LoadVector as early as possible.
|
|
// Walk up the memory chain, and ignore any StoreVector that provably
|
|
// does not have any memory dependency.
|
|
while (mem->is_StoreVector()) {
|
|
VPointer p_store(mem->as_Mem(), _vloop);
|
|
if (p_store.overlap_possible_with_any_in(p)) {
|
|
break;
|
|
} else {
|
|
mem = mem->in(MemNode::Memory);
|
|
}
|
|
}
|
|
Node* adr = first->in(MemNode::Address);
|
|
const TypePtr* atyp = n->adr_type();
|
|
vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
|
|
vlen_in_bytes = vn->as_LoadVector()->memory_size();
|
|
} else if (n->is_Store()) {
|
|
// Promote value to be stored to vector
|
|
Node* val = vector_opd(p, MemNode::ValueIn);
|
|
if (val == nullptr) {
|
|
assert(false, "input to vector store was not created");
|
|
C->record_failure(C2Compiler::retry_no_superword());
|
|
return false; // bailout
|
|
}
|
|
|
|
Node* ctl = n->in(MemNode::Control);
|
|
Node* mem = first->in(MemNode::Memory);
|
|
Node* adr = first->in(MemNode::Address);
|
|
const TypePtr* atyp = n->adr_type();
|
|
vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
|
|
vlen_in_bytes = vn->as_StoreVector()->memory_size();
|
|
} else if (VectorNode::is_scalar_rotate(n)) {
|
|
Node* in1 = vector_opd(p, 1);
|
|
Node* in2 = first->in(2);
|
|
// If rotation count is non-constant or greater than 8bit value create a vector.
|
|
if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) {
|
|
in2 = vector_opd(p, 2);
|
|
}
|
|
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
} else if (VectorNode::is_roundopD(n)) {
|
|
Node* in1 = vector_opd(p, 1);
|
|
Node* in2 = first->in(2);
|
|
assert(in2->is_Con(), "Constant rounding mode expected.");
|
|
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
} else if (VectorNode::is_muladds2i(n)) {
|
|
assert(n->req() == 5u, "MulAddS2I should have 4 operands.");
|
|
Node* in1 = vector_opd(p, 1);
|
|
Node* in2 = vector_opd(p, 2);
|
|
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
} else if (opc == Op_SignumF || opc == Op_SignumD) {
|
|
assert(n->req() == 4, "four inputs expected");
|
|
Node* in = vector_opd(p, 1);
|
|
Node* zero = vector_opd(p, 2);
|
|
Node* one = vector_opd(p, 3);
|
|
vn = VectorNode::make(opc, in, zero, one, vlen, velt_basic_type(n));
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
} else if (n->is_Cmp()) {
|
|
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
|
|
continue;
|
|
} else if (n->is_Bool()) {
|
|
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
|
|
continue;
|
|
} else if (n->is_CMove()) {
|
|
// Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
|
|
|
|
BoolNode* bol = n->in(1)->as_Bool();
|
|
assert(bol != nullptr, "must have Bool above CMove");
|
|
Node_List* bool_pack = get_pack(bol);
|
|
assert(bool_pack != nullptr, "CMove must have matching Bool pack");
|
|
|
|
CmpNode* cmp = bol->in(1)->as_Cmp();
|
|
assert(cmp != nullptr, "must have cmp above CMove");
|
|
Node_List* cmp_pack = get_pack(cmp);
|
|
assert(cmp_pack != nullptr, "Bool must have matching Cmp pack");
|
|
|
|
Node* cmp_in1 = vector_opd(cmp_pack, 1);
|
|
Node* cmp_in2 = vector_opd(cmp_pack, 2);
|
|
|
|
Node* blend_in1 = vector_opd(p, 2);
|
|
Node* blend_in2 = vector_opd(p, 3);
|
|
|
|
VTransformBoolTest bool_test = _packset.get_bool_test(bool_pack);
|
|
BoolTest::mask test_mask = bool_test._mask;
|
|
if (bool_test._is_negated) {
|
|
// We can cancel out the negation by swapping the blend inputs.
|
|
swap(blend_in1, blend_in2);
|
|
}
|
|
|
|
// VectorMaskCmp
|
|
ConINode* test_mask_node = igvn().intcon((int)test_mask);
|
|
BasicType bt = velt_basic_type(cmp);
|
|
const TypeVect* vt = TypeVect::make(bt, vlen);
|
|
VectorNode* mask = new VectorMaskCmpNode(test_mask, cmp_in1, cmp_in2, test_mask_node, vt);
|
|
phase()->register_new_node_with_ctrl_of(mask, p->at(0));
|
|
igvn()._worklist.push(mask);
|
|
|
|
// VectorBlend
|
|
vn = new VectorBlendNode(blend_in1, blend_in2, mask);
|
|
} else if (n->req() == 3) {
|
|
// Promote operands to vector
|
|
Node* in1 = nullptr;
|
|
bool node_isa_reduction = is_marked_reduction(n);
|
|
if (node_isa_reduction) {
|
|
// the input to the first reduction operation is retained
|
|
in1 = first->in(1);
|
|
} else {
|
|
in1 = vector_opd(p, 1);
|
|
if (in1 == nullptr) {
|
|
assert(false, "input in1 to vector operand was not created");
|
|
C->record_failure(C2Compiler::retry_no_superword());
|
|
return false; // bailout
|
|
}
|
|
}
|
|
Node* in2 = vector_opd(p, 2);
|
|
if (in2 == nullptr) {
|
|
assert(false, "input in2 to vector operand was not created");
|
|
C->record_failure(C2Compiler::retry_no_superword());
|
|
return false; // bailout
|
|
}
|
|
if (in1->Opcode() == Op_Replicate && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
|
|
// Move invariant vector input into second position to avoid register spilling.
|
|
Node* tmp = in1;
|
|
in1 = in2;
|
|
in2 = tmp;
|
|
}
|
|
if (node_isa_reduction) {
|
|
const Type *arith_type = n->bottom_type();
|
|
vn = ReductionNode::make(opc, nullptr, in1, in2, arith_type->basic_type());
|
|
if (in2->is_Load()) {
|
|
vlen_in_bytes = in2->as_LoadVector()->memory_size();
|
|
} else {
|
|
vlen_in_bytes = in2->as_Vector()->length_in_bytes();
|
|
}
|
|
} else {
|
|
if (VectorNode::can_use_RShiftI_instead_of_URShiftI(n, velt_basic_type(n))) {
|
|
opc = Op_RShiftI;
|
|
}
|
|
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
}
|
|
} else if (VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc)) {
|
|
assert(n->req() == 2, "only one input expected");
|
|
Node* in = vector_opd(p, 1);
|
|
vn = VectorNode::make(opc, in, nullptr, vlen, velt_basic_type(n));
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
} else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) {
|
|
assert(n->req() == 2, "only one input expected");
|
|
Node* in = vector_opd(p, 1);
|
|
Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
|
|
phase()->register_new_node_with_ctrl_of(longval, first);
|
|
// Requires extra vector long -> int conversion.
|
|
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
} else if (VectorNode::is_convert_opcode(opc)) {
|
|
assert(n->req() == 2, "only one input expected");
|
|
BasicType bt = velt_basic_type(n);
|
|
Node* in = vector_opd(p, 1);
|
|
int vopc = VectorCastNode::opcode(opc, in->bottom_type()->is_vect()->element_basic_type());
|
|
vn = VectorCastNode::make(vopc, in, bt, vlen);
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
} else if (opc == Op_FmaD || opc == Op_FmaF) {
|
|
// Promote operands to vector
|
|
Node* in1 = vector_opd(p, 1);
|
|
Node* in2 = vector_opd(p, 2);
|
|
Node* in3 = vector_opd(p, 3);
|
|
vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n));
|
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
|
} else {
|
|
assert(false, "Unhandled scalar opcode (%s)", NodeClassNames[opc]);
|
|
C->record_failure(C2Compiler::retry_no_superword());
|
|
return false; // bailout
|
|
}
|
|
|
|
if (vn == nullptr) {
|
|
assert(false, "got null node instead of vector node");
|
|
C->record_failure(C2Compiler::retry_no_superword());
|
|
return false; // bailout
|
|
}
|
|
|
|
#ifdef ASSERT
|
|
// Mark Load/Store Vector for alignment verification
|
|
if (VerifyAlignVector) {
|
|
if (vn->Opcode() == Op_LoadVector) {
|
|
vn->as_LoadVector()->set_must_verify_alignment();
|
|
} else if (vn->Opcode() == Op_StoreVector) {
|
|
vn->as_StoreVector()->set_must_verify_alignment();
|
|
}
|
|
}
|
|
#endif
|
|
|
|
phase()->register_new_node_with_ctrl_of(vn, first);
|
|
for (uint j = 0; j < p->size(); j++) {
|
|
Node* pm = p->at(j);
|
|
igvn().replace_node(pm, vn);
|
|
}
|
|
igvn()._worklist.push(vn);
|
|
|
|
if (vlen > max_vlen) {
|
|
max_vlen = vlen;
|
|
}
|
|
if (vlen_in_bytes > max_vlen_in_bytes) {
|
|
max_vlen_in_bytes = vlen_in_bytes;
|
|
}
|
|
VectorNode::trace_new_vector(vn, "SuperWord");
|
|
}
|
|
}//for (int i = 0; i < body().length(); i++)
|
|
|
|
if (max_vlen_in_bytes > C->max_vector_size()) {
|
|
C->set_max_vector_size(max_vlen_in_bytes);
|
|
}
|
|
if (max_vlen_in_bytes > 0) {
|
|
cl->mark_loop_vectorized();
|
|
}
|
|
|
|
if (SuperWordLoopUnrollAnalysis) {
|
|
if (cl->has_passed_slp()) {
|
|
uint slp_max_unroll_factor = cl->slp_max_unroll();
|
|
if (slp_max_unroll_factor == max_vlen) {
|
|
#ifndef PRODUCT
|
|
if (TraceSuperWordLoopUnrollAnalysis) {
|
|
tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
|
|
}
|
|
#endif
|
|
// For atomic unrolled loops which are vector mapped, instigate more unrolling
|
|
cl->set_notpassed_slp();
|
|
// if vector resources are limited, do not allow additional unrolling
|
|
if (Matcher::float_pressure_limit() > 8) {
|
|
C->set_major_progress();
|
|
cl->mark_do_unroll_only();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
//------------------------------vector_opd---------------------------
|
|
// Create a vector operand for the nodes in pack p for operand: in(opd_idx)
|
|
Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
|
Node* p0 = p->at(0);
|
|
uint vlen = p->size();
|
|
Node* opd = p0->in(opd_idx);
|
|
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
|
Node* same_input = _packset.same_inputs_at_index_or_null(p, opd_idx);
|
|
|
|
// Insert index population operation to create a vector of increasing
|
|
// indices starting from the iv value. In some special unrolled loops
|
|
// (see JDK-8286125), we need scalar replications of the iv value if
|
|
// all inputs are the same iv, so we do a same inputs check here.
|
|
if (opd == iv() && same_input == nullptr) {
|
|
BasicType p0_bt = velt_basic_type(p0);
|
|
BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
|
|
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
|
|
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
|
|
Node* vn = new PopulateIndexNode(iv(), igvn().intcon(1), vt);
|
|
VectorNode::trace_new_vector(vn, "SuperWord");
|
|
phase()->register_new_node_with_ctrl_of(vn, opd);
|
|
return vn;
|
|
}
|
|
|
|
if (same_input != nullptr) {
|
|
if (opd->is_Vector() || opd->is_LoadVector()) {
|
|
if (opd_idx == 2 && VectorNode::is_shift(p0)) {
|
|
assert(false, "shift's count can't be vector");
|
|
return nullptr;
|
|
}
|
|
return opd; // input is matching vector
|
|
}
|
|
if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
|
|
Node* cnt = opd;
|
|
// Vector instructions do not mask shift count, do it here.
|
|
juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
|
|
const TypeInt* t = opd->find_int_type();
|
|
if (t != nullptr && t->is_con()) {
|
|
juint shift = t->get_con();
|
|
if (shift > mask) { // Unsigned cmp
|
|
cnt = igvn().intcon(shift & mask);
|
|
phase()->set_ctrl(cnt, phase()->C->root());
|
|
}
|
|
} else {
|
|
if (t == nullptr || t->_lo < 0 || t->_hi > (int)mask) {
|
|
cnt = igvn().intcon(mask);
|
|
cnt = new AndINode(opd, cnt);
|
|
phase()->register_new_node_with_ctrl_of(cnt, opd);
|
|
}
|
|
if (!opd->bottom_type()->isa_int()) {
|
|
assert(false, "int type only");
|
|
return nullptr;
|
|
}
|
|
}
|
|
// Move shift count into vector register.
|
|
cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0));
|
|
phase()->register_new_node_with_ctrl_of(cnt, opd);
|
|
return cnt;
|
|
}
|
|
if (opd->is_StoreVector()) {
|
|
assert(false, "StoreVector is not expected here");
|
|
return nullptr;
|
|
}
|
|
// Convert scalar input to vector with the same number of elements as
|
|
// p0's vector. Use p0's type because size of operand's container in
|
|
// vector should match p0's size regardless operand's size.
|
|
const Type* p0_t = nullptr;
|
|
VectorNode* vn = nullptr;
|
|
if (opd_idx == 2 && VectorNode::is_scalar_rotate(p0)) {
|
|
Node* conv = opd;
|
|
p0_t = TypeInt::INT;
|
|
if (p0->bottom_type()->isa_long()) {
|
|
p0_t = TypeLong::LONG;
|
|
conv = new ConvI2LNode(opd);
|
|
phase()->register_new_node_with_ctrl_of(conv, opd);
|
|
}
|
|
vn = VectorNode::scalar2vector(conv, vlen, p0_t);
|
|
} else {
|
|
p0_t = velt_type(p0);
|
|
vn = VectorNode::scalar2vector(opd, vlen, p0_t);
|
|
}
|
|
|
|
phase()->register_new_node_with_ctrl_of(vn, opd);
|
|
VectorNode::trace_new_vector(vn, "SuperWord");
|
|
return vn;
|
|
}
|
|
|
|
// Insert pack operation
|
|
BasicType bt = velt_basic_type(p0);
|
|
PackNode* pk = PackNode::make(opd, vlen, bt);
|
|
DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
|
|
|
|
for (uint i = 1; i < vlen; i++) {
|
|
Node* pi = p->at(i);
|
|
Node* in = pi->in(opd_idx);
|
|
if (get_pack(in) != nullptr) {
|
|
assert(false, "Should already have been unpacked");
|
|
return nullptr;
|
|
}
|
|
assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
|
|
pk->add_opd(in);
|
|
if (VectorNode::is_muladds2i(pi)) {
|
|
Node* in2 = pi->in(opd_idx + 2);
|
|
if (get_pack(in2) != nullptr) {
|
|
assert(false, "Should already have been unpacked");
|
|
return nullptr;
|
|
}
|
|
assert(opd_bt == in2->bottom_type()->basic_type(), "all same type");
|
|
pk->add_opd(in2);
|
|
}
|
|
}
|
|
phase()->register_new_node_with_ctrl_of(pk, opd);
|
|
VectorNode::trace_new_vector(pk, "SuperWord");
|
|
return pk;
|
|
}
|
|
|
|
#ifdef ASSERT
|
|
// We check that every packset (name it p_def) only has vector uses (p_use),
|
|
// which are proper vector uses of def.
|
|
void SuperWord::verify_no_extract() {
|
|
for (int i = 0; i < _packset.length(); i++) {
|
|
Node_List* p_def = _packset.at(i);
|
|
|
|
// A vector store has no uses
|
|
if (p_def->at(0)->is_Store()) { continue; }
|
|
|
|
// for every def in p_def, and every use:
|
|
for (uint i = 0; i < p_def->size(); i++) {
|
|
Node* def = p_def->at(i);
|
|
for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
|
|
Node* use = def->fast_out(j);
|
|
// find every use->def edge:
|
|
for (uint k = 0; k < use->req(); k++) {
|
|
Node* maybe_def = use->in(k);
|
|
if (def == maybe_def) {
|
|
Node_List* p_use = get_pack(use);
|
|
if (is_marked_reduction(def)) { continue; }
|
|
assert(p_use != nullptr && is_vector_use(use, k), "all uses must be vector uses");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Check if n_super's pack uses are a superset of n_sub's pack uses.
|
|
bool SuperWord::has_use_pack_superset(const Node* n_super, const Node* n_sub) const {
|
|
Node_List* pack = get_pack(n_super);
|
|
assert(pack != nullptr && pack == get_pack(n_sub), "must have the same pack");
|
|
|
|
// For all uses of n_sub that are in a pack (use_sub) ...
|
|
for (DUIterator_Fast jmax, j = n_sub->fast_outs(jmax); j < jmax; j++) {
|
|
Node* use_sub = n_sub->fast_out(j);
|
|
Node_List* pack_use_sub = get_pack(use_sub);
|
|
if (pack_use_sub == nullptr) { continue; }
|
|
|
|
// ... and all input edges: use_sub->in(i) == n_sub.
|
|
uint start, end;
|
|
VectorNode::vector_operands(use_sub, &start, &end);
|
|
for (uint i = start; i < end; i++) {
|
|
if (use_sub->in(i) != n_sub) { continue; }
|
|
|
|
// Check if n_super has any use use_super in the same pack ...
|
|
bool found = false;
|
|
for (DUIterator_Fast kmax, k = n_super->fast_outs(kmax); k < kmax; k++) {
|
|
Node* use_super = n_super->fast_out(k);
|
|
Node_List* pack_use_super = get_pack(use_super);
|
|
if (pack_use_sub != pack_use_super) { continue; }
|
|
|
|
// ... and where there is an edge use_super->in(i) == n_super.
|
|
// For MulAddS2I it is expected to have defs over different input edges.
|
|
if (use_super->in(i) != n_super && !VectorNode::is_muladds2i(use_super)) { continue; }
|
|
|
|
found = true;
|
|
break;
|
|
}
|
|
if (!found) {
|
|
// n_sub has a use-edge (use_sub->in(i) == n_sub) with use_sub in a packset,
|
|
// but n_super does not have any edge (use_super->in(i) == n_super) with
|
|
// use_super in the same packset. Hence, n_super does not have a use pack
|
|
// superset of n_sub.
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
// n_super has all edges that n_sub has.
|
|
return true;
|
|
}
|
|
|
|
// Find a boundary in the pack, where left and right have different pack uses and defs.
|
|
// This is a natural boundary to split a pack, to ensure that use and def packs match.
|
|
// If no boundary is found, return zero.
|
|
uint SuperWord::find_use_def_boundary(const Node_List* pack) const {
|
|
Node* p0 = pack->at(0);
|
|
Node* p1 = pack->at(1);
|
|
|
|
const bool is_reduction_pack = reduction(p0, p1);
|
|
|
|
// Inputs range
|
|
uint start, end;
|
|
VectorNode::vector_operands(p0, &start, &end);
|
|
|
|
for (int i = pack->size() - 2; i >= 0; i--) {
|
|
// For all neighbours
|
|
Node* n0 = pack->at(i + 0);
|
|
Node* n1 = pack->at(i + 1);
|
|
|
|
|
|
// 1. Check for matching defs
|
|
for (uint j = start; j < end; j++) {
|
|
Node* n0_in = n0->in(j);
|
|
Node* n1_in = n1->in(j);
|
|
// No boundary if:
|
|
// 1) the same packs OR
|
|
// 2) reduction edge n0->n1 or n1->n0
|
|
if (get_pack(n0_in) != get_pack(n1_in) &&
|
|
!((n0 == n1_in || n1 == n0_in) && is_reduction_pack)) {
|
|
return i + 1;
|
|
}
|
|
}
|
|
|
|
// 2. Check for matching uses: equal if both are superset of the other.
|
|
// Reductions have no pack uses, so they match trivially on the use packs.
|
|
if (!is_reduction_pack &&
|
|
!(has_use_pack_superset(n0, n1) &&
|
|
has_use_pack_superset(n1, n0))) {
|
|
return i + 1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
//------------------------------is_vector_use---------------------------
|
|
// Is use->in(u_idx) a vector use?
|
|
bool SuperWord::is_vector_use(Node* use, int u_idx) const {
|
|
Node_List* u_pk = get_pack(use);
|
|
if (u_pk == nullptr) return false;
|
|
|
|
// Reduction: first input is internal connection.
|
|
if (is_marked_reduction(use) && u_idx == 1) {
|
|
#ifdef ASSERT
|
|
for (uint i = 1; i < u_pk->size(); i++) {
|
|
assert(u_pk->at(i - 1) == u_pk->at(i)->in(1), "internal connection");
|
|
}
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
Node* def = use->in(u_idx);
|
|
Node_List* d_pk = get_pack(def);
|
|
if (d_pk == nullptr) {
|
|
Node* n = u_pk->at(0)->in(u_idx);
|
|
if (n == iv()) {
|
|
// check for index population
|
|
BasicType bt = velt_basic_type(use);
|
|
if (!VectorNode::is_populate_index_supported(bt)) return false;
|
|
for (uint i = 1; i < u_pk->size(); i++) {
|
|
// We can create a vector filled with iv indices if all other nodes
|
|
// in use pack have inputs of iv plus node index.
|
|
Node* use_in = u_pk->at(i)->in(u_idx);
|
|
if (!use_in->is_Add() || use_in->in(1) != n) return false;
|
|
const TypeInt* offset_t = use_in->in(2)->bottom_type()->is_int();
|
|
if (offset_t == nullptr || !offset_t->is_con() ||
|
|
offset_t->get_con() != (jint) i) return false;
|
|
}
|
|
} else {
|
|
// check for scalar promotion
|
|
for (uint i = 1; i < u_pk->size(); i++) {
|
|
if (u_pk->at(i)->in(u_idx) != n) return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
if (!is_velt_basic_type_compatible_use_def(use, def)) {
|
|
return false;
|
|
}
|
|
|
|
if (VectorNode::is_muladds2i(use)) {
|
|
// MulAddS2I takes shorts and produces ints.
|
|
if (u_pk->size() * 2 != d_pk->size()) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
if (u_pk->size() != d_pk->size()) {
|
|
return false;
|
|
}
|
|
|
|
for (uint i = 0; i < u_pk->size(); i++) {
|
|
Node* ui = u_pk->at(i);
|
|
Node* di = d_pk->at(i);
|
|
if (ui->in(u_idx) != di) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Check if the output type of def is compatible with the input type of use, i.e. if the
|
|
// types have the same size.
|
|
bool SuperWord::is_velt_basic_type_compatible_use_def(Node* use, Node* def) const {
|
|
assert(in_bb(def) && in_bb(use), "both use and def are in loop");
|
|
|
|
// Conversions are trivially compatible.
|
|
if (VectorNode::is_convert_opcode(use->Opcode())) {
|
|
return true;
|
|
}
|
|
|
|
BasicType use_bt = velt_basic_type(use);
|
|
BasicType def_bt = velt_basic_type(def);
|
|
|
|
assert(is_java_primitive(use_bt), "sanity %s", type2name(use_bt));
|
|
assert(is_java_primitive(def_bt), "sanity %s", type2name(def_bt));
|
|
|
|
// Nodes like Long.bitCount: expect long input, and int output.
|
|
if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(use->Opcode())) {
|
|
return type2aelembytes(def_bt) == 8 &&
|
|
type2aelembytes(use_bt) == 4;
|
|
}
|
|
|
|
// MulAddS2I: expect short input, and int output.
|
|
if (VectorNode::is_muladds2i(use)) {
|
|
return type2aelembytes(def_bt) == 2 &&
|
|
type2aelembytes(use_bt) == 4;
|
|
}
|
|
|
|
// Default case: input size of use equals output size of def.
|
|
return type2aelembytes(use_bt) == type2aelembytes(def_bt);
|
|
}
|
|
|
|
// Return nullptr if success, else failure message
|
|
VStatus VLoopBody::construct() {
|
|
assert(_body.is_empty(), "body is empty");
|
|
|
|
// First pass over loop body:
|
|
// (1) Check that there are no unwanted nodes (LoadStore, MergeMem, data Proj).
|
|
// (2) Count number of nodes, and create a temporary map (_idx -> bb_idx).
|
|
// (3) Verify that all non-ctrl nodes have an input inside the loop.
|
|
int body_count = 0;
|
|
for (uint i = 0; i < _vloop.lpt()->_body.size(); i++) {
|
|
Node* n = _vloop.lpt()->_body.at(i);
|
|
set_bb_idx(n, i); // Create a temporary map
|
|
if (_vloop.in_bb(n)) {
|
|
body_count++;
|
|
|
|
if (n->is_LoadStore() || n->is_MergeMem() ||
|
|
(n->is_Proj() && !n->as_Proj()->is_CFG())) {
|
|
// Bailout if the loop has LoadStore, MergeMem or data Proj
|
|
// nodes. Superword optimization does not work with them.
|
|
#ifndef PRODUCT
|
|
if (_vloop.is_trace_body()) {
|
|
tty->print_cr("VLoopBody::construct: fails because of unhandled node:");
|
|
n->dump();
|
|
}
|
|
#endif
|
|
return VStatus::make_failure(VLoopBody::FAILURE_NODE_NOT_ALLOWED);
|
|
}
|
|
|
|
if (!n->is_CFG()) {
|
|
bool found = false;
|
|
for (uint j = 0; j < n->req(); j++) {
|
|
Node* def = n->in(j);
|
|
if (def != nullptr && _vloop.in_bb(def)) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!found) {
|
|
// If all inputs to a data-node are outside the loop, the node itself should be outside the loop.
|
|
#ifndef PRODUCT
|
|
if (_vloop.is_trace_body()) {
|
|
tty->print_cr("VLoopBody::construct: fails because data node in loop has no input in loop:");
|
|
n->dump();
|
|
}
|
|
#endif
|
|
return VStatus::make_failure(VLoopBody::FAILURE_UNEXPECTED_CTRL);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create a reverse-post-order list of nodes in body
|
|
ResourceMark rm;
|
|
GrowableArray<Node*> stack;
|
|
VectorSet visited;
|
|
VectorSet post_visited;
|
|
|
|
visited.set(bb_idx(_vloop.cl()));
|
|
stack.push(_vloop.cl());
|
|
|
|
// Do a depth first walk over out edges
|
|
int rpo_idx = body_count - 1;
|
|
while (!stack.is_empty()) {
|
|
Node* n = stack.top(); // Leave node on stack
|
|
if (!visited.test_set(bb_idx(n))) {
|
|
// forward arc in graph
|
|
} else if (!post_visited.test(bb_idx(n))) {
|
|
// cross or back arc
|
|
const int old_length = stack.length();
|
|
|
|
// If a Load depends on the same memory state as a Store, we must make sure that
|
|
// the Load is ordered before the Store.
|
|
//
|
|
// mem
|
|
// |
|
|
// +--+--+
|
|
// | |
|
|
// | Load (n)
|
|
// |
|
|
// Store (mem_use)
|
|
//
|
|
if (n->is_Load()) {
|
|
Node* mem = n->in(MemNode::Memory);
|
|
for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
|
|
Node* mem_use = mem->fast_out(i);
|
|
if (mem_use->is_Store() && _vloop.in_bb(mem_use) && !visited.test(bb_idx(mem_use))) {
|
|
stack.push(mem_use); // Ordering edge: Load (n) -> Store (mem_use)
|
|
}
|
|
}
|
|
}
|
|
|
|
for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
|
|
Node* use = n->fast_out(i);
|
|
if (_vloop.in_bb(use) && !visited.test(bb_idx(use)) &&
|
|
// Don't go around backedge
|
|
(!use->is_Phi() || n == _vloop.cl())) {
|
|
stack.push(use); // Ordering edge: n -> use
|
|
}
|
|
}
|
|
|
|
if (stack.length() == old_length) {
|
|
// There were no additional uses, post visit node now
|
|
stack.pop(); // Remove node from stack
|
|
assert(rpo_idx >= 0, "must still have idx to pass out");
|
|
_body.at_put_grow(rpo_idx, n);
|
|
rpo_idx--;
|
|
post_visited.set(bb_idx(n));
|
|
assert(rpo_idx >= 0 || stack.is_empty(), "still have idx left or are finished");
|
|
}
|
|
} else {
|
|
stack.pop(); // Remove post-visited node from stack
|
|
}
|
|
}
|
|
|
|
// Create real map of body indices for nodes
|
|
for (int j = 0; j < _body.length(); j++) {
|
|
Node* n = _body.at(j);
|
|
set_bb_idx(n, j);
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
if (_vloop.is_trace_body()) {
|
|
print();
|
|
}
|
|
#endif
|
|
|
|
assert(rpo_idx == -1 && body_count == _body.length(), "all body members found");
|
|
return VStatus::make_success();
|
|
}
|
|
|
|
BasicType SuperWord::longer_type_for_conversion(Node* n) const {
|
|
if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
|
|
VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(n->Opcode())) ||
|
|
!in_bb(n->in(1))) {
|
|
return T_ILLEGAL;
|
|
}
|
|
assert(in_bb(n), "must be in the bb");
|
|
BasicType src_t = velt_basic_type(n->in(1));
|
|
BasicType dst_t = velt_basic_type(n);
|
|
// Do not use superword for non-primitives.
|
|
// Superword does not support casting involving unsigned types.
|
|
if (!is_java_primitive(src_t) || is_unsigned_subword_type(src_t) ||
|
|
!is_java_primitive(dst_t) || is_unsigned_subword_type(dst_t)) {
|
|
return T_ILLEGAL;
|
|
}
|
|
int src_size = type2aelembytes(src_t);
|
|
int dst_size = type2aelembytes(dst_t);
|
|
return src_size == dst_size ? T_ILLEGAL
|
|
: (src_size > dst_size ? src_t : dst_t);
|
|
}
|
|
|
|
void VLoopTypes::compute_vector_element_type() {
|
|
#ifndef PRODUCT
|
|
if (_vloop.is_trace_vector_element_type()) {
|
|
tty->print_cr("\nVLoopTypes::compute_vector_element_type:");
|
|
}
|
|
#endif
|
|
|
|
const GrowableArray<Node*>& body = _body.body();
|
|
|
|
assert(_velt_type.is_empty(), "must not yet be computed");
|
|
// reserve space
|
|
_velt_type.at_put_grow(body.length()-1, nullptr);
|
|
|
|
// Initial type
|
|
for (int i = 0; i < body.length(); i++) {
|
|
Node* n = body.at(i);
|
|
set_velt_type(n, container_type(n));
|
|
}
|
|
|
|
// Propagate integer narrowed type backwards through operations
|
|
// that don't depend on higher order bits
|
|
for (int i = body.length() - 1; i >= 0; i--) {
|
|
Node* n = body.at(i);
|
|
// Only integer types need be examined
|
|
const Type* vtn = velt_type(n);
|
|
if (vtn->basic_type() == T_INT) {
|
|
uint start, end;
|
|
VectorNode::vector_operands(n, &start, &end);
|
|
|
|
for (uint j = start; j < end; j++) {
|
|
Node* in = n->in(j);
|
|
// Don't propagate through a memory
|
|
if (!in->is_Mem() &&
|
|
_vloop.in_bb(in) &&
|
|
velt_type(in)->basic_type() == T_INT &&
|
|
data_size(n) < data_size(in)) {
|
|
bool same_type = true;
|
|
for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
|
|
Node *use = in->fast_out(k);
|
|
if (!_vloop.in_bb(use) || !same_velt_type(use, n)) {
|
|
same_type = false;
|
|
break;
|
|
}
|
|
}
|
|
if (same_type) {
|
|
// In any Java arithmetic operation, operands of small integer types
|
|
// (boolean, byte, char & short) should be promoted to int first.
|
|
// During narrowed integer type backward propagation, for some operations
|
|
// like RShiftI, Abs, and ReverseBytesI,
|
|
// the compiler has to know the higher order bits of the 1st operand,
|
|
// which will be lost in the narrowed type. These operations shouldn't
|
|
// be vectorized if the higher order bits info is imprecise.
|
|
const Type* vt = vtn;
|
|
int op = in->Opcode();
|
|
if (VectorNode::is_shift_opcode(op) || op == Op_AbsI || op == Op_ReverseBytesI) {
|
|
Node* load = in->in(1);
|
|
if (load->is_Load() &&
|
|
_vloop.in_bb(load) &&
|
|
(velt_type(load)->basic_type() == T_INT)) {
|
|
// Only Load nodes distinguish signed (LoadS/LoadB) and unsigned
|
|
// (LoadUS/LoadUB) values. Store nodes only have one version.
|
|
vt = velt_type(load);
|
|
} else if (op != Op_LShiftI) {
|
|
// Widen type to int to avoid the creation of vector nodes. Note
|
|
// that left shifts work regardless of the signedness.
|
|
vt = TypeInt::INT;
|
|
}
|
|
}
|
|
set_velt_type(in, vt);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for (int i = 0; i < body.length(); i++) {
|
|
Node* n = body.at(i);
|
|
Node* nn = n;
|
|
if (nn->is_Bool() && nn->in(0) == nullptr) {
|
|
nn = nn->in(1);
|
|
assert(nn->is_Cmp(), "always have Cmp above Bool");
|
|
}
|
|
if (nn->is_Cmp() && nn->in(0) == nullptr) {
|
|
assert(_vloop.in_bb(nn->in(1)) || _vloop.in_bb(nn->in(2)),
|
|
"one of the inputs must be in the loop, too");
|
|
if (_vloop.in_bb(nn->in(1))) {
|
|
set_velt_type(n, velt_type(nn->in(1)));
|
|
} else {
|
|
set_velt_type(n, velt_type(nn->in(2)));
|
|
}
|
|
}
|
|
}
|
|
#ifndef PRODUCT
|
|
if (_vloop.is_trace_vector_element_type()) {
|
|
for (int i = 0; i < body.length(); i++) {
|
|
Node* n = body.at(i);
|
|
velt_type(n)->dump();
|
|
tty->print("\t");
|
|
n->dump();
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Smallest type containing range of values
|
|
const Type* VLoopTypes::container_type(Node* n) const {
|
|
if (n->is_Mem()) {
|
|
BasicType bt = n->as_Mem()->memory_type();
|
|
if (n->is_Store() && (bt == T_CHAR)) {
|
|
// Use T_SHORT type instead of T_CHAR for stored values because any
|
|
// preceding arithmetic operation extends values to signed Int.
|
|
bt = T_SHORT;
|
|
}
|
|
if (n->Opcode() == Op_LoadUB) {
|
|
// Adjust type for unsigned byte loads, it is important for right shifts.
|
|
// T_BOOLEAN is used because there is no basic type representing type
|
|
// TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
|
|
// size (one byte) and sign is important.
|
|
bt = T_BOOLEAN;
|
|
}
|
|
return Type::get_const_basic_type(bt);
|
|
}
|
|
const Type* t = _vloop.phase()->igvn().type(n);
|
|
if (t->basic_type() == T_INT) {
|
|
// A narrow type of arithmetic operations will be determined by
|
|
// propagating the type of memory operations.
|
|
return TypeInt::INT;
|
|
}
|
|
return t;
|
|
}
|
|
|
|
bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const {
|
|
return _vloop.phase()->C->get_alias_index(m1->adr_type()) ==
|
|
_vloop.phase()->C->get_alias_index(m2->adr_type());
|
|
}
|
|
|
|
LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
|
|
LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
|
|
for (uint i = 0; i < p->size(); i++) {
|
|
Node* n = p->at(i);
|
|
assert(n->is_Load(), "only meaningful for loads");
|
|
if (!n->depends_only_on_test()) {
|
|
if (n->as_Load()->has_unknown_control_dependency() &&
|
|
dep != LoadNode::Pinned) {
|
|
// Upgrade to unknown control...
|
|
dep = LoadNode::UnknownControl;
|
|
} else {
|
|
// Otherwise, we must pin it.
|
|
dep = LoadNode::Pinned;
|
|
}
|
|
}
|
|
}
|
|
return dep;
|
|
}
|
|
|
|
// Find the memop pack with the maximum vector width, unless they were already
|
|
// determined by SuperWord::filter_packs_for_alignment().
|
|
void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() {
|
|
if (_mem_ref_for_main_loop_alignment != nullptr) {
|
|
assert(VLoop::vectors_should_be_aligned(), "mem_ref only set if filtered for alignment");
|
|
return;
|
|
}
|
|
|
|
MemNode const* mem_ref = nullptr;
|
|
int max_aw = 0;
|
|
for (int i = 0; i < _packset.length(); i++) {
|
|
Node_List* pack = _packset.at(i);
|
|
MemNode* first = pack->at(0)->isa_Mem();
|
|
if (first == nullptr) { continue; }
|
|
|
|
int vw = first->memory_size() * pack->size();
|
|
if (vw > max_aw) {
|
|
max_aw = vw;
|
|
mem_ref = first;
|
|
}
|
|
}
|
|
assert(mem_ref != nullptr && max_aw > 0, "found mem_ref and aw");
|
|
_mem_ref_for_main_loop_alignment = mem_ref;
|
|
_aw_for_main_loop_alignment = max_aw;
|
|
}
|
|
|
|
#define TRACE_ALIGN_VECTOR_NODE(node) { \
|
|
DEBUG_ONLY( \
|
|
if (is_trace_align_vector()) { \
|
|
tty->print(" " #node ": "); \
|
|
node->dump(); \
|
|
} \
|
|
) \
|
|
} \
|
|
|
|
// Ensure that the main loop vectors are aligned by adjusting the pre loop limit. We memory-align
|
|
// the address of "_mem_ref_for_main_loop_alignment" to "_aw_for_main_loop_alignment", which is a
|
|
// sufficiently large alignment width. We adjust the pre-loop iteration count by adjusting the
|
|
// pre-loop limit.
|
|
void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
|
|
determine_mem_ref_and_aw_for_main_loop_alignment();
|
|
const MemNode* align_to_ref = _mem_ref_for_main_loop_alignment;
|
|
const int aw = _aw_for_main_loop_alignment;
|
|
assert(align_to_ref != nullptr && aw > 0, "must have alignment reference and aw");
|
|
assert(cl()->is_main_loop(), "can only do alignment for main loop");
|
|
|
|
// The opaque node for the limit, where we adjust the input
|
|
Opaque1Node* pre_opaq = _vloop.pre_loop_end()->limit()->as_Opaque1();
|
|
|
|
// Current pre-loop limit.
|
|
Node* old_limit = pre_opaq->in(1);
|
|
|
|
// Where we put new limit calculations.
|
|
Node* pre_ctrl = _vloop.pre_loop_head()->in(LoopNode::EntryControl);
|
|
|
|
// Ensure the original loop limit is available from the pre-loop Opaque1 node.
|
|
Node* orig_limit = pre_opaq->original_loop_limit();
|
|
assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, "");
|
|
|
|
const VPointer& align_to_ref_p = vpointer(align_to_ref);
|
|
assert(align_to_ref_p.valid(), "sanity");
|
|
|
|
// For the main-loop, we want the address of align_to_ref to be memory aligned
|
|
// with some alignment width (aw, a power of 2). When we enter the main-loop,
|
|
// we know that iv is equal to the pre-loop limit. If we adjust the pre-loop
|
|
// limit by executing adjust_pre_iter many extra iterations, we can change the
|
|
// alignment of the address.
|
|
//
|
|
// adr = base + offset + invar + scale * iv (1)
|
|
// adr % aw = 0 (2)
|
|
//
|
|
// Note, that we are defining the modulo operator "%" such that the remainder is
|
|
// always positive, see AlignmentSolution::mod(i, q). Since we are only computing
|
|
// modulo with powers of 2, we can instead simply use the last log2(q) bits of
|
|
// a number i, to get "i % q". This is performed with a bitmask.
|
|
//
|
|
// The limit of the pre-loop needs to be adjusted:
|
|
//
|
|
// old_limit: current pre-loop limit
|
|
// new_limit: new pre-loop limit
|
|
// adjust_pre_iter: additional pre-loop iterations for alignment adjustment
|
|
//
|
|
// We want to find adjust_pre_iter, such that the address is aligned when entering
|
|
// the main-loop:
|
|
//
|
|
// iv = new_limit = old_limit + adjust_pre_iter (3a, stride > 0)
|
|
// iv = new_limit = old_limit - adjust_pre_iter (3b, stride < 0)
|
|
//
|
|
// We define boi as:
|
|
//
|
|
// boi = base + offset + invar (4)
|
|
//
|
|
// And now we can simplify the address using (1), (3), and (4):
|
|
//
|
|
// adr = boi + scale * new_limit
|
|
// adr = boi + scale * (old_limit + adjust_pre_iter) (5a, stride > 0)
|
|
// adr = boi + scale * (old_limit - adjust_pre_iter) (5b, stride < 0)
|
|
//
|
|
// And hence we can restate (2) with (5), and solve the equation for adjust_pre_iter:
|
|
//
|
|
// (boi + scale * (old_limit + adjust_pre_iter) % aw = 0 (6a, stride > 0)
|
|
// (boi + scale * (old_limit - adjust_pre_iter) % aw = 0 (6b, stride < 0)
|
|
//
|
|
// In most cases, scale is the element size, for example:
|
|
//
|
|
// for (i = 0; i < a.length; i++) { a[i] = ...; }
|
|
//
|
|
// It is thus reasonable to assume that both abs(scale) and abs(stride) are
|
|
// strictly positive powers of 2. Further, they can be assumed to be non-zero,
|
|
// otherwise the address does not depend on iv, and the alignment cannot be
|
|
// affected by adjusting the pre-loop limit.
|
|
//
|
|
// Further, if abs(scale) >= aw, then adjust_pre_iter has no effect on alignment, and
|
|
// we are not able to affect the alignment at all. Hence, we require abs(scale) < aw.
|
|
//
|
|
// Moreover, for alignment to be achievable, boi must be a multiple of scale. If strict
|
|
// alignment is required (i.e. -XX:+AlignVector), this is guaranteed by the filtering
|
|
// done with the AlignmentSolver / AlignmentSolution. If strict alignment is not
|
|
// required, then alignment is still preferable for performance, but not necessary.
|
|
// In many cases boi will be a multiple of scale, but if it is not, then the adjustment
|
|
// does not guarantee alignment, but the code is still correct.
|
|
//
|
|
// Hence, in what follows we assume that boi is a multiple of scale, and in fact all
|
|
// terms in (6) are multiples of scale. Therefore we divide all terms by scale:
|
|
//
|
|
// AW = aw / abs(scale) (power of 2) (7)
|
|
// BOI = boi / abs(scale) (8)
|
|
//
|
|
// and restate (6), using (7) and (8), i.e. we divide (6) by abs(scale):
|
|
//
|
|
// (BOI + sign(scale) * (old_limit + adjust_pre_iter) % AW = 0 (9a, stride > 0)
|
|
// (BOI + sign(scale) * (old_limit - adjust_pre_iter) % AW = 0 (9b, stride < 0)
|
|
//
|
|
// where: sign(scale) = scale / abs(scale) = (scale > 0 ? 1 : -1)
|
|
//
|
|
// Note, (9) allows for periodic solutions of adjust_pre_iter, with periodicity AW.
|
|
// But we would like to spend as few iterations in the pre-loop as possible,
|
|
// hence we want the smallest adjust_pre_iter, and so:
|
|
//
|
|
// 0 <= adjust_pre_iter < AW (10)
|
|
//
|
|
// We solve (9) for adjust_pre_iter, in the following 4 cases:
|
|
//
|
|
// Case A: scale > 0 && stride > 0 (i.e. sign(scale) = 1)
|
|
// (BOI + old_limit + adjust_pre_iter) % AW = 0
|
|
// adjust_pre_iter = (-BOI - old_limit) % AW (11a)
|
|
//
|
|
// Case B: scale < 0 && stride > 0 (i.e. sign(scale) = -1)
|
|
// (BOI - old_limit - adjust_pre_iter) % AW = 0
|
|
// adjust_pre_iter = (BOI - old_limit) % AW (11b)
|
|
//
|
|
// Case C: scale > 0 && stride < 0 (i.e. sign(scale) = 1)
|
|
// (BOI + old_limit - adjust_pre_iter) % AW = 0
|
|
// adjust_pre_iter = (BOI + old_limit) % AW (11c)
|
|
//
|
|
// Case D: scale < 0 && stride < 0 (i.e. sign(scale) = -1)
|
|
// (BOI - old_limit + adjust_pre_iter) % AW = 0
|
|
// adjust_pre_iter = (-BOI + old_limit) % AW (11d)
|
|
//
|
|
// We now generalize the equations (11*) by using:
|
|
//
|
|
// OP: (stride > 0) ? SUB : ADD
|
|
// XBOI: (stride * scale > 0) ? -BOI : BOI
|
|
//
|
|
// which gives us the final pre-loop limit adjustment:
|
|
//
|
|
// adjust_pre_iter = (XBOI OP old_limit) % AW (12)
|
|
//
|
|
// We can construct XBOI by additionally defining:
|
|
//
|
|
// xboi = (stride * scale > 0) ? -boi : boi (13)
|
|
//
|
|
// which gives us:
|
|
//
|
|
// XBOI = (stride * scale > 0) ? -BOI : BOI
|
|
// = (stride * scale > 0) ? -boi / abs(scale) : boi / abs(scale)
|
|
// = xboi / abs(scale) (14)
|
|
//
|
|
// When we have computed adjust_pre_iter, we update the pre-loop limit
|
|
// with (3a, b). However, we have to make sure that the adjust_pre_iter
|
|
// additional pre-loop iterations do not lead the pre-loop to execute
|
|
// iterations that would step over the original limit (orig_limit) of
|
|
// the loop. Hence, we must constrain the updated limit as follows:
|
|
//
|
|
// constrained_limit = MIN(old_limit + adjust_pre_iter, orig_limit)
|
|
// = MIN(new_limit, orig_limit) (15a, stride > 0)
|
|
// constrained_limit = MAX(old_limit - adjust_pre_iter, orig_limit)
|
|
// = MAX(new_limit, orig_limit) (15a, stride < 0)
|
|
//
|
|
const int stride = iv_stride();
|
|
const int scale = align_to_ref_p.scale_in_bytes();
|
|
const int offset = align_to_ref_p.offset_in_bytes();
|
|
Node* base = align_to_ref_p.adr();
|
|
Node* invar = align_to_ref_p.invar();
|
|
|
|
#ifdef ASSERT
|
|
if (is_trace_align_vector()) {
|
|
tty->print_cr("\nadjust_pre_loop_limit_to_align_main_loop_vectors:");
|
|
tty->print(" align_to_ref:");
|
|
align_to_ref->dump();
|
|
tty->print_cr(" aw: %d", aw);
|
|
tty->print_cr(" stride: %d", stride);
|
|
tty->print_cr(" scale: %d", scale);
|
|
tty->print_cr(" offset: %d", offset);
|
|
tty->print(" base:");
|
|
base->dump();
|
|
if (invar == nullptr) {
|
|
tty->print_cr(" invar: null");
|
|
} else {
|
|
tty->print(" invar:");
|
|
invar->dump();
|
|
}
|
|
tty->print(" old_limit: ");
|
|
old_limit->dump();
|
|
tty->print(" orig_limit: ");
|
|
orig_limit->dump();
|
|
}
|
|
#endif
|
|
|
|
if (stride == 0 || !is_power_of_2(abs(stride)) ||
|
|
scale == 0 || !is_power_of_2(abs(scale)) ||
|
|
abs(scale) >= aw) {
|
|
#ifdef ASSERT
|
|
if (is_trace_align_vector()) {
|
|
tty->print_cr(" Alignment cannot be affected by changing pre-loop limit because");
|
|
tty->print_cr(" stride or scale are not power of 2, or abs(scale) >= aw.");
|
|
}
|
|
#endif
|
|
// Cannot affect alignment, abort.
|
|
return;
|
|
}
|
|
|
|
assert(stride != 0 && is_power_of_2(abs(stride)) &&
|
|
scale != 0 && is_power_of_2(abs(scale)) &&
|
|
abs(scale) < aw, "otherwise we cannot affect alignment with pre-loop");
|
|
|
|
const int AW = aw / abs(scale);
|
|
|
|
#ifdef ASSERT
|
|
if (is_trace_align_vector()) {
|
|
tty->print_cr(" AW = aw(%d) / abs(scale(%d)) = %d", aw, scale, AW);
|
|
}
|
|
#endif
|
|
|
|
// 1: Compute (13a, b):
|
|
// xboi = -boi = (-base - offset - invar) (stride * scale > 0)
|
|
// xboi = +boi = (+base + offset + invar) (stride * scale < 0)
|
|
const bool is_sub = scale * stride > 0;
|
|
|
|
// 1.1: offset
|
|
Node* xboi = igvn().intcon(is_sub ? -offset : offset);
|
|
TRACE_ALIGN_VECTOR_NODE(xboi);
|
|
|
|
// 1.2: invar (if it exists)
|
|
if (invar != nullptr) {
|
|
if (igvn().type(invar)->isa_long()) {
|
|
// Computations are done % (vector width/element size) so it's
|
|
// safe to simply convert invar to an int and loose the upper 32
|
|
// bit half.
|
|
invar = new ConvL2INode(invar);
|
|
phase()->register_new_node(invar, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(invar);
|
|
}
|
|
if (is_sub) {
|
|
xboi = new SubINode(xboi, invar);
|
|
} else {
|
|
xboi = new AddINode(xboi, invar);
|
|
}
|
|
phase()->register_new_node(xboi, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(xboi);
|
|
}
|
|
|
|
// 1.3: base (unless base is guaranteed aw aligned)
|
|
if (aw > ObjectAlignmentInBytes || align_to_ref_p.base()->is_top()) {
|
|
// The base is only aligned with ObjectAlignmentInBytes with arrays.
|
|
// When the base() is top, we have no alignment guarantee at all.
|
|
// Hence, we must now take the base into account for the calculation.
|
|
Node* xbase = new CastP2XNode(nullptr, base);
|
|
phase()->register_new_node(xbase, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(xbase);
|
|
#ifdef _LP64
|
|
xbase = new ConvL2INode(xbase);
|
|
phase()->register_new_node(xbase, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(xbase);
|
|
#endif
|
|
if (is_sub) {
|
|
xboi = new SubINode(xboi, xbase);
|
|
} else {
|
|
xboi = new AddINode(xboi, xbase);
|
|
}
|
|
phase()->register_new_node(xboi, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(xboi);
|
|
}
|
|
|
|
// 2: Compute (14):
|
|
// XBOI = xboi / abs(scale)
|
|
// The division is executed as shift
|
|
Node* log2_abs_scale = igvn().intcon(exact_log2(abs(scale)));
|
|
Node* XBOI = new URShiftINode(xboi, log2_abs_scale);
|
|
phase()->register_new_node(XBOI, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(log2_abs_scale);
|
|
TRACE_ALIGN_VECTOR_NODE(XBOI);
|
|
|
|
// 3: Compute (12):
|
|
// adjust_pre_iter = (XBOI OP old_limit) % AW
|
|
//
|
|
// 3.1: XBOI_OP_old_limit = XBOI OP old_limit
|
|
Node* XBOI_OP_old_limit = nullptr;
|
|
if (stride > 0) {
|
|
XBOI_OP_old_limit = new SubINode(XBOI, old_limit);
|
|
} else {
|
|
XBOI_OP_old_limit = new AddINode(XBOI, old_limit);
|
|
}
|
|
phase()->register_new_node(XBOI_OP_old_limit, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(XBOI_OP_old_limit);
|
|
|
|
// 3.2: Compute:
|
|
// adjust_pre_iter = (XBOI OP old_limit) % AW
|
|
// = XBOI_OP_old_limit % AW
|
|
// = XBOI_OP_old_limit AND (AW - 1)
|
|
// Since AW is a power of 2, the modulo operation can be replaced with
|
|
// a bitmask operation.
|
|
Node* mask_AW = igvn().intcon(AW-1);
|
|
Node* adjust_pre_iter = new AndINode(XBOI_OP_old_limit, mask_AW);
|
|
phase()->register_new_node(adjust_pre_iter, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(mask_AW);
|
|
TRACE_ALIGN_VECTOR_NODE(adjust_pre_iter);
|
|
|
|
// 4: Compute (3a, b):
|
|
// new_limit = old_limit + adjust_pre_iter (stride > 0)
|
|
// new_limit = old_limit - adjust_pre_iter (stride < 0)
|
|
Node* new_limit = nullptr;
|
|
if (stride < 0) {
|
|
new_limit = new SubINode(old_limit, adjust_pre_iter);
|
|
} else {
|
|
new_limit = new AddINode(old_limit, adjust_pre_iter);
|
|
}
|
|
phase()->register_new_node(new_limit, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(new_limit);
|
|
|
|
// 5: Compute (15a, b):
|
|
// Prevent pre-loop from going past the original limit of the loop.
|
|
Node* constrained_limit =
|
|
(stride > 0) ? (Node*) new MinINode(new_limit, orig_limit)
|
|
: (Node*) new MaxINode(new_limit, orig_limit);
|
|
phase()->register_new_node(constrained_limit, pre_ctrl);
|
|
TRACE_ALIGN_VECTOR_NODE(constrained_limit);
|
|
|
|
// 6: Hack the pre-loop limit
|
|
igvn().replace_input_of(pre_opaq, 1, constrained_limit);
|
|
}
|
|
|
|
#ifndef PRODUCT
|
|
void PairSet::print() const {
|
|
tty->print_cr("\nPairSet::print: %d pairs", length());
|
|
int chain = 0;
|
|
int chain_index = 0;
|
|
for (PairSetIterator pair(*this); !pair.done(); pair.next()) {
|
|
Node* left = pair.left();
|
|
Node* right = pair.right();
|
|
if (is_left_in_a_left_most_pair(left)) {
|
|
chain_index = 0;
|
|
tty->print_cr(" Pair-chain %d:", chain++);
|
|
tty->print(" %3d: ", chain_index++);
|
|
left->dump();
|
|
}
|
|
tty->print(" %3d: ", chain_index++);
|
|
right->dump();
|
|
}
|
|
}
|
|
|
|
void PackSet::print() const {
|
|
tty->print_cr("\nPackSet::print: %d packs", _packs.length());
|
|
for (int i = 0; i < _packs.length(); i++) {
|
|
tty->print_cr(" Pack: %d", i);
|
|
Node_List* pack = _packs.at(i);
|
|
if (pack == nullptr) {
|
|
tty->print_cr(" nullptr");
|
|
} else {
|
|
print_pack(pack);
|
|
}
|
|
}
|
|
}
|
|
|
|
void PackSet::print_pack(Node_List* pack) {
|
|
for (uint i = 0; i < pack->size(); i++) {
|
|
tty->print(" %3d: ", i);
|
|
pack->at(i)->dump();
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifndef PRODUCT
|
|
void VLoopBody::print() const {
|
|
tty->print_cr("\nBlock");
|
|
for (int i = 0; i < body().length(); i++) {
|
|
Node* n = body().at(i);
|
|
tty->print("%d ", i);
|
|
if (n != nullptr) {
|
|
n->dump();
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
//
|
|
// --------------------------------- vectorization/simd -----------------------------------
|
|
//
|
|
bool SuperWord::same_origin_idx(Node* a, Node* b) const {
|
|
return a != nullptr && b != nullptr && _clone_map.same_idx(a->_idx, b->_idx);
|
|
}
|
|
bool SuperWord::same_generation(Node* a, Node* b) const {
|
|
return a != nullptr && b != nullptr && _clone_map.same_gen(a->_idx, b->_idx);
|
|
}
|