8302652: [SuperWord] Reduction should happen after loop, when possible

Reviewed-by: kvn, pli, jbhateja, sviswanathan
This commit is contained in:
Emanuel Peter 2023-05-23 08:05:13 +00:00
parent 69f508a2ac
commit 06b0a5e038
16 changed files with 1031 additions and 238 deletions

View File

@ -2844,12 +2844,7 @@ void Compile::process_logic_cone_root(PhaseIterGVN &igvn, Node *n, VectorSet &vi
if (mask == nullptr ||
Matcher::match_rule_supported_vector_masked(Op_MacroLogicV, vt->length(), vt->element_basic_type())) {
Node* macro_logic = xform_to_MacroLogicV(igvn, vt, partition, inputs);
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
macro_logic->dump();
}
#endif
VectorNode::trace_new_vector(macro_logic, "MacroLogic");
igvn.replace_node(n, macro_logic);
}
}

View File

@ -4634,6 +4634,16 @@ void PhaseIdealLoop::build_and_optimize() {
}
}
}
// Move UnorderedReduction out of counted loop. Can be introduced by SuperWord.
if (C->has_loops() && !C->major_progress()) {
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
if (lpt->is_counted() && lpt->is_innermost()) {
move_unordered_reduction_out_of_loop(lpt);
}
}
}
}
#ifndef PRODUCT

View File

@ -1486,6 +1486,9 @@ public:
bool partial_peel( IdealLoopTree *loop, Node_List &old_new );
bool duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old_new);
// Move UnorderedReduction out of loop if possible
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
// Create a scheduled list of nodes control dependent on ctrl set.
void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
// Has a use in the vector set

View File

@ -41,6 +41,7 @@
#include "opto/rootnode.hpp"
#include "opto/subnode.hpp"
#include "opto/subtypenode.hpp"
#include "opto/vectornode.hpp"
#include "utilities/macros.hpp"
//=============================================================================
@ -4120,3 +4121,188 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old
return true;
}
// Having ReductionNodes in the loop is expensive. They need to recursively
// fold together the vector values, for every vectorized loop iteration. If
// we encounter the following pattern, we can vector accumulate the values
// inside the loop, and only have a single UnorderedReduction after the loop.
//
// CountedLoop init
// | |
// +------+ | +-----------------------+
// | | | |
// PhiNode (s) |
// | |
// | Vector |
// | | |
// UnorderedReduction (first_ur) |
// | |
// ... Vector |
// | | |
// UnorderedReduction (last_ur) |
// | |
// +---------------------+
//
// We patch the graph to look like this:
//
// CountedLoop identity_vector
// | |
// +-------+ | +---------------+
// | | | |
// PhiNode (v) |
// | |
// | Vector |
// | | |
// VectorAccumulator |
// | |
// ... Vector |
// | | |
// init VectorAccumulator |
// | | | |
// UnorderedReduction +-----------+
//
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
// use vector_accumulators, which do the same reductions, but only element
// wise. This is a single operation per vector_accumulator, rather than many
// for a UnorderedReduction. We can then reduce the last vector_accumulator
// after the loop, and also reduce the init value into it.
// We can not do this with all reductions. Some reductions do not allow the
// reordering of operations (for example float addition).
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
// Find all Phi nodes with UnorderedReduction on backedge.
CountedLoopNode* cl = loop->_head->as_CountedLoop();
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
Node* phi = cl->fast_out(j);
// We have a phi with a single use, and a UnorderedReduction on the backedge.
if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) {
continue;
}
UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction();
// Determine types
const TypeVect* vec_t = last_ur->vect_type();
uint vector_length = vec_t->length();
BasicType bt = vec_t->element_basic_type();
const Type* bt_t = Type::get_const_basic_type(bt);
// Convert opcode from vector-reduction -> scalar -> normal-vector-op
const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
const int vopc = VectorNode::opcode(sopc, bt);
if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
DEBUG_ONLY( last_ur->dump(); )
assert(false, "do not have normal vector op for this reduction");
continue; // not implemented -> fails
}
// Traverse up the chain of UnorderedReductions, checking that it loops back to
// the phi. Check that all UnorderedReductions only have a single use, except for
// the last (last_ur), which only has phi as a use in the loop, and all other uses
// are outside the loop.
UnorderedReductionNode* current = last_ur;
UnorderedReductionNode* first_ur = nullptr;
while (true) {
assert(current->is_UnorderedReduction(), "sanity");
// Expect no ctrl and a vector_input from within the loop.
Node* ctrl = current->in(0);
Node* vector_input = current->in(2);
if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
DEBUG_ONLY( current->dump(1); )
assert(false, "reduction has ctrl or bad vector_input");
break; // Chain traversal fails.
}
// Expect single use of UnorderedReduction, except for last_ur.
if (current == last_ur) {
// Expect all uses to be outside the loop, except phi.
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
Node* use = current->fast_out(k);
if (use != phi && ctrl_or_self(use) == cl) {
DEBUG_ONLY( current->dump(-1); )
assert(false, "reduction has use inside loop");
break; // Chain traversal fails.
}
}
} else {
if (current->outcnt() != 1) {
break; // Chain traversal fails.
}
}
// Expect another UnorderedReduction or phi as the scalar input.
Node* scalar_input = current->in(1);
if (scalar_input->is_UnorderedReduction() &&
scalar_input->Opcode() == current->Opcode()) {
// Move up the UnorderedReduction chain.
current = scalar_input->as_UnorderedReduction();
} else if (scalar_input == phi) {
// Chain terminates at phi.
first_ur = current;
current = nullptr;
break; // Success.
} else {
DEBUG_ONLY( current->dump(1); )
assert(false, "scalar_input is neither phi nor a matchin reduction");
break; // Chain traversal fails.
}
}
if (current != nullptr) {
// Chain traversal was not successful.
continue;
}
assert(first_ur != nullptr, "must have successfully terminated chain traversal");
Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
set_ctrl(identity_scalar, C->root());
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t);
register_new_node(identity_vector, C->root());
assert(vec_t == identity_vector->vect_type(), "matching vector type");
VectorNode::trace_new_vector(identity_vector, "UnorderedReduction");
// Turn the scalar phi into a vector phi.
_igvn.rehash_node_delayed(phi);
Node* init = phi->in(1); // Remember init before replacing it.
phi->set_req_X(1, identity_vector, &_igvn);
phi->as_Type()->set_type(vec_t);
_igvn.set_type(phi, vec_t);
// Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators.
current = first_ur;
while (true) {
// Create vector_accumulator to replace current.
Node* last_vector_accumulator = current->in(1);
Node* vector_input = current->in(2);
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
register_new_node(vector_accumulator, cl);
_igvn.replace_node(current, vector_accumulator);
VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction");
if (current == last_ur) {
break;
}
current = vector_accumulator->unique_out()->as_UnorderedReduction();
}
// Create post-loop reduction.
Node* last_accumulator = phi->in(2);
Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);
// Take over uses of last_accumulator that are not in the loop.
for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
Node* use = last_accumulator->out(i);
if (use != phi && use != post_loop_reduction) {
assert(ctrl_or_self(use) != cl, "use must be outside loop");
use->replace_edge(last_accumulator, post_loop_reduction, &_igvn);
--i;
}
}
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction");
assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
assert(phi->outcnt() == 1, "accumulator is the only use of phi");
}
}

View File

@ -151,6 +151,7 @@ class Pipeline;
class PopulateIndexNode;
class ProjNode;
class RangeCheckNode;
class ReductionNode;
class RegMask;
class RegionNode;
class RootNode;
@ -164,6 +165,7 @@ class SubTypeCheckNode;
class Type;
class TypeNode;
class UnlockNode;
class UnorderedReductionNode;
class VectorNode;
class LoadVectorNode;
class LoadVectorMaskedNode;
@ -718,6 +720,8 @@ public:
DEFINE_CLASS_ID(CompressV, Vector, 4)
DEFINE_CLASS_ID(ExpandV, Vector, 5)
DEFINE_CLASS_ID(CompressM, Vector, 6)
DEFINE_CLASS_ID(Reduction, Vector, 7)
DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0)
DEFINE_CLASS_ID(Con, Type, 8)
DEFINE_CLASS_ID(ConI, Con, 0)
@ -941,6 +945,7 @@ public:
DEFINE_CLASS_QUERY(PCTable)
DEFINE_CLASS_QUERY(Phi)
DEFINE_CLASS_QUERY(Proj)
DEFINE_CLASS_QUERY(Reduction)
DEFINE_CLASS_QUERY(Region)
DEFINE_CLASS_QUERY(Root)
DEFINE_CLASS_QUERY(SafePoint)
@ -950,6 +955,7 @@ public:
DEFINE_CLASS_QUERY(Sub)
DEFINE_CLASS_QUERY(SubTypeCheck)
DEFINE_CLASS_QUERY(Type)
DEFINE_CLASS_QUERY(UnorderedReduction)
DEFINE_CLASS_QUERY(Vector)
DEFINE_CLASS_QUERY(VectorMaskCmp)
DEFINE_CLASS_QUERY(VectorUnbox)

View File

@ -3197,12 +3197,7 @@ bool SuperWord::output() {
if (vlen_in_bytes > max_vlen_in_bytes) {
max_vlen_in_bytes = vlen_in_bytes;
}
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
}
}//for (int i = 0; i < _block.length(); i++)
@ -3242,6 +3237,7 @@ bool SuperWord::output() {
if (do_reserve_copy()) {
make_reversable.use_new();
}
NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
return true;
}
@ -3374,12 +3370,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
Node* vn = new PopulateIndexNode(iv(), _igvn.intcon(1), vt);
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
return vn;
@ -3452,12 +3443,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
return vn;
}
@ -3489,12 +3475,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
}
_igvn.register_new_node_with_optimizer(pk);
_phase->set_ctrl(pk, _phase->get_ctrl(opd));
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
pk->dump();
}
#endif
VectorNode::trace_new_vector(pk, "SuperWord");
return pk;
}

View File

@ -1536,7 +1536,7 @@ bool LibraryCallKit::inline_vector_reduction() {
}
}
Node* init = ReductionNode::make_reduction_input(gvn(), opc, elem_bt);
Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt);
Node* value = nullptr;
if (mask == nullptr) {
assert(!is_masked_op, "Masked op needs the mask value never null");

View File

@ -34,7 +34,7 @@
//------------------------------VectorNode--------------------------------------
// Return the vector operator for the specified scalar operation
// and vector length.
// and basic type.
int VectorNode::opcode(int sopc, BasicType bt) {
switch (sopc) {
case Op_AddI:
@ -274,6 +274,117 @@ int VectorNode::opcode(int sopc, BasicType bt) {
}
}
// Return the scalar opcode for the specified vector opcode
// and basic type.
int VectorNode::scalar_opcode(int sopc, BasicType bt) {
switch (sopc) {
case Op_AddReductionVI:
case Op_AddVI:
return Op_AddI;
case Op_AddReductionVL:
case Op_AddVL:
return Op_AddL;
case Op_MulReductionVI:
case Op_MulVI:
return Op_MulI;
case Op_MulReductionVL:
case Op_MulVL:
return Op_MulL;
case Op_AndReductionV:
case Op_AndV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_AndI;
case T_LONG:
return Op_AndL;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_OrReductionV:
case Op_OrV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_OrI;
case T_LONG:
return Op_OrL;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_XorReductionV:
case Op_XorV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_XorI;
case T_LONG:
return Op_XorL;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_MinReductionV:
case Op_MinV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
assert(false, "boolean and char are signed, not implemented for Min");
return 0;
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_MinI;
case T_LONG:
return Op_MinL;
case T_FLOAT:
return Op_MinF;
case T_DOUBLE:
return Op_MinD;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_MaxReductionV:
case Op_MaxV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
assert(false, "boolean and char are signed, not implemented for Max");
return 0;
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_MaxI;
case T_LONG:
return Op_MaxL;
case T_FLOAT:
return Op_MaxF;
case T_DOUBLE:
return Op_MaxD;
default:
assert(false, "basic type not handled");
return 0;
}
default:
assert(false,
"Vector node %s is not handled in VectorNode::scalar_opcode",
NodeClassNames[sopc]);
return 0; // Unimplemented
}
}
int VectorNode::replicate_opcode(BasicType bt) {
switch(bt) {
case T_BOOLEAN:
@ -1398,9 +1509,9 @@ Node* VectorCastNode::Identity(PhaseGVN* phase) {
return this;
}
Node* ReductionNode::make_reduction_input(PhaseGVN& gvn, int opc, BasicType bt) {
int vopc = opcode(opc, bt);
guarantee(vopc != opc, "Vector reduction for '%s' is not implemented", NodeClassNames[opc]);
Node* ReductionNode::make_identity_con_scalar(PhaseGVN& gvn, int sopc, BasicType bt) {
int vopc = opcode(sopc, bt);
guarantee(vopc != sopc, "Vector reduction for '%s' is not implemented", NodeClassNames[sopc]);
switch (vopc) {
case Op_AndReductionV:

View File

@ -25,6 +25,8 @@
#define SHARE_OPTO_VECTORNODE_HPP
#include "opto/callnode.hpp"
#include "opto/cfgnode.hpp"
#include "opto/loopnode.hpp"
#include "opto/matcher.hpp"
#include "opto/memnode.hpp"
#include "opto/node.hpp"
@ -90,7 +92,8 @@ class VectorNode : public TypeNode {
static bool is_rotate_opcode(int opc);
static int opcode(int opc, BasicType bt);
static int opcode(int sopc, BasicType bt); // scalar_opc -> vector_opc
static int scalar_opcode(int vopc, BasicType bt); // vector_opc -> scalar_opc
static int replicate_opcode(BasicType bt);
// Limits on vector size (number of elements) for auto-vectorization.
@ -130,6 +133,15 @@ class VectorNode : public TypeNode {
static bool is_vector_shift_count(Node* n) {
return is_vector_shift_count(n->Opcode());
}
static void trace_new_vector(Node* n, const char* context) {
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("TraceNewVectors [%s]: ", context);
n->dump();
}
#endif
}
};
//===========================Vector=ALU=Operations=============================
@ -191,12 +203,15 @@ class ReductionNode : public Node {
public:
ReductionNode(Node *ctrl, Node* in1, Node* in2) : Node(ctrl, in1, in2),
_bottom_type(Type::get_const_basic_type(in1->bottom_type()->basic_type())),
_vect_type(in2->bottom_type()->is_vect()) {}
_vect_type(in2->bottom_type()->is_vect()) {
init_class_id(Class_Reduction);
}
static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt);
static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt);
static int opcode(int opc, BasicType bt);
static bool implemented(int opc, uint vlen, BasicType bt);
static Node* make_reduction_input(PhaseGVN& gvn, int opc, BasicType bt);
// Make an identity scalar (zero for add, one for mul, etc) for scalar opc.
static Node* make_identity_con_scalar(PhaseGVN& gvn, int sopc, BasicType bt);
virtual const Type* bottom_type() const {
return _bottom_type;
@ -216,19 +231,28 @@ class ReductionNode : public Node {
virtual uint size_of() const { return sizeof(*this); }
};
//---------------------------UnorderedReductionNode-------------------------------------
// Order of reduction does not matter. Example int add. Not true for float add.
class UnorderedReductionNode : public ReductionNode {
public:
UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {
init_class_id(Class_UnorderedReduction);
}
};
//------------------------------AddReductionVINode--------------------------------------
// Vector add byte, short and int as a reduction
class AddReductionVINode : public ReductionNode {
class AddReductionVINode : public UnorderedReductionNode {
public:
AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------AddReductionVLNode--------------------------------------
// Vector add long as a reduction
class AddReductionVLNode : public ReductionNode {
class AddReductionVLNode : public UnorderedReductionNode {
public:
AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
@ -386,17 +410,17 @@ public:
//------------------------------MulReductionVINode--------------------------------------
// Vector multiply byte, short and int as a reduction
class MulReductionVINode : public ReductionNode {
class MulReductionVINode : public UnorderedReductionNode {
public:
MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------MulReductionVLNode--------------------------------------
// Vector multiply int as a reduction
class MulReductionVLNode : public ReductionNode {
class MulReductionVLNode : public UnorderedReductionNode {
public:
MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
@ -737,9 +761,9 @@ class AndVNode : public VectorNode {
//------------------------------AndReductionVNode--------------------------------------
// Vector and byte, short, int, long as a reduction
class AndReductionVNode : public ReductionNode {
class AndReductionVNode : public UnorderedReductionNode {
public:
AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
@ -754,17 +778,9 @@ class OrVNode : public VectorNode {
//------------------------------OrReductionVNode--------------------------------------
// Vector xor byte, short, int, long as a reduction
class OrReductionVNode : public ReductionNode {
class OrReductionVNode : public UnorderedReductionNode {
public:
OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------XorReductionVNode--------------------------------------
// Vector and int, long as a reduction
class XorReductionVNode : public ReductionNode {
public:
XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
@ -777,19 +793,27 @@ class XorVNode : public VectorNode {
virtual Node* Ideal(PhaseGVN* phase, bool can_reshape);
};
//------------------------------XorReductionVNode--------------------------------------
// Vector and int, long as a reduction
class XorReductionVNode : public UnorderedReductionNode {
public:
XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------MinReductionVNode--------------------------------------
// Vector min byte, short, int, long, float, double as a reduction
class MinReductionVNode : public ReductionNode {
class MinReductionVNode : public UnorderedReductionNode {
public:
MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};
//------------------------------MaxReductionVNode--------------------------------------
// Vector min byte, short, int, long, float, double as a reduction
class MaxReductionVNode : public ReductionNode {
class MaxReductionVNode : public UnorderedReductionNode {
public:
MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {}
virtual int Opcode() const;
};

View File

@ -84,7 +84,7 @@ public class ProdRed_Int {
failOn = {IRNode.MUL_REDUCTION_VI})
@IR(applyIfCPUFeature = {"sse4.1", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.MUL_REDUCTION_VI, ">= 1"})
counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
public static int prodReductionImplement(int[] a, int[] b, int total) {
for (int i = 0; i < a.length; i++) {
total *= a[i] + b[i];

View File

@ -134,7 +134,7 @@ public class RedTest_int {
failOn = {IRNode.ADD_REDUCTION_VI})
@IR(applyIfCPUFeature = {"sse4.1", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
counts = {IRNode.ADD_REDUCTION_VI, ">= 1", IRNode.ADD_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
public static int sumReductionImplement(
int[] a,
int[] b,
@ -151,7 +151,7 @@ public class RedTest_int {
failOn = {IRNode.OR_REDUCTION_V})
@IR(applyIfCPUFeature = {"sse4.1", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.OR_REDUCTION_V, ">= 1"})
counts = {IRNode.OR_REDUCTION_V, ">= 1", IRNode.OR_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop
public static int orReductionImplement(
int[] a,
int[] b,
@ -168,7 +168,7 @@ public class RedTest_int {
failOn = {IRNode.AND_REDUCTION_V})
@IR(applyIfCPUFeature = {"sse4.1", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.AND_REDUCTION_V, ">= 1"})
counts = {IRNode.AND_REDUCTION_V, ">= 1", IRNode.AND_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop
public static int andReductionImplement(
int[] a,
int[] b,
@ -185,7 +185,7 @@ public class RedTest_int {
failOn = {IRNode.XOR_REDUCTION_V})
@IR(applyIfCPUFeature = {"sse4.1", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.XOR_REDUCTION_V, ">= 1"})
counts = {IRNode.XOR_REDUCTION_V, ">= 1", IRNode.XOR_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop
public static int xorReductionImplement(
int[] a,
int[] b,
@ -202,7 +202,7 @@ public class RedTest_int {
failOn = {IRNode.MUL_REDUCTION_VI})
@IR(applyIfCPUFeature = {"sse4.1", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.MUL_REDUCTION_VI, ">= 1"})
counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
public static int mulReductionImplement(
int[] a,
int[] b,

View File

@ -137,7 +137,7 @@ public class RedTest_long {
failOn = {IRNode.ADD_REDUCTION_VL})
@IR(applyIfCPUFeature = {"avx2", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.ADD_REDUCTION_VL, ">= 1"})
counts = {IRNode.ADD_REDUCTION_VL, ">= 1", IRNode.ADD_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop
public static long sumReductionImplement(
long[] a,
long[] b,
@ -154,7 +154,7 @@ public class RedTest_long {
failOn = {IRNode.OR_REDUCTION_V})
@IR(applyIfCPUFeature = {"avx2", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.OR_REDUCTION_V, ">= 1"})
counts = {IRNode.OR_REDUCTION_V, ">= 1", IRNode.OR_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop
public static long orReductionImplement(
long[] a,
long[] b,
@ -171,7 +171,7 @@ public class RedTest_long {
failOn = {IRNode.AND_REDUCTION_V})
@IR(applyIfCPUFeature = {"avx2", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.AND_REDUCTION_V, ">= 1"})
counts = {IRNode.AND_REDUCTION_V, ">= 1", IRNode.AND_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop
public static long andReductionImplement(
long[] a,
long[] b,
@ -188,7 +188,7 @@ public class RedTest_long {
failOn = {IRNode.XOR_REDUCTION_V})
@IR(applyIfCPUFeature = {"avx2", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.XOR_REDUCTION_V, ">= 1"})
counts = {IRNode.XOR_REDUCTION_V, ">= 1", IRNode.XOR_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop
public static long xorReductionImplement(
long[] a,
long[] b,
@ -205,7 +205,7 @@ public class RedTest_long {
failOn = {IRNode.MUL_REDUCTION_VL})
@IR(applyIfCPUFeature = {"avx512dq", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.MUL_REDUCTION_VL, ">= 1"})
counts = {IRNode.MUL_REDUCTION_VL, ">= 1", IRNode.MUL_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop
public static long mulReductionImplement(
long[] a,
long[] b,

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -21,242 +21,570 @@
* questions.
*/
/**
/*
* @test
* @bug 8074981
* @summary Add C2 x86 Superword support for scalar product reduction optimizations : int test
* @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64" | os.arch=="aarch64" | os.arch=="riscv64"
*
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
* -XX:LoopUnrollLimit=250 -XX:CompileThresholdScaling=0.1
* -XX:CompileCommand=exclude,compiler.loopopts.superword.ReductionPerf::main
* -XX:+SuperWordReductions
* compiler.loopopts.superword.ReductionPerf
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions
* -XX:LoopUnrollLimit=250 -XX:CompileThresholdScaling=0.1
* -XX:CompileCommand=exclude,compiler.loopopts.superword.ReductionPerf::main
* -XX:-SuperWordReductions
* compiler.loopopts.superword.ReductionPerf
* @bug 8074981 8302652
* @summary Test SuperWord Reduction Perf.
* @requires vm.compiler2.enabled
* @requires vm.simpleArch == "x86" | vm.simpleArch == "x64" | vm.simpleArch == "aarch64" | vm.simpleArch == "riscv64"
* @library /test/lib /
* @run main/othervm -Xbatch -XX:LoopUnrollLimit=250
* -XX:CompileCommand=exclude,compiler.loopopts.superword.ReductionPerf::main
* compiler.loopopts.superword.ReductionPerf
*/
package compiler.loopopts.superword;
import java.util.Random;
import jdk.test.lib.Utils;
public class ReductionPerf {
public static void main(String[] args) throws Exception {
int[] a1 = new int[8 * 1024];
int[] a2 = new int[8 * 1024];
int[] a3 = new int[8 * 1024];
long[] b1 = new long[8 * 1024];
long[] b2 = new long[8 * 1024];
long[] b3 = new long[8 * 1024];
float[] c1 = new float[8 * 1024];
float[] c2 = new float[8 * 1024];
float[] c3 = new float[8 * 1024];
double[] d1 = new double[8 * 1024];
double[] d2 = new double[8 * 1024];
double[] d3 = new double[8 * 1024];
static final int RANGE = 8192;
static Random rand = Utils.getRandomInstance();
ReductionInit(a1, a2, a3, b1, b2, b3, c1, c2, c3, d1, d2, d3);
public static void main(String args[]) {
// Please increase iterations for measurement to 2_000 and 100_000.
int iter_warmup = 100;
int iter_perf = 1_000;
int sumIv = sumInt(a1, a2, a3);
long sumLv = sumLong(b1, b2, b3);
float sumFv = sumFloat(c1, c2, c3);
double sumDv = sumDouble(d1, d2, d3);
int mulIv = prodInt(a1, a2, a3);
long mulLv = prodLong(b1, b2, b3);
float mulFv = prodFloat(c1, c2, c3);
double mulDv = prodDouble(d1, d2, d3);
double[] aDouble = new double[RANGE];
double[] bDouble = new double[RANGE];
double[] cDouble = new double[RANGE];
float[] aFloat = new float[RANGE];
float[] bFloat = new float[RANGE];
float[] cFloat = new float[RANGE];
int[] aInt = new int[RANGE];
int[] bInt = new int[RANGE];
int[] cInt = new int[RANGE];
long[] aLong = new long[RANGE];
long[] bLong = new long[RANGE];
long[] cLong = new long[RANGE];
int sumI = 0;
long sumL = 0;
float sumF = 0.f;
double sumD = 0.;
int mulI = 0;
long mulL = 0;
float mulF = 0.f;
double mulD = 0.;
long start, stop;
System.out.println("Warmup ...");
long start = System.currentTimeMillis();
for (int j = 0; j < 2000; j++) {
sumI = sumInt(a1, a2, a3);
sumL = sumLong(b1, b2, b3);
sumF = sumFloat(c1, c2, c3);
sumD = sumDouble(d1, d2, d3);
mulI = prodInt(a1, a2, a3);
mulL = prodLong(b1, b2, b3);
mulF = prodFloat(c1, c2, c3);
mulD = prodDouble(d1, d2, d3);
int startIntAdd = init(aInt, bInt, cInt);
int goldIntAdd = testIntAdd(aInt, bInt, cInt, startIntAdd);
for (int j = 0; j < iter_warmup; j++) {
int total = testIntAdd(aInt, bInt, cInt, startIntAdd);
verify("int add", total, goldIntAdd);
}
long stop = System.currentTimeMillis();
System.out.println(" Warmup is done in " + (stop - start) + " msec");
if (sumIv != sumI) {
System.out.println("sum int: " + sumIv + " != " + sumI);
}
if (sumLv != sumL) {
System.out.println("sum long: " + sumLv + " != " + sumL);
}
if (sumFv != sumF) {
System.out.println("sum float: " + sumFv + " != " + sumF);
}
if (sumDv != sumD) {
System.out.println("sum double: " + sumDv + " != " + sumD);
}
if (mulIv != mulI) {
System.out.println("prod int: " + mulIv + " != " + mulI);
}
if (mulLv != mulL) {
System.out.println("prod long: " + mulLv + " != " + mulL);
}
if (mulFv != mulF) {
System.out.println("prod float: " + mulFv + " != " + mulF);
}
if (mulDv != mulD) {
System.out.println("prod double: " + mulDv + " != " + mulD);
}
start = System.currentTimeMillis();
for (int j = 0; j < 5000; j++) {
sumI = sumInt(a1, a2, a3);
for (int j = 0; j < iter_perf; j++) {
testIntAdd(aInt, bInt, cInt, startIntAdd);
}
stop = System.currentTimeMillis();
System.out.println("sum int: " + (stop - start));
System.out.println("int add " + (stop - start));
int startIntMul = init(aInt, bInt, cInt);
int goldIntMul = testIntMul(aInt, bInt, cInt, startIntMul);
for (int j = 0; j < iter_warmup; j++) {
int total = testIntMul(aInt, bInt, cInt, startIntMul);
verify("int mul", total, goldIntMul);
}
start = System.currentTimeMillis();
for (int j = 0; j < 5000; j++) {
sumL = sumLong(b1, b2, b3);
for (int j = 0; j < iter_perf; j++) {
testIntMul(aInt, bInt, cInt, startIntMul);
}
stop = System.currentTimeMillis();
System.out.println("sum long: " + (stop - start));
System.out.println("int mul " + (stop - start));
int startIntMin = init(aInt, bInt, cInt);
int goldIntMin = testIntMin(aInt, bInt, cInt, startIntMin);
for (int j = 0; j < iter_warmup; j++) {
int total = testIntMin(aInt, bInt, cInt, startIntMin);
verify("int min", total, goldIntMin);
}
start = System.currentTimeMillis();
for (int j = 0; j < 5000; j++) {
sumF = sumFloat(c1, c2, c3);
for (int j = 0; j < iter_perf; j++) {
testIntMin(aInt, bInt, cInt, startIntMin);
}
stop = System.currentTimeMillis();
System.out.println("sum float: " + (stop - start));
System.out.println("int min " + (stop - start));
int startIntMax = init(aInt, bInt, cInt);
int goldIntMax = testIntMax(aInt, bInt, cInt, startIntMax);
for (int j = 0; j < iter_warmup; j++) {
int total = testIntMax(aInt, bInt, cInt, startIntMax);
verify("int max", total, goldIntMax);
}
start = System.currentTimeMillis();
for (int j = 0; j < 5000; j++) {
sumD = sumDouble(d1, d2, d3);
for (int j = 0; j < iter_perf; j++) {
testIntMax(aInt, bInt, cInt, startIntMax);
}
stop = System.currentTimeMillis();
System.out.println("sum double: " + (stop - start));
System.out.println("int max " + (stop - start));
int startIntAnd = init(aInt, bInt, cInt);
int goldIntAnd = testIntAnd(aInt, bInt, cInt, startIntAnd);
for (int j = 0; j < iter_warmup; j++) {
int total = testIntAnd(aInt, bInt, cInt, startIntAnd);
verify("int and", total, goldIntAnd);
}
start = System.currentTimeMillis();
for (int j = 0; j < 5000; j++) {
mulI = prodInt(a1, a2, a3);
for (int j = 0; j < iter_perf; j++) {
testIntAnd(aInt, bInt, cInt, startIntAnd);
}
stop = System.currentTimeMillis();
System.out.println("prod int: " + (stop - start));
System.out.println("int and " + (stop - start));
int startIntOr = init(aInt, bInt, cInt);
int goldIntOr = testIntOr(aInt, bInt, cInt, startIntOr);
for (int j = 0; j < iter_warmup; j++) {
int total = testIntOr(aInt, bInt, cInt, startIntOr);
verify("int or", total, goldIntOr);
}
start = System.currentTimeMillis();
for (int j = 0; j < 5000; j++) {
mulL = prodLong(b1, b2, b3);
for (int j = 0; j < iter_perf; j++) {
testIntOr(aInt, bInt, cInt, startIntOr);
}
stop = System.currentTimeMillis();
System.out.println("prod long: " + (stop - start));
System.out.println("int or " + (stop - start));
int startIntXor = init(aInt, bInt, cInt);
int goldIntXor = testIntXor(aInt, bInt, cInt, startIntXor);
for (int j = 0; j < iter_warmup; j++) {
int total = testIntXor(aInt, bInt, cInt, startIntXor);
verify("int xor", total, goldIntXor);
}
start = System.currentTimeMillis();
for (int j = 0; j < 5000; j++) {
mulF = prodFloat(c1, c2, c3);
for (int j = 0; j < iter_perf; j++) {
testIntXor(aInt, bInt, cInt, startIntXor);
}
stop = System.currentTimeMillis();
System.out.println("prod float: " + (stop - start));
System.out.println("int xor " + (stop - start));
long startLongAdd = init(aLong, bLong, cLong);
long goldLongAdd = testLongAdd(aLong, bLong, cLong, startLongAdd);
for (int j = 0; j < iter_warmup; j++) {
long total = testLongAdd(aLong, bLong, cLong, startLongAdd);
verify("long add", total, goldLongAdd);
}
start = System.currentTimeMillis();
for (int j = 0; j < 5000; j++) {
mulD = prodDouble(d1, d2, d3);
for (int j = 0; j < iter_perf; j++) {
testLongAdd(aLong, bLong, cLong, startLongAdd);
}
stop = System.currentTimeMillis();
System.out.println("prod double: " + (stop - start));
System.out.println("long add " + (stop - start));
long startLongMul = init(aLong, bLong, cLong);
long goldLongMul = testLongMul(aLong, bLong, cLong, startLongMul);
for (int j = 0; j < iter_warmup; j++) {
long total = testLongMul(aLong, bLong, cLong, startLongMul);
verify("long mul", total, goldLongMul);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testLongMul(aLong, bLong, cLong, startLongMul);
}
stop = System.currentTimeMillis();
System.out.println("long mul " + (stop - start));
long startLongMin = init(aLong, bLong, cLong);
long goldLongMin = testLongMin(aLong, bLong, cLong, startLongMin);
for (int j = 0; j < iter_warmup; j++) {
long total = testLongMin(aLong, bLong, cLong, startLongMin);
verify("long min", total, goldLongMin);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testLongMin(aLong, bLong, cLong, startLongMin);
}
stop = System.currentTimeMillis();
System.out.println("long min " + (stop - start));
long startLongMax = init(aLong, bLong, cLong);
long goldLongMax = testLongMax(aLong, bLong, cLong, startLongMax);
for (int j = 0; j < iter_warmup; j++) {
long total = testLongMax(aLong, bLong, cLong, startLongMax);
verify("long max", total, goldLongMax);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testLongMax(aLong, bLong, cLong, startLongMax);
}
stop = System.currentTimeMillis();
System.out.println("long max " + (stop - start));
long startLongAnd = init(aLong, bLong, cLong);
long goldLongAnd = testLongAnd(aLong, bLong, cLong, startLongAnd);
for (int j = 0; j < iter_warmup; j++) {
long total = testLongAnd(aLong, bLong, cLong, startLongAnd);
verify("long and", total, goldLongAnd);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testLongAnd(aLong, bLong, cLong, startLongAnd);
}
stop = System.currentTimeMillis();
System.out.println("long and " + (stop - start));
long startLongOr = init(aLong, bLong, cLong);
long goldLongOr = testLongOr(aLong, bLong, cLong, startLongOr);
for (int j = 0; j < iter_warmup; j++) {
long total = testLongOr(aLong, bLong, cLong, startLongOr);
verify("long or", total, goldLongOr);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testLongOr(aLong, bLong, cLong, startLongOr);
}
stop = System.currentTimeMillis();
System.out.println("long or " + (stop - start));
long startLongXor = init(aLong, bLong, cLong);
long goldLongXor = testLongXor(aLong, bLong, cLong, startLongXor);
for (int j = 0; j < iter_warmup; j++) {
long total = testLongXor(aLong, bLong, cLong, startLongXor);
verify("long xor", total, goldLongXor);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testLongXor(aLong, bLong, cLong, startLongXor);
}
stop = System.currentTimeMillis();
System.out.println("long xor " + (stop - start));
float startFloatAdd = init(aFloat, bFloat, cFloat);
float goldFloatAdd = testFloatAdd(aFloat, bFloat, cFloat, startFloatAdd);
for (int j = 0; j < iter_warmup; j++) {
float total = testFloatAdd(aFloat, bFloat, cFloat, startFloatAdd);
verify("float add", total, goldFloatAdd);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testFloatAdd(aFloat, bFloat, cFloat, startFloatAdd);
}
stop = System.currentTimeMillis();
System.out.println("float add " + (stop - start));
float startFloatMul = init(aFloat, bFloat, cFloat);
float goldFloatMul = testFloatMul(aFloat, bFloat, cFloat, startFloatMul);
for (int j = 0; j < iter_warmup; j++) {
float total = testFloatMul(aFloat, bFloat, cFloat, startFloatMul);
verify("float mul", total, goldFloatMul);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testFloatMul(aFloat, bFloat, cFloat, startFloatMul);
}
stop = System.currentTimeMillis();
System.out.println("float mul " + (stop - start));
float startFloatMin = init(aFloat, bFloat, cFloat);
float goldFloatMin = testFloatMin(aFloat, bFloat, cFloat, startFloatMin);
for (int j = 0; j < iter_warmup; j++) {
float total = testFloatMin(aFloat, bFloat, cFloat, startFloatMin);
verify("float min", total, goldFloatMin);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testFloatMin(aFloat, bFloat, cFloat, startFloatMin);
}
stop = System.currentTimeMillis();
System.out.println("float min " + (stop - start));
float startFloatMax = init(aFloat, bFloat, cFloat);
float goldFloatMax = testFloatMax(aFloat, bFloat, cFloat, startFloatMax);
for (int j = 0; j < iter_warmup; j++) {
float total = testFloatMax(aFloat, bFloat, cFloat, startFloatMax);
verify("float max", total, goldFloatMax);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testFloatMax(aFloat, bFloat, cFloat, startFloatMax);
}
stop = System.currentTimeMillis();
System.out.println("float max " + (stop - start));
double startDoubleAdd = init(aDouble, bDouble, cDouble);
double goldDoubleAdd = testDoubleAdd(aDouble, bDouble, cDouble, startDoubleAdd);
for (int j = 0; j < iter_warmup; j++) {
double total = testDoubleAdd(aDouble, bDouble, cDouble, startDoubleAdd);
verify("double add", total, goldDoubleAdd);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testDoubleAdd(aDouble, bDouble, cDouble, startDoubleAdd);
}
stop = System.currentTimeMillis();
System.out.println("double add " + (stop - start));
double startDoubleMul = init(aDouble, bDouble, cDouble);
double goldDoubleMul = testDoubleMul(aDouble, bDouble, cDouble, startDoubleMul);
for (int j = 0; j < iter_warmup; j++) {
double total = testDoubleMul(aDouble, bDouble, cDouble, startDoubleMul);
verify("double mul", total, goldDoubleMul);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testDoubleMul(aDouble, bDouble, cDouble, startDoubleMul);
}
stop = System.currentTimeMillis();
System.out.println("double mul " + (stop - start));
double startDoubleMin = init(aDouble, bDouble, cDouble);
double goldDoubleMin = testDoubleMin(aDouble, bDouble, cDouble, startDoubleMin);
for (int j = 0; j < iter_warmup; j++) {
double total = testDoubleMin(aDouble, bDouble, cDouble, startDoubleMin);
verify("double min", total, goldDoubleMin);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testDoubleMin(aDouble, bDouble, cDouble, startDoubleMin);
}
stop = System.currentTimeMillis();
System.out.println("double min " + (stop - start));
double startDoubleMax = init(aDouble, bDouble, cDouble);
double goldDoubleMax = testDoubleMax(aDouble, bDouble, cDouble, startDoubleMax);
for (int j = 0; j < iter_warmup; j++) {
double total = testDoubleMax(aDouble, bDouble, cDouble, startDoubleMax);
verify("double max", total, goldDoubleMax);
}
start = System.currentTimeMillis();
for (int j = 0; j < iter_perf; j++) {
testDoubleMax(aDouble, bDouble, cDouble, startDoubleMax);
}
stop = System.currentTimeMillis();
System.out.println("double max " + (stop - start));
}
public static void ReductionInit(int[] a1, int[] a2, int[] a3,
long[] b1, long[] b2, long[] b3,
float[] c1, float[] c2, float[] c3,
double[] d1, double[] d2, double[] d3) {
for(int i = 0; i < a1.length; i++) {
a1[i] = (i + 0);
a2[i] = (i + 1);
a3[i] = (i + 2);
b1[i] = (long) (i + 0);
b2[i] = (long) (i + 1);
b3[i] = (long) (i + 2);
c1[i] = (float) (i + 0);
c2[i] = (float) (i + 1);
c3[i] = (float) (i + 2);
d1[i] = (double) (i + 0);
d2[i] = (double) (i + 1);
d3[i] = (double) (i + 2);
}
}
// ------------------- Tests -------------------
public static int sumInt(int[] a1, int[] a2, int[] a3) {
int total = 0;
for (int i = 0; i < a1.length; i++) {
total += (a1[i] * a2[i]) + (a1[i] * a3[i]) + (a2[i] * a3[i]);
static int testIntAdd(int[] a, int[] b, int[] c, int total) {
for (int i = 0; i < RANGE; i++) {
int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total += v;
}
return total;
}
public static long sumLong(long[] b1, long[] b2, long[] b3) {
long total = 0;
for (int i = 0; i < b1.length; i++) {
total += (b1[i] * b2[i]) + (b1[i] * b3[i]) + (b2[i] * b3[i]);
static int testIntMul(int[] a, int[] b, int[] c, int total) {
for (int i = 0; i < RANGE; i++) {
int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total *= v;
}
return total;
}
public static float sumFloat(float[] c1, float[] c2, float[] c3) {
float total = 0;
for (int i = 0; i < c1.length; i++) {
total += (c1[i] * c2[i]) + (c1[i] * c3[i]) + (c2[i] * c3[i]);
static int testIntMin(int[] a, int[] b, int[] c, int total) {
for (int i = 0; i < RANGE; i++) {
int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = Math.min(total, v);
}
return total;
}
public static double sumDouble(double[] d1, double[] d2, double[] d3) {
double total = 0;
for (int i = 0; i < d1.length; i++) {
total += (d1[i] * d2[i]) + (d1[i] * d3[i]) + (d2[i] * d3[i]);
static int testIntMax(int[] a, int[] b, int[] c, int total) {
for (int i = 0; i < RANGE; i++) {
int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = Math.max(total, v);
}
return total;
}
public static int prodInt(int[] a1, int[] a2, int[] a3) {
int total = 1;
for (int i = 0; i < a1.length; i++) {
total *= (a1[i] * a2[i]) + (a1[i] * a3[i]) + (a2[i] * a3[i]);
static int testIntAnd(int[] a, int[] b, int[] c, int total) {
for (int i = 0; i < RANGE; i++) {
int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total &= v;
}
return total;
}
public static long prodLong(long[] b1, long[] b2, long[] b3) {
long total = 1;
for (int i = 0; i < b1.length; i++) {
total *= (b1[i] * b2[i]) + (b1[i] * b3[i]) + (b2[i] * b3[i]);
static int testIntOr(int[] a, int[] b, int[] c, int total) {
for (int i = 0; i < RANGE; i++) {
int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total |= v;
}
return total;
}
public static float prodFloat(float[] c1, float[] c2, float[] c3) {
float total = 1;
for (int i = 0; i < c1.length; i++) {
total *= (c1[i] * c2[i]) + (c1[i] * c3[i]) + (c2[i] * c3[i]);
static int testIntXor(int[] a, int[] b, int[] c, int total) {
for (int i = 0; i < RANGE; i++) {
int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total ^= v;
}
return total;
}
public static double prodDouble(double[] d1, double[] d2, double[] d3) {
double total = 1;
for (int i = 0; i < d1.length; i++) {
total *= (d1[i] * d2[i]) + (d1[i] * d3[i]) + (d2[i] * d3[i]);
static long testLongAdd(long[] a, long[] b, long[] c, long total) {
for (int i = 0; i < RANGE; i++) {
long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total += v;
}
return total;
}
static long testLongMul(long[] a, long[] b, long[] c, long total) {
for (int i = 0; i < RANGE; i++) {
long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total *= v;
}
return total;
}
static long testLongMin(long[] a, long[] b, long[] c, long total) {
for (int i = 0; i < RANGE; i++) {
long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = Math.min(total, v);
}
return total;
}
static long testLongMax(long[] a, long[] b, long[] c, long total) {
for (int i = 0; i < RANGE; i++) {
long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = Math.max(total, v);
}
return total;
}
static long testLongAnd(long[] a, long[] b, long[] c, long total) {
for (int i = 0; i < RANGE; i++) {
long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total &= v;
}
return total;
}
static long testLongOr(long[] a, long[] b, long[] c, long total) {
for (int i = 0; i < RANGE; i++) {
long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total |= v;
}
return total;
}
static long testLongXor(long[] a, long[] b, long[] c, long total) {
for (int i = 0; i < RANGE; i++) {
long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total ^= v;
}
return total;
}
static float testFloatAdd(float[] a, float[] b, float[] c, float total) {
for (int i = 0; i < RANGE; i++) {
float v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total += v;
}
return total;
}
static float testFloatMul(float[] a, float[] b, float[] c, float total) {
for (int i = 0; i < RANGE; i++) {
float v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total *= v;
}
return total;
}
static float testFloatMin(float[] a, float[] b, float[] c, float total) {
for (int i = 0; i < RANGE; i++) {
float v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = Math.min(total, v);
}
return total;
}
static float testFloatMax(float[] a, float[] b, float[] c, float total) {
for (int i = 0; i < RANGE; i++) {
float v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = Math.max(total, v);
}
return total;
}
static double testDoubleAdd(double[] a, double[] b, double[] c, double total) {
for (int i = 0; i < RANGE; i++) {
double v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total += v;
}
return total;
}
static double testDoubleMul(double[] a, double[] b, double[] c, double total) {
for (int i = 0; i < RANGE; i++) {
double v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total *= v;
}
return total;
}
static double testDoubleMin(double[] a, double[] b, double[] c, double total) {
for (int i = 0; i < RANGE; i++) {
double v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = Math.min(total, v);
}
return total;
}
static double testDoubleMax(double[] a, double[] b, double[] c, double total) {
for (int i = 0; i < RANGE; i++) {
double v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
total = Math.max(total, v);
}
return total;
}
// ------------------- Initialization -------------------
static int init(int[] a, int[] b, int[] c) {
for (int j = 0; j < RANGE; j++) {
a[j] = rand.nextInt();
b[j] = rand.nextInt();
c[j] = rand.nextInt();
}
return rand.nextInt();
}
static long init(long[] a, long[] b, long[] c) {
for (int j = 0; j < RANGE; j++) {
a[j] = rand.nextLong();
b[j] = rand.nextLong();
c[j] = rand.nextLong();
}
return rand.nextLong();
}
static float init(float[] a, float[] b, float[] c) {
for (int j = 0; j < RANGE; j++) {
a[j] = rand.nextFloat();
b[j] = rand.nextFloat();
c[j] = rand.nextFloat();
}
return rand.nextFloat();
}
static double init(double[] a, double[] b, double[] c) {
for (int j = 0; j < RANGE; j++) {
a[j] = rand.nextDouble();
b[j] = rand.nextDouble();
c[j] = rand.nextDouble();
}
return rand.nextDouble();
}
// ------------------- Verification -------------------
static void verify(String context, double total, double gold) {
if (total != gold) {
throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold);
}
}
static void verify(String context, float total, float gold) {
if (total != gold) {
throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold);
}
}
static void verify(String context, int total, int gold) {
if (total != gold) {
throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold);
}
}
static void verify(String context, long total, long gold) {
if (total != gold) {
throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold);
}
}
}

View File

@ -91,7 +91,7 @@ public class SumRed_Int {
failOn = {IRNode.ADD_REDUCTION_VI})
@IR(applyIfCPUFeature = {"sse4.1", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.ADD_REDUCTION_VI, ">= 1"})
counts = {IRNode.ADD_REDUCTION_VI, ">= 1", IRNode.ADD_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop
public static int sumReductionImplement(
int[] a,
int[] b,

View File

@ -95,7 +95,7 @@ public class SumRed_Long {
failOn = {IRNode.ADD_REDUCTION_VL})
@IR(applyIfCPUFeature = {"avx2", "true"},
applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"},
counts = {IRNode.ADD_REDUCTION_VL, ">= 1"})
counts = {IRNode.ADD_REDUCTION_VL, ">= 1", IRNode.ADD_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop
public static long sumReductionImplement(
long[] a,
long[] b,

View File

@ -0,0 +1,149 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8302652
* @summary Special test cases for PhaseIdealLoop::move_unordered_reduction_out_of_loop
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestUnorderedReduction
*/
package compiler.loopopts.superword;
import compiler.lib.ir_framework.*;
public class TestUnorderedReduction {
static final int RANGE = 1024;
static final int ITER = 10;
public static void main(String[] args) {
TestFramework.runWithFlags("-Xbatch",
"-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestUnorderedReduction::test*",
"-XX:MaxVectorSize=16");
}
@Run(test = {"test1", "test2"})
@Warmup(0)
public void runTests() throws Exception {
int[] data = new int[RANGE];
init(data);
for (int i = 0; i < ITER; i++) {
int r1 = test1(data, i);
int r2 = ref1(data, i);
if (r1 != r2) {
throw new RuntimeException("Wrong result test1: " + r1 + " != " + r2);
}
}
for (int i = 0; i < ITER; i++) {
int r1 = test2(data, i);
int r2 = ref2(data, i);
if (r1 != r2) {
throw new RuntimeException("Wrong result test2: " + r1 + " != " + r2);
}
}
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR, "> 0",
IRNode.ADD_VI, "= 0",
IRNode.ADD_REDUCTION_VI, "> 0"}, // count can be high
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
static int test1(int[] data, int sum) {
// Vectorizes, but the UnorderedReduction cannot be moved out of the loop,
// because we have a use inside the loop.
int x = 0;
for (int i = 0; i < RANGE; i+=8) {
sum += 11 * data[i+0]; // vec 1 (16 bytes)
sum += 11 * data[i+1];
sum += 11 * data[i+2];
sum += 11 * data[i+3];
x = sum + i; // vec 1 reduction has more than 1 use
sum += 11 * data[i+4]; // vec 2 (next 16 bytes)
sum += 11 * data[i+5];
sum += 11 * data[i+6];
sum += 11 * data[i+7];
}
return sum + x;
}
static int ref1(int[] data, int sum) {
int x = 0;
for (int i = 0; i < RANGE; i+=8) {
sum += 11 * data[i+0];
sum += 11 * data[i+1];
sum += 11 * data[i+2];
sum += 11 * data[i+3];
x = sum + i;
sum += 11 * data[i+4];
sum += 11 * data[i+5];
sum += 11 * data[i+6];
sum += 11 * data[i+7];
}
return sum + x;
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR, "> 0",
IRNode.ADD_VI, "> 0",
IRNode.ADD_REDUCTION_VI, "> 0",
IRNode.ADD_REDUCTION_VI, "<= 2"}, // count must be low
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
static int test2(int[] data, int sum) {
for (int i = 0; i < RANGE; i+=8) {
// Vectorized, and UnorderedReduction moved outside loop.
sum += 11 * data[i+0]; // vec 1
sum += 11 * data[i+1];
sum += 11 * data[i+2];
sum += 11 * data[i+3];
sum += 11 * data[i+4]; // vec 2
sum += 11 * data[i+5];
sum += 11 * data[i+6];
sum += 11 * data[i+7];
}
return sum;
}
static int ref2(int[] data, int sum) {
for (int i = 0; i < RANGE; i+=8) {
sum += 11 * data[i+0];
sum += 11 * data[i+1];
sum += 11 * data[i+2];
sum += 11 * data[i+3];
sum += 11 * data[i+4];
sum += 11 * data[i+5];
sum += 11 * data[i+6];
sum += 11 * data[i+7];
}
return sum;
}
static void init(int[] data) {
for (int i = 0; i < RANGE; i++) {
data[i] = i + 1;
}
}
}