mirror of
https://github.com/openjdk/jdk.git
synced 2026-02-20 07:15:31 +00:00
8129920: Vectorized loop unrolling
Optimize loop opts for vectorizible loops. Reviewed-by: kvn, roland
This commit is contained in:
parent
963c3852a3
commit
b5284a93ce
@ -280,6 +280,10 @@ bool IdealLoopTree::policy_peeling( PhaseIdealLoop *phase ) const {
|
||||
|| (body_size * body_size + phase->C->live_nodes()) > phase->C->max_node_limit() ) {
|
||||
return false; // too large to safely clone
|
||||
}
|
||||
|
||||
// check for vectorized loops, any peeling done was already applied
|
||||
if (_head->is_CountedLoop() && _head->as_CountedLoop()->do_unroll_only()) return false;
|
||||
|
||||
while( test != _head ) { // Scan till run off top of loop
|
||||
if( test->is_If() ) { // Test?
|
||||
Node *ctrl = phase->get_ctrl(test->in(1));
|
||||
@ -656,7 +660,12 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
_local_loop_unroll_limit = LoopUnrollLimit;
|
||||
_local_loop_unroll_factor = 4;
|
||||
int future_unroll_ct = cl->unrolled_count() * 2;
|
||||
if (future_unroll_ct > LoopMaxUnroll) return false;
|
||||
if (!cl->do_unroll_only()) {
|
||||
if (future_unroll_ct > LoopMaxUnroll) return false;
|
||||
} else {
|
||||
// obey user constraints on vector mapped loops with additional unrolling applied
|
||||
if ((future_unroll_ct / cl->slp_max_unroll()) > LoopMaxUnroll) return false;
|
||||
}
|
||||
|
||||
// Check for initial stride being a small enough constant
|
||||
if (abs(cl->stride_con()) > (1<<2)*future_unroll_ct) return false;
|
||||
@ -759,13 +768,19 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
if (LoopMaxUnroll > _local_loop_unroll_factor) {
|
||||
// Once policy_slp_analysis succeeds, mark the loop with the
|
||||
// maximal unroll factor so that we minimize analysis passes
|
||||
if ((future_unroll_ct > _local_loop_unroll_factor) ||
|
||||
(body_size > (uint)_local_loop_unroll_limit)) {
|
||||
if (future_unroll_ct >= _local_loop_unroll_factor) {
|
||||
policy_unroll_slp_analysis(cl, phase, future_unroll_ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int slp_max_unroll_factor = cl->slp_max_unroll();
|
||||
if (cl->has_passed_slp()) {
|
||||
if (slp_max_unroll_factor >= future_unroll_ct) return true;
|
||||
// Normal case: loop too big
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for being too big
|
||||
if (body_size > (uint)_local_loop_unroll_limit) {
|
||||
if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true;
|
||||
@ -773,6 +788,10 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(cl->do_unroll_only()) {
|
||||
NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("policy_unroll passed vector loop(vlen=%d,factor = %d)\n", slp_max_unroll_factor, future_unroll_ct));
|
||||
}
|
||||
|
||||
// Unroll once! (Each trip will soon do double iterations)
|
||||
return true;
|
||||
}
|
||||
@ -780,28 +799,24 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_ct) {
|
||||
// Enable this functionality target by target as needed
|
||||
if (SuperWordLoopUnrollAnalysis) {
|
||||
if (!cl->has_passed_slp()) {
|
||||
if (!cl->was_slp_analyzed()) {
|
||||
SuperWord sw(phase);
|
||||
sw.transform_loop(this, false);
|
||||
|
||||
// If the loop is slp canonical analyze it
|
||||
if (sw.early_return() == false) {
|
||||
sw.unrolling_analysis(cl, _local_loop_unroll_factor);
|
||||
sw.unrolling_analysis(_local_loop_unroll_factor);
|
||||
}
|
||||
}
|
||||
|
||||
int slp_max_unroll_factor = cl->slp_max_unroll();
|
||||
if ((slp_max_unroll_factor > 4) &&
|
||||
(slp_max_unroll_factor >= future_unroll_ct)) {
|
||||
int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
|
||||
if (new_limit > LoopUnrollLimit) {
|
||||
#ifndef PRODUCT
|
||||
if (TraceSuperWordLoopUnrollAnalysis) {
|
||||
tty->print_cr("slp analysis is applying unroll limit %d, the original limit was %d\n",
|
||||
new_limit, _local_loop_unroll_limit);
|
||||
if (cl->has_passed_slp()) {
|
||||
int slp_max_unroll_factor = cl->slp_max_unroll();
|
||||
if (slp_max_unroll_factor >= future_unroll_ct) {
|
||||
int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
|
||||
if (new_limit > LoopUnrollLimit) {
|
||||
NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("slp analysis unroll=%d, default limit=%d\n", new_limit, _local_loop_unroll_limit));
|
||||
_local_loop_unroll_limit = new_limit;
|
||||
}
|
||||
#endif
|
||||
_local_loop_unroll_limit = new_limit;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -830,6 +845,9 @@ bool IdealLoopTree::policy_range_check( PhaseIdealLoop *phase ) const {
|
||||
if (cl->is_main_no_pre_loop()) return false; // Disallowed for now.
|
||||
Node *trip_counter = cl->phi();
|
||||
|
||||
// check for vectorized loops, some opts are no longer needed
|
||||
if (cl->do_unroll_only()) return false;
|
||||
|
||||
// Check loop body for tests of trip-counter plus loop-invariant vs
|
||||
// loop-invariant.
|
||||
for (uint i = 0; i < _body.size(); i++) {
|
||||
@ -880,6 +898,8 @@ bool IdealLoopTree::policy_range_check( PhaseIdealLoop *phase ) const {
|
||||
// Return TRUE or FALSE if the loop should NEVER be RCE'd or aligned. Useful
|
||||
// for unrolling loops with NO array accesses.
|
||||
bool IdealLoopTree::policy_peel_only( PhaseIdealLoop *phase ) const {
|
||||
// check for vectorized loops, any peeling done was already applied
|
||||
if (_head->is_CountedLoop() && _head->as_CountedLoop()->do_unroll_only()) return false;
|
||||
|
||||
for( uint i = 0; i < _body.size(); i++ )
|
||||
if( _body[i]->is_Mem() )
|
||||
|
||||
@ -61,6 +61,12 @@ bool IdealLoopTree::policy_unswitching( PhaseIdealLoop *phase ) const {
|
||||
if (!_head->is_Loop()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// check for vectorized loops, any unswitching was already applied
|
||||
if (_head->is_CountedLoop() && _head->as_CountedLoop()->do_unroll_only()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int nodes_left = phase->C->max_node_limit() - phase->C->live_nodes();
|
||||
if ((int)(2 * _body.size()) > nodes_left) {
|
||||
return false; // Too speculative if running low on nodes.
|
||||
|
||||
@ -2317,7 +2317,11 @@ void PhaseIdealLoop::build_and_optimize(bool do_split_ifs, bool skip_loop_opts)
|
||||
// Reassociate invariants and prep for split_thru_phi
|
||||
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
|
||||
IdealLoopTree* lpt = iter.current();
|
||||
if (!lpt->is_counted() || !lpt->is_inner()) continue;
|
||||
bool is_counted = lpt->is_counted();
|
||||
if (!is_counted || !lpt->is_inner()) continue;
|
||||
|
||||
// check for vectorized loops, any reassociation of invariants was already done
|
||||
if (is_counted && lpt->_head->as_CountedLoop()->do_unroll_only()) continue;
|
||||
|
||||
lpt->reassociate_invariants(this);
|
||||
|
||||
|
||||
@ -64,7 +64,9 @@ protected:
|
||||
PartialPeelLoop=32,
|
||||
PartialPeelFailed=64,
|
||||
HasReductions=128,
|
||||
PassedSlpAnalysis=256 };
|
||||
WasSlpAnalyzed=256,
|
||||
PassedSlpAnalysis=512,
|
||||
DoUnrollOnly=1024 };
|
||||
char _unswitch_count;
|
||||
enum { _unswitch_max=3 };
|
||||
|
||||
@ -80,7 +82,9 @@ public:
|
||||
int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
|
||||
void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
|
||||
void mark_has_reductions() { _loop_flags |= HasReductions; }
|
||||
void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
|
||||
void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
|
||||
void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
|
||||
|
||||
int unswitch_max() { return _unswitch_max; }
|
||||
int unswitch_count() { return _unswitch_count; }
|
||||
@ -212,7 +216,9 @@ public:
|
||||
int is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; }
|
||||
int is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; }
|
||||
int is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
|
||||
int was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; }
|
||||
int has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
|
||||
int do_unroll_only () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
|
||||
int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
|
||||
void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
|
||||
|
||||
@ -235,6 +241,9 @@ public:
|
||||
void set_nonexact_trip_count() {
|
||||
_loop_flags &= ~HasExactTripCount;
|
||||
}
|
||||
void set_notpassed_slp() {
|
||||
_loop_flags &= ~PassedSlpAnalysis;
|
||||
}
|
||||
|
||||
void set_profile_trip_cnt(float ptc) { _profile_trip_cnt = ptc; }
|
||||
float profile_trip_cnt() { return _profile_trip_cnt; }
|
||||
|
||||
@ -100,6 +100,10 @@ void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
|
||||
return;
|
||||
}
|
||||
|
||||
// We only re-enter slp when we vector mapped a queried loop and we want to
|
||||
// continue unrolling, in this case, slp is not subsequently done.
|
||||
if (cl->do_unroll_only()) return;
|
||||
|
||||
// Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
|
||||
CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
|
||||
if (pre_end == NULL) return;
|
||||
@ -121,12 +125,13 @@ void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
|
||||
}
|
||||
|
||||
//------------------------------early unrolling analysis------------------------------
|
||||
void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
|
||||
void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
|
||||
bool is_slp = true;
|
||||
ResourceMark rm;
|
||||
size_t ignored_size = lpt()->_body.size();
|
||||
int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
|
||||
Node_Stack nstack((int)ignored_size);
|
||||
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
||||
Node *cl_exit = cl->loopexit();
|
||||
|
||||
// First clear the entries
|
||||
@ -249,13 +254,9 @@ void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_f
|
||||
|
||||
// If a max vector exists which is not larger than _local_loop_unroll_factor
|
||||
// stop looking, we already have the max vector to map to.
|
||||
if (cur_max_vector <= local_loop_unroll_factor) {
|
||||
if (cur_max_vector < local_loop_unroll_factor) {
|
||||
is_slp = false;
|
||||
#ifndef PRODUCT
|
||||
if (TraceSuperWordLoopUnrollAnalysis) {
|
||||
tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
|
||||
}
|
||||
#endif
|
||||
NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("slp analysis fails: unroll limit greater than max vector\n"));
|
||||
break;
|
||||
}
|
||||
|
||||
@ -268,8 +269,9 @@ void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_f
|
||||
}
|
||||
if (is_slp) {
|
||||
local_loop_unroll_factor = max_vector;
|
||||
cl->mark_passed_slp();
|
||||
}
|
||||
cl->mark_passed_slp();
|
||||
cl->mark_was_slp();
|
||||
cl->set_slp_max_unroll(local_loop_unroll_factor);
|
||||
}
|
||||
}
|
||||
@ -1758,7 +1760,9 @@ void SuperWord::output() {
|
||||
}
|
||||
|
||||
Compile* C = _phase->C;
|
||||
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
|
||||
uint max_vlen_in_bytes = 0;
|
||||
uint max_vlen = 0;
|
||||
for (int i = 0; i < _block.length(); i++) {
|
||||
Node* n = _block.at(i);
|
||||
Node_List* p = my_pack(n);
|
||||
@ -1841,6 +1845,7 @@ void SuperWord::output() {
|
||||
_igvn._worklist.push(vn);
|
||||
|
||||
if (vlen_in_bytes > max_vlen_in_bytes) {
|
||||
max_vlen = vlen;
|
||||
max_vlen_in_bytes = vlen_in_bytes;
|
||||
}
|
||||
#ifdef ASSERT
|
||||
@ -1852,6 +1857,18 @@ void SuperWord::output() {
|
||||
}
|
||||
}
|
||||
C->set_max_vector_size(max_vlen_in_bytes);
|
||||
if (SuperWordLoopUnrollAnalysis) {
|
||||
if (cl->has_passed_slp()) {
|
||||
uint slp_max_unroll_factor = cl->slp_max_unroll();
|
||||
if (slp_max_unroll_factor == max_vlen) {
|
||||
NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte));
|
||||
// For atomic unrolled loops which are vector mapped, instigate more unrolling.
|
||||
cl->set_notpassed_slp();
|
||||
C->set_major_progress();
|
||||
cl->mark_do_unroll_only();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------vector_opd---------------------------
|
||||
|
||||
@ -241,7 +241,7 @@ class SuperWord : public ResourceObj {
|
||||
|
||||
void transform_loop(IdealLoopTree* lpt, bool do_optimization);
|
||||
|
||||
void unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor);
|
||||
void unrolling_analysis(int &local_loop_unroll_factor);
|
||||
|
||||
// Accessors for SWPointer
|
||||
PhaseIdealLoop* phase() { return _phase; }
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user