8136445: Performance issue with Nashorn and C2's global code motion

Reviewed-by: kvn
This commit is contained in:
Martin Doerr 2015-12-04 16:23:39 +01:00
parent d60a09e9c5
commit 0fba365de2
2 changed files with 41 additions and 28 deletions

View File

@ -419,12 +419,12 @@ class PhaseCFG : public Phase {
void global_code_motion();
// Schedule Nodes early in their basic blocks.
bool schedule_early(VectorSet &visited, Node_List &roots);
bool schedule_early(VectorSet &visited, Node_Stack &roots);
// For each node, find the latest block it can be scheduled into
// and then select the cheapest block between the latest and earliest
// block to place the node.
void schedule_late(VectorSet &visited, Node_List &stack);
void schedule_late(VectorSet &visited, Node_Stack &stack);
// Compute the (backwards) latency of a node from a single use
int latency_from_use(Node *n, const Node *def, Node *use);
@ -433,7 +433,7 @@ class PhaseCFG : public Phase {
void partial_latency_of_defs(Node *n);
// Compute the instruction global latency with a backwards walk
void compute_latencies_backwards(VectorSet &visited, Node_List &stack);
void compute_latencies_backwards(VectorSet &visited, Node_Stack &stack);
// Pick a block between early and late that is a cheaper alternative
// to late. Helper for schedule_late.

View File

@ -228,18 +228,19 @@ static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) {
// Find the earliest Block any instruction can be placed in. Some instructions
// are pinned into Blocks. Unpinned instructions can appear in last block in
// which all their inputs occur.
bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
bool PhaseCFG::schedule_early(VectorSet &visited, Node_Stack &roots) {
// Allocate stack with enough space to avoid frequent realloc
Node_Stack nstack(roots.Size() + 8);
Node_Stack nstack(roots.size() + 8);
// _root will be processed among C->top() inputs
roots.push(C->top());
roots.push(C->top(), 0);
visited.set(C->top()->_idx);
while (roots.size() != 0) {
// Use local variables nstack_top_n & nstack_top_i to cache values
// on stack's top.
Node* parent_node = roots.pop();
Node* parent_node = roots.node();
uint input_index = 0;
roots.pop();
while (true) {
if (input_index == 0) {
@ -286,7 +287,7 @@ bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
break;
} else if (!is_visited) {
// Visit this guy later, using worklist
roots.push(in);
roots.push(in, 0);
}
}
@ -791,23 +792,23 @@ private:
public:
// Constructor for the iterator
Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg);
Node_Backward_Iterator(Node *root, VectorSet &visited, Node_Stack &stack, PhaseCFG &cfg);
// Postincrement operator to iterate over the nodes
Node *next();
private:
VectorSet &_visited;
Node_List &_stack;
Node_Stack &_stack;
PhaseCFG &_cfg;
};
// Constructor for the Node_Backward_Iterator
Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg)
Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_Stack &stack, PhaseCFG &cfg)
: _visited(visited), _stack(stack), _cfg(cfg) {
// The stack should contain exactly the root
stack.clear();
stack.push(root);
stack.push(root, root->outcnt());
// Clear the visited bits
visited.Clear();
@ -820,12 +821,14 @@ Node *Node_Backward_Iterator::next() {
if ( !_stack.size() )
return NULL;
// '_stack' is emulating a real _stack. The 'visit-all-users' loop has been
// made stateless, so I do not need to record the index 'i' on my _stack.
// Instead I visit all users each time, scanning for unvisited users.
// I visit unvisited not-anti-dependence users first, then anti-dependent
// children next.
Node *self = _stack.pop();
// children next. I iterate backwards to support removal of nodes.
// The stack holds states consisting of 3 values:
// current Def node, flag which indicates 1st/2nd pass, index of current out edge
Node *self = (Node*)(((uintptr_t)_stack.node()) & ~1);
bool iterate_anti_dep = (((uintptr_t)_stack.node()) & 1);
uint idx = MIN2(_stack.index(), self->outcnt()); // Support removal of nodes.
_stack.pop();
// I cycle here when I am entering a deeper level of recursion.
// The key variable 'self' was set prior to jumping here.
@ -841,9 +844,9 @@ Node *Node_Backward_Iterator::next() {
Node *unvisited = NULL; // Unvisited anti-dependent Node, if any
// Scan for unvisited nodes
for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
while (idx > 0) {
// For all uses, schedule late
Node* n = self->fast_out(i); // Use
Node* n = self->raw_out(--idx); // Use
// Skip already visited children
if ( _visited.test(n->_idx) )
@ -863,19 +866,31 @@ Node *Node_Backward_Iterator::next() {
unvisited = n; // Found unvisited
// Check for possible-anti-dependent
if( !n->needs_anti_dependence_check() )
break; // Not visited, not anti-dep; schedule it NOW
// 1st pass: No such nodes, 2nd pass: Only such nodes.
if (n->needs_anti_dependence_check() == iterate_anti_dep) {
unvisited = n; // Found unvisited
break;
}
}
// Did I find an unvisited not-anti-dependent Node?
if ( !unvisited )
if (!unvisited) {
if (!iterate_anti_dep) {
// 2nd pass: Iterate over nodes which needs_anti_dependence_check.
iterate_anti_dep = true;
idx = self->outcnt();
continue;
}
break; // All done with children; post-visit 'self'
}
// Visit the unvisited Node. Contains the obvious push to
// indicate I'm entering a deeper level of recursion. I push the
// old state onto the _stack and set a new state and loop (recurse).
_stack.push(self);
_stack.push((Node*)((uintptr_t)self | (uintptr_t)iterate_anti_dep), idx);
self = unvisited;
iterate_anti_dep = false;
idx = self->outcnt();
} // End recursion loop
return self;
@ -883,7 +898,7 @@ Node *Node_Backward_Iterator::next() {
//------------------------------ComputeLatenciesBackwards----------------------
// Compute the latency of all the instructions.
void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_List &stack) {
void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_Stack &stack) {
#ifndef PRODUCT
if (trace_opto_pipelining())
tty->print("\n#---- ComputeLatenciesBackwards ----\n");
@ -1157,7 +1172,7 @@ Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
// dominator tree of all USES of a value. Pick the block with the least
// loop nesting depth that is lowest in the dominator tree.
extern const char must_clone[];
void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {
void PhaseCFG::schedule_late(VectorSet &visited, Node_Stack &stack) {
#ifndef PRODUCT
if (trace_opto_pipelining())
tty->print("\n#---- schedule_late ----\n");
@ -1313,9 +1328,7 @@ void PhaseCFG::global_code_motion() {
// instructions are pinned into Blocks. Unpinned instructions can
// appear in last block in which all their inputs occur.
visited.Clear();
Node_List stack(arena);
// Pre-grow the list
stack.map((C->live_nodes() >> 1) + 16, NULL);
Node_Stack stack(arena, (C->live_nodes() >> 2) + 16); // pre-grow
if (!schedule_early(visited, stack)) {
// Bailout without retry
C->record_method_not_compilable("early schedule failed");