Merge

2026-02-16 21:35:25 +00:00 · 2010-07-21 12:45:42 -07:00 · 2010-07-21 12:45:42 -07:00 · f495cb2581
commit f495cb2581
parent eceef28d8c 5c3da1ee7e
16 changed files with 311 additions and 183 deletions
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
@ -664,19 +664,14 @@ CMSCollector::CMSCollector(ConcurrentMarkSweepGeneration* cmsGen,
        return;
      }

-      // XXX use a global constant instead of 64!
-      typedef struct OopTaskQueuePadded {
-        OopTaskQueue work_queue;
-        char pad[64 - sizeof(OopTaskQueue)];  // prevent false sharing
-      } OopTaskQueuePadded;
-
+      typedef Padded<OopTaskQueue> PaddedOopTaskQueue;
      for (i = 0; i < num_queues; i++) {
-        OopTaskQueuePadded *q_padded = new OopTaskQueuePadded();
-        if (q_padded == NULL) {
+        PaddedOopTaskQueue *q = new PaddedOopTaskQueue();
+        if (q == NULL) {
          warning("work_queue allocation failure.");
          return;
        }
-        _task_queues->register_queue(i, &q_padded->work_queue);
+        _task_queues->register_queue(i, q);
      }
      for (i = 0; i < num_queues; i++) {
        _task_queues->queue(i)->initialize();
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -271,21 +271,16 @@ jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* def
    if (cas_res == prev_epoch_entry) {
      // We successfully updated the card num value in the epoch entry
      count_ptr->_count = 0; // initialize counter for new card num
+      jbyte* old_card_ptr = card_num_2_ptr(old_card_num);

      // Even though the region containg the card at old_card_num was not
      // in the young list when old_card_num was recorded in the epoch
      // cache it could have been added to the free list and subsequently
-      // added to the young list in the intervening time. If the evicted
-      // card is in a young region just return the card_ptr and the evicted
-      // card will not be cleaned. See CR 6817995.
-
-      jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
-      if (is_young_card(old_card_ptr)) {
-        *count = 0;
-        // We can defer the processing of card_ptr
-        *defer = true;
-        return card_ptr;
-      }
+      // added to the young list in the intervening time. See CR 6817995.
+      // We do not deal with this case here - it will be handled in
+      // HeapRegion::oops_on_card_seq_iterate_careful after it has been
+      // determined that the region containing the card has been allocated
+      // to, and it's safe to check the young type of the region.

      // We do not want to defer processing of card_ptr in this case
      // (we need to refine old_card_ptr and card_ptr)
@ -301,22 +296,22 @@ jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
  jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
  assert(cached_ptr != NULL, "bad cached card ptr");

-  if (is_young_card(cached_ptr)) {
-    // The region containing cached_ptr has been freed during a clean up
-    // pause, reallocated, and tagged as young.
-    assert(cached_ptr != card_ptr, "shouldn't be");
+  // We've just inserted a card pointer into the card count cache
+  // and got back the card that we just inserted or (evicted) the
+  // previous contents of that count slot.

-    // We've just inserted a new old-gen card pointer into the card count
-    // cache and evicted the previous contents of that count slot.
-    // The evicted card pointer has been determined to be in a young region
-    // and so cannot be the newly inserted card pointer (that will be
-    // in an old region).
-    // The count for newly inserted card will be set to zero during the
-    // insertion, so we don't want to defer the cleaning of the newly
-    // inserted card pointer.
-    assert(*defer == false, "deferring non-hot card");
-    return NULL;
-  }
+  // The card we got back could be in a young region. When the
+  // returned card (if evicted) was originally inserted, we had
+  // determined that its containing region was not young. However
+  // it is possible for the region to be freed during a cleanup
+  // pause, then reallocated and tagged as young which will result
+  // in the returned card residing in a young region.
+  //
+  // We do not deal with this case here - the change from non-young
+  // to young could be observed at any time - it will be handled in
+  // HeapRegion::oops_on_card_seq_iterate_careful after it has been
+  // determined that the region containing the card has been allocated
+  // to.

  // The card pointer we obtained from card count cache is not hot
  // so do not store it in the cache; return it for immediate
@ -325,7 +320,7 @@ jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
    return cached_ptr;
  }

-  // Otherwise, the pointer we got from the _card_counts is hot.
+  // Otherwise, the pointer we got from the _card_counts cache is hot.
  jbyte* res = NULL;
  MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
  if (_n_hot == _hot_cache_size) {
@ -338,17 +333,8 @@ jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
  if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
  _n_hot++;

-  if (res != NULL) {
-    // Even though the region containg res was not in the young list
-    // when it was recorded in the hot cache it could have been added
-    // to the free list and subsequently added to the young list in
-    // the intervening time. If res is in a young region, return NULL
-    // so that res is not cleaned. See CR 6817995.
-
-    if (is_young_card(res)) {
-      res = NULL;
-    }
-  }
+  // The card obtained from the hot card cache could be in a young
+  // region. See above on how this can happen.

  return res;
 }
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@ -638,6 +638,11 @@ G1CollectedHeap::attempt_allocation_slow(size_t word_size,

    // Now retry the allocation.
    if (_cur_alloc_region != NULL) {
+      if (allocated_young_region != NULL) {
+        // We need to ensure that the store to top does not
+        // float above the setting of the young type.
+        OrderAccess::storestore();
+      }
      res = _cur_alloc_region->allocate(word_size);
    }
  }
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -676,9 +676,27 @@ void HRInto_G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i
  // We must complete this write before we do any of the reads below.
  OrderAccess::storeload();
  // And process it, being careful of unallocated portions of TLAB's.
+
+  // The region for the current card may be a young region. The
+  // current card may have been a card that was evicted from the
+  // card cache. When the card was inserted into the cache, we had
+  // determined that its region was non-young. While in the cache,
+  // the region may have been freed during a cleanup pause, reallocated
+  // and tagged as young.
+  //
+  // We wish to filter out cards for such a region but the current
+  // thread, if we're running conucrrently, may "see" the young type
+  // change at any time (so an earlier "is_young" check may pass or
+  // fail arbitrarily). We tell the iteration code to perform this
+  // filtering when it has been determined that there has been an actual
+  // allocation in this region and making it safe to check the young type.
+  bool filter_young = true;
+
  HeapWord* stop_point =
    r->oops_on_card_seq_iterate_careful(dirtyRegion,
-                                        &filter_then_update_rs_oop_cl);
+                                        &filter_then_update_rs_oop_cl,
+                                        filter_young);
+
  // If stop_point is non-null, then we encountered an unallocated region
  // (perhaps the unfilled portion of a TLAB.)  For now, we'll dirty the
  // card and re-enqueue: if we put off the card until a GC pause, then the
@ -789,8 +807,14 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
      if (r == NULL) {
        assert(_g1->is_in_permanent(start), "Or else where?");
      } else {
-        guarantee(!r->is_young(), "It was evicted in the current minor cycle.");
-        // Process card pointer we get back from the hot card cache
+        // Checking whether the region we got back from the cache
+        // is young here is inappropriate. The region could have been
+        // freed, reallocated and tagged as young while in the cache.
+        // Hence we could see its young type change at any time.
+        //
+        // Process card pointer we get back from the hot card cache. This
+        // will check whether the region containing the card is young
+        // _after_ checking that the region has been allocated from.
        concurrentRefineOneCard_impl(res, worker_i);
      }
    }
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
@ -658,7 +658,8 @@ HeapRegion::object_iterate_mem_careful(MemRegion mr,
 HeapWord*
 HeapRegion::
 oops_on_card_seq_iterate_careful(MemRegion mr,
-                                     FilterOutOfRegionClosure* cl) {
+                                 FilterOutOfRegionClosure* cl,
+                                 bool filter_young) {
  G1CollectedHeap* g1h = G1CollectedHeap::heap();

  // If we're within a stop-world GC, then we might look at a card in a
@ -672,6 +673,16 @@ oops_on_card_seq_iterate_careful(MemRegion mr,
  if (mr.is_empty()) return NULL;
  // Otherwise, find the obj that extends onto mr.start().

+  // The intersection of the incoming mr (for the card) and the
+  // allocated part of the region is non-empty. This implies that
+  // we have actually allocated into this region. The code in
+  // G1CollectedHeap.cpp that allocates a new region sets the
+  // is_young tag on the region before allocating. Thus we
+  // safely know if this region is young.
+  if (is_young() && filter_young) {
+    return NULL;
+  }
+
  // We used to use "block_start_careful" here.  But we're actually happy
  // to update the BOT while we do this...
  HeapWord* cur = block_start(mr.start());
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp
@ -252,7 +252,7 @@ class HeapRegion: public G1OffsetTableContigSpace {
                                // survivor
  };

-  YoungType _young_type;
+  volatile YoungType _young_type;
  int  _young_index_in_cset;
  SurvRateGroup* _surv_rate_group;
  int  _age_index;
@ -726,9 +726,12 @@ class HeapRegion: public G1OffsetTableContigSpace {
  HeapWord*
  object_iterate_mem_careful(MemRegion mr, ObjectClosure* cl);

+  // In this version - if filter_young is true and the region
+  // is a young region then we skip the iteration.
  HeapWord*
  oops_on_card_seq_iterate_careful(MemRegion mr,
-                                   FilterOutOfRegionClosure* cl);
+                                   FilterOutOfRegionClosure* cl,
+                                   bool filter_young);

  // The region "mr" is entirely in "this", and starts and ends at block
  // boundaries. The caller declares that all the contained blocks are
--- a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
@ -539,10 +539,9 @@ ParNewGeneration(ReservedSpace rs, size_t initial_byte_size, int level)
  guarantee(_task_queues != NULL, "task_queues allocation failure.");

  for (uint i1 = 0; i1 < ParallelGCThreads; i1++) {
-    ObjToScanQueuePadded *q_padded = new ObjToScanQueuePadded();
-    guarantee(q_padded != NULL, "work_queue Allocation failure.");
-
-    _task_queues->register_queue(i1, &q_padded->work_queue);
+    ObjToScanQueue *q = new ObjToScanQueue();
+    guarantee(q != NULL, "work_queue Allocation failure.");
+    _task_queues->register_queue(i1, q);
  }

  for (uint i2 = 0; i2 < ParallelGCThreads; i2++)
--- a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -33,8 +33,8 @@ class ParEvacuateFollowersClosure;
 // but they must be here to allow ParScanClosure::do_oop_work to be defined
 // in genOopClosures.inline.hpp.

-typedef OopTaskQueue       ObjToScanQueue;
-typedef OopTaskQueueSet    ObjToScanQueueSet;
+typedef Padded<OopTaskQueue> ObjToScanQueue;
+typedef GenericTaskQueueSet<ObjToScanQueue> ObjToScanQueueSet;

 // Enable this to get push/pop/steal stats.
 const int PAR_STATS_ENABLED = 0;
@ -304,12 +304,6 @@ class ParNewGeneration: public DefNewGeneration {
  friend class ParEvacuateFollowersClosure;

 private:
-  // XXX use a global constant instead of 64!
-  struct ObjToScanQueuePadded {
-        ObjToScanQueue work_queue;
-        char pad[64 - sizeof(ObjToScanQueue)];  // prevent false sharing
-  };
-
  // The per-worker-thread work queues
  ObjToScanQueueSet* _task_queues;

--- a/hotspot/src/share/vm/gc_implementation/parNew/parOopClosures.hpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parOopClosures.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -26,7 +26,8 @@

 class ParScanThreadState;
 class ParNewGeneration;
-typedef OopTaskQueueSet ObjToScanQueueSet;
+typedef Padded<OopTaskQueue> ObjToScanQueue;
+typedef GenericTaskQueueSet<ObjToScanQueue> ObjToScanQueueSet;
 class ParallelTaskTerminator;

 class ParScanClosure: public OopsInGenClosure {
--- a/hotspot/src/share/vm/gc_implementation/parallelScavenge/psPromotionManager.cpp
+++ b/hotspot/src/share/vm/gc_implementation/parallelScavenge/psPromotionManager.cpp
@ -90,10 +90,7 @@ void PSPromotionManager::pre_scavenge() {
 }

 void PSPromotionManager::post_scavenge() {
-#if PS_PM_STATS
-  print_stats();
-#endif // PS_PM_STATS
-
+  TASKQUEUE_STATS_ONLY(if (PrintGCDetails && ParallelGCVerbose) print_stats());
  for (uint i = 0; i < ParallelGCThreads + 1; i++) {
    PSPromotionManager* manager = manager_array(i);
    if (UseDepthFirstScavengeOrder) {
@ -105,37 +102,58 @@ void PSPromotionManager::post_scavenge() {
  }
 }

-#if PS_PM_STATS
+#if TASKQUEUE_STATS
+void
+PSPromotionManager::print_taskqueue_stats(uint i) const {
+  const TaskQueueStats& stats = depth_first() ?
+    _claimed_stack_depth.stats : _claimed_stack_breadth.stats;
+  tty->print("%3u ", i);
+  stats.print();
+  tty->cr();
+}

 void
-PSPromotionManager::print_stats(uint i) {
-  tty->print_cr("---- GC Worker %2d Stats", i);
-  tty->print_cr("    total pushes            %8d", _total_pushes);
-  tty->print_cr("    masked pushes           %8d", _masked_pushes);
-  tty->print_cr("    overflow pushes         %8d", _overflow_pushes);
-  tty->print_cr("    max overflow length     %8d", _max_overflow_length);
-  tty->print_cr("");
-  tty->print_cr("    arrays chunked          %8d", _arrays_chunked);
-  tty->print_cr("    array chunks processed  %8d", _array_chunks_processed);
-  tty->print_cr("");
-  tty->print_cr("    total steals            %8d", _total_steals);
-  tty->print_cr("    masked steals           %8d", _masked_steals);
-  tty->print_cr("");
+PSPromotionManager::print_local_stats(uint i) const {
+  #define FMT " " SIZE_FORMAT_W(10)
+  tty->print_cr("%3u" FMT FMT FMT FMT, i, _masked_pushes, _masked_steals,
+                _arrays_chunked, _array_chunks_processed);
+  #undef FMT
 }

+static const char* const pm_stats_hdr[] = {
+  "    --------masked-------     arrays      array",
+  "thr       push      steal    chunked     chunks",
+  "--- ---------- ---------- ---------- ----------"
+};
+
 void
 PSPromotionManager::print_stats() {
-  tty->print_cr("== GC Tasks Stats (%s), GC %3d",
-                (UseDepthFirstScavengeOrder) ? "Depth-First" : "Breadth-First",
+  const bool df = UseDepthFirstScavengeOrder;
+  tty->print_cr("== GC Task Stats (%s-First), GC %3d", df ? "Depth" : "Breadth",
                Universe::heap()->total_collections());

-  for (uint i = 0; i < ParallelGCThreads+1; ++i) {
-    PSPromotionManager* manager = manager_array(i);
-    manager->print_stats(i);
+  tty->print("thr "); TaskQueueStats::print_header(1); tty->cr();
+  tty->print("--- "); TaskQueueStats::print_header(2); tty->cr();
+  for (uint i = 0; i < ParallelGCThreads + 1; ++i) {
+    manager_array(i)->print_taskqueue_stats(i);
+  }
+
+  const uint hlines = sizeof(pm_stats_hdr) / sizeof(pm_stats_hdr[0]);
+  for (uint i = 0; i < hlines; ++i) tty->print_cr(pm_stats_hdr[i]);
+  for (uint i = 0; i < ParallelGCThreads + 1; ++i) {
+    manager_array(i)->print_local_stats(i);
  }
 }

-#endif // PS_PM_STATS
+void
+PSPromotionManager::reset_stats() {
+  TaskQueueStats& stats = depth_first() ?
+    claimed_stack_depth()->stats : claimed_stack_breadth()->stats;
+  stats.reset();
+  _masked_pushes = _masked_steals = 0;
+  _arrays_chunked = _array_chunks_processed = 0;
+}
+#endif // TASKQUEUE_STATS

 PSPromotionManager::PSPromotionManager() {
  ParallelScavengeHeap* heap = (ParallelScavengeHeap*)Universe::heap();
@ -189,16 +207,7 @@ void PSPromotionManager::reset() {

  _prefetch_queue.clear();

-#if PS_PM_STATS
-  _total_pushes = 0;
-  _masked_pushes = 0;
-  _overflow_pushes = 0;
-  _max_overflow_length = 0;
-  _arrays_chunked = 0;
-  _array_chunks_processed = 0;
-  _total_steals = 0;
-  _masked_steals = 0;
-#endif // PS_PM_STATS
+  TASKQUEUE_STATS_ONLY(reset_stats());
 }


@ -423,14 +432,9 @@ oop PSPromotionManager::copy_to_survivor_space(oop o, bool depth_first) {
            new_obj->is_objArray() &&
            PSChunkLargeArrays) {
          // we'll chunk it
-#if PS_PM_STATS
-          ++_arrays_chunked;
-#endif // PS_PM_STATS
          oop* const masked_o = mask_chunked_array_oop(o);
          push_depth(masked_o);
-#if PS_PM_STATS
-          ++_masked_pushes;
-#endif // PS_PM_STATS
+          TASKQUEUE_STATS_ONLY(++_arrays_chunked; ++_masked_pushes);
        } else {
          // we'll just push its contents
          new_obj->push_contents(this);
@ -494,9 +498,7 @@ void PSPromotionManager::process_array_chunk(oop old) {
  assert(old->is_objArray(), "invariant");
  assert(old->is_forwarded(), "invariant");

-#if PS_PM_STATS
-  ++_array_chunks_processed;
-#endif // PS_PM_STATS
+  TASKQUEUE_STATS_ONLY(++_array_chunks_processed);

  oop const obj = old->forwardee();

@ -508,9 +510,7 @@ void PSPromotionManager::process_array_chunk(oop old) {
    assert(start > 0, "invariant");
    arrayOop(old)->set_length(start);
    push_depth(mask_chunked_array_oop(old));
-#if PS_PM_STATS
-    ++_masked_pushes;
-#endif // PS_PM_STATS
+    TASKQUEUE_STATS_ONLY(++_masked_pushes);
  } else {
    // this is the final chunk for this array
    start = 0;
--- a/hotspot/src/share/vm/gc_implementation/parallelScavenge/psPromotionManager.hpp
+++ b/hotspot/src/share/vm/gc_implementation/parallelScavenge/psPromotionManager.hpp
@ -42,8 +42,6 @@ class MutableSpace;
 class PSOldGen;
 class ParCompactionManager;

-#define PS_PM_STATS         0
-
 class PSPromotionManager : public CHeapObj {
  friend class PSScavenge;
  friend class PSRefProcTaskExecutor;
@ -54,22 +52,18 @@ class PSPromotionManager : public CHeapObj {
  static PSOldGen*                    _old_gen;
  static MutableSpace*                _young_space;

-#if PS_PM_STATS
-  uint                                _total_pushes;
-  uint                                _masked_pushes;
+#if TASKQUEUE_STATS
+  size_t                              _masked_pushes;
+  size_t                              _masked_steals;
+  size_t                              _arrays_chunked;
+  size_t                              _array_chunks_processed;

-  uint                                _overflow_pushes;
-  uint                                _max_overflow_length;
-
-  uint                                _arrays_chunked;
-  uint                                _array_chunks_processed;
-
-  uint                                _total_steals;
-  uint                                _masked_steals;
-
-  void print_stats(uint i);
+  void print_taskqueue_stats(uint i) const;
+  void print_local_stats(uint i) const;
  static void print_stats();
-#endif // PS_PM_STATS
+
+  void reset_stats();
+#endif // TASKQUEUE_STATS

  PSYoungPromotionLAB                 _young_lab;
  PSOldPromotionLAB                   _old_lab;
@ -143,42 +137,12 @@ class PSPromotionManager : public CHeapObj {

  template <class T> void push_depth(T* p) {
    assert(depth_first(), "pre-condition");
-
-#if PS_PM_STATS
-    ++_total_pushes;
-    int stack_length = claimed_stack_depth()->overflow_stack()->length();
-#endif // PS_PM_STATS
-
    claimed_stack_depth()->push(p);
-
-#if PS_PM_STATS
-    if (claimed_stack_depth()->overflow_stack()->length() != stack_length) {
-      ++_overflow_pushes;
-      if ((uint)stack_length + 1 > _max_overflow_length) {
-        _max_overflow_length = (uint)stack_length + 1;
-      }
-    }
-#endif // PS_PM_STATS
  }

  void push_breadth(oop o) {
    assert(!depth_first(), "pre-condition");
-
-#if PS_PM_STATS
-    ++_total_pushes;
-    int stack_length = claimed_stack_breadth()->overflow_stack()->length();
-#endif // PS_PM_STATS
-
    claimed_stack_breadth()->push(o);
-
-#if PS_PM_STATS
-    if (claimed_stack_breadth()->overflow_stack()->length() != stack_length) {
-      ++_overflow_pushes;
-      if ((uint)stack_length + 1 > _max_overflow_length) {
-        _max_overflow_length = (uint)stack_length + 1;
-      }
-    }
-#endif // PS_PM_STATS
  }

 protected:
@ -256,12 +220,5 @@ class PSPromotionManager : public CHeapObj {
  template <class T> inline void claim_or_forward_depth(T* p);
  template <class T> inline void claim_or_forward_breadth(T* p);

-#if PS_PM_STATS
-  void increment_steals(oop* p = NULL) {
-    _total_steals += 1;
-    if (p != NULL && is_oop_masked(p)) {
-      _masked_steals += 1;
-    }
-  }
-#endif // PS_PM_STATS
+  TASKQUEUE_STATS_ONLY(inline void record_steal(StarTask& p);)
 };
--- a/hotspot/src/share/vm/gc_implementation/parallelScavenge/psPromotionManager.inline.hpp
+++ b/hotspot/src/share/vm/gc_implementation/parallelScavenge/psPromotionManager.inline.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2008, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -124,3 +124,11 @@ inline void PSPromotionManager::process_popped_location_depth(StarTask p) {
    }
  }
 }
+
+#if TASKQUEUE_STATS
+void PSPromotionManager::record_steal(StarTask& p) {
+  if (is_oop_masked(p)) {
+    ++_masked_steals;
+  }
+}
+#endif // TASKQUEUE_STATS
--- a/hotspot/src/share/vm/gc_implementation/parallelScavenge/psTasks.cpp
+++ b/hotspot/src/share/vm/gc_implementation/parallelScavenge/psTasks.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2008, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -148,9 +148,7 @@ void StealTask::do_it(GCTaskManager* manager, uint which) {
    while(true) {
      StarTask p;
      if (PSPromotionManager::steal_depth(which, &random_seed, p)) {
-#if PS_PM_STATS
-        pm->increment_steals(p);
-#endif // PS_PM_STATS
+        TASKQUEUE_STATS_ONLY(pm->record_steal(p));
        pm->process_popped_location_depth(p);
        pm->drain_stacks_depth(true);
      } else {
@ -163,9 +161,6 @@ void StealTask::do_it(GCTaskManager* manager, uint which) {
    while(true) {
      oop obj;
      if (PSPromotionManager::steal_breadth(which, &random_seed, obj)) {
-#if PS_PM_STATS
-        pm->increment_steals();
-#endif // PS_PM_STATS
        obj->copy_contents(pm);
        pm->drain_stacks_breadth(true);
      } else {
--- a/hotspot/src/share/vm/utilities/globalDefinitions.hpp
+++ b/hotspot/src/share/vm/utilities/globalDefinitions.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -345,6 +345,35 @@ inline intptr_t align_object_offset(intptr_t offset) {
  return align_size_up(offset, HeapWordsPerLong);
 }

+// The expected size in bytes of a cache line, used to pad data structures.
+#define DEFAULT_CACHE_LINE_SIZE 64
+
+// Bytes needed to pad type to avoid cache-line sharing; alignment should be the
+// expected cache line size (a power of two).  The first addend avoids sharing
+// when the start address is not a multiple of alignment; the second maintains
+// alignment of starting addresses that happen to be a multiple.
+#define PADDING_SIZE(type, alignment)                           \
+  ((alignment) + align_size_up_(sizeof(type), alignment))
+
+// Templates to create a subclass padded to avoid cache line sharing.  These are
+// effective only when applied to derived-most (leaf) classes.
+
+// When no args are passed to the base ctor.
+template <class T, size_t alignment = DEFAULT_CACHE_LINE_SIZE>
+class Padded: public T {
+private:
+  char _pad_buf_[PADDING_SIZE(T, alignment)];
+};
+
+// When either 0 or 1 args may be passed to the base ctor.
+template <class T, typename Arg1T, size_t alignment = DEFAULT_CACHE_LINE_SIZE>
+class Padded01: public T {
+public:
+  Padded01(): T() { }
+  Padded01(Arg1T arg1): T(arg1) { }
+private:
+  char _pad_buf_[PADDING_SIZE(T, alignment)];
+};

 //----------------------------------------------------------------------------------------------------
 // Utility macros for compilers
--- a/hotspot/src/share/vm/utilities/taskqueue.cpp
+++ b/hotspot/src/share/vm/utilities/taskqueue.cpp
@ -31,6 +31,48 @@ uint ParallelTaskTerminator::_total_spins = 0;
 uint ParallelTaskTerminator::_total_peeks = 0;
 #endif

+#if TASKQUEUE_STATS
+const char * const TaskQueueStats::_names[last_stat_id] = {
+  "qpush", "qpop", "qpop-s", "qattempt", "qsteal", "opush", "omax"
+};
+
+void TaskQueueStats::print_header(unsigned int line, outputStream* const stream,
+                                  unsigned int width)
+{
+  // Use a width w: 1 <= w <= max_width
+  const unsigned int max_width = 40;
+  const unsigned int w = MAX2(MIN2(width, max_width), 1U);
+
+  if (line == 0) { // spaces equal in width to the header
+    const unsigned int hdr_width = w * last_stat_id + last_stat_id - 1;
+    stream->print("%*s", hdr_width, " ");
+  } else if (line == 1) { // labels
+    stream->print("%*s", w, _names[0]);
+    for (unsigned int i = 1; i < last_stat_id; ++i) {
+      stream->print(" %*s", w, _names[i]);
+    }
+  } else if (line == 2) { // dashed lines
+    char dashes[max_width + 1];
+    memset(dashes, '-', w);
+    dashes[w] = '\0';
+    stream->print("%s", dashes);
+    for (unsigned int i = 1; i < last_stat_id; ++i) {
+      stream->print(" %s", dashes);
+    }
+  }
+}
+
+void TaskQueueStats::print(outputStream* stream, unsigned int width) const
+{
+  #define FMT SIZE_FORMAT_W(*)
+  stream->print(FMT, width, _stats[0]);
+  for (unsigned int i = 1; i < last_stat_id; ++i) {
+    stream->print(" " FMT, width, _stats[i]);
+  }
+  #undef FMT
+}
+#endif // TASKQUEUE_STATS
+
 int TaskQueueSetSuper::randomParkAndMiller(int *seed0) {
  const int a =      16807;
  const int m = 2147483647;
--- a/hotspot/src/share/vm/utilities/taskqueue.hpp
+++ b/hotspot/src/share/vm/utilities/taskqueue.hpp
@ -22,6 +22,72 @@
 *
 */

+// Simple TaskQueue stats that are collected by default in debug builds.
+
+#if !defined(TASKQUEUE_STATS) && defined(ASSERT)
+#define TASKQUEUE_STATS 1
+#elif !defined(TASKQUEUE_STATS)
+#define TASKQUEUE_STATS 0
+#endif
+
+#if TASKQUEUE_STATS
+#define TASKQUEUE_STATS_ONLY(code) code
+#else
+#define TASKQUEUE_STATS_ONLY(code)
+#endif // TASKQUEUE_STATS
+
+#if TASKQUEUE_STATS
+class TaskQueueStats {
+public:
+  enum StatId {
+    push,             // number of taskqueue pushes
+    pop,              // number of taskqueue pops
+    pop_slow,         // subset of taskqueue pops that were done slow-path
+    steal_attempt,    // number of taskqueue steal attempts
+    steal,            // number of taskqueue steals
+    overflow,         // number of overflow pushes
+    overflow_max_len, // max length of overflow stack
+    last_stat_id
+  };
+
+public:
+  inline TaskQueueStats()       { reset(); }
+
+  inline void record_push()     { ++_stats[push]; }
+  inline void record_pop()      { ++_stats[pop]; }
+  inline void record_pop_slow() { record_pop(); ++_stats[pop_slow]; }
+  inline void record_steal(bool success);
+  inline void record_overflow(size_t new_length);
+
+  inline size_t get(StatId id) const { return _stats[id]; }
+  inline const size_t* get() const   { return _stats; }
+
+  inline void reset();
+
+  static void print_header(unsigned int line, outputStream* const stream = tty,
+                           unsigned int width = 10);
+  void print(outputStream* const stream = tty, unsigned int width = 10) const;
+
+private:
+  size_t                    _stats[last_stat_id];
+  static const char * const _names[last_stat_id];
+};
+
+void TaskQueueStats::record_steal(bool success) {
+  ++_stats[steal_attempt];
+  if (success) ++_stats[steal];
+}
+
+void TaskQueueStats::record_overflow(size_t new_len) {
+  ++_stats[overflow];
+  if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len;
+}
+
+void TaskQueueStats::reset() {
+  memset(_stats, 0, sizeof(_stats));
+}
+#endif // TASKQUEUE_STATS
+
 template <unsigned int N>
 class TaskQueueSuper: public CHeapObj {
 protected:
@ -135,6 +201,8 @@ public:

  // Total size of queue.
  static const uint total_size() { return N; }
+
+  TASKQUEUE_STATS_ONLY(TaskQueueStats stats;)
 };

 template<class E, unsigned int N = TASKQUEUE_SIZE>
@ -152,6 +220,7 @@ protected:
 public:
  using TaskQueueSuper<N>::max_elems;
  using TaskQueueSuper<N>::size;
+  TASKQUEUE_STATS_ONLY(using TaskQueueSuper<N>::stats;)

 private:
  // Slow paths for push, pop_local.  (pop_global has no fast path.)
@ -224,14 +293,14 @@ bool GenericTaskQueue<E, N>::push_slow(E t, uint dirty_n_elems) {
    // g++ complains if the volatile result of the assignment is unused.
    const_cast<E&>(_elems[localBot] = t);
    OrderAccess::release_store(&_bottom, increment_index(localBot));
+    TASKQUEUE_STATS_ONLY(stats.record_push());
    return true;
  }
  return false;
 }

 template<class E, unsigned int N>
-bool GenericTaskQueue<E, N>::
-pop_local_slow(uint localBot, Age oldAge) {
+bool GenericTaskQueue<E, N>::pop_local_slow(uint localBot, Age oldAge) {
  // This queue was observed to contain exactly one element; either this
  // thread will claim it, or a competing "pop_global".  In either case,
  // the queue will be logically empty afterwards.  Create a new Age value
@ -251,6 +320,7 @@ pop_local_slow(uint localBot, Age oldAge) {
    if (tempAge == oldAge) {
      // We win.
      assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
+      TASKQUEUE_STATS_ONLY(stats.record_pop_slow());
      return true;
    }
  }
@ -306,6 +376,8 @@ public:
  typedef GrowableArray<E>       overflow_t;
  typedef GenericTaskQueue<E, N> taskqueue_t;

+  TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
+
  OverflowTaskQueue();
  ~OverflowTaskQueue();
  void initialize();
@ -356,6 +428,7 @@ bool OverflowTaskQueue<E, N>::push(E t)
 {
  if (!taskqueue_t::push(t)) {
    overflow_stack()->push(t);
+    TASKQUEUE_STATS_ONLY(stats.record_overflow(overflow_stack()->length()));
  }
  return true;
 }
@ -424,9 +497,13 @@ GenericTaskQueueSet<T>::queue(uint i) {

 template<class T> bool
 GenericTaskQueueSet<T>::steal(uint queue_num, int* seed, E& t) {
-  for (uint i = 0; i < 2 * _n; i++)
-    if (steal_best_of_2(queue_num, seed, t))
+  for (uint i = 0; i < 2 * _n; i++) {
+    if (steal_best_of_2(queue_num, seed, t)) {
+      TASKQUEUE_STATS_ONLY(queue(queue_num)->stats.record_steal(true));
      return true;
+    }
+  }
+  TASKQUEUE_STATS_ONLY(queue(queue_num)->stats.record_steal(false));
  return false;
 }

@ -574,6 +651,7 @@ GenericTaskQueue<E, N>::push(E t) {
    // g++ complains if the volatile result of the assignment is unused.
    const_cast<E&>(_elems[localBot] = t);
    OrderAccess::release_store(&_bottom, increment_index(localBot));
+    TASKQUEUE_STATS_ONLY(stats.record_push());
    return true;
  } else {
    return push_slow(t, dirty_n_elems);
@ -603,6 +681,7 @@ GenericTaskQueue<E, N>::pop_local(E& t) {
  idx_t tp = _age.top();    // XXX
  if (size(localBot, tp) > 0) {
    assert(dirty_size(localBot, tp) != N - 1, "sanity");
+    TASKQUEUE_STATS_ONLY(stats.record_pop());
    return true;
  } else {
    // Otherwise, the queue contained exactly one element; we take the slow