8231153: Improve concurrent refinement statistics

8043505: G1RemSet::_conc_refine_cards is incremented unsynchronized Added refinement rates and counts, removed _conc_refine_cards Reviewed-by: tschatzl, sjohanss
2026-01-28 12:09:14 +00:00 · 2019-10-08 15:15:50 -04:00 · 2019-10-08 15:15:50 -04:00 · d7fcd0ccac
commit d7fcd0ccac
parent 8edf64d236
17 changed files with 240 additions and 91 deletions
--- a/src/hotspot/share/gc/g1/g1Analytics.cpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.cpp
@ -78,6 +78,8 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
    _alloc_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
    _prev_collection_pause_end_ms(0.0),
    _rs_length_diff_seq(new TruncatedSeq(TruncatedSeqLength)),
+    _concurrent_refine_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
+    _logged_cards_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
    _cost_per_logged_card_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
    _cost_scan_hcc_seq(new TruncatedSeq(TruncatedSeqLength)),
    _young_cards_per_entry_ratio_seq(new TruncatedSeq(TruncatedSeqLength)),
@ -102,6 +104,10 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
  int index = MIN2(ParallelGCThreads - 1, 7u);

  _rs_length_diff_seq->add(rs_length_diff_defaults[index]);
+  // Start with inverse of maximum STW cost.
+  _concurrent_refine_rate_ms_seq->add(1/cost_per_logged_card_ms_defaults[0]);
+  // Some applications have very low rates for logging cards.
+  _logged_cards_rate_ms_seq->add(0.0);
  _cost_per_logged_card_ms_seq->add(cost_per_logged_card_ms_defaults[index]);
  _cost_scan_hcc_seq->add(0.0);
  _young_cards_per_entry_ratio_seq->add(young_cards_per_entry_ratio_defaults[index]);
@ -159,6 +165,14 @@ void G1Analytics::compute_pause_time_ratio(double interval_ms, double pause_time
    (pause_time_ms * _recent_prev_end_times_for_all_gcs_sec->num()) / interval_ms;
 }

+void G1Analytics::report_concurrent_refine_rate_ms(double cards_per_ms) {
+  _concurrent_refine_rate_ms_seq->add(cards_per_ms);
+}
+
+void G1Analytics::report_logged_cards_rate_ms(double cards_per_ms) {
+  _logged_cards_rate_ms_seq->add(cards_per_ms);
+}
+
 void G1Analytics::report_cost_per_logged_card_ms(double cost_per_logged_card_ms) {
  _cost_per_logged_card_ms_seq->add(cost_per_logged_card_ms);
 }
@ -223,6 +237,14 @@ double G1Analytics::predict_alloc_rate_ms() const {
  return get_new_prediction(_alloc_rate_ms_seq);
 }

+double G1Analytics::predict_concurrent_refine_rate_ms() const {
+  return get_new_prediction(_concurrent_refine_rate_ms_seq);
+}
+
+double G1Analytics::predict_logged_cards_rate_ms() const {
+  return get_new_prediction(_logged_cards_rate_ms_seq);
+}
+
 double G1Analytics::predict_cost_per_logged_card_ms() const {
  return get_new_prediction(_cost_per_logged_card_ms_seq);
 }
--- a/src/hotspot/share/gc/g1/g1Analytics.hpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.hpp
@ -46,6 +46,8 @@ class G1Analytics: public CHeapObj<mtGC> {
  double        _prev_collection_pause_end_ms;

  TruncatedSeq* _rs_length_diff_seq;
+  TruncatedSeq* _concurrent_refine_rate_ms_seq;
+  TruncatedSeq* _logged_cards_rate_ms_seq;
  TruncatedSeq* _cost_per_logged_card_ms_seq;
  TruncatedSeq* _cost_scan_hcc_seq;
  TruncatedSeq* _young_cards_per_entry_ratio_seq;
@ -99,6 +101,8 @@ public:
  void report_concurrent_mark_remark_times_ms(double ms);
  void report_concurrent_mark_cleanup_times_ms(double ms);
  void report_alloc_rate_ms(double alloc_rate);
+  void report_concurrent_refine_rate_ms(double cards_per_ms);
+  void report_logged_cards_rate_ms(double cards_per_ms);
  void report_cost_per_logged_card_ms(double cost_per_logged_card_ms);
  void report_cost_scan_hcc(double cost_scan_hcc);
  void report_cost_per_remset_card_ms(double cost_per_remset_card_ms, bool for_young_gc);
@ -116,6 +120,8 @@ public:
  double predict_alloc_rate_ms() const;
  int num_alloc_rate_ms() const;

+  double predict_concurrent_refine_rate_ms() const;
+  double predict_logged_cards_rate_ms() const;
  double predict_cost_per_logged_card_ms() const;

  double predict_scan_hcc_ms() const;
--- a/src/hotspot/share/gc/g1/g1CollectionSet.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectionSet.cpp
@ -409,7 +409,7 @@ double G1CollectionSet::finalize_young_part(double target_pause_time_ms, G1Survi
  guarantee(target_pause_time_ms > 0.0,
            "target_pause_time_ms = %1.6lf should be positive", target_pause_time_ms);

-  size_t pending_cards = _policy->pending_cards();
+  size_t pending_cards = _policy->pending_cards_at_gc_start();
  double base_time_ms = _policy->predict_base_elapsed_time_ms(pending_cards);
  double time_remaining_ms = MAX2(target_pause_time_ms - base_time_ms, 0.0);

--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
@ -412,6 +412,22 @@ void G1ConcurrentRefine::adjust(double logged_cards_scan_time,
  dcqs.notify_if_necessary();
 }

+G1ConcurrentRefine::RefinementStats G1ConcurrentRefine::total_refinement_stats() const {
+  struct CollectData : public ThreadClosure {
+    Tickspan _total_time;
+    size_t _total_cards;
+    CollectData() : _total_time(), _total_cards(0) {}
+    virtual void do_thread(Thread* t) {
+      G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
+      _total_time += crt->total_refinement_time();
+      _total_cards += crt->total_refined_cards();
+    }
+  } collector;
+  // Cast away const so we can call non-modifying closure on threads.
+  const_cast<G1ConcurrentRefine*>(this)->threads_do(&collector);
+  return RefinementStats(collector._total_time, collector._total_cards);
+}
+
 size_t G1ConcurrentRefine::activation_threshold(uint worker_id) const {
  Thresholds thresholds = calc_thresholds(_green_zone, _yellow_zone, worker_id);
  return activation_level(thresholds);
@ -432,7 +448,8 @@ void G1ConcurrentRefine::maybe_activate_more_threads(uint worker_id, size_t num_
  }
 }

-bool G1ConcurrentRefine::do_refinement_step(uint worker_id) {
+bool G1ConcurrentRefine::do_refinement_step(uint worker_id,
+                                            size_t* total_refined_cards) {
  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();

  size_t curr_cards = dcqs.num_cards();
@ -448,5 +465,6 @@ bool G1ConcurrentRefine::do_refinement_step(uint worker_id) {

  // Process the next buffer, if there are enough left.
  return dcqs.refine_completed_buffer_concurrently(worker_id + worker_id_offset(),
-                                                   deactivation_threshold(worker_id));
+                                                   deactivation_threshold(worker_id),
+                                                   total_refined_cards);
 }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
@ -27,6 +27,7 @@

 #include "memory/allocation.hpp"
 #include "utilities/globalDefinitions.hpp"
+#include "utilities/ticks.hpp"

 // Forward decl
 class G1ConcurrentRefine;
@ -118,11 +119,22 @@ public:
  // Adjust refinement thresholds based on work done during the pause and the goal time.
  void adjust(double logged_cards_scan_time, size_t processed_logged_cards, double goal_ms);

+  struct RefinementStats {
+    Tickspan _time;
+    size_t _cards;
+    RefinementStats(Tickspan time, size_t cards) : _time(time), _cards(cards) {}
+  };
+
+  RefinementStats total_refinement_stats() const;
+
  // Cards in the dirty card queue set.
  size_t activation_threshold(uint worker_id) const;
  size_t deactivation_threshold(uint worker_id) const;
-  // Perform a single refinement step. Called by the refinement threads when woken up.
-  bool do_refinement_step(uint worker_id);
+
+  // Perform a single refinement step; called by the refinement
+  // threads.  Returns true if there was refinement work available.
+  // Increments *total_refined_cards.
+  bool do_refinement_step(uint worker_id, size_t* total_refined_cards);

  // Iterate over all concurrent refinement threads applying the given closure.
  void threads_do(ThreadClosure *tc);
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
@ -37,6 +37,8 @@ G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint
  ConcurrentGCThread(),
  _vtime_start(0.0),
  _vtime_accum(0.0),
+  _total_refinement_time(),
+  _total_refined_cards(0),
  _worker_id(worker_id),
  _active(false),
  _monitor(NULL),
@ -101,11 +103,12 @@ void G1ConcurrentRefineThread::run_service() {
      break;
    }

-    size_t buffers_processed = 0;
    log_debug(gc, refine)("Activated worker %d, on threshold: " SIZE_FORMAT ", current: " SIZE_FORMAT,
                          _worker_id, _cr->activation_threshold(_worker_id),
                          G1BarrierSet::dirty_card_queue_set().num_cards());

+    size_t start_total_refined_cards = _total_refined_cards; // For logging.
+
    {
      SuspendibleThreadSetJoiner sts_join;

@ -115,20 +118,22 @@ void G1ConcurrentRefineThread::run_service() {
          continue;             // Re-check for termination after yield delay.
        }

-        if (!_cr->do_refinement_step(_worker_id)) {
-          break;
+        Ticks start_time = Ticks::now();
+        if (!_cr->do_refinement_step(_worker_id, &_total_refined_cards)) {
+          break;                // No cards to process.
        }
-        ++buffers_processed;
+        _total_refinement_time += (Ticks::now() - start_time);
      }
    }

    deactivate();
    log_debug(gc, refine)("Deactivated worker %d, off threshold: " SIZE_FORMAT
-                          ", current: " SIZE_FORMAT ", buffers processed: "
-                          SIZE_FORMAT,
+                          ", current: " SIZE_FORMAT ", refined cards: "
+                          SIZE_FORMAT ", total refined cards: " SIZE_FORMAT,
                          _worker_id, _cr->deactivation_threshold(_worker_id),
                          G1BarrierSet::dirty_card_queue_set().num_cards(),
-                          buffers_processed);
+                          _total_refined_cards - start_total_refined_cards,
+                          _total_refined_cards);

    if (os::supports_vtime()) {
      _vtime_accum = (os::elapsedVTime() - _vtime_start);
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
@ -26,6 +26,7 @@
 #define SHARE_GC_G1_G1CONCURRENTREFINETHREAD_HPP

 #include "gc/shared/concurrentGCThread.hpp"
+#include "utilities/ticks.hpp"

 // Forward Decl.
 class G1ConcurrentRefine;
@ -38,6 +39,10 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {

  double _vtime_start;  // Initial virtual time.
  double _vtime_accum;  // Accumulated virtual time.
+
+  Tickspan _total_refinement_time;
+  size_t _total_refined_cards;
+
  uint _worker_id;

  bool _active;
@ -61,6 +66,9 @@ public:
  // Activate this thread.
  void activate();

+  Tickspan total_refinement_time() const { return _total_refinement_time; }
+  size_t total_refined_cards() const { return _total_refined_cards; }
+
  // Total virtual time so far.
  double vtime_accum() { return _vtime_accum; }
 };
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@ -37,6 +37,7 @@
 #include "runtime/atomic.hpp"
 #include "runtime/flags/flagSetting.hpp"
 #include "runtime/mutexLocker.hpp"
+#include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.inline.hpp"
 #include "runtime/threadSMR.hpp"
@ -62,6 +63,9 @@ void G1DirtyCardQueue::handle_completed_buffer() {
  }
 }

+// Assumed to be zero by concurrent threads.
+static uint par_ids_start() { return 0; }
+
 G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon,
                                         BufferNode::Allocator* allocator) :
  PtrQueueSet(allocator),
@ -73,15 +77,16 @@ G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon,
  _process_completed_buffers(false),
  _max_cards(MaxCardsUnlimited),
  _max_cards_padding(0),
-  _free_ids(0, num_par_ids()),
-  _processed_buffers_mut(0),
-  _processed_buffers_rs_thread(0)
+  _free_ids(par_ids_start(), num_par_ids()),
+  _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC))
 {
+  ::memset(_mutator_refined_cards_counters, 0, num_par_ids() * sizeof(size_t));
  _all_active = true;
 }

 G1DirtyCardQueueSet::~G1DirtyCardQueueSet() {
  abandon_completed_buffers();
+  FREE_C_HEAP_ARRAY(size_t, _mutator_refined_cards_counters);
 }

 // Determines how many mutator threads can process the buffers in parallel.
@ -89,6 +94,14 @@ uint G1DirtyCardQueueSet::num_par_ids() {
  return (uint)os::initial_active_processor_count();
 }

+size_t G1DirtyCardQueueSet::total_mutator_refined_cards() const {
+  size_t sum = 0;
+  for (uint i = 0; i < num_par_ids(); ++i) {
+    sum += _mutator_refined_cards_counters[i];
+  }
+  return sum;
+}
+
 void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
  G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 }
@ -213,7 +226,9 @@ G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
  return result;
 }

-bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node, uint worker_id) {
+bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
+                                        uint worker_id,
+                                        size_t* total_refined_cards) {
  G1RemSet* rem_set = G1CollectedHeap::heap()->rem_set();
  size_t size = buffer_size();
  void** buffer = BufferNode::make_buffer_from_node(node);
@ -223,6 +238,7 @@ bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node, uint worker_id) {
    CardTable::CardValue* cp = static_cast<CardTable::CardValue*>(buffer[i]);
    rem_set->refine_card_concurrently(cp, worker_id);
  }
+  *total_refined_cards += (i - node->index());
  node->set_index(i);
  return i == size;
 }
@ -260,25 +276,27 @@ bool G1DirtyCardQueueSet::process_or_enqueue_completed_buffer(BufferNode* node)

 bool G1DirtyCardQueueSet::mut_process_buffer(BufferNode* node) {
  uint worker_id = _free_ids.claim_par_id(); // temporarily claim an id
-  bool result = refine_buffer(node, worker_id);
+  uint counter_index = worker_id - par_ids_start();
+  size_t* counter = &_mutator_refined_cards_counters[counter_index];
+  bool result = refine_buffer(node, worker_id, counter);
  _free_ids.release_par_id(worker_id); // release the id

  if (result) {
    assert_fully_consumed(node, buffer_size());
-    Atomic::inc(&_processed_buffers_mut);
  }
  return result;
 }

-bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id, size_t stop_at) {
+bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
+                                                               size_t stop_at,
+                                                               size_t* total_refined_cards) {
  BufferNode* node = get_completed_buffer(stop_at);
  if (node == NULL) {
    return false;
-  } else if (refine_buffer(node, worker_id)) {
+  } else if (refine_buffer(node, worker_id, total_refined_cards)) {
    assert_fully_consumed(node, buffer_size());
    // Done with fully processed buffer.
    deallocate_buffer(node);
-    Atomic::inc(&_processed_buffers_rs_thread);
    return true;
  } else {
    // Return partially processed buffer to the queue.
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
@ -78,14 +78,15 @@ class G1DirtyCardQueueSet: public PtrQueueSet {

  void abandon_completed_buffers();

-  // Refine the cards in "node" from it's index to buffer_size.
+  // Refine the cards in "node" from its index to buffer_size.
  // Stops processing if SuspendibleThreadSet::should_yield() is true.
  // Returns true if the entire buffer was processed, false if there
  // is a pending yield request.  The node's index is updated to exclude
  // the processed elements, e.g. up to the element before processing
  // stopped, or one past the last element if the entire buffer was
-  // processed.
-  bool refine_buffer(BufferNode* node, uint worker_id);
+  // processed. Increments *total_refined_cards by the number of cards
+  // processed and removed from the buffer.
+  bool refine_buffer(BufferNode* node, uint worker_id, size_t* total_refined_cards);

  bool mut_process_buffer(BufferNode* node);

@ -97,10 +98,9 @@ class G1DirtyCardQueueSet: public PtrQueueSet {

  G1FreeIdSet _free_ids;

-  // The number of completed buffers processed by mutator and rs thread,
-  // respectively.
-  jint _processed_buffers_mut;
-  jint _processed_buffers_rs_thread;
+  // Array of cumulative dirty cards refined by mutator threads.
+  // Array has an entry per id in _free_ids.
+  size_t* _mutator_refined_cards_counters;

 public:
  G1DirtyCardQueueSet(Monitor* cbl_mon, BufferNode::Allocator* allocator);
@ -158,7 +158,12 @@ public:
  // Stops processing a buffer if SuspendibleThreadSet::should_yield(),
  // returning the incompletely processed buffer to the completed buffer
  // list, for later processing of the remainder.
-  bool refine_completed_buffer_concurrently(uint worker_id, size_t stop_at);
+  //
+  // Increments *total_refined_cards by the number of cards processed and
+  // removed from the buffer.
+  bool refine_completed_buffer_concurrently(uint worker_id,
+                                            size_t stop_at,
+                                            size_t* total_refined_cards);

  // If a full collection is happening, reset partial logs, and release
  // completed ones: the full collection will make them all irrelevant.
@ -181,13 +186,8 @@ public:
    return _max_cards_padding;
  }

-  jint processed_buffers_mut() {
-    return _processed_buffers_mut;
-  }
-  jint processed_buffers_rs_thread() {
-    return _processed_buffers_rs_thread;
-  }
-
+  // Total dirty cards refined by mutator threads.
+  size_t total_mutator_refined_cards() const;
 };

 inline G1DirtyCardQueueSet* G1DirtyCardQueue::dirty_card_qset() const {
--- a/src/hotspot/share/gc/g1/g1Policy.cpp
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp
@ -70,7 +70,11 @@ G1Policy::G1Policy(STWGCTimer* gc_timer) :
  _free_regions_at_end_of_collection(0),
  _max_rs_length(0),
  _rs_length_prediction(0),
-  _pending_cards(0),
+  _pending_cards_at_gc_start(0),
+  _pending_cards_at_prev_gc_end(0),
+  _total_mutator_refined_cards(0),
+  _total_concurrent_refined_cards(0),
+  _total_concurrent_refinement_time(),
  _bytes_allocated_in_old_since_last_gc(0),
  _initial_mark_to_mixed(),
  _collection_set(NULL),
@ -442,6 +446,7 @@ void G1Policy::record_full_collection_start() {
  collector_state()->set_in_young_only_phase(false);
  collector_state()->set_in_full_gc(true);
  _collection_set->clear_candidates();
+  record_concurrent_refinement_data(true /* is_full_collection */);
 }

 void G1Policy::record_full_collection_end() {
@ -472,12 +477,67 @@ void G1Policy::record_full_collection_end() {
  _survivor_surv_rate_group->reset();
  update_young_list_max_and_target_length();
  update_rs_length_prediction();
+  _pending_cards_at_prev_gc_end = _g1h->pending_card_num();

  _bytes_allocated_in_old_since_last_gc = 0;

  record_pause(FullGC, _full_collection_start_sec, end_sec);
 }

+void G1Policy::record_concurrent_refinement_data(bool is_full_collection) {
+  _pending_cards_at_gc_start = _g1h->pending_card_num();
+
+  // Record info about concurrent refinement thread processing.
+  G1ConcurrentRefine* cr = _g1h->concurrent_refine();
+  G1ConcurrentRefine::RefinementStats cr_stats = cr->total_refinement_stats();
+
+  Tickspan cr_time = cr_stats._time - _total_concurrent_refinement_time;
+  _total_concurrent_refinement_time = cr_stats._time;
+
+  size_t cr_cards = cr_stats._cards - _total_concurrent_refined_cards;
+  _total_concurrent_refined_cards = cr_stats._cards;
+
+  // Don't update rate if full collection.  We could be in an implicit full
+  // collection after a non-full collection failure, in which case there
+  // wasn't any mutator/cr-thread activity since last recording.  And if
+  // we're in an explicit full collection, the time since the last GC can
+  // be arbitrarily short, so not a very good sample.  Similarly, don't
+  // update the rate if the current sample is empty or time is zero.
+  if (!is_full_collection && (cr_cards > 0) && (cr_time > Tickspan())) {
+    double rate = cr_cards / (cr_time.seconds() * MILLIUNITS);
+    _analytics->report_concurrent_refine_rate_ms(rate);
+  }
+
+  // Record info about mutator thread processing.
+  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
+  size_t mut_total_cards = dcqs.total_mutator_refined_cards();
+  size_t mut_cards = mut_total_cards - _total_mutator_refined_cards;
+  _total_mutator_refined_cards = mut_total_cards;
+
+  // Record mutator's card logging rate.
+  // Don't update if full collection; see above.
+  if (!is_full_collection) {
+    size_t total_cards = _pending_cards_at_gc_start + cr_cards + mut_cards;
+    assert(_pending_cards_at_prev_gc_end <= total_cards,
+           "untracked cards: last pending: " SIZE_FORMAT
+           ", pending: " SIZE_FORMAT ", conc refine: " SIZE_FORMAT
+           ", mut refine:" SIZE_FORMAT,
+           _pending_cards_at_prev_gc_end, _pending_cards_at_gc_start,
+           cr_cards, mut_cards);
+    size_t logged_cards = total_cards - _pending_cards_at_prev_gc_end;
+    double logging_start_time = _analytics->prev_collection_pause_end_ms();
+    double logging_end_time = Ticks::now().seconds() * MILLIUNITS;
+    double logging_time = logging_end_time - logging_start_time;
+    // Unlike above for conc-refine rate, here we should not require a
+    // non-empty sample, since an application could go some time with only
+    // young-gen or filtered out writes.  But we'll ignore unusually short
+    // sample periods, as they may just pollute the predictions.
+    if (logging_time > 1.0) {   // Require > 1ms sample time.
+      _analytics->report_logged_cards_rate_ms(logged_cards / logging_time);
+    }
+  }
+}
+
 void G1Policy::record_collection_pause_start(double start_time_sec) {
  // We only need to do this here as the policy will only be applied
  // to the GC we're about to start. so, no point is calculating this
@ -490,7 +550,8 @@ void G1Policy::record_collection_pause_start(double start_time_sec) {
  assert_used_and_recalculate_used_equal(_g1h);

  phase_times()->record_cur_collection_start_sec(start_time_sec);
-  _pending_cards = _g1h->pending_card_num();
+
+  record_concurrent_refinement_data(false /* is_full_collection */);

  _collection_set->reset_bytes_used_before();
  _bytes_copied_during_gc = 0;
@ -744,7 +805,7 @@ void G1Policy::record_collection_pause_end(double pause_time_ms, size_t heap_use
    // after the mixed gc phase.
    // During mixed gc we do not use them for young gen sizing.
    if (this_pause_was_young_only) {
-      _analytics->report_pending_cards((double) _pending_cards);
+      _analytics->report_pending_cards((double) _pending_cards_at_gc_start);
      _analytics->report_rs_length((double) _max_rs_length);
    }
  }
@ -798,6 +859,7 @@ void G1Policy::record_collection_pause_end(double pause_time_ms, size_t heap_use
    scan_logged_cards_time_goal_ms -= scan_hcc_time_ms;
  }

+  _pending_cards_at_prev_gc_end = _g1h->pending_card_num();
  double const logged_cards_time = logged_cards_processing_time();

  log_debug(gc, ergo, refine)("Concurrent refinement times: Logged Cards Scan time goal: %1.2fms Logged Cards Scan time: %1.2fms HCC time: %1.2fms",
--- a/src/hotspot/share/gc/g1/g1Policy.hpp
+++ b/src/hotspot/share/gc/g1/g1Policy.hpp
@ -100,7 +100,11 @@ class G1Policy: public CHeapObj<mtGC> {

  size_t _rs_length_prediction;

-  size_t _pending_cards;
+  size_t _pending_cards_at_gc_start;
+  size_t _pending_cards_at_prev_gc_end;
+  size_t _total_mutator_refined_cards;
+  size_t _total_concurrent_refined_cards;
+  Tickspan _total_concurrent_refinement_time;

  // The amount of allocated bytes in old gen during the last mutator and the following
  // young GC phase.
@ -244,7 +248,15 @@ private:
                        uint base_free_regions, double target_pause_time_ms) const;

 public:
-  size_t pending_cards() const { return _pending_cards; }
+  size_t pending_cards_at_gc_start() const { return _pending_cards_at_gc_start; }
+
+  size_t total_concurrent_refined_cards() const {
+    return _total_concurrent_refined_cards;
+  }
+
+  size_t total_mutator_refined_cards() const {
+    return _total_mutator_refined_cards;
+  }

  // Calculate the minimum number of old regions we'll add to the CSet
  // during a mixed GC.
@ -283,6 +295,9 @@ private:
  void record_pause(PauseKind kind, double start, double end);
  // Indicate that we aborted marking before doing any mixed GCs.
  void abort_time_to_mixed_tracking();
+
+  void record_concurrent_refinement_data(bool is_full_collection);
+
 public:

  G1Policy(STWGCTimer* gc_timer);
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
@ -489,7 +489,6 @@ G1RemSet::G1RemSet(G1CollectedHeap* g1h,
  _scan_state(new G1RemSetScanState()),
  _prev_period_summary(),
  _g1h(g1h),
-  _num_conc_refined_cards(0),
  _ct(ct),
  _g1p(_g1h->policy()),
  _hot_card_cache(hot_card_cache) {
@ -1377,7 +1376,6 @@ void G1RemSet::refine_card_concurrently(CardValue* card_ptr,

  G1ConcurrentRefineOopClosure conc_refine_cl(_g1h, worker_id);
  if (r->oops_on_memregion_seq_iterate_careful<false>(dirty_region, &conc_refine_cl) != NULL) {
-    _num_conc_refined_cards++; // Unsynchronized update, only used for logging.
    return;
  }

--- a/src/hotspot/share/gc/g1/g1RemSet.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.hpp
@ -61,7 +61,6 @@ private:
  G1RemSetSummary _prev_period_summary;

  G1CollectedHeap* _g1h;
-  size_t _num_conc_refined_cards; // Number of cards refined concurrently to the mutator.

  G1CardTable*           _ct;
  G1Policy*              _g1p;
@ -125,8 +124,6 @@ public:
  // Print accumulated summary info from the last time called.
  void print_periodic_summary_info(const char* header, uint period_count);

-  size_t num_conc_refined_cards() const { return _num_conc_refined_cards; }
-
  // Rebuilds the remembered set by scanning from bottom to TARS for all regions
  // using the given work gang.
  void rebuild_rem_set(G1ConcurrentMark* cm, WorkGang* workers, uint worker_id_offset);
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
@ -27,6 +27,7 @@
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
+#include "gc/g1/g1Policy.hpp"
 #include "gc/g1/g1RemSet.hpp"
 #include "gc/g1/g1RemSetSummary.hpp"
 #include "gc/g1/g1YoungRemSetSamplingThread.hpp"
@ -53,18 +54,17 @@ public:
 };

 void G1RemSetSummary::update() {
-  _num_conc_refined_cards = _rem_set->num_conc_refined_cards();
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  _num_processed_buf_mutator = dcqs.processed_buffers_mut();
-  _num_processed_buf_rs_threads = dcqs.processed_buffers_rs_thread();
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+
+  const G1Policy* policy = g1h->policy();
+  _total_mutator_refined_cards = policy->total_mutator_refined_cards();
+  _total_concurrent_refined_cards = policy->total_concurrent_refined_cards();

  _num_coarsenings = HeapRegionRemSet::n_coarsenings();

-  G1CollectedHeap* g1h = G1CollectedHeap::heap();
-  G1ConcurrentRefine* cg1r = g1h->concurrent_refine();
  if (_rs_threads_vtimes != NULL) {
    GetRSThreadVTimeClosure p(this);
-    cg1r->threads_do(&p);
+    g1h->concurrent_refine()->threads_do(&p);
  }
  set_sampling_thread_vtime(g1h->sampling_thread()->vtime_accum());
 }
@ -83,9 +83,8 @@ double G1RemSetSummary::rs_thread_vtime(uint thread) const {

 G1RemSetSummary::G1RemSetSummary() :
  _rem_set(NULL),
-  _num_conc_refined_cards(0),
-  _num_processed_buf_mutator(0),
-  _num_processed_buf_rs_threads(0),
+  _total_mutator_refined_cards(0),
+  _total_concurrent_refined_cards(0),
  _num_coarsenings(0),
  _num_vtimes(G1ConcurrentRefine::max_num_threads()),
  _rs_threads_vtimes(NEW_C_HEAP_ARRAY(double, _num_vtimes, mtGC)),
@ -96,9 +95,8 @@ G1RemSetSummary::G1RemSetSummary() :

 G1RemSetSummary::G1RemSetSummary(G1RemSet* rem_set) :
  _rem_set(rem_set),
-  _num_conc_refined_cards(0),
-  _num_processed_buf_mutator(0),
-  _num_processed_buf_rs_threads(0),
+  _total_mutator_refined_cards(0),
+  _total_concurrent_refined_cards(0),
  _num_coarsenings(0),
  _num_vtimes(G1ConcurrentRefine::max_num_threads()),
  _rs_threads_vtimes(NEW_C_HEAP_ARRAY(double, _num_vtimes, mtGC)),
@ -114,12 +112,10 @@ void G1RemSetSummary::set(G1RemSetSummary* other) {
  assert(other != NULL, "just checking");
  assert(_num_vtimes == other->_num_vtimes, "just checking");

-  _num_conc_refined_cards = other->num_conc_refined_cards();
+  _total_mutator_refined_cards = other->total_mutator_refined_cards();
+  _total_concurrent_refined_cards = other->total_concurrent_refined_cards();

-  _num_processed_buf_mutator = other->num_processed_buf_mutator();
-  _num_processed_buf_rs_threads = other->num_processed_buf_rs_threads();
-
-  _num_coarsenings = other->_num_coarsenings;
+  _num_coarsenings = other->num_coarsenings();

  memcpy(_rs_threads_vtimes, other->_rs_threads_vtimes, sizeof(double) * _num_vtimes);

@ -130,10 +126,8 @@ void G1RemSetSummary::subtract_from(G1RemSetSummary* other) {
  assert(other != NULL, "just checking");
  assert(_num_vtimes == other->_num_vtimes, "just checking");

-  _num_conc_refined_cards = other->num_conc_refined_cards() - _num_conc_refined_cards;
-
-  _num_processed_buf_mutator = other->num_processed_buf_mutator() - _num_processed_buf_mutator;
-  _num_processed_buf_rs_threads = other->num_processed_buf_rs_threads() - _num_processed_buf_rs_threads;
+  _total_mutator_refined_cards = other->total_mutator_refined_cards() - _total_mutator_refined_cards;
+  _total_concurrent_refined_cards = other->total_concurrent_refined_cards() - _total_concurrent_refined_cards;

  _num_coarsenings = other->num_coarsenings() - _num_coarsenings;

@ -356,16 +350,15 @@ public:

 void G1RemSetSummary::print_on(outputStream* out) {
  out->print_cr(" Recent concurrent refinement statistics");
-  out->print_cr("  Processed " SIZE_FORMAT " cards concurrently", num_conc_refined_cards());
-  out->print_cr("  Of " SIZE_FORMAT " completed buffers:", num_processed_buf_total());
-  out->print_cr("     " SIZE_FORMAT_W(8) " (%5.1f%%) by concurrent RS threads.",
-                num_processed_buf_total(),
-                percent_of(num_processed_buf_rs_threads(), num_processed_buf_total()));
+  out->print_cr("  Of " SIZE_FORMAT " refined cards:", total_refined_cards());
+  out->print_cr("     " SIZE_FORMAT_W(8) " (%5.1f%%) by concurrent refinement threads.",
+                total_concurrent_refined_cards(),
+                percent_of(total_concurrent_refined_cards(), total_refined_cards()));
  out->print_cr("     " SIZE_FORMAT_W(8) " (%5.1f%%) by mutator threads.",
-                num_processed_buf_mutator(),
-                percent_of(num_processed_buf_mutator(), num_processed_buf_total()));
+                total_mutator_refined_cards(),
+                percent_of(total_mutator_refined_cards(), total_refined_cards()));
  out->print_cr("  Did " SIZE_FORMAT " coarsenings.", num_coarsenings());
-  out->print_cr("  Concurrent RS threads times (s)");
+  out->print_cr("  Concurrent refinement threads times (s)");
  out->print("     ");
  for (uint i = 0; i < _num_vtimes; i++) {
    out->print("    %5.2f", rs_thread_vtime(i));
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
@ -38,9 +38,8 @@ private:

  G1RemSet* _rem_set;

-  size_t _num_conc_refined_cards;
-  size_t _num_processed_buf_mutator;
-  size_t _num_processed_buf_rs_threads;
+  size_t _total_mutator_refined_cards;
+  size_t _total_concurrent_refined_cards;

  size_t _num_coarsenings;

@ -76,20 +75,16 @@ public:
    return _sampling_thread_vtime;
  }

-  size_t num_conc_refined_cards() const {
-    return _num_conc_refined_cards;
+  size_t total_mutator_refined_cards() const {
+    return _total_mutator_refined_cards;
  }

-  size_t num_processed_buf_mutator() const {
-    return _num_processed_buf_mutator;
+  size_t total_concurrent_refined_cards() const {
+    return _total_concurrent_refined_cards;
  }

-  size_t num_processed_buf_rs_threads() const {
-    return _num_processed_buf_rs_threads;
-  }
-
-  size_t num_processed_buf_total() const {
-    return num_processed_buf_mutator() + num_processed_buf_rs_threads();
+  size_t total_refined_cards() const {
+    return total_mutator_refined_cards() + total_concurrent_refined_cards();
  }

  size_t num_coarsenings() const {
--- a/test/hotspot/jtreg/gc/g1/TestRemsetLoggingThreads.java
+++ b/test/hotspot/jtreg/gc/g1/TestRemsetLoggingThreads.java
@ -54,7 +54,7 @@ public class TestRemsetLoggingThreads {

    OutputAnalyzer output = new OutputAnalyzer(pb.start());

-    String pattern = "Concurrent RS threads times \\(s\\)$";
+    String pattern = "Concurrent refinement threads times \\(s\\)$";
    Matcher m = Pattern.compile(pattern, Pattern.MULTILINE).matcher(output.getStdout());

    if (!m.find()) {
--- a/test/hotspot/jtreg/gc/g1/TestRemsetLoggingTools.java
+++ b/test/hotspot/jtreg/gc/g1/TestRemsetLoggingTools.java
@ -110,7 +110,7 @@ public class TestRemsetLoggingTools {
    }

    public static void expectRSetSummaries(String result, int expectedCumulative, int expectedPeriodic) throws Exception {
-        int actualTotal = result.split("concurrent refinement").length - 1;
+        int actualTotal = result.split("concurrent refinement statistics").length - 1;
        int actualCumulative = result.split("Cumulative RS summary").length - 1;

        if (expectedCumulative != actualCumulative) {