From 55c1d3a449b4b89b546e57aed166a3b81178abe2 Mon Sep 17 00:00:00 2001
From: Matt Valentine-House <matt@eightbitraptor.com>
Date: Wed, 10 Jun 2026 11:12:38 +0100
Subject: [PATCH] Sample RSS across the benchmark window

The harness took a single RSS snapshot after a forced full GC, so the RSS column carried no variance and small differences read as signal when they were only noise.

Sample RSS once per iteration across the bench window (default, gc and warmup harnesses) and report it as mean +/- stddev% with a mean-based ratio. The raw per-iteration samples are stored under rss_samples in the JSON; the post-GC rss snapshot and lifetime maxrss are still recorded.
---
 README.md                          |  12 ++-
 harness-gc/harness.rb              |   4 +
 harness-warmup/harness.rb          |   6 +-
 harness/harness-common.rb          |  12 +++
 harness/harness.rb                 |   8 +-
 lib/results_table_builder.rb       |  52 +++++++++++--
 test/results_table_builder_test.rb | 121 +++++++++++++++++++++++++++++
 7 files changed, 206 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index bba0d1ef..a36d0955 100644
--- a/README.md
+++ b/README.md
@@ -284,12 +284,22 @@ after each iteration with the default harness.
 
 ## Measuring memory usage
 
-`--rss` option of `run_benchmarks.rb` allows you to measure RSS after benchmark iterations.
+`--rss` option of `run_benchmarks.rb` allows you to measure RSS (resident set size).
 
 ```
 ./run_benchmarks.rb --rss
 ```
 
+The harness samples RSS once per iteration across the benchmarking window (after
+warmup), so the `RSS (MiB)` column reports the mean working set during measurement
+along with its run-to-run variability (`mean ± stddev%`), and the `RSS` ratio is
+computed from those means. The raw per-iteration samples are stored in the JSON
+output under `rss_samples` (bytes).
+
+For reference, the JSON output also keeps `rss`, a single snapshot taken after a
+full GC at the end of the run (the retained set, a lower bound), and `maxrss`, the
+process's lifetime peak from `getrusage`.
+
 ## Rendering a graph
 
 `--graph` option of `run_benchmarks.rb` allows you to render benchmark results as a graph.
diff --git a/harness-gc/harness.rb b/harness-gc/harness.rb
index 70c113a9..9398fdcf 100644
--- a/harness-gc/harness.rb
+++ b/harness-gc/harness.rb
@@ -33,6 +33,7 @@ def gc_stat_heap_delta(before, after)
 
 def run_benchmark(_num_itrs_hint, **, &block)
   times = []
+  rss_samples = []
   marking_times = []
   sweeping_times = []
   gc_counts = []
@@ -82,6 +83,7 @@ def run_benchmark(_num_itrs_hint, **, &block)
     puts itr_str
 
     times << time
+    rss_samples << get_rss
     marking_times << mark_delta
     sweeping_times << sweep_delta
     gc_counts << count_delta
@@ -95,6 +97,8 @@ def run_benchmark(_num_itrs_hint, **, &block)
   bench_range = WARMUP_ITRS..-1
 
   extra = {}
+  rss_bench = rss_samples[bench_range] || []
+  extra["rss_samples"] = rss_bench unless rss_bench.empty?
   extra["gc_marking_time_warmup"] = marking_times[warmup_range]
   extra["gc_marking_time_bench"] = marking_times[bench_range]
   extra["gc_sweeping_time_warmup"] = sweeping_times[warmup_range]
diff --git a/harness-warmup/harness.rb b/harness-warmup/harness.rb
index 21045227..5f1050cc 100644
--- a/harness-warmup/harness.rb
+++ b/harness-warmup/harness.rb
@@ -36,10 +36,12 @@ def print_stats(bench, elapsed)
 def run_benchmark(num_itrs_hint, **)
   start = monotonic_time
   times = []
+  rss_samples = []
 
   begin
     time = Benchmark.realtime { yield }
     times << time
+    rss_samples << get_rss
 
     stats = Stats.new(times)
     median = stats.median
@@ -63,7 +65,9 @@ def run_benchmark(num_itrs_hint, **)
   end until times.size >= MIN_ITERS and elapsed >= MIN_TIME and mad <= threshold
 
   warmup, bench = times[0...times.size/2], times[times.size/2..-1]
-  return_results(warmup, bench)
+  rss_bench = rss_samples[times.size/2..-1] || []
+  extra = rss_bench.empty? ? {} : { "rss_samples" => rss_bench }
+  return_results(warmup, bench, **extra)
 
   print_stats(bench, elapsed)
 end
diff --git a/harness/harness-common.rb b/harness/harness-common.rb
index 096036d0..31035310 100644
--- a/harness/harness-common.rb
+++ b/harness/harness-common.rb
@@ -1,4 +1,5 @@
 require 'rbconfig'
+require_relative '../misc/stats'
 
 # Ensure the ruby in PATH is the ruby running this, so we can safely shell out to other commands
 ruby_in_path = `ruby -e 'print RbConfig.ruby'`
@@ -214,6 +215,17 @@ def return_results(warmup_iterations, bench_iterations, **extra)
     puts "MAXRSS: %.1fMiB" % (maxrss / 1024.0 / 1024.0)
   end
 
+  rss_samples = ruby_bench_results["rss_samples"]
+  if rss_samples.is_a?(Array) && !rss_samples.empty?
+    mib = rss_samples.map { |bytes| bytes / 1024.0 / 1024.0 }
+    stats = Stats.new(mib)
+    median = stats.median
+    mad = stats.median_absolute_deviation(median)
+    puts "RSS sampled (n=%d): median %.1fMiB \u00b1 %.1fMiB (MAD), range [%.1f, %.1f]MiB" % [
+      mib.size, median, mad, stats.min, stats.max
+    ]
+  end
+
   write_json_file(ruby_bench_results)
 end
 
diff --git a/harness/harness.rb b/harness/harness.rb
index 7c3b79a4..2fac8c3f 100644
--- a/harness/harness.rb
+++ b/harness/harness.rb
@@ -34,6 +34,7 @@ def realtime
 # Takes a block as input
 def run_benchmark(_num_itrs_hint, **, &block)
   times = []
+  rss_samples = []
   total_time = 0
   num_itrs = 0
   header = "itr:   time"
@@ -75,10 +76,15 @@ def run_benchmark(_num_itrs_hint, **, &block)
     # We internally save the time in seconds to avoid loss of precision
     times << time
     total_time += time
+    # Sample current RSS between iterations (outside the timed block) so we can
+    # report the working set across the window with variance.
+    rss_samples << get_rss
   end until num_itrs >= WARMUP_ITRS + MIN_BENCH_ITRS and total_time >= MIN_BENCH_TIME
 
   warmup, bench = times[0...WARMUP_ITRS], times[WARMUP_ITRS..-1]
-  return_results(warmup, bench)
+  rss_bench = rss_samples[WARMUP_ITRS..-1] || []
+  extra = rss_bench.empty? ? {} : { "rss_samples" => rss_bench }
+  return_results(warmup, bench, **extra)
 
   non_warmups = times[WARMUP_ITRS..-1]
   if non_warmups.size > 1
diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb
index 32556016..d7ac004d 100644
--- a/lib/results_table_builder.rb
+++ b/lib/results_table_builder.rb
@@ -12,6 +12,7 @@ def initialize(executable_names:, bench_data:, include_rss: false, include_pvalu
     @include_pvalue = include_pvalue
     @zjit_stats = zjit_stats || []
     @include_gc = detect_gc_data(bench_data)
+    @rss_has_samples = @include_rss && detect_rss_samples(bench_data)
     @base_name = executable_names.first
     @other_names = executable_names[1..]
     @bench_names = compute_bench_names
@@ -86,7 +87,7 @@ def build_format
 
     @executable_names.each do |_name|
       format << "%s"
-      format << "%.1f" if @include_rss
+      format << (@rss_has_samples ? "%s" : "%.1f") if @include_rss
       @zjit_stats.each { format << "%s" }
       if @include_gc
         format << "%s"
@@ -125,11 +126,15 @@ def build_row(bench_name)
     t0s = extract_first_iteration_times(bench_name)
     times_no_warmup = extract_benchmark_times(bench_name)
     rsss = extract_rss_values(bench_name)
+    rss_series = @rss_has_samples ? extract_rss_series(bench_name) : nil
 
     base_t0, *other_t0s = t0s
     base_t, *other_ts = times_no_warmup
     base_rss, *other_rsss = rsss
 
+    base_rss_cell = rss_cell(base_rss, rss_series && rss_series[0])
+    other_rss_cells = other_rsss.each_index.map { |i| rss_cell(other_rsss[i], rss_series && rss_series[i + 1]) }
+
     # Extract zjit stats: { stat_name => [base_val, other1_val, ...] }
     zjit_stat_values = @zjit_stats.map do |stat|
       [stat, extract_zjit_stat(bench_name, stat)]
@@ -143,8 +148,8 @@ def build_row(bench_name)
     end
 
     row = [bench_name]
-    build_base_columns(row, base_t, base_rss, zjit_stat_values, 0, base_mark, base_sweep)
-    build_comparison_columns(row, other_ts, other_rsss, zjit_stat_values, other_marks, other_sweeps)
+    build_base_columns(row, base_t, base_rss_cell, zjit_stat_values, 0, base_mark, base_sweep)
+    build_comparison_columns(row, other_ts, other_rss_cells, zjit_stat_values, other_marks, other_sweeps)
     build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
     build_rss_ratio_columns(row, base_rss, other_rsss)
     build_gc_ratio_columns(row, base_mark, other_marks, base_sweep, other_sweeps)
@@ -162,10 +167,10 @@ def build_base_columns(row, base_t, base_rss, zjit_stat_values, exe_index, base_
     end
   end
 
-  def build_comparison_columns(row, other_ts, other_rsss, zjit_stat_values, other_marks, other_sweeps)
+  def build_comparison_columns(row, other_ts, other_rss_cells, zjit_stat_values, other_marks, other_sweeps)
     other_ts.each_with_index do |other_t, i|
       row << format_time_with_stddev(other_t)
-      row << other_rsss[i] if @include_rss
+      row << other_rss_cells[i] if @include_rss
       zjit_stat_values.each { |_stat, values| row << format_stat(values[i + 1]) }
       if @include_gc
         row << format_time_with_stddev(other_marks[i])
@@ -283,9 +288,38 @@ def extract_benchmark_times(bench_name)
     end
   end
 
+  # Numeric RSS (MiB) per executable, used for the RSS ratio. When per-iteration
+  # samples are present we use their mean so the ratio matches the displayed value.
   def extract_rss_values(bench_name)
     @executable_names.map do |name|
-      bench_data_for(name, bench_name)['rss'] / BYTES_TO_MIB
+      data = bench_data_for(name, bench_name)
+      samples = data['rss_samples']
+      if samples.is_a?(Array) && !samples.empty?
+        mean(samples) / BYTES_TO_MIB
+      else
+        data['rss'] / BYTES_TO_MIB
+      end
+    end
+  end
+
+  # Per-iteration RSS samples (MiB) per executable, or nil when a run lacks them.
+  def extract_rss_series(bench_name)
+    @executable_names.map do |name|
+      samples = bench_data_for(name, bench_name)['rss_samples']
+      next nil unless samples.is_a?(Array) && !samples.empty?
+      samples.map { |bytes| bytes / BYTES_TO_MIB }
+    end
+  end
+
+  # Display value for an RSS column: mean ± stddev% when samples exist (matching
+  # the timing columns), otherwise a plain MiB value. Returns a Float when no run
+  # in the suite has samples, preserving the legacy "%.1f" formatting.
+  def rss_cell(mean_value, series)
+    return mean_value unless @rss_has_samples
+    if series && !series.empty?
+      format_time_with_stddev(series)
+    else
+      "%.1f" % mean_value
     end
   end
 
@@ -305,6 +339,12 @@ def detect_gc_data(bench_data)
     bench_data.values.any? { |benchmarks| benchmarks.values.any? { |d| d.is_a?(Hash) && d.key?('gc_marking_time_bench') } }
   end
 
+  def detect_rss_samples(bench_data)
+    bench_data.values.any? do |benchmarks|
+      benchmarks.values.any? { |d| d.is_a?(Hash) && d['rss_samples'].is_a?(Array) && !d['rss_samples'].empty? }
+    end
+  end
+
   def bench_data_for(name, bench_name)
     @bench_data[name][bench_name]
   end
diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb
index a79d5288..303b84f5 100644
--- a/test/results_table_builder_test.rb
+++ b/test/results_table_builder_test.rb
@@ -549,4 +549,125 @@
       assert_equal 'fib', bench_names[4]
     end
   end
+
+  describe 'RSS sampling (rss_samples)' do
+    MIB = 1024 * 1024
+
+    it 'shows mean ± stddev% and uses %s format when samples are present' do
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1, 0.1, 0.1],
+            'rss' => 10 * MIB,
+            'rss_samples' => [9 * MIB, 10 * MIB, 11 * MIB]
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: ['ruby'],
+        bench_data: bench_data,
+        include_rss: true
+      )
+
+      table, format = builder.build
+
+      assert_equal ['bench', 'ruby (ms)', 'RSS (MiB)'], table[0]
+      assert_equal ['%s', '%s', '%s'], format
+
+      m = table[1][2].match(/\A(\d+\.\d) ± (\d+\.\d)%\z/)
+      assert m, "expected mean ± stddev%, got #{table[1][2].inspect}"
+      assert_in_delta 10.0, m[1].to_f, 0.1
+      assert_operator m[2].to_f, :>, 0.0
+    end
+
+    it 'computes the RSS ratio from the mean of samples' do
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1, 0.1, 0.1],
+            'rss' => 99 * MIB, # should be ignored in favour of samples
+            'rss_samples' => [10 * MIB, 10 * MIB, 10 * MIB]
+          }
+        },
+        'ruby-yjit' => {
+          'fib' => {
+            'warmup' => [0.05],
+            'bench' => [0.05, 0.05, 0.05],
+            'rss' => 1 * MIB,
+            'rss_samples' => [18 * MIB, 20 * MIB, 22 * MIB]
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: ['ruby', 'ruby-yjit'],
+        bench_data: bench_data,
+        include_rss: true
+      )
+
+      table, _format = builder.build
+
+      # ratio = mean(ruby samples) / mean(yjit samples) = 10 / 20 = 0.5
+      assert_in_delta 0.5, table[1].last, 0.001
+    end
+
+    it 'falls back to a plain MiB value for runs without samples in a mixed suite' do
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1, 0.1],
+            'rss' => 10 * MIB,
+            'rss_samples' => [10 * MIB, 10 * MIB]
+          },
+          'loop' => {
+            'warmup' => [0.2],
+            'bench' => [0.2, 0.2],
+            'rss' => 15 * MIB
+            # no rss_samples for this benchmark
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: ['ruby'],
+        bench_data: bench_data,
+        include_rss: true
+      )
+
+      table, format = builder.build
+
+      # Suite has samples somewhere, so the RSS column is string-formatted.
+      assert_equal ['%s', '%s', '%s'], format
+
+      rows = table[1..].each_with_object({}) { |row, h| h[row[0]] = row }
+      assert_match(/\A\d+\.\d ± \d+\.\d%\z/, rows['fib'][2])
+      # The sample-less benchmark still renders as a bare MiB value.
+      assert_equal '15.0', rows['loop'][2]
+    end
+
+    it 'keeps %.1f formatting when no run in the suite has samples' do
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1],
+            'rss' => 10 * MIB
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: ['ruby'],
+        bench_data: bench_data,
+        include_rss: true
+      )
+
+      _table, format = builder.build
+      assert_equal ['%s', '%s', '%.1f'], format
+    end
+  end
 end