From 55c1d3a449b4b89b546e57aed166a3b81178abe2 Mon Sep 17 00:00:00 2001 From: Matt Valentine-House Date: Wed, 10 Jun 2026 11:12:38 +0100 Subject: [PATCH] Sample RSS across the benchmark window The harness took a single RSS snapshot after a forced full GC, so the RSS column carried no variance and small differences read as signal when they were only noise. Sample RSS once per iteration across the bench window (default, gc and warmup harnesses) and report it as mean +/- stddev% with a mean-based ratio. The raw per-iteration samples are stored under rss_samples in the JSON; the post-GC rss snapshot and lifetime maxrss are still recorded. --- README.md | 12 ++- harness-gc/harness.rb | 4 + harness-warmup/harness.rb | 6 +- harness/harness-common.rb | 12 +++ harness/harness.rb | 8 +- lib/results_table_builder.rb | 52 +++++++++++-- test/results_table_builder_test.rb | 121 +++++++++++++++++++++++++++++ 7 files changed, 206 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index bba0d1ef..a36d0955 100644 --- a/README.md +++ b/README.md @@ -284,12 +284,22 @@ after each iteration with the default harness. ## Measuring memory usage -`--rss` option of `run_benchmarks.rb` allows you to measure RSS after benchmark iterations. +`--rss` option of `run_benchmarks.rb` allows you to measure RSS (resident set size). ``` ./run_benchmarks.rb --rss ``` +The harness samples RSS once per iteration across the benchmarking window (after +warmup), so the `RSS (MiB)` column reports the mean working set during measurement +along with its run-to-run variability (`mean ± stddev%`), and the `RSS` ratio is +computed from those means. The raw per-iteration samples are stored in the JSON +output under `rss_samples` (bytes). + +For reference, the JSON output also keeps `rss`, a single snapshot taken after a +full GC at the end of the run (the retained set, a lower bound), and `maxrss`, the +process's lifetime peak from `getrusage`. + ## Rendering a graph `--graph` option of `run_benchmarks.rb` allows you to render benchmark results as a graph. diff --git a/harness-gc/harness.rb b/harness-gc/harness.rb index 70c113a9..9398fdcf 100644 --- a/harness-gc/harness.rb +++ b/harness-gc/harness.rb @@ -33,6 +33,7 @@ def gc_stat_heap_delta(before, after) def run_benchmark(_num_itrs_hint, **, &block) times = [] + rss_samples = [] marking_times = [] sweeping_times = [] gc_counts = [] @@ -82,6 +83,7 @@ def run_benchmark(_num_itrs_hint, **, &block) puts itr_str times << time + rss_samples << get_rss marking_times << mark_delta sweeping_times << sweep_delta gc_counts << count_delta @@ -95,6 +97,8 @@ def run_benchmark(_num_itrs_hint, **, &block) bench_range = WARMUP_ITRS..-1 extra = {} + rss_bench = rss_samples[bench_range] || [] + extra["rss_samples"] = rss_bench unless rss_bench.empty? extra["gc_marking_time_warmup"] = marking_times[warmup_range] extra["gc_marking_time_bench"] = marking_times[bench_range] extra["gc_sweeping_time_warmup"] = sweeping_times[warmup_range] diff --git a/harness-warmup/harness.rb b/harness-warmup/harness.rb index 21045227..5f1050cc 100644 --- a/harness-warmup/harness.rb +++ b/harness-warmup/harness.rb @@ -36,10 +36,12 @@ def print_stats(bench, elapsed) def run_benchmark(num_itrs_hint, **) start = monotonic_time times = [] + rss_samples = [] begin time = Benchmark.realtime { yield } times << time + rss_samples << get_rss stats = Stats.new(times) median = stats.median @@ -63,7 +65,9 @@ def run_benchmark(num_itrs_hint, **) end until times.size >= MIN_ITERS and elapsed >= MIN_TIME and mad <= threshold warmup, bench = times[0...times.size/2], times[times.size/2..-1] - return_results(warmup, bench) + rss_bench = rss_samples[times.size/2..-1] || [] + extra = rss_bench.empty? ? {} : { "rss_samples" => rss_bench } + return_results(warmup, bench, **extra) print_stats(bench, elapsed) end diff --git a/harness/harness-common.rb b/harness/harness-common.rb index 096036d0..31035310 100644 --- a/harness/harness-common.rb +++ b/harness/harness-common.rb @@ -1,4 +1,5 @@ require 'rbconfig' +require_relative '../misc/stats' # Ensure the ruby in PATH is the ruby running this, so we can safely shell out to other commands ruby_in_path = `ruby -e 'print RbConfig.ruby'` @@ -214,6 +215,17 @@ def return_results(warmup_iterations, bench_iterations, **extra) puts "MAXRSS: %.1fMiB" % (maxrss / 1024.0 / 1024.0) end + rss_samples = ruby_bench_results["rss_samples"] + if rss_samples.is_a?(Array) && !rss_samples.empty? + mib = rss_samples.map { |bytes| bytes / 1024.0 / 1024.0 } + stats = Stats.new(mib) + median = stats.median + mad = stats.median_absolute_deviation(median) + puts "RSS sampled (n=%d): median %.1fMiB \u00b1 %.1fMiB (MAD), range [%.1f, %.1f]MiB" % [ + mib.size, median, mad, stats.min, stats.max + ] + end + write_json_file(ruby_bench_results) end diff --git a/harness/harness.rb b/harness/harness.rb index 7c3b79a4..2fac8c3f 100644 --- a/harness/harness.rb +++ b/harness/harness.rb @@ -34,6 +34,7 @@ def realtime # Takes a block as input def run_benchmark(_num_itrs_hint, **, &block) times = [] + rss_samples = [] total_time = 0 num_itrs = 0 header = "itr: time" @@ -75,10 +76,15 @@ def run_benchmark(_num_itrs_hint, **, &block) # We internally save the time in seconds to avoid loss of precision times << time total_time += time + # Sample current RSS between iterations (outside the timed block) so we can + # report the working set across the window with variance. + rss_samples << get_rss end until num_itrs >= WARMUP_ITRS + MIN_BENCH_ITRS and total_time >= MIN_BENCH_TIME warmup, bench = times[0...WARMUP_ITRS], times[WARMUP_ITRS..-1] - return_results(warmup, bench) + rss_bench = rss_samples[WARMUP_ITRS..-1] || [] + extra = rss_bench.empty? ? {} : { "rss_samples" => rss_bench } + return_results(warmup, bench, **extra) non_warmups = times[WARMUP_ITRS..-1] if non_warmups.size > 1 diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb index 32556016..d7ac004d 100644 --- a/lib/results_table_builder.rb +++ b/lib/results_table_builder.rb @@ -12,6 +12,7 @@ def initialize(executable_names:, bench_data:, include_rss: false, include_pvalu @include_pvalue = include_pvalue @zjit_stats = zjit_stats || [] @include_gc = detect_gc_data(bench_data) + @rss_has_samples = @include_rss && detect_rss_samples(bench_data) @base_name = executable_names.first @other_names = executable_names[1..] @bench_names = compute_bench_names @@ -86,7 +87,7 @@ def build_format @executable_names.each do |_name| format << "%s" - format << "%.1f" if @include_rss + format << (@rss_has_samples ? "%s" : "%.1f") if @include_rss @zjit_stats.each { format << "%s" } if @include_gc format << "%s" @@ -125,11 +126,15 @@ def build_row(bench_name) t0s = extract_first_iteration_times(bench_name) times_no_warmup = extract_benchmark_times(bench_name) rsss = extract_rss_values(bench_name) + rss_series = @rss_has_samples ? extract_rss_series(bench_name) : nil base_t0, *other_t0s = t0s base_t, *other_ts = times_no_warmup base_rss, *other_rsss = rsss + base_rss_cell = rss_cell(base_rss, rss_series && rss_series[0]) + other_rss_cells = other_rsss.each_index.map { |i| rss_cell(other_rsss[i], rss_series && rss_series[i + 1]) } + # Extract zjit stats: { stat_name => [base_val, other1_val, ...] } zjit_stat_values = @zjit_stats.map do |stat| [stat, extract_zjit_stat(bench_name, stat)] @@ -143,8 +148,8 @@ def build_row(bench_name) end row = [bench_name] - build_base_columns(row, base_t, base_rss, zjit_stat_values, 0, base_mark, base_sweep) - build_comparison_columns(row, other_ts, other_rsss, zjit_stat_values, other_marks, other_sweeps) + build_base_columns(row, base_t, base_rss_cell, zjit_stat_values, 0, base_mark, base_sweep) + build_comparison_columns(row, other_ts, other_rss_cells, zjit_stat_values, other_marks, other_sweeps) build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts) build_rss_ratio_columns(row, base_rss, other_rsss) build_gc_ratio_columns(row, base_mark, other_marks, base_sweep, other_sweeps) @@ -162,10 +167,10 @@ def build_base_columns(row, base_t, base_rss, zjit_stat_values, exe_index, base_ end end - def build_comparison_columns(row, other_ts, other_rsss, zjit_stat_values, other_marks, other_sweeps) + def build_comparison_columns(row, other_ts, other_rss_cells, zjit_stat_values, other_marks, other_sweeps) other_ts.each_with_index do |other_t, i| row << format_time_with_stddev(other_t) - row << other_rsss[i] if @include_rss + row << other_rss_cells[i] if @include_rss zjit_stat_values.each { |_stat, values| row << format_stat(values[i + 1]) } if @include_gc row << format_time_with_stddev(other_marks[i]) @@ -283,9 +288,38 @@ def extract_benchmark_times(bench_name) end end + # Numeric RSS (MiB) per executable, used for the RSS ratio. When per-iteration + # samples are present we use their mean so the ratio matches the displayed value. def extract_rss_values(bench_name) @executable_names.map do |name| - bench_data_for(name, bench_name)['rss'] / BYTES_TO_MIB + data = bench_data_for(name, bench_name) + samples = data['rss_samples'] + if samples.is_a?(Array) && !samples.empty? + mean(samples) / BYTES_TO_MIB + else + data['rss'] / BYTES_TO_MIB + end + end + end + + # Per-iteration RSS samples (MiB) per executable, or nil when a run lacks them. + def extract_rss_series(bench_name) + @executable_names.map do |name| + samples = bench_data_for(name, bench_name)['rss_samples'] + next nil unless samples.is_a?(Array) && !samples.empty? + samples.map { |bytes| bytes / BYTES_TO_MIB } + end + end + + # Display value for an RSS column: mean ± stddev% when samples exist (matching + # the timing columns), otherwise a plain MiB value. Returns a Float when no run + # in the suite has samples, preserving the legacy "%.1f" formatting. + def rss_cell(mean_value, series) + return mean_value unless @rss_has_samples + if series && !series.empty? + format_time_with_stddev(series) + else + "%.1f" % mean_value end end @@ -305,6 +339,12 @@ def detect_gc_data(bench_data) bench_data.values.any? { |benchmarks| benchmarks.values.any? { |d| d.is_a?(Hash) && d.key?('gc_marking_time_bench') } } end + def detect_rss_samples(bench_data) + bench_data.values.any? do |benchmarks| + benchmarks.values.any? { |d| d.is_a?(Hash) && d['rss_samples'].is_a?(Array) && !d['rss_samples'].empty? } + end + end + def bench_data_for(name, bench_name) @bench_data[name][bench_name] end diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb index a79d5288..303b84f5 100644 --- a/test/results_table_builder_test.rb +++ b/test/results_table_builder_test.rb @@ -549,4 +549,125 @@ assert_equal 'fib', bench_names[4] end end + + describe 'RSS sampling (rss_samples)' do + MIB = 1024 * 1024 + + it 'shows mean ± stddev% and uses %s format when samples are present' do + bench_data = { + 'ruby' => { + 'fib' => { + 'warmup' => [0.1], + 'bench' => [0.1, 0.1, 0.1], + 'rss' => 10 * MIB, + 'rss_samples' => [9 * MIB, 10 * MIB, 11 * MIB] + } + } + } + + builder = ResultsTableBuilder.new( + executable_names: ['ruby'], + bench_data: bench_data, + include_rss: true + ) + + table, format = builder.build + + assert_equal ['bench', 'ruby (ms)', 'RSS (MiB)'], table[0] + assert_equal ['%s', '%s', '%s'], format + + m = table[1][2].match(/\A(\d+\.\d) ± (\d+\.\d)%\z/) + assert m, "expected mean ± stddev%, got #{table[1][2].inspect}" + assert_in_delta 10.0, m[1].to_f, 0.1 + assert_operator m[2].to_f, :>, 0.0 + end + + it 'computes the RSS ratio from the mean of samples' do + bench_data = { + 'ruby' => { + 'fib' => { + 'warmup' => [0.1], + 'bench' => [0.1, 0.1, 0.1], + 'rss' => 99 * MIB, # should be ignored in favour of samples + 'rss_samples' => [10 * MIB, 10 * MIB, 10 * MIB] + } + }, + 'ruby-yjit' => { + 'fib' => { + 'warmup' => [0.05], + 'bench' => [0.05, 0.05, 0.05], + 'rss' => 1 * MIB, + 'rss_samples' => [18 * MIB, 20 * MIB, 22 * MIB] + } + } + } + + builder = ResultsTableBuilder.new( + executable_names: ['ruby', 'ruby-yjit'], + bench_data: bench_data, + include_rss: true + ) + + table, _format = builder.build + + # ratio = mean(ruby samples) / mean(yjit samples) = 10 / 20 = 0.5 + assert_in_delta 0.5, table[1].last, 0.001 + end + + it 'falls back to a plain MiB value for runs without samples in a mixed suite' do + bench_data = { + 'ruby' => { + 'fib' => { + 'warmup' => [0.1], + 'bench' => [0.1, 0.1], + 'rss' => 10 * MIB, + 'rss_samples' => [10 * MIB, 10 * MIB] + }, + 'loop' => { + 'warmup' => [0.2], + 'bench' => [0.2, 0.2], + 'rss' => 15 * MIB + # no rss_samples for this benchmark + } + } + } + + builder = ResultsTableBuilder.new( + executable_names: ['ruby'], + bench_data: bench_data, + include_rss: true + ) + + table, format = builder.build + + # Suite has samples somewhere, so the RSS column is string-formatted. + assert_equal ['%s', '%s', '%s'], format + + rows = table[1..].each_with_object({}) { |row, h| h[row[0]] = row } + assert_match(/\A\d+\.\d ± \d+\.\d%\z/, rows['fib'][2]) + # The sample-less benchmark still renders as a bare MiB value. + assert_equal '15.0', rows['loop'][2] + end + + it 'keeps %.1f formatting when no run in the suite has samples' do + bench_data = { + 'ruby' => { + 'fib' => { + 'warmup' => [0.1], + 'bench' => [0.1], + 'rss' => 10 * MIB + } + } + } + + builder = ResultsTableBuilder.new( + executable_names: ['ruby'], + bench_data: bench_data, + include_rss: true + ) + + _table, format = builder.build + assert_equal ['%s', '%s', '%.1f'], format + end + end end