ruby · eightbitraptor · Mar 17, 2026 · Mar 17, 2026
diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb
@@ -48,7 +48,7 @@ def write_csv(output_path, ruby_descriptions, table)
     end
 
     # Build output text string with metadata, table, and legend
-    def build_output_text(ruby_descriptions, table, format, bench_failures, include_rss: false, include_gc: false)
+    def build_output_text(ruby_descriptions, table, format, bench_failures, include_rss: false, include_gc: false, include_pvalue: false)
       base_name, *other_names = ruby_descriptions.keys
 
       output_str = +""
@@ -73,7 +73,9 @@ def build_output_text(ruby_descriptions, table, format, bench_failures, include_
             output_str << "- sweep #{base_name}/#{name}: ratio of GC sweeping time. Higher is better for #{name}. Above 1 represents faster sweeping.\n"
           end
         end
-        output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n"
+        if include_pvalue
+          output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n"
+        end
       end
 
       output_str

diff --git a/lib/benchmark_runner/cli.rb b/lib/benchmark_runner/cli.rb
@@ -109,7 +109,7 @@ def run
       BenchmarkRunner.write_csv(output_path, ruby_descriptions, table)
 
       # Save the output in a text file that we can easily refer to
-      output_str = BenchmarkRunner.build_output_text(ruby_descriptions, table, format, bench_failures, include_rss: args.rss, include_gc: builder.include_gc?)
+      output_str = BenchmarkRunner.build_output_text(ruby_descriptions, table, format, bench_failures, include_rss: args.rss, include_gc: builder.include_gc?, include_pvalue: args.pvalue)
       out_txt_path = output_path + ".txt"
       File.open(out_txt_path, "w") { |f| f.write output_str }
 

diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb
@@ -174,7 +174,7 @@ def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
     row.concat(ratio_1sts)
 
     other_ts.each do |other_t|
-      pval = Stats.welch_p_value(base_t, other_t)
+      pval = @include_pvalue ? Stats.welch_p_value(base_t, other_t) : nil
       row << format_ratio(mean(base_t) / mean(other_t), pval)
       if @include_pvalue
         row << format_p_value(pval)
@@ -207,7 +207,7 @@ def gc_ratio(base, other)
         mean(other) == 0.0
       return "N/A"
     end
-    pval = Stats.welch_p_value(base, other)
+    pval = @include_pvalue ? Stats.welch_p_value(base, other) : nil
     format_ratio(mean(base) / mean(other), pval)
   end
 

diff --git a/test/benchmark_runner_test.rb b/test/benchmark_runner_test.rb
@@ -387,6 +387,25 @@
       assert_includes result, 'Legend:'
       assert_includes result, '- ruby-yjit 1st itr: ratio of ruby-base/ruby-yjit time for the first benchmarking iteration.'
       assert_includes result, '- ruby-base/ruby-yjit: ratio of ruby-base/ruby-yjit time. Higher is better for ruby-yjit. Above 1 represents a speedup.'
+      refute_includes result, "p < 0.001"
+    end
+
+    it 'includes p-value legend when include_pvalue is true' do
+      ruby_descriptions = {
+        'ruby-base' => 'ruby 3.3.0',
+        'ruby-yjit' => 'ruby 3.3.0 +YJIT'
+      }
+      table = [
+        ['bench', 'ruby-base (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)'],
+        ['fib', '100.0', '5.0', '50.0', '3.0']
+      ]
+      format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f']
+      bench_failures = {}
+
+      result = BenchmarkRunner.build_output_text(
+        ruby_descriptions, table, format, bench_failures, include_pvalue: true
+      )
+
       assert_includes result, "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)"
     end
 

diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb
@@ -425,7 +425,7 @@
       assert_equal '', table[1].last
     end
 
-    it 'always shows significance symbol but omits verbose columns without --pvalue' do
+    it 'omits significance symbols and p-value columns without --pvalue' do
       executable_names = ['ruby', 'ruby-yjit']
       bench_data = {
         'ruby' => {
@@ -452,7 +452,41 @@
       table, _format = builder.build
       refute_includes table[0], 'p-value'
       refute_includes table[0], 'sig'
-      assert_match(/\(\*{1,3}\)$/, table[1].last)
+      ratio_cell = table[1].last
+      refute_match(/\*/, ratio_cell)
+      assert_match(/\A\d+\.\d+\s*\z/, ratio_cell)
+    end
+
+    it 'shows significance symbols and p-value columns with --pvalue' do
+      executable_names = ['ruby', 'ruby-yjit']
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.100, 0.101, 0.099],
+            'rss' => 1024 * 1024 * 10
+          }
+        },
+        'ruby-yjit' => {
+          'fib' => {
+            'warmup' => [0.05],
+            'bench' => [0.050, 0.051, 0.049],
+            'rss' => 1024 * 1024 * 12
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: executable_names,
+        bench_data: bench_data,
+        include_pvalue: true
+      )
+
+      table, _format = builder.build
+      assert_includes table[0], 'p-value'
+      assert_includes table[0], 'sig'
+      ratio_col_idx = table[0].index('ruby/ruby-yjit')
+      assert_match(/\(\*{1,3}\)/, table[1][ratio_col_idx])
     end
 
     it 'handles only headline benchmarks' do