Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
2039b99
bench: add sqlstorm module skeleton and query loader
mprammer May 27, 2026
96b4336
bench: add SqlstormBenchmark + registration (TPC-DS-shaped)
mprammer May 27, 2026
b1cf8c7
bench: implement StackOverflow dba data generation for SQLStorm
mprammer May 27, 2026
979bbcd
bench-orchestrator: register sqlstorm benchmark
mprammer May 27, 2026
285d2d1
bench: make sqlstorm pattern origin-aware for tpch shards
mprammer May 27, 2026
b72c665
bench: vendor 100 confirmed SQLStorm queries (25 per origin)
mprammer May 27, 2026
f950866
bench: self-contained sqlstorm data-gen (job + tpch/tpcds delegation)
mprammer May 27, 2026
ca56b53
ci: run sqlstorm in nightly benchmarks (overnight only)
mprammer May 27, 2026
35dbe4b
bench: vendor 500 SQLStorm queries (125/origin) + full pass/fail log
mprammer May 27, 2026
47cea5a
bench: sqlstorm cleanup and tpch q5862 -> q1261 swap
mprammer May 28, 2026
7af6478
bench: inline sqlstorm SO DDL and route JOB extraction through locate…
mprammer May 28, 2026
02f1e96
bench: unify sqlstorm data-gen behind generate_origin + add drift guards
mprammer May 29, 2026
d741723
bench: add sqlstorm README, drop the queries.csv provenance log
mprammer May 29, 2026
54a6fb9
bench: resample sqlstorm queries to short random samples
mprammer May 29, 2026
62bfe29
bench: document that sqlstorm has no scale factor
mprammer May 29, 2026
732d996
bench: run sqlstorm TPC-H/TPC-DS origins at SF10
mprammer May 29, 2026
5ec0d27
bench: switch sqlstorm StackOverflow origin to the math (12GB) tier
mprammer May 29, 2026
fe5b7bb
bench: tolerate non-RFC-compliant quoting in sqlstorm StackOverflow m…
mprammer May 29, 2026
f8dc1bd
bench: re-curate sqlstorm tpch/tpcds/stackoverflow queries for the ne…
mprammer May 29, 2026
320a071
bench: document sqlstorm scale-up (TPC SF10, StackOverflow math)
mprammer May 29, 2026
6faf2cc
Merge origin/develop into mp/benchmark-sqlstorm
mprammer May 29, 2026
d566d21
bench: fix rustdoc private-intra-doc-link errors in sqlstorm data docs
mprammer May 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
80 changes: 80 additions & 0 deletions .github/workflows/nightly-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,86 @@ jobs:
{"engine": "duckdb", "format": "vortex"}
],
"scale_factor": "100.0"
},
{
"id": "sqlstorm-stackoverflow-nvme",
"subcommand": "sqlstorm",
"name": "SQLStorm (stackoverflow) on NVME",
"origin": "stackoverflow",
"data_formats": ["parquet", "vortex"],
"pr_targets": [
{"engine": "datafusion", "format": "parquet"},
{"engine": "datafusion", "format": "vortex"},
{"engine": "duckdb", "format": "parquet"},
{"engine": "duckdb", "format": "vortex"}
],
"develop_targets": [
{"engine": "datafusion", "format": "parquet"},
{"engine": "datafusion", "format": "vortex"},
{"engine": "duckdb", "format": "parquet"},
{"engine": "duckdb", "format": "vortex"}
],
"iterations": "3"
},
{
"id": "sqlstorm-job-nvme",
"subcommand": "sqlstorm",
"name": "SQLStorm (job) on NVME",
"origin": "job",
"data_formats": ["parquet", "vortex"],
"pr_targets": [
{"engine": "datafusion", "format": "parquet"},
{"engine": "datafusion", "format": "vortex"},
{"engine": "duckdb", "format": "parquet"},
{"engine": "duckdb", "format": "vortex"}
],
"develop_targets": [
{"engine": "datafusion", "format": "parquet"},
{"engine": "datafusion", "format": "vortex"},
{"engine": "duckdb", "format": "parquet"},
{"engine": "duckdb", "format": "vortex"}
],
"iterations": "3"
},
{
"id": "sqlstorm-tpch-nvme",
"subcommand": "sqlstorm",
"name": "SQLStorm (tpch) on NVME",
"origin": "tpch",
"data_formats": ["parquet", "vortex"],
"pr_targets": [
{"engine": "datafusion", "format": "parquet"},
{"engine": "datafusion", "format": "vortex"},
{"engine": "duckdb", "format": "parquet"},
{"engine": "duckdb", "format": "vortex"}
],
"develop_targets": [
{"engine": "datafusion", "format": "parquet"},
{"engine": "datafusion", "format": "vortex"},
{"engine": "duckdb", "format": "parquet"},
{"engine": "duckdb", "format": "vortex"}
],
"iterations": "3"
},
{
"id": "sqlstorm-tpcds-nvme",
"subcommand": "sqlstorm",
"name": "SQLStorm (tpcds) on NVME",
"origin": "tpcds",
"data_formats": ["parquet", "vortex"],
"pr_targets": [
{"engine": "datafusion", "format": "parquet"},
{"engine": "datafusion", "format": "vortex"},
{"engine": "duckdb", "format": "parquet"},
{"engine": "duckdb", "format": "vortex"}
],
"develop_targets": [
{"engine": "datafusion", "format": "parquet"},
{"engine": "datafusion", "format": "vortex"},
{"engine": "duckdb", "format": "parquet"},
{"engine": "duckdb", "format": "vortex"}
],
"iterations": "3"
}
]
strategy:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ jobs:
run: |
uv run --project bench-orchestrator vx-bench prepare-data "${{ matrix.subcommand }}" \
--formats-json '${{ toJSON(matrix.data_formats) }}' \
${{ matrix.origin && format('--opt origin={0}', matrix.origin) || '' }} \
${{ matrix.scale_factor && format('--opt scale-factor={0}', matrix.scale_factor) || '' }}

- name: Setup AWS CLI
Expand Down Expand Up @@ -403,6 +404,7 @@ jobs:
--no-build \
--runner "ec2_${{ inputs.machine_type }}" \
${{ matrix.iterations && format('--iterations {0}', matrix.iterations) || '' }} \
${{ matrix.origin && format('--opt origin={0}', matrix.origin) || '' }} \
${{ matrix.scale_factor && format('--opt scale-factor={0}', matrix.scale_factor) || '' }}

- name: Run ${{ matrix.name }} benchmark (remote)
Expand All @@ -424,6 +426,7 @@ jobs:
--runner "ec2_${{ inputs.machine_type }}" \
${{ matrix.iterations && format('--iterations {0}', matrix.iterations) || '' }} \
--opt remote-data-dir=${{ matrix.remote_storage }} \
${{ matrix.origin && format('--opt origin={0}', matrix.origin) || '' }} \
${{ matrix.scale_factor && format('--opt scale-factor={0}', matrix.scale_factor) || '' }}

- name: Capture file sizes
Expand Down
2 changes: 1 addition & 1 deletion bench-orchestrator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ vx-bench run <benchmark> [options]

**Arguments:**

- `benchmark`: Benchmark suite to run (`appian`, `tpch`, `tpcds`, `clickbench`, `fineweb`, `gh-archive`, `polarsignals`, `public-bi`, `statpopgen`)
- `benchmark`: Benchmark suite to run (`appian`, `tpch`, `tpcds`, `clickbench`, `fineweb`, `gh-archive`, `polarsignals`, `public-bi`, `statpopgen`, `sqlstorm`)

**Options:**

Expand Down
1 change: 1 addition & 0 deletions bench-orchestrator/bench_orchestrator/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class Benchmark(Enum):
POLARSIGNALS = "polarsignals"
PUBLIC_BI = "public-bi"
STATPOPGEN = "statpopgen"
SQLSTORM = "sqlstorm"


# Engine to supported formats mapping.
Expand Down
86 changes: 86 additions & 0 deletions vortex-bench/sqlstorm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# vortex-bench SQLStorm queries

[SQLStorm] is an LLM-generated SQL stress suite — ~62k queries across four
schemas, broad enough to exercise SQL surface that TPC-H and TPC-DS don't.
This directory holds a small, confirmed-working sample (125 queries per
schema, 500 total) that the nightly bench runs against TPC-H and TPC-DS data
generated at SF10 plus two larger datasets we download for the non-TPC
schemas. Queries are pinned at SHA
[`b3bb0b96794a6afe9bb8f3ff2b243562b779c40d`][pinned-sqlstorm].

[SQLStorm]: https://github.com/SQL-Storm/SQLStorm
[pinned-sqlstorm]: https://github.com/SQL-Storm/SQLStorm/tree/b3bb0b96794a6afe9bb8f3ff2b243562b779c40d

## Layout

- `<origin>/<id>.sql` — 125 queries per origin, 4 origins, 500 total.
`<id>` is the upstream SQLStorm query id (sparse, non-sequential).

| Origin | Source data | Upstream SQLStorm dir |
| --- | --- | --- |
| `tpch` | TPC-H generated at SF10 (`data/tpch/10.0/`) | `v1.0/tpch/` |
| `tpcds` | TPC-DS generated at SF10 (`data/tpcds/10.0/`) | `v1.0/tpcds/` |
| `stackoverflow` | `stackoverflow_math.tar.gz` (~12 GB) from `db.in.tum.de` | `v1.0/stackoverflow/` |
| `job` | `imdb.tzst` from `db.in.tum.de` | `v1.0/job/` |

The benchmark runs strict — a query failure aborts the run rather than
silently dropping a row, so any regression that breaks a query in nightly
is loud. The vendored set was curated to be the intersection of queries
that pass DuckDB and DataFusion against the source data; that is why a
small, confirmed-working sample lives in-tree and the full ~62k SQLStorm
corpus does not.

## Data size (fixed scale)

**There is no SQLStorm scale factor.** Each origin runs at a single fixed
size, and `vx-bench run sqlstorm` does **not** read `--opt scale-factor` —
passing one is silently ignored (it is not an error and changes nothing). The
four origins are sized to sit within the same order of magnitude as JOB:

| Origin | Fixed size | ~Rows (all tables) | ~Parquet |
| --- | --- | --- | --- |
| `stackoverflow` | the `math` tier, ~12 GB | 40 M | 6.1 GB |
| `job` | the full IMDB/JOB snapshot (fixed real dataset) | 74 M | 1.7 GB |
| `tpch` | SF 10 | 87 M | 3.5 GB |
| `tpcds` | SF 10 | 192 M | 3.9 GB |

This mirrors upstream: SQLStorm has no uniform scale knob either. OLAPBench
(the canonical runner) selects size *per origin* — StackOverflow ships at
0 / 1 GB (`dba`) / 12 GB (`math`) / 222 GB, TPC-H/TPC-DS scale via their own
generators, and JOB is fixed. Query *validity* is scale-independent; only row
counts change with size. The fixed points above are set in code — the TPC
scale by `SQLSTORM_TPC_SCALE_FACTOR` (`sqlstorm_benchmark.rs`) and the
StackOverflow tier by the `STACKOVERFLOW` recipe's tarball URL (`data.rs`) —
so changing them means editing those consts (and re-curating, since the
vendored queries are selected to stay short at the configured scale), not
passing a runtime scale factor.

## Refreshing the vendored set

Swaps happen by hand against the pinned SHA above: clone the SQLStorm
corpus at that SHA, pick candidates from `v1.0/<origin>/queries/`, and
verify each runs cleanly on both DuckDB and DataFusion **at the configured
scale** (SF10 / `math`) before vendoring. Candidates must also stay short
— the vendored set is curated to keep each query under ~5 s/engine at scale
so the nightly stays bounded; drop anything slower and refill. One gotcha:
verify against the bench's own DataFusion `SessionContext`, **not**
`datafusion-cli` — the cli decorrelates more subqueries than the harness can
physically plan and reports false-positive passes on queries the harness then
can't actually run.

## Running

The four origins are nightly-only matrix entries in
`.github/workflows/nightly-bench.yml`. Locally:

```
vx-bench run sqlstorm --opt origin=tpch # tpch | tpcds | stackoverflow | job
```

TPC-H / TPC-DS generate their own SF10 datasets under
`vortex-bench/data/tpch/10.0/` and `vortex-bench/data/tpcds/10.0/` (no longer
shared with the standalone SF1 benchmarks). StackOverflow / JOB download and
convert their upstream tarballs to Parquet under
`vortex-bench/data/sqlstorm/<origin>/parquet/` on first run (idempotent via a
`.success` marker). The StackOverflow `math` tarball is ~12 GB and needs
~30 GB of scratch to extract and load.
24 changes: 24 additions & 0 deletions vortex-bench/sqlstorm/job/10088.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
SELECT
a.name AS actor_name,
t.title AS movie_title,
c.kind AS cast_type,
m.info AS movie_info,
k.keyword AS movie_keyword
FROM
aka_name a
JOIN
cast_info ci ON a.person_id = ci.person_id
JOIN
aka_title t ON ci.movie_id = t.movie_id
JOIN
movie_info m ON t.id = m.movie_id
JOIN
movie_keyword mk ON t.id = mk.movie_id
JOIN
keyword k ON mk.keyword_id = k.id
JOIN
comp_cast_type c ON ci.role_id = c.id
WHERE
t.production_year = 2020
ORDER BY
a.name, t.title;
29 changes: 29 additions & 0 deletions vortex-bench/sqlstorm/job/10166.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
SELECT
a.name AS aka_name,
t.title AS movie_title,
c.note AS cast_note,
c.nr_order AS cast_order,
n.name AS person_name,
rt.role AS role,
m.info AS movie_info,
k.keyword AS movie_keyword
FROM
aka_name a
JOIN
cast_info c ON a.person_id = c.person_id
JOIN
title t ON c.movie_id = t.id
JOIN
name n ON a.person_id = n.imdb_id
JOIN
role_type rt ON c.role_id = rt.id
JOIN
movie_info m ON t.id = m.movie_id
JOIN
movie_keyword mk ON t.id = mk.movie_id
JOIN
keyword k ON mk.keyword_id = k.id
WHERE
t.production_year = 2020
ORDER BY
t.title, c.nr_order;
24 changes: 24 additions & 0 deletions vortex-bench/sqlstorm/job/10176.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
SELECT
a.name AS aka_name,
t.title AS movie_title,
c.note AS cast_note,
co.name AS company_name,
k.keyword AS movie_keyword
FROM
aka_name a
JOIN
cast_info c ON a.person_id = c.person_id
JOIN
aka_title t ON c.movie_id = t.movie_id
JOIN
movie_companies mc ON t.id = mc.movie_id
JOIN
company_name co ON mc.company_id = co.id
JOIN
movie_keyword mk ON t.id = mk.movie_id
JOIN
keyword k ON mk.keyword_id = k.id
WHERE
t.production_year >= 2000
ORDER BY
t.production_year DESC;
25 changes: 25 additions & 0 deletions vortex-bench/sqlstorm/job/10194.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
SELECT
a.name AS aka_name,
t.title AS movie_title,
c.nr_order AS cast_order,
n.name AS person_name,
p.info AS person_info,
k.keyword AS movie_keyword
FROM
aka_name a
JOIN
cast_info c ON a.person_id = c.person_id
JOIN
title t ON c.movie_id = t.id
JOIN
name n ON c.person_id = n.imdb_id
JOIN
person_info p ON n.id = p.person_id
JOIN
movie_keyword mk ON t.id = mk.movie_id
JOIN
keyword k ON mk.keyword_id = k.id
WHERE
t.production_year >= 2000
ORDER BY
t.production_year DESC, c.nr_order;
22 changes: 22 additions & 0 deletions vortex-bench/sqlstorm/job/10228.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
SELECT
a.name AS actor_name,
m.title AS movie_title,
m.production_year,
c.kind AS cast_type,
k.keyword AS movie_keyword
FROM
aka_name a
JOIN
cast_info ci ON a.person_id = ci.person_id
JOIN
aka_title m ON ci.movie_id = m.id
JOIN
comp_cast_type c ON ci.person_role_id = c.id
JOIN
movie_keyword mk ON m.id = mk.movie_id
JOIN
keyword k ON mk.keyword_id = k.id
WHERE
m.production_year > 2000
ORDER BY
m.production_year DESC, a.name;
32 changes: 32 additions & 0 deletions vortex-bench/sqlstorm/job/10231.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
SELECT
t.title,
a.name AS actor_name,
c.kind AS comp_cast_type,
m.name AS company_name,
k.keyword,
i.info
FROM
title t
JOIN
cast_info ci ON t.id = ci.movie_id
JOIN
aka_name a ON ci.person_id = a.person_id
JOIN
comp_cast_type c ON ci.role_id = c.id
JOIN
movie_companies mc ON t.id = mc.movie_id
JOIN
company_name m ON mc.company_id = m.id
JOIN
movie_keyword mk ON t.id = mk.movie_id
JOIN
keyword k ON mk.keyword_id = k.id
JOIN
movie_info mi ON t.id = mi.movie_id
JOIN
info_type i ON mi.info_type_id = i.id
WHERE
t.production_year >= 2000
AND m.country_code = 'USA'
ORDER BY
t.title, a.name;
22 changes: 22 additions & 0 deletions vortex-bench/sqlstorm/job/10489.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
SELECT
a.name AS aka_name,
t.title AS movie_title,
c.note AS cast_note,
ri.role AS person_role,
m.name AS company_name
FROM
aka_name a
JOIN
cast_info c ON a.person_id = c.person_id
JOIN
title t ON c.movie_id = t.id
JOIN
movie_companies mc ON t.id = mc.movie_id
JOIN
company_name m ON mc.company_id = m.id
JOIN
role_type ri ON c.role_id = ri.id
WHERE
t.production_year = 2022
ORDER BY
t.title, a.name;
Loading
Loading