vortex-data · mprammer · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/.github/workflows/nightly-bench.yml b/.github/workflows/nightly-bench.yml
@@ -64,6 +64,86 @@ jobs:
               {"engine": "duckdb", "format": "vortex"}
             ],
             "scale_factor": "100.0"
+          },
+          {
+            "id": "sqlstorm-stackoverflow-nvme",
+            "subcommand": "sqlstorm",
+            "name": "SQLStorm (stackoverflow) on NVME",
+            "origin": "stackoverflow",
+            "data_formats": ["parquet", "vortex"],
+            "pr_targets": [
+              {"engine": "datafusion", "format": "parquet"},
+              {"engine": "datafusion", "format": "vortex"},
+              {"engine": "duckdb", "format": "parquet"},
+              {"engine": "duckdb", "format": "vortex"}
+            ],
+            "develop_targets": [
+              {"engine": "datafusion", "format": "parquet"},
+              {"engine": "datafusion", "format": "vortex"},
+              {"engine": "duckdb", "format": "parquet"},
+              {"engine": "duckdb", "format": "vortex"}
+            ],
+            "iterations": "3"
+          },
+          {
+            "id": "sqlstorm-job-nvme",
+            "subcommand": "sqlstorm",
+            "name": "SQLStorm (job) on NVME",
+            "origin": "job",
+            "data_formats": ["parquet", "vortex"],
+            "pr_targets": [
+              {"engine": "datafusion", "format": "parquet"},
+              {"engine": "datafusion", "format": "vortex"},
+              {"engine": "duckdb", "format": "parquet"},
+              {"engine": "duckdb", "format": "vortex"}
+            ],
+            "develop_targets": [
+              {"engine": "datafusion", "format": "parquet"},
+              {"engine": "datafusion", "format": "vortex"},
+              {"engine": "duckdb", "format": "parquet"},
+              {"engine": "duckdb", "format": "vortex"}
+            ],
+            "iterations": "3"
+          },
+          {
+            "id": "sqlstorm-tpch-nvme",
+            "subcommand": "sqlstorm",
+            "name": "SQLStorm (tpch) on NVME",
+            "origin": "tpch",
+            "data_formats": ["parquet", "vortex"],
+            "pr_targets": [
+              {"engine": "datafusion", "format": "parquet"},
+              {"engine": "datafusion", "format": "vortex"},
+              {"engine": "duckdb", "format": "parquet"},
+              {"engine": "duckdb", "format": "vortex"}
+            ],
+            "develop_targets": [
+              {"engine": "datafusion", "format": "parquet"},
+              {"engine": "datafusion", "format": "vortex"},
+              {"engine": "duckdb", "format": "parquet"},
+              {"engine": "duckdb", "format": "vortex"}
+            ],
+            "iterations": "3"
+          },
+          {
+            "id": "sqlstorm-tpcds-nvme",
+            "subcommand": "sqlstorm",
+            "name": "SQLStorm (tpcds) on NVME",
+            "origin": "tpcds",
+            "data_formats": ["parquet", "vortex"],
+            "pr_targets": [
+              {"engine": "datafusion", "format": "parquet"},
+              {"engine": "datafusion", "format": "vortex"},
+              {"engine": "duckdb", "format": "parquet"},
+              {"engine": "duckdb", "format": "vortex"}
+            ],
+            "develop_targets": [
+              {"engine": "datafusion", "format": "parquet"},
+              {"engine": "datafusion", "format": "vortex"},
+              {"engine": "duckdb", "format": "parquet"},
+              {"engine": "duckdb", "format": "vortex"}
+            ],
+            "iterations": "3"
           }
         ]
     strategy:

diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
@@ -358,6 +358,7 @@ jobs:
         run: |
           uv run --project bench-orchestrator vx-bench prepare-data "${{ matrix.subcommand }}" \
             --formats-json '${{ toJSON(matrix.data_formats) }}' \
+            ${{ matrix.origin && format('--opt origin={0}', matrix.origin) || '' }} \
             ${{ matrix.scale_factor && format('--opt scale-factor={0}', matrix.scale_factor) || '' }}
 
       - name: Setup AWS CLI
@@ -403,6 +404,7 @@ jobs:
             --no-build \
             --runner "ec2_${{ inputs.machine_type }}" \
             ${{ matrix.iterations && format('--iterations {0}', matrix.iterations) || '' }} \
+            ${{ matrix.origin && format('--opt origin={0}', matrix.origin) || '' }} \
             ${{ matrix.scale_factor && format('--opt scale-factor={0}', matrix.scale_factor) || '' }}
 
       - name: Run ${{ matrix.name }} benchmark (remote)
@@ -424,6 +426,7 @@ jobs:
             --runner "ec2_${{ inputs.machine_type }}" \
             ${{ matrix.iterations && format('--iterations {0}', matrix.iterations) || '' }} \
             --opt remote-data-dir=${{ matrix.remote_storage }} \
+            ${{ matrix.origin && format('--opt origin={0}', matrix.origin) || '' }} \
             ${{ matrix.scale_factor && format('--opt scale-factor={0}', matrix.scale_factor) || '' }}
 
       - name: Capture file sizes

diff --git a/bench-orchestrator/README.md b/bench-orchestrator/README.md
@@ -41,7 +41,7 @@ vx-bench run <benchmark> [options]
 
 **Arguments:**
 
-- `benchmark`: Benchmark suite to run (`appian`, `tpch`, `tpcds`, `clickbench`, `fineweb`, `gh-archive`, `polarsignals`, `public-bi`, `statpopgen`)
+- `benchmark`: Benchmark suite to run (`appian`, `tpch`, `tpcds`, `clickbench`, `fineweb`, `gh-archive`, `polarsignals`, `public-bi`, `statpopgen`, `sqlstorm`)
 
 **Options:**
 

diff --git a/bench-orchestrator/bench_orchestrator/config.py b/bench-orchestrator/bench_orchestrator/config.py
@@ -51,6 +51,7 @@ class Benchmark(Enum):
     POLARSIGNALS = "polarsignals"
     PUBLIC_BI = "public-bi"
     STATPOPGEN = "statpopgen"
+    SQLSTORM = "sqlstorm"
 
 
 # Engine to supported formats mapping.

diff --git a/vortex-bench/sqlstorm/README.md b/vortex-bench/sqlstorm/README.md
@@ -0,0 +1,86 @@
+# vortex-bench SQLStorm queries
+
+[SQLStorm] is an LLM-generated SQL stress suite — ~62k queries across four
+schemas, broad enough to exercise SQL surface that TPC-H and TPC-DS don't.
+This directory holds a small, confirmed-working sample (125 queries per
+schema, 500 total) that the nightly bench runs against TPC-H and TPC-DS data
+generated at SF10 plus two larger datasets we download for the non-TPC
+schemas. Queries are pinned at SHA
+[`b3bb0b96794a6afe9bb8f3ff2b243562b779c40d`][pinned-sqlstorm].
+
+[SQLStorm]: https://github.com/SQL-Storm/SQLStorm
+[pinned-sqlstorm]: https://github.com/SQL-Storm/SQLStorm/tree/b3bb0b96794a6afe9bb8f3ff2b243562b779c40d
+
+## Layout
+
+- `<origin>/<id>.sql` — 125 queries per origin, 4 origins, 500 total.
+  `<id>` is the upstream SQLStorm query id (sparse, non-sequential).
+
+| Origin | Source data | Upstream SQLStorm dir |
+| --- | --- | --- |
+| `tpch` | TPC-H generated at SF10 (`data/tpch/10.0/`) | `v1.0/tpch/` |
+| `tpcds` | TPC-DS generated at SF10 (`data/tpcds/10.0/`) | `v1.0/tpcds/` |
+| `stackoverflow` | `stackoverflow_math.tar.gz` (~12 GB) from `db.in.tum.de` | `v1.0/stackoverflow/` |
+| `job` | `imdb.tzst` from `db.in.tum.de` | `v1.0/job/` |
+
+The benchmark runs strict — a query failure aborts the run rather than
+silently dropping a row, so any regression that breaks a query in nightly
+is loud. The vendored set was curated to be the intersection of queries
+that pass DuckDB and DataFusion against the source data; that is why a
+small, confirmed-working sample lives in-tree and the full ~62k SQLStorm
+corpus does not.
+
+## Data size (fixed scale)
+
+**There is no SQLStorm scale factor.** Each origin runs at a single fixed
+size, and `vx-bench run sqlstorm` does **not** read `--opt scale-factor` —
+passing one is silently ignored (it is not an error and changes nothing). The
+four origins are sized to sit within the same order of magnitude as JOB:
+
+| Origin | Fixed size | ~Rows (all tables) | ~Parquet |
+| --- | --- | --- | --- |
+| `stackoverflow` | the `math` tier, ~12 GB | 40 M | 6.1 GB |
+| `job` | the full IMDB/JOB snapshot (fixed real dataset) | 74 M | 1.7 GB |
+| `tpch` | SF 10 | 87 M | 3.5 GB |
+| `tpcds` | SF 10 | 192 M | 3.9 GB |
+
+This mirrors upstream: SQLStorm has no uniform scale knob either. OLAPBench
+(the canonical runner) selects size *per origin* — StackOverflow ships at
+0 / 1 GB (`dba`) / 12 GB (`math`) / 222 GB, TPC-H/TPC-DS scale via their own
+generators, and JOB is fixed. Query *validity* is scale-independent; only row
+counts change with size. The fixed points above are set in code — the TPC
+scale by `SQLSTORM_TPC_SCALE_FACTOR` (`sqlstorm_benchmark.rs`) and the
+StackOverflow tier by the `STACKOVERFLOW` recipe's tarball URL (`data.rs`) —
+so changing them means editing those consts (and re-curating, since the
+vendored queries are selected to stay short at the configured scale), not
+passing a runtime scale factor.
+
+## Refreshing the vendored set
+
+Swaps happen by hand against the pinned SHA above: clone the SQLStorm
+corpus at that SHA, pick candidates from `v1.0/<origin>/queries/`, and
+verify each runs cleanly on both DuckDB and DataFusion **at the configured
+scale** (SF10 / `math`) before vendoring. Candidates must also stay short
+— the vendored set is curated to keep each query under ~5 s/engine at scale
+so the nightly stays bounded; drop anything slower and refill. One gotcha:
+verify against the bench's own DataFusion `SessionContext`, **not**
+`datafusion-cli` — the cli decorrelates more subqueries than the harness can
+physically plan and reports false-positive passes on queries the harness then
+can't actually run.
+
+## Running
+
+The four origins are nightly-only matrix entries in
+`.github/workflows/nightly-bench.yml`. Locally:
+
+```
+vx-bench run sqlstorm --opt origin=tpch       # tpch | tpcds | stackoverflow | job
+```
+
+TPC-H / TPC-DS generate their own SF10 datasets under
+`vortex-bench/data/tpch/10.0/` and `vortex-bench/data/tpcds/10.0/` (no longer
+shared with the standalone SF1 benchmarks). StackOverflow / JOB download and
+convert their upstream tarballs to Parquet under
+`vortex-bench/data/sqlstorm/<origin>/parquet/` on first run (idempotent via a
+`.success` marker). The StackOverflow `math` tarball is ~12 GB and needs
+~30 GB of scratch to extract and load.
diff --git a/vortex-bench/sqlstorm/job/10088.sql b/vortex-bench/sqlstorm/job/10088.sql
@@ -0,0 +1,24 @@
+SELECT 
+    a.name AS actor_name,
+    t.title AS movie_title,
+    c.kind AS cast_type,
+    m.info AS movie_info,
+    k.keyword AS movie_keyword
+FROM 
+    aka_name a
+JOIN 
+    cast_info ci ON a.person_id = ci.person_id
+JOIN 
+    aka_title t ON ci.movie_id = t.movie_id
+JOIN 
+    movie_info m ON t.id = m.movie_id
+JOIN 
+    movie_keyword mk ON t.id = mk.movie_id
+JOIN 
+    keyword k ON mk.keyword_id = k.id
+JOIN 
+    comp_cast_type c ON ci.role_id = c.id
+WHERE 
+    t.production_year = 2020
+ORDER BY 
+    a.name, t.title;
diff --git a/vortex-bench/sqlstorm/job/10166.sql b/vortex-bench/sqlstorm/job/10166.sql
@@ -0,0 +1,29 @@
+SELECT 
+    a.name AS aka_name,
+    t.title AS movie_title,
+    c.note AS cast_note,
+    c.nr_order AS cast_order,
+    n.name AS person_name,
+    rt.role AS role,
+    m.info AS movie_info,
+    k.keyword AS movie_keyword
+FROM 
+    aka_name a
+JOIN 
+    cast_info c ON a.person_id = c.person_id
+JOIN 
+    title t ON c.movie_id = t.id
+JOIN 
+    name n ON a.person_id = n.imdb_id
+JOIN 
+    role_type rt ON c.role_id = rt.id
+JOIN 
+    movie_info m ON t.id = m.movie_id
+JOIN 
+    movie_keyword mk ON t.id = mk.movie_id
+JOIN 
+    keyword k ON mk.keyword_id = k.id
+WHERE 
+    t.production_year = 2020
+ORDER BY 
+    t.title, c.nr_order;
diff --git a/vortex-bench/sqlstorm/job/10176.sql b/vortex-bench/sqlstorm/job/10176.sql
@@ -0,0 +1,24 @@
+SELECT 
+    a.name AS aka_name,
+    t.title AS movie_title,
+    c.note AS cast_note,
+    co.name AS company_name,
+    k.keyword AS movie_keyword
+FROM 
+    aka_name a
+JOIN 
+    cast_info c ON a.person_id = c.person_id
+JOIN 
+    aka_title t ON c.movie_id = t.movie_id
+JOIN 
+    movie_companies mc ON t.id = mc.movie_id
+JOIN 
+    company_name co ON mc.company_id = co.id
+JOIN 
+    movie_keyword mk ON t.id = mk.movie_id
+JOIN 
+    keyword k ON mk.keyword_id = k.id
+WHERE 
+    t.production_year >= 2000
+ORDER BY 
+    t.production_year DESC;
diff --git a/vortex-bench/sqlstorm/job/10194.sql b/vortex-bench/sqlstorm/job/10194.sql
@@ -0,0 +1,25 @@
+SELECT 
+    a.name AS aka_name,
+    t.title AS movie_title,
+    c.nr_order AS cast_order,
+    n.name AS person_name,
+    p.info AS person_info,
+    k.keyword AS movie_keyword
+FROM 
+    aka_name a
+JOIN 
+    cast_info c ON a.person_id = c.person_id
+JOIN 
+    title t ON c.movie_id = t.id
+JOIN 
+    name n ON c.person_id = n.imdb_id
+JOIN 
+    person_info p ON n.id = p.person_id
+JOIN 
+    movie_keyword mk ON t.id = mk.movie_id
+JOIN 
+    keyword k ON mk.keyword_id = k.id
+WHERE 
+    t.production_year >= 2000
+ORDER BY 
+    t.production_year DESC, c.nr_order;
diff --git a/vortex-bench/sqlstorm/job/10228.sql b/vortex-bench/sqlstorm/job/10228.sql
@@ -0,0 +1,22 @@
+SELECT 
+    a.name AS actor_name,
+    m.title AS movie_title,
+    m.production_year,
+    c.kind AS cast_type,
+    k.keyword AS movie_keyword
+FROM 
+    aka_name a
+JOIN 
+    cast_info ci ON a.person_id = ci.person_id
+JOIN 
+    aka_title m ON ci.movie_id = m.id
+JOIN 
+    comp_cast_type c ON ci.person_role_id = c.id
+JOIN 
+    movie_keyword mk ON m.id = mk.movie_id
+JOIN 
+    keyword k ON mk.keyword_id = k.id
+WHERE 
+    m.production_year > 2000
+ORDER BY 
+    m.production_year DESC, a.name;
diff --git a/vortex-bench/sqlstorm/job/10231.sql b/vortex-bench/sqlstorm/job/10231.sql
@@ -0,0 +1,32 @@
+SELECT 
+    t.title,
+    a.name AS actor_name,
+    c.kind AS comp_cast_type,
+    m.name AS company_name,
+    k.keyword,
+    i.info
+FROM 
+    title t
+JOIN 
+    cast_info ci ON t.id = ci.movie_id
+JOIN 
+    aka_name a ON ci.person_id = a.person_id
+JOIN 
+    comp_cast_type c ON ci.role_id = c.id
+JOIN 
+    movie_companies mc ON t.id = mc.movie_id
+JOIN 
+    company_name m ON mc.company_id = m.id
+JOIN 
+    movie_keyword mk ON t.id = mk.movie_id
+JOIN 
+    keyword k ON mk.keyword_id = k.id
+JOIN 
+    movie_info mi ON t.id = mi.movie_id
+JOIN 
+    info_type i ON mi.info_type_id = i.id
+WHERE 
+    t.production_year >= 2000
+    AND m.country_code = 'USA'
+ORDER BY 
+    t.title, a.name;
diff --git a/vortex-bench/sqlstorm/job/10489.sql b/vortex-bench/sqlstorm/job/10489.sql
@@ -0,0 +1,22 @@
+SELECT 
+    a.name AS aka_name, 
+    t.title AS movie_title, 
+    c.note AS cast_note, 
+    ri.role AS person_role, 
+    m.name AS company_name
+FROM 
+    aka_name a
+JOIN 
+    cast_info c ON a.person_id = c.person_id
+JOIN 
+    title t ON c.movie_id = t.id
+JOIN 
+    movie_companies mc ON t.id = mc.movie_id
+JOIN 
+    company_name m ON mc.company_id = m.id
+JOIN 
+    role_type ri ON c.role_id = ri.id
+WHERE 
+    t.production_year = 2022
+ORDER BY 
+    t.title, a.name;