From 35e71bea4ffca169ef05254a5eb1309bb74df483 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 May 2026 17:14:21 +0900
Subject: [PATCH] Delete source dataset files before sync to make cold-run prep
 fair

Most systems run `sync && echo 3 > /proc/sys/vm/drop_caches` at the
start of run.sh to prepare for a cold first run of each query. This
sync also flushes any dirty pages of the *source* dataset files
(hits.tsv, hits.csv, hits.parquet, etc.) that were downloaded and
loaded into the system but are no longer needed once ingest is done.

The sync's flush of those unrelated source pages adds time and disk
I/O that varies wildly across systems (uncompressed size ~70 GB for
TSV/CSV vs ~14 GB for Parquet, and some systems decompress in-place
while others move to a separate dir). That's effectively a hidden
violation of benchmark rules: cold-run prep cost ends up depending
on what input format the system happened to use, not on the system
itself.

Fix: in benchmark.sh of every system that ingests the dataset into
its own storage format, delete the downloaded source files
(.csv, .tsv, .parquet, .json.gz - both compressed and uncompressed
forms) immediately after the load step (and after any data_size
measurement that depends on those files) and before run.sh is
invoked. The unlinked files' dirty pages are dropped by the kernel
without being flushed to disk, so the subsequent sync only covers
the database's own writes.

Systems that query the parquet/csv/tsv file directly as their
storage (clickhouse-parquet, duckdb-parquet, datafusion, polars,
sail, etc.) are intentionally NOT modified - those files ARE the
data and must remain.

Skipped: locustdb (script panics during load and never reaches
run.sh), mongodb (uses run.js, not run.sh).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 byconity/benchmark.sh            | 4 ++++
 cedardb/benchmark.sh             | 4 ++++
 chdb/benchmark.sh                | 4 ++++
 citus/benchmark.sh               | 4 ++++
 clickhouse/benchmark.sh          | 4 ++++
 cloudberry/benchmark.sh          | 3 +++
 cockroachdb/benchmark.sh         | 4 ++++
 cratedb/benchmark.sh             | 4 ++++
 databend/benchmark.sh            | 4 ++++
 doris/benchmark.sh               | 4 ++++
 druid/benchmark.sh               | 4 ++++
 duckdb-vortex/benchmark.sh       | 4 ++++
 duckdb/benchmark.sh              | 4 ++++
 elasticsearch/benchmark.sh       | 4 ++++
 greenplum/benchmark.sh           | 3 +++
 heavyai/benchmark.sh             | 4 ++++
 hologres/benchmark.sh            | 4 ++++
 hyper/benchmark.sh               | 4 ++++
 infobright/benchmark.sh          | 4 ++++
 kinetica/benchmark.sh            | 4 ++++
 mariadb-columnstore/benchmark.sh | 4 ++++
 mariadb/benchmark.sh             | 4 ++++
 monetdb/benchmark.sh             | 4 ++++
 mysql-myisam/benchmark.sh        | 4 ++++
 mysql/benchmark.sh               | 4 ++++
 oxla/benchmark.sh                | 4 ++++
 pg_duckdb-indexed/benchmark.sh   | 4 ++++
 pg_duckdb/benchmark.sh           | 4 ++++
 pgpro_tam/benchmark.sh           | 4 ++++
 pinot/benchmark.sh               | 4 ++++
 selectdb/benchmark.sh            | 4 ++++
 starrocks/benchmark.sh           | 4 ++++
 umbra/benchmark.sh               | 4 ++++
 33 files changed, 130 insertions(+)

diff --git a/byconity/benchmark.sh b/byconity/benchmark.sh
index af59bdff2..a7c00995a 100755
--- a/byconity/benchmark.sh
+++ b/byconity/benchmark.sh
@@ -36,6 +36,10 @@ byconity --database bench --query "INSERT INTO hits FORMAT TSV" < hits.tsv
 END=$(date +%s)
 echo "Load time: $(echo "$END - $START" | bc)"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv hits.tsv.gz
+
 # NOTE: sometimes may hung due to docker-compose, using docker directly may help
 ./run.sh
 
diff --git a/cedardb/benchmark.sh b/cedardb/benchmark.sh
index 530be23dc..1cc728ed6 100755
--- a/cedardb/benchmark.sh
+++ b/cedardb/benchmark.sh
@@ -37,6 +37,10 @@ PGPASSWORD=test command time -f '%e' psql -h localhost -U postgres -q -t -c "COP
 echo -n "Data size: "
 PGPASSWORD=test psql -h localhost -U postgres -q -t -c "SELECT pg_total_relation_size('hits');"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f data/hits.tsv
+
 # run benchmark
 echo "running benchmark..."
 ./run.sh 2>&1 | tee log.txt
diff --git a/chdb/benchmark.sh b/chdb/benchmark.sh
index 2f5daed52..532d29ef0 100755
--- a/chdb/benchmark.sh
+++ b/chdb/benchmark.sh
@@ -16,6 +16,10 @@ pigz -d -f hits.csv.gz
 echo -n "Load time: "
 command time -f '%e' ./load.py
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.csv
+
 # Run the queries
 ./run.sh 2>&1 | tee log.txt
 
diff --git a/citus/benchmark.sh b/citus/benchmark.sh
index d23b3bf64..4d88ecda6 100755
--- a/citus/benchmark.sh
+++ b/citus/benchmark.sh
@@ -26,6 +26,10 @@ command time -f '%e' psql -U postgres -h localhost -d postgres test -q -t -c "\\
 # COPY 99997497
 # Time: 1579203.482 ms (26:19.203)
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 ./run.sh 2>&1 | tee log.txt
 
 echo -n "Data size: "
diff --git a/clickhouse/benchmark.sh b/clickhouse/benchmark.sh
index 34c19b4f0..edb355852 100755
--- a/clickhouse/benchmark.sh
+++ b/clickhouse/benchmark.sh
@@ -44,6 +44,10 @@ sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files/hits_*.parquet
 echo -n "Load time: "
 clickhouse-client --time --query "INSERT INTO hits SELECT * FROM file('hits_*.parquet')" --max-insert-threads $(( $(nproc) / 4 ))
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+sudo rm -f /var/lib/clickhouse/user_files/hits_*.parquet
+
 # Run the queries
 
 ./run.sh "$1"
diff --git a/cloudberry/benchmark.sh b/cloudberry/benchmark.sh
index 4271c6030..cda1d9fa1 100755
--- a/cloudberry/benchmark.sh
+++ b/cloudberry/benchmark.sh
@@ -120,6 +120,9 @@ elif [[ $1 == 'test' ]]; then
 	if [[ $2 != 'no_dl' ]]; then echo -n "Load time: "
                                command time -f '%e' sudo -iu gpadmin psql -d postgres -t -c "ANALYZE hits;"; fi
 	du -sh /data0*
+	# Drop the downloaded source files so the sync at the top of run.sh
+	# doesn't flush their pages and inflate cold-run prep time.
+	sudo -u gpadmin rm -f ~gpadmin/hits.tsv ~gpadmin/hits.tsv.gz
 	sudo -iu gpadmin /home/gpadmin/run.sh 2>&1 | tee log.txt
 	cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' |awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }'
 
diff --git a/cockroachdb/benchmark.sh b/cockroachdb/benchmark.sh
index 4951f6cfe..ca714c1c9 100755
--- a/cockroachdb/benchmark.sh
+++ b/cockroachdb/benchmark.sh
@@ -33,6 +33,10 @@ cockroach sql --insecure --host=localhost --database=test --execute="IMPORT INTO
 END=$(date +%s)
 echo "Load time: $(echo "$END - $START" | bc)"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+sudo rm -f /tmp/hits.csv.gz $CRDBDATADIR/extern/hits.csv
+
 ./run.sh 2>&1 | tee log.txt
 
 echo -n "Data size: "
diff --git a/cratedb/benchmark.sh b/cratedb/benchmark.sh
index 7d5307b29..011f981e1 100755
--- a/cratedb/benchmark.sh
+++ b/cratedb/benchmark.sh
@@ -68,6 +68,10 @@ if [[ $MODE == "tuned" ]]; then
   psql -U crate -h localhost --no-password -t -c "REFRESH TABLE hits; OPTIMIZE TABLE hits;"
 fi;
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+sudo rm -f /tmp/hits.tsv
+
 # Some queries don't fit into the available heap space and raise an CircuitBreakingException
 ./run.sh "$MODE" 2>&1 | tee log.txt
 
diff --git a/databend/benchmark.sh b/databend/benchmark.sh
index 6fedd2978..29d5dc783 100755
--- a/databend/benchmark.sh
+++ b/databend/benchmark.sh
@@ -73,4 +73,8 @@ du -bcs _data | grep total
 # curl 'http://default@localhost:8124/' --data-binary "select humanize_size(bytes_compressed)  from fuse_snapshot('default', 'hits') order by timestamp desc limit 1"
 # 18.48 GiB
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 ./run.sh 2>&1 | tee log.txt
diff --git a/doris/benchmark.sh b/doris/benchmark.sh
index b57738670..5ccc0a07f 100755
--- a/doris/benchmark.sh
+++ b/doris/benchmark.sh
@@ -228,6 +228,10 @@ echo "$LOADTIME" > loadtime
 du -bs "$DORIS_HOME"/be/storage/ | cut -f1 | tee storage_size
 echo "Data size: $(cat storage_size)"
 
+# Drop the downloaded source files so the per-query sync below
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f "$BE_DATA_DIR/user_files_secure"/hits_*.parquet
+
 mysql -h 127.0.0.1 -P9030 -uroot hits -e "set global enable_sql_cache = false"
 # Dataset contains 99997497 rows, storage size is about 13319588503 bytes
 mysql -h 127.0.0.1 -P9030 -uroot hits -e "SELECT count(*) FROM hits"
diff --git a/druid/benchmark.sh b/druid/benchmark.sh
index 6a3139784..72c2fea66 100755
--- a/druid/benchmark.sh
+++ b/druid/benchmark.sh
@@ -36,6 +36,10 @@ command time -f '%e' ./apache-druid-${VERSION}/bin/post-index-task --file ingest
 # The command above will fail due to timeout but still continue to run in background.
 # The loading time should be checked from the logs.
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 # Run the queries
 ./run.sh
 
diff --git a/duckdb-vortex/benchmark.sh b/duckdb-vortex/benchmark.sh
index cd25a6409..f8100ae6c 100755
--- a/duckdb-vortex/benchmark.sh
+++ b/duckdb-vortex/benchmark.sh
@@ -22,6 +22,10 @@ command time -f '%e' duckdb hits-single.db -f create.sql
 
 echo 'single'
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.parquet
+
 ./run.sh 'hits-single.db' 2>&1 | tee log-s.txt
 cat log-s.txt |
   grep -P '^\d|Killed|Segmentation|^Run Time \(s\): real' |
diff --git a/duckdb/benchmark.sh b/duckdb/benchmark.sh
index 7042b968f..5cf30bb75 100755
--- a/duckdb/benchmark.sh
+++ b/duckdb/benchmark.sh
@@ -11,6 +11,10 @@ wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compat
 echo -n "Load time: "
 command time -f '%e' duckdb hits.db -storage_version latest -f create.sql -f load.sql
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.parquet
+
 # Run the queries
 
 ./run.sh 2>&1 | tee log.txt
diff --git a/elasticsearch/benchmark.sh b/elasticsearch/benchmark.sh
index e50248e12..f7445e644 100755
--- a/elasticsearch/benchmark.sh
+++ b/elasticsearch/benchmark.sh
@@ -76,5 +76,9 @@ echo "Data size: $(jq -r '._all.total.store.total_data_set_size_in_bytes' stats.
 END=$(date +%s)
 echo "Load time: $(echo "$END - $START" | bc)"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.json.gz
+
 ######  Run the queries
 ./run.sh
diff --git a/greenplum/benchmark.sh b/greenplum/benchmark.sh
index ec5a52604..503b966dd 100755
--- a/greenplum/benchmark.sh
+++ b/greenplum/benchmark.sh
@@ -78,5 +78,8 @@ command time -f '%e' psql -d postgres -t -c "insert into hits select * from hits
 echo -n "Load time: "
 command time -f '%e' psql -d postgres -t -c "ANALYZE hits;"
 du -sh /gpdata*
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
 ./run.sh 2>&1 | tee log.txt
 cat log.txt | grep -oP 'Time: \d+\.\d+ ms|psql: error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/; s/^.*psql: error.*$/null/' |awk '{ if (i % 3 == 0) { printf "[" }; if ($1 == "null") { printf $1 } else { printf $1 / 1000 }; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }'
diff --git a/heavyai/benchmark.sh b/heavyai/benchmark.sh
index fe6c9758b..37a48ce2e 100755
--- a/heavyai/benchmark.sh
+++ b/heavyai/benchmark.sh
@@ -48,6 +48,10 @@ command time -f '%e' /opt/heavyai/bin/heavysql -q -t -p HyperInteractive <<< "CO
 
 # Loaded: 99997497 recs, Rejected: 0 recs in 572.633 secs
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.csv
+
 ./run.sh 2>&1 | tee log.txt
 
 echo -n "Data size: "
diff --git a/hologres/benchmark.sh b/hologres/benchmark.sh
index 74a821111..741d15eea 100755
--- a/hologres/benchmark.sh
+++ b/hologres/benchmark.sh
@@ -56,6 +56,10 @@ for file in hits_part_*; do
     PGUSER=$PG_USER PGPASSWORD=$PG_PASSWORD command time -f '%e' psql -h $HOST_NAME -p $PORT -d $DATABASE -t -c "\\copy hits FROM '$file'"
 done
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv hits_part_*
+
 # run clickbench test with queries
 echo "Starting to run queries..."
 
diff --git a/hyper/benchmark.sh b/hyper/benchmark.sh
index 20583879b..9f2bb55a9 100755
--- a/hyper/benchmark.sh
+++ b/hyper/benchmark.sh
@@ -13,6 +13,10 @@ pigz -d -f hits.csv.gz
 echo -n "Load time: "
 command time -f '%e' ./load.py
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.csv
+
 ./run.sh | tee log.txt
 
 cat log.txt |
diff --git a/infobright/benchmark.sh b/infobright/benchmark.sh
index 9c0b72d21..77bd60020 100755
--- a/infobright/benchmark.sh
+++ b/infobright/benchmark.sh
@@ -32,6 +32,10 @@ sudo docker exec mysql_ib du -bcs /mnt/mysql_data/ /usr/local/infobright-4.0.7-x
 
 # 13 760 341 294
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv hits90m.tsv
+
 ./run.sh 2>&1 | tee log.txt
 
 cat log.txt |
diff --git a/kinetica/benchmark.sh b/kinetica/benchmark.sh
index 046fa0b30..8612d01c6 100755
--- a/kinetica/benchmark.sh
+++ b/kinetica/benchmark.sh
@@ -31,5 +31,9 @@ LOADTIME=$(echo "$END - $START" | bc)
 echo "Load time: $LOADTIME"
 echo "Data size: $(du -bcs ./kinetica-persist/gpudb | grep total)"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+sudo rm -f ./kinetica-persist/hits.tsv.gz
+
 # run the queries
 ./run.sh
diff --git a/mariadb-columnstore/benchmark.sh b/mariadb-columnstore/benchmark.sh
index d050c705a..e0e6fe0b9 100755
--- a/mariadb-columnstore/benchmark.sh
+++ b/mariadb-columnstore/benchmark.sh
@@ -32,6 +32,10 @@ command time -f '%e' mysql --password="${PASSWORD}" --host 127.0.0.1 clickbench
 
 # 41m47.856s
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 ./run.sh 2>&1 | tee log.txt
 
 echo -n "Data size: "
diff --git a/mariadb/benchmark.sh b/mariadb/benchmark.sh
index 3363dd20a..483763266 100755
--- a/mariadb/benchmark.sh
+++ b/mariadb/benchmark.sh
@@ -24,6 +24,10 @@ sudo mariadb test < create.sql
 echo -n "Load time: "
 command time -f '%e' split -l 10000 --filter="sudo mariadb test -e \"SET sql_log_bin = 0; LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE hits;\"" hits.tsv
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 ./run.sh 2>&1 | tee log.txt
 
 echo -n "Data size: "
diff --git a/monetdb/benchmark.sh b/monetdb/benchmark.sh
index 529851954..ec9e7bd5c 100755
--- a/monetdb/benchmark.sh
+++ b/monetdb/benchmark.sh
@@ -33,6 +33,10 @@ command time -f '%e' ./query.expect "COPY INTO hits FROM '$(pwd)/hits.tsv' USING
 # 99997497 affected rows
 # clk: 15:39 min
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 ./run.sh 2>&1 | tee log.txt
 
 echo -n "Data size: "
diff --git a/mysql-myisam/benchmark.sh b/mysql-myisam/benchmark.sh
index 3c41141a4..a99bc4c5d 100755
--- a/mysql-myisam/benchmark.sh
+++ b/mysql-myisam/benchmark.sh
@@ -20,6 +20,10 @@ command time -f '%e' sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL IN
 
 # 41m8.979s
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 ./run.sh 2>&1 | tee log.txt
 
 echo -n "Data size: "
diff --git a/mysql/benchmark.sh b/mysql/benchmark.sh
index 1a04d06bb..1d33a1827 100755
--- a/mysql/benchmark.sh
+++ b/mysql/benchmark.sh
@@ -20,6 +20,10 @@ command time -f '%e' sudo mysql test -e "SET sql_log_bin = 0; LOAD DATA LOCAL IN
 
 # 2:37:52 elapsed
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 ./run.sh 2>&1 | tee log.txt
 
 echo -n "Data size: "
diff --git a/oxla/benchmark.sh b/oxla/benchmark.sh
index e9ad00536..a3add3c4c 100755
--- a/oxla/benchmark.sh
+++ b/oxla/benchmark.sh
@@ -38,6 +38,10 @@ PGPASSWORD=oxla command time -f '%e' psql -h localhost -U oxla -q -t -c "COPY hi
 echo -n "Data size: "
 PGPASSWORD=oxla psql -h localhost -U oxla -q -t -c "SELECT pg_total_relation_size('hits');"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+sudo rm -f data/hits.csv
+
 # run benchmark
 echo "running benchmark..."
 ./run.sh
diff --git a/pg_duckdb-indexed/benchmark.sh b/pg_duckdb-indexed/benchmark.sh
index 608d29794..ed894caaa 100755
--- a/pg_duckdb-indexed/benchmark.sh
+++ b/pg_duckdb-indexed/benchmark.sh
@@ -74,6 +74,10 @@ fi
 echo -n "Load time: "
 command time -f '%e' ./load.sh
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 psql $CONNECTION -c "ALTER DATABASE postgres SET duckdb.force_execution = true;"
 ./run.sh 2>&1 | tee log.txt
 
diff --git a/pg_duckdb/benchmark.sh b/pg_duckdb/benchmark.sh
index 14704efbc..51acdf64c 100755
--- a/pg_duckdb/benchmark.sh
+++ b/pg_duckdb/benchmark.sh
@@ -74,6 +74,10 @@ fi
 echo -n "Load time: "
 command time -f '%e' ./load.sh
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 psql $CONNECTION -c "ALTER DATABASE postgres SET duckdb.force_execution = true;"
 ./run.sh 2>&1 | tee log.txt
 
diff --git a/pgpro_tam/benchmark.sh b/pgpro_tam/benchmark.sh
index 07fa462f7..cc28d02c4 100755
--- a/pgpro_tam/benchmark.sh
+++ b/pgpro_tam/benchmark.sh
@@ -60,6 +60,10 @@ else
     command time -f '%e' psql -h 127.0.0.1 -U postgres -t -c "COPY hits FROM '/tmp/hits.tsv'"
 fi
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+sudo docker exec pgpro_tam rm -f /tmp/hits.tsv
+
 #run benchmark
 ./run.sh 2>&1 | tee log.txt
 
diff --git a/pinot/benchmark.sh b/pinot/benchmark.sh
index afe663f63..41fbd1c7f 100755
--- a/pinot/benchmark.sh
+++ b/pinot/benchmark.sh
@@ -38,6 +38,10 @@ command time -f '%e' ./apache-pinot-$PINOT_VERSION-bin/bin/pinot-admin.sh Launch
 
 # After upload it shows 94465149 rows instead of 99997497 in the dataset
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv parts*.tsv
+
 # Run the queries
 ./run.sh
 
diff --git a/selectdb/benchmark.sh b/selectdb/benchmark.sh
index ca9a30d42..820a4a9fe 100755
--- a/selectdb/benchmark.sh
+++ b/selectdb/benchmark.sh
@@ -119,6 +119,10 @@ du -bs "$DORIS_HOME"/be/storage/ | cut -f1 | tee storage_size
 
 echo "Data size: $(cat storage_size)"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 # Run queries
 ./run.sh 2>&1 | tee -a log.txt
 
diff --git a/starrocks/benchmark.sh b/starrocks/benchmark.sh
index f6d73ddf7..2da562919 100755
--- a/starrocks/benchmark.sh
+++ b/starrocks/benchmark.sh
@@ -69,6 +69,10 @@ du -bcs StarRocks-${VERSION}/storage/ | grep total
 # Dataset contains 99997497 rows
 mysql -h 127.0.0.1 -P9030 -uroot hits -e "SELECT count(*) FROM hits"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f hits.tsv
+
 ./run.sh 2>&1 | tee -a log.txt
 
 cat log.txt |
diff --git a/umbra/benchmark.sh b/umbra/benchmark.sh
index 73b434ec9..92550c1cf 100755
--- a/umbra/benchmark.sh
+++ b/umbra/benchmark.sh
@@ -34,6 +34,10 @@ then
 fi
 echo "Load time: $(( (end - start) / 1000 ))"
 
+# Drop the downloaded source files so the sync at the top of run.sh
+# doesn't flush their pages and inflate cold-run prep time.
+rm -f data/hits.tsv
+
 ./run.sh 2>&1 | tee log.txt
 
 # Calculate persistence size