diff --git a/documentation/configuration/configuration-utils/_cairo.config.json b/documentation/configuration/configuration-utils/_cairo.config.json index 20af3db59..e7c70fe92 100644 --- a/documentation/configuration/configuration-utils/_cairo.config.json +++ b/documentation/configuration/configuration-utils/_cairo.config.json @@ -490,5 +490,9 @@ "cairo.partition.encoder.parquet.data.page.size": { "default": "1048576", "description": "Sets the default page size for parquet-encoded partitions." + }, + "cairo.partition.encoder.parquet.min.compression.ratio": { + "default": "1.2", + "description": "Minimum compression ratio (uncompressed_size / compressed_size) for Parquet pages. When a compressed page does not meet this threshold, it is stored uncompressed instead. A value of 0.0 disables the check." } } diff --git a/documentation/query/export-parquet.md b/documentation/query/export-parquet.md index 21a22d574..a0b84e174 100644 --- a/documentation/query/export-parquet.md +++ b/documentation/query/export-parquet.md @@ -184,3 +184,32 @@ cairo.partition.encoder.parquet.compression.level=0 When using `ZSTD`, the level ranges from 1 (fastest) to 22, with a default of 9. For COPY exports, you can also override compression per-query. See [Overriding compression](#overriding-compression). + +### Minimum compression ratio + +The `cairo.partition.encoder.parquet.min.compression.ratio` property controls +whether compressed Parquet pages are worth keeping. After compressing a page, +QuestDB checks the ratio of `uncompressed_size / compressed_size`. If the ratio +falls below the threshold, the compressed output is discarded and the page is +stored uncompressed instead. + +```ini +# Default: 1.2 (keep compressed output only if it achieves ~17% size reduction) +cairo.partition.encoder.parquet.min.compression.ratio=1.2 +``` + +A value of `0.0` (or any value <= 1.0) disables the check, always keeping +compressed output. + +The ratio check applies to both data pages and dictionary pages and works with +all compression codecs. It runs after compression, so the CPU cost of +compression is still incurred -- this setting only avoids the I/O and storage +penalty of keeping pages that barely compress. + +### Per-column overrides + +Individual columns can override the global encoding and compression settings. +See [CREATE TABLE - Per-column Parquet encoding and compression](/docs/query/sql/create-table/#per-column-parquet-encoding-and-compression) +for defining overrides at table creation, or +[ALTER TABLE ALTER COLUMN SET/DROP PARQUET](/docs/query/sql/alter-table-alter-column-parquet-encoding/) +for modifying existing tables. diff --git a/documentation/query/sql/alter-table-alter-column-parquet-encoding.md b/documentation/query/sql/alter-table-alter-column-parquet-encoding.md new file mode 100644 index 000000000..37d0bf73e --- /dev/null +++ b/documentation/query/sql/alter-table-alter-column-parquet-encoding.md @@ -0,0 +1,42 @@ +--- +title: ALTER TABLE ALTER COLUMN SET/DROP PARQUET +sidebar_label: PARQUET ENCODING/COMPRESSION +description: ALTER TABLE ALTER COLUMN SET/DROP PARQUET SQL keyword reference documentation. +--- + +Sets or removes per-column Parquet encoding and compression configuration on +existing tables. These settings only affect +[Parquet partitions](/docs/query/export-parquet/#in-place-conversion) and are +ignored for native partitions. + +## SET + +Override the default Parquet encoding, compression, or both for a column. +The syntax is `SET PARQUET(encoding [, compression[(level)]])`. Use `default` +for the encoding when specifying compression only. + +```questdb-sql title="Set encoding only" +ALTER TABLE sensors ALTER COLUMN temperature SET PARQUET(rle_dictionary); +``` + +```questdb-sql title="Set compression only (with optional level)" +ALTER TABLE sensors ALTER COLUMN temperature SET PARQUET(default, zstd(3)); +``` + +```questdb-sql title="Set both encoding and compression" +ALTER TABLE sensors ALTER COLUMN temperature SET PARQUET(rle_dictionary, zstd(3)); +``` + +## DROP + +Reset per-column overrides back to the server defaults. + +```questdb-sql title="Reset to defaults" +ALTER TABLE sensors ALTER COLUMN temperature DROP PARQUET; +``` + +## Supported encodings and codecs + +See the [CREATE TABLE](/docs/query/sql/create-table/#supported-encodings) +reference for the full list of supported encodings, compression codecs, and +their valid column types. diff --git a/documentation/query/sql/create-table.md b/documentation/query/sql/create-table.md index ad5db8404..e07a8567f 100644 --- a/documentation/query/sql/create-table.md +++ b/documentation/query/sql/create-table.md @@ -361,6 +361,99 @@ CREATE TABLE trades ( ) TIMESTAMP(timestamp); ``` +### Per-column Parquet encoding and compression + +![Flow chart showing the syntax of per-column Parquet encoding and compression](/images/docs/diagrams/parquetEncodingDef.svg) + +Column definitions may include an optional `PARQUET(encoding [, compression[(level)]])` +clause. These settings only affect +[Parquet partitions](/docs/query/export-parquet/#in-place-conversion) and are +ignored for native partitions. Both encoding and compression are optional — use +`default` for the encoding when specifying compression only. + +```questdb-sql title="CREATE TABLE with per-column Parquet config" +CREATE TABLE sensors ( + ts TIMESTAMP, + temperature DOUBLE PARQUET(rle_dictionary, zstd(3)), + humidity FLOAT PARQUET(rle_dictionary), + device_id VARCHAR PARQUET(default, lz4_raw), + status INT +) TIMESTAMP(ts) PARTITION BY DAY; +``` + +When omitted, columns use the global defaults: a type-appropriate encoding and +the server-wide compression codec +(`cairo.partition.encoder.parquet.compression.codec`). + +#### Supported encodings + +| Encoding | SQL keyword | Valid column types | +| ----------------------- | ------------------------- | ---------------------------- | +| Plain | `plain` | All | +| RLE Dictionary | `rle_dictionary` | All except BOOLEAN and ARRAY | +| Delta Length Byte Array | `delta_length_byte_array` | STRING, BINARY, VARCHAR | +| Delta Binary Packed | `delta_binary_packed` | INT, LONG, DATE, TIMESTAMP | + +- **Plain** — stores values as-is with no transformation. Simplest encoding + with no overhead. Use as a fallback when data has high cardinality and no + exploitable patterns (e.g. random floats or UUIDs). +- **RLE Dictionary** — builds a dictionary of unique values and replaces each + value with a short integer key. The keys are then encoded with a hybrid of + run-length encoding (for repeated consecutive keys) and bit-packing (for + non-repeating sequences). Best for low-to-medium cardinality columns (status + codes, device IDs, symbols). The lower the cardinality, the greater the + compression. +- **Delta Length Byte Array** — delta-encodes the lengths of consecutive + string/binary values, then stores the raw bytes back-to-back. This is the + Parquet-recommended encoding for byte array columns and is always preferred + over `plain` for STRING, BINARY, and VARCHAR. +- **Delta Binary Packed** — delta-encodes integer values and packs the deltas + into a compact binary representation. Effective for monotonically increasing + or slowly changing integer/timestamp columns (e.g. sequential IDs, event + timestamps). + +For the full specification of each encoding, see the +[Apache Parquet encodings documentation](https://parquet.apache.org/docs/file-format/data-pages/encodings/). + +When no encoding is specified, QuestDB picks a type-appropriate default: +`rle_dictionary` for SYMBOL and VARCHAR, `delta_length_byte_array` for STRING +and BINARY, and `plain` for everything else. + +#### Supported compression codecs + +| Codec | SQL keyword | Level range | +| ------------ | -------------- | ----------- | +| LZ4 Raw | `lz4_raw` | -- | +| Zstd | `zstd` | 1-22 | +| Snappy | `snappy` | -- | +| Gzip | `gzip` | 1-9 | +| Brotli | `brotli` | 0-11 | +| Uncompressed | `uncompressed` | -- | + +- **LZ4 Raw** — extremely fast compression and decompression with a moderate + ratio. No tunable level. This is the QuestDB default and a good choice for + most workloads where query throughput matters. +- **Zstd** — excellent balance of compression ratio and speed across its level + range. Lower levels (1-3) approach LZ4 speed with better ratios; higher + levels (up to 22) rival Brotli ratios. A strong general-purpose choice when + storage savings justify slightly slower decompression. +- **Snappy** — very fast compression and decompression with moderate ratio. No + tunable level. Similar trade-offs to LZ4 Raw. +- **Gzip** — widely supported, higher compression ratio than Snappy or LZ4 at + the cost of slower decompression, which reduces query throughput. Higher + levels (up to 9) improve ratio but further increase CPU time. +- **Brotli** — achieves some of the highest compression ratios, especially at + higher levels, but decompression is significantly slower. Best suited for + cold/archival data where storage savings outweigh query throughput. +- **Uncompressed** — no compression. Fastest decompression (none needed) but + largest file size. Useful when data is already incompressible. + +For more details on Parquet compression, see the +[Apache Parquet compression documentation](https://parquet.apache.org/docs/file-format/data-pages/compression/). + +To modify encoding or compression on existing tables, see +[ALTER TABLE ALTER COLUMN SET/DROP PARQUET](/docs/query/sql/alter-table-alter-column-parquet-encoding/). + ### Casting types `castDef` - casts the type of a specific column. `columnRef` must reference diff --git a/documentation/query/sql/show.md b/documentation/query/sql/show.md index 51b8a2de5..4669e4d8c 100644 --- a/documentation/query/sql/show.md +++ b/documentation/query/sql/show.md @@ -88,6 +88,21 @@ CREATE TABLE trades ( WITH maxUncommittedRows=500000, o3MaxLag=600000000us; ``` +#### Per-column Parquet encoding + +When columns have per-column Parquet encoding or compression overrides, they +appear in the `SHOW CREATE TABLE` output: + +```questdb-sql +CREATE TABLE sensors ( + ts TIMESTAMP, + temperature DOUBLE PARQUET(rle_dictionary, zstd(3)), + humidity FLOAT PARQUET(rle_dictionary), + device_id VARCHAR PARQUET(default, lz4_raw), + status INT +) timestamp(ts) PARTITION BY DAY BYPASS WAL; +``` + #### Enterprise variant [QuestDB Enterprise](/enterprise/) will include an additional `OWNED BY` clause populated with the current user. diff --git a/documentation/sidebars.js b/documentation/sidebars.js index 645e259ef..f96b2e32d 100644 --- a/documentation/sidebars.js +++ b/documentation/sidebars.js @@ -264,6 +264,7 @@ module.exports = { "query/sql/alter-table-alter-column-cache", "query/sql/alter-table-change-column-type", "query/sql/alter-table-alter-column-drop-index", + "query/sql/alter-table-alter-column-parquet-encoding", "query/sql/alter-table-change-symbol-capacity", ], }, diff --git a/static/images/docs/diagrams/.railroad b/static/images/docs/diagrams/.railroad index e0536140e..7e1f10612 100644 --- a/static/images/docs/diagrams/.railroad +++ b/static/images/docs/diagrams/.railroad @@ -379,6 +379,9 @@ refreshMatView dropMatView ::= 'DROP' 'MATERIALIZED' 'VIEW' ('IF' 'EXISTS')? viewName +parquetEncodingDef + ::= 'PARQUET' '(' encoding ( ',' compression-codec ( '(' level ')' )? )? ')' + pivot ::= ( '(' selectQuery ')' | tableName ) ('WHERE' condition)? diff --git a/static/images/docs/diagrams/parquetEncodingDef.svg b/static/images/docs/diagrams/parquetEncodingDef.svg new file mode 100644 index 000000000..b352e5d6c --- /dev/null +++ b/static/images/docs/diagrams/parquetEncodingDef.svg @@ -0,0 +1,60 @@ + + + + + + + + + PARQUET + + + ( + + + encoding + + , + + + compression-codec + + ( + + + level + + ) + + + ) + + + + \ No newline at end of file