Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .config/nextest.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
[[profile.default.overrides]]
filter = 'test(compress_large_int)'
priority = 100

# Long-running (~3 min) regression that pushes cumulative FSST output past i32::MAX. Dispatch it
# first so its latency overlaps with the rest of the suite instead of trailing at the end.
[[profile.default.overrides]]
filter = 'test(fsst_compress_offsets_overflow_i32)'
priority = 100
59 changes: 33 additions & 26 deletions encodings/fsst/src/tests.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use rand::SeedableRng;
use rand::rngs::StdRng;
use rand::seq::IndexedRandom;
use fsst::CompressorBuilder;
use vortex_array::ArrayRef;
use vortex_array::IntoArray;
use vortex_array::LEGACY_SESSION;
Expand Down Expand Up @@ -118,11 +116,20 @@ fn test_fsst_array_ops() {
/// [`VarBinBuilder<i32>`] for the FSST output and panicked in
/// [`VarBinBuilder::append_value`] once cumulative compressed bytes crossed the boundary.
///
/// The input is built with [`VarBinBuilder<i64>`] so the input itself does not panic, which
/// confirms the overflow is on the FSST output side. After the fix the test must succeed
/// with the row count preserved.
/// We force the output past [`i32::MAX`] with an empty FSST compressor: it has no symbols, so
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this test the best documented part of the codebase?

/// every input byte is emitted as a two-byte escape and the compressed output is
/// deterministically `2 *` the input size. This crosses the boundary with ~1.1 GiB of input
/// rather than the ~2.5 GiB of incompressible data a trained compressor would require, which
/// roughly halves the compression work and removes random-data generation entirely while
/// exercising the same output-offset path. The escape factor is the worst case FSST can
/// produce (`2 * len + 7`), so this is also the cheapest way to reach the boundary.
///
/// Allocates ~2.5 GiB for the input and ~2.5 GiB for the FSST output (~5 GiB total), so it
/// The input offsets stay below [`i32::MAX`], so the input build never overflows; only the FSST
/// output crosses the boundary, isolating the regression to the output side. The test asserts
/// the actual compressed byte size exceeds [`i32::MAX`] so it cannot silently stop covering the
/// regression if FSST's escape behavior ever changes.
///
/// Allocates ~1.1 GiB for the input and ~2.25 GiB for the FSST output (~3.4 GiB total), so it
/// is gated to CI runs and skipped when `VORTEX_SKIP_SLOW_TESTS` is set. To run it locally:
///
/// ```text
Expand All @@ -133,38 +140,38 @@ fn test_fsst_array_ops() {
#[test_with::env(CI)]
#[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)]
fn fsst_compress_offsets_overflow_i32() {
// High-entropy ASCII strings sliced from a random pool. FSST is a symbol-table
// compressor; pseudo-random data with no recurring byte sequences resists compression,
// so the compressed output stays close to input size and crosses the i32 boundary.
const STRING_LEN: usize = 64 * 1024;
const TOTAL_BYTES: usize = (1usize << 31) + (512 << 20); // ~2.5 GiB
const N: usize = TOTAL_BYTES / STRING_LEN;
const POOL_LEN: usize = 64 * 1024 * 1024;

// Printable ASCII alphabet so the result is valid UTF-8.
const ALPHABET: &[u8; 95] =
b" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
// An empty compressor escapes every byte, so each 64 KiB string compresses to exactly
// 128 KiB of output. Target ~2.25 GiB of output (2^31 + 256 MiB margin) so cumulative
// compressed bytes comfortably exceed i32::MAX.
const TOTAL_OUTPUT_BYTES: usize = (1usize << 31) + (256 << 20);
const N: usize = TOTAL_OUTPUT_BYTES / (2 * STRING_LEN);

let mut rng = StdRng::seed_from_u64(0xC0DE_C011_B711);
let pool: Vec<u8> = (0..POOL_LEN)
.map(|_| *ALPHABET.choose(&mut rng).unwrap())
.collect();
// The content is irrelevant because the empty compressor escapes every byte; `b'a'` keeps
// the input valid UTF-8. A single reused buffer avoids per-row allocation.
let string = vec![b'a'; STRING_LEN];

println!("building large VarBinArray");
let mut builder = VarBinBuilder::<i64>::with_capacity(N);
for i in 0..N {
let off = i.wrapping_mul(31337) % (POOL_LEN - STRING_LEN);
builder.append_value(&pool[off..off + STRING_LEN]);
for _ in 0..N {
builder.append_value(&string);
}
let array = builder.finish(DType::Utf8(Nullability::NonNullable));

println!("training FSST compressor");
let compressor = fsst_train_compressor(&array);
// Empty symbol table -> every byte is escaped -> 2x expansion.
let compressor = CompressorBuilder::default().build();
let len = array.len();
let dtype = array.dtype().clone();
let mut ctx = LEGACY_SESSION.create_execution_ctx();

println!("compressing to FSST");
let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx);
assert_eq!(compressed.len(), len);

// The regression is only exercised if cumulative compressed bytes truly exceed i32::MAX.
let compressed_bytes = compressed.codes_bytes().len();
assert!(
compressed_bytes > i32::MAX as usize,
"compressed output ({compressed_bytes} bytes) must exceed i32::MAX to require i64 offsets",
);
}
Loading