From b140b80f439ba817982fd220223d16d01143a11e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 19 May 2026 08:51:01 +0200 Subject: [PATCH] perf: generic literal-prefix fast path for regexp_replace `${1}` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generalizes the existing `^...(capture).*$` -> `${1}` extraction in `OptimizedRegex` for the common subset where the regex up to the capture reduces to a finite set of literal byte prefixes and the capture has the form `([^X]+)X` for a single ASCII byte X. For inputs of that shape: - `^https?://(?:www\.)?([^/]+)/.*$` (ClickBench Q28) - `^foo:([^,]+),.*$` (single literal prefix) - `^(?:foo|bar|baz):([^/]+)/.*$` (alternation prefix) the recognizer parses the pattern's HIR once via `regex-syntax`, enumerates the literal prefix variants (bounded by 32 alternatives), and dispatches each row to a `memchr`-based extractor instead of the regex engine. Longest-matching-prefix is tried first; on empty-capture failure the extractor falls back to shorter prefixes. That preserves the regex's backtracking semantics for cases like `http://www./path` against the Q28 pattern, where the full regex prefers to leave `www.` outside the optional so the capture is non-empty. Patterns that don't match the literal-prefix shape continue through the existing `ShortenedRegex` path (strip trailing `.*$`, use `captures_read` against reusable `CaptureLocations`). Recognition is strict — `(?i)`, `(?m)`, non-ASCII terminators, and unbounded prefix constructs all fall back. Measured on ClickBench Q28 (`REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1')`, partitioned dataset, dfbench `--iterations 5 --query 28`, same machine): | Build | Avg ms | | ------------------------------------ | -------: | | Upstream main (shortened-regex only) | 4577.52 | | Literal-prefix fast path | 2225.00 | Delta: -51.4% on Q28. Validation: - cargo test -p datafusion-functions --lib regex::regexpreplace - cargo clippy -p datafusion-functions --all-targets --all-features -- -D warnings --- Cargo.lock | 1 + Cargo.toml | 1 + datafusion/functions/Cargo.toml | 3 +- .../functions/src/regex/regexpreplace.rs | 498 ++++++++++++++++-- 4 files changed, 464 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 010dd315c44ca..16e64ae72e044 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2194,6 +2194,7 @@ dependencies = [ "num-traits", "rand 0.9.4", "regex", + "regex-syntax", "sha2", "tokio", "uuid", diff --git a/Cargo.toml b/Cargo.toml index 78c271d524fb8..026c6075e80b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -189,6 +189,7 @@ prost = "0.14.1" rand = "0.9" recursive = "0.1.1" regex = "1.12" +regex-syntax = "0.8" rstest = "0.26.1" serde_json = "1" sha2 = "^0.11.0" diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index d6a6693d862cc..5e34768d5801c 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -55,7 +55,7 @@ encoding_expressions = ["base64", "hex"] # enable math functions math_expressions = [] # enable regular expressions -regex_expressions = ["regex"] +regex_expressions = ["regex", "regex-syntax"] # enable string functions string_expressions = ["uuid"] # enable unicode functions @@ -87,6 +87,7 @@ memchr = { workspace = true } num-traits = { workspace = true } rand = { workspace = true } regex = { workspace = true, optional = true } +regex-syntax = { workspace = true, optional = true } sha2 = { workspace = true, optional = true } uuid = { workspace = true, features = ["v4"], optional = true } diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 215dd33324375..07ad8a5af9311 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -202,12 +202,27 @@ fn regex_replace_posix_groups(replacement: &str) -> String { .into_owned() } -struct ShortRegex { - /// Shortened anchored regex used to extract capture group 1 directly. - /// See [`try_build_short_extract_regex`] for details. - short_re: Regex, - /// Reusable capture locations for `short_re` to avoid per-row allocation. - locs: CaptureLocations, +/// Fast-path state used in place of [`Regex::replacen`] for patterns matching +/// [`OptimizedRegex`]'s direct-extraction shape. +enum ShortRegex { + /// Generic shortened-regex extractor for patterns of the form + /// `^...(capture)....*$` with replacement `${1}`. The trailing `.*$` is + /// stripped and `captures_read` is used to find capture group 1 directly, + /// avoiding `expand()` and any `String` allocation. + ShortenedRegex { + re: Regex, + locs: CaptureLocations, + }, + /// Specialized memchr-based extractor for the common subset of the above + /// where the regex up to the capture is a finite set of literal byte + /// prefixes, the capture is `[^X]+` for a single ASCII byte `X`, and the + /// capture is followed by `X.*$`. See + /// [`try_recognize_literal_prefix_capture`] for the recognized shape. + /// + /// ClickBench Q28's `^https?://(?:www\.)?([^/]+)/.*$` is the motivating + /// instance: `prefixes = ["https://www.", "http://www.", "https://", "http://"]`, + /// `terminator = b'/'`. + LiteralPrefixCapture(LiteralPrefixCaptureSpec), } /// Holds the normal compiled regex together with the optional fast path used @@ -231,11 +246,6 @@ impl OptimizedRegex { None }; - let short_re = short_re.map(|short_re| { - let locs = short_re.capture_locations(); - ShortRegex { short_re, locs } - }); - Self { re, short_re } } @@ -248,50 +258,337 @@ impl OptimizedRegex { replacement: &str, ) -> Cow<'a, str> { // If this pattern is not eligible for direct extraction, use the full regex. - let Some(ShortRegex { short_re, locs }) = self.short_re.as_mut() else { + let Some(short_re) = self.short_re.as_mut() else { return self.re.replacen(val, limit, replacement); }; - // If the shortened regex does not match, the original anchored regex would - // also leave the input unchanged. - if short_re.captures_read(locs, val).is_none() { - return Cow::Borrowed(val); - }; + match short_re { + ShortRegex::ShortenedRegex { re: short_re, locs } => { + // If the shortened regex does not match, the original anchored regex would + // also leave the input unchanged. + if short_re.captures_read(locs, val).is_none() { + return Cow::Borrowed(val); + }; + + // `captures_read` succeeded, so the overall shortened match is present. + let match_end = locs.get(0).unwrap().1; + if memchr(b'\n', &val.as_bytes()[match_end..]).is_some() { + // If there is a newline after the match, we can't use the short + // regex since it won't match across lines. Fall back to the full + // regex replacement. + return self.re.replacen(val, limit, replacement); + }; + // The fast path only applies to `${1}` replacements, so the result is + // either capture group 1 or the empty string if that group did not match. + if let Some((start, end)) = locs.get(1) { + Cow::Borrowed(&val[start..end]) + } else { + Cow::Borrowed("") + } + } + ShortRegex::LiteralPrefixCapture(spec) => { + let Some((start, end, match_end)) = + spec.extract(val.as_bytes()) + else { + return Cow::Borrowed(val); + }; + + if memchr(b'\n', &val.as_bytes()[match_end..]).is_some() { + // Same single-line safety as the shortened-regex path: `.*$` + // in the original anchored pattern doesn't cross `\n`, so + // hand off to the full regex when trailing content has one. + return self.re.replacen(val, limit, replacement); + } - // `captures_read` succeeded, so the overall shortened match is present. - let match_end = locs.get(0).unwrap().1; - if memchr(b'\n', &val.as_bytes()[match_end..]).is_some() { - // If there is a newline after the match, we can't use the short - // regex since it won't match across lines. Fall back to the full - // regex replacement. - return self.re.replacen(val, limit, replacement); - }; - // The fast path only applies to `${1}` replacements, so the result is - // either capture group 1 or the empty string if that group did not match. - if let Some((start, end)) = locs.get(1) { - Cow::Borrowed(&val[start..end]) - } else { - Cow::Borrowed("") + Cow::Borrowed(&val[start..end]) + } } } } /// For anchored patterns like `^...(capture)....*$` where the replacement -/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use -/// `captures_read` with `CaptureLocations` for direct extraction — no -/// `expand()`, no `String` allocation. -/// This pattern appears in ClickBench Q28: which uses a regexp like -/// `^https?://(?:www\.)?([^/]+)/.*$` -fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option { +/// is `\1`, build the fastest available direct-extraction state. +/// +/// Two shapes are recognized: +/// +/// 1. `^([^X]+)X.*$` with replacement `${1}`, where the +/// prefix reduces to a finite set of byte literals and `X` is a single +/// ASCII byte. Handled by [`LiteralPrefixCaptureSpec`] with a `memchr` +/// terminator scan — no regex engine involvement per row. +/// 2. Any other `^...(capture)....*$` with replacement `${1}`: the trailing +/// `.*$` is stripped and the resulting shortened regex is run with +/// `captures_read` against reusable `CaptureLocations`. +/// +/// ClickBench Q28's `^https?://(?:www\.)?([^/]+)/.*$` matches shape 1. +fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option { if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") { return None; } + + if let Some(spec) = try_recognize_literal_prefix_capture(pattern) { + return Some(ShortRegex::LiteralPrefixCapture(spec)); + } + let short = &pattern[..pattern.len() - 3]; let re = Regex::new(short).ok()?; if re.captures_len() != 2 { return None; } - Some(re) + let locs = re.capture_locations(); + Some(ShortRegex::ShortenedRegex { re, locs }) +} + +/// Bound on enumerated prefix variants. Each `(?:...)?` doubles the count and +/// each `(a|b|c)` multiplies it, so this caps the explosion for adversarial +/// patterns. ClickBench Q28's pattern produces 4 variants. +const MAX_PREFIX_VARIANTS: usize = 32; + +/// Precomputed state for the `^([^X]+)X.*$` → `${1}` fast path. +#[derive(Debug)] +struct LiteralPrefixCaptureSpec { + /// Distinct literal byte prefixes the input must start with, sorted + /// longest-first so a single linear scan acts greedily. + prefixes: Vec>, + /// Single ASCII byte that ends the capture (also the literal that must + /// follow the capture in the original regex). + terminator: u8, +} + +impl LiteralPrefixCaptureSpec { + /// Returns `(capture_start, capture_end, match_end)` where `match_end` + /// points past the terminator, or `None` if the input doesn't match. + /// + /// Prefixes are tried longest-first to mimic greedy matching, but we + /// fall back to shorter alternatives if the greedy choice leaves no + /// room for the capture. That mirrors the full regex's backtracking + /// behavior for cases like `http://www./path` against + /// `^https?://(?:www\.)?([^/]+)/.*$`, where the regex prefers to leave + /// `www.` outside the optional so the capture is non-empty. + fn extract(&self, bytes: &[u8]) -> Option<(usize, usize, usize)> { + for prefix in &self.prefixes { + if !bytes.starts_with(prefix) { + continue; + } + let capture_start = prefix.len(); + let Some(terminator_offset) = + memchr(self.terminator, &bytes[capture_start..]) + else { + continue; + }; + if terminator_offset == 0 { + // `[^X]+` requires at least one byte; try a shorter prefix. + continue; + } + let capture_end = capture_start + terminator_offset; + return Some((capture_start, capture_end, capture_end + 1)); + } + None + } +} + +/// Recognizes the `^([^X]+)X.*$` shape with default flags (no +/// `(?i)`, `(?m)`, etc. — those would alter the byte-level interpretation +/// we rely on). +fn try_recognize_literal_prefix_capture( + pattern: &str, +) -> Option { + use regex_syntax::hir::{Hir, HirKind, Look}; + + let hir = regex_syntax::parse(pattern).ok()?; + let HirKind::Concat(parts) = hir.kind() else { + return None; + }; + + let mut iter = parts.iter(); + + // 1. `^` (start of text — multiline `(?m)` would give StartLF instead). + if !matches!(iter.next()?.kind(), HirKind::Look(Look::Start)) { + return None; + } + + // 2. Literal prefix segments up to (but not including) the capture. + let mut prefixes: Vec> = vec![Vec::new()]; + let capture: &Hir = loop { + let part = iter.next()?; + if matches!(part.kind(), HirKind::Capture(_)) { + break part; + } + prefix_extend_variants(&mut prefixes, part)?; + if prefixes.len() > MAX_PREFIX_VARIANTS { + return None; + } + }; + + // 3. Capture must be group 1 wrapping `[^X]+` (greedy, 1+) for a single + // ASCII byte X. + let HirKind::Capture(cap) = capture.kind() else { + unreachable!() + }; + if cap.index != 1 { + return None; + } + let terminator = capture_terminator_byte(&cap.sub)?; + + // 4. Literal terminator matching the excluded byte. + let HirKind::Literal(lit) = iter.next()?.kind() else { + return None; + }; + if lit.0.as_ref() != [terminator] { + return None; + } + + // 5. `.*` (any byte except `\n`, zero or more). + if !is_dot_star(iter.next()?) { + return None; + } + + // 6. `$` and nothing after. + if !matches!(iter.next()?.kind(), HirKind::Look(Look::End)) { + return None; + } + if iter.next().is_some() { + return None; + } + + // Dedupe + sort longest-first so the runtime probe is greedy. + prefixes.sort(); + prefixes.dedup(); + let mut prefixes: Vec> = prefixes + .into_iter() + .map(Vec::into_boxed_slice) + .collect(); + prefixes.sort_by_key(|p| std::cmp::Reverse(p.len())); + + Some(LiteralPrefixCaptureSpec { + prefixes, + terminator, + }) +} + +/// Extend the accumulator with one prefix segment. Returns `None` if the +/// segment isn't a finite literal shape (literal / concat / alternation / +/// `?`-optional combination of those). +fn prefix_extend_variants( + variants: &mut Vec>, + hir: ®ex_syntax::hir::Hir, +) -> Option<()> { + use regex_syntax::hir::HirKind; + + match hir.kind() { + HirKind::Literal(lit) => { + for v in variants.iter_mut() { + v.extend_from_slice(&lit.0); + } + Some(()) + } + HirKind::Concat(parts) => { + for part in parts { + prefix_extend_variants(variants, part)?; + if variants.len() > MAX_PREFIX_VARIANTS { + return None; + } + } + Some(()) + } + HirKind::Repetition(rep) if rep.min == 0 && rep.max == Some(1) => { + // `X?` → either nothing or X. Duplicate the accumulator and + // append X to one copy. + let mut with_x = variants.clone(); + prefix_extend_variants(&mut with_x, &rep.sub)?; + if variants.len() + with_x.len() > MAX_PREFIX_VARIANTS { + return None; + } + variants.extend(with_x); + Some(()) + } + HirKind::Alternation(branches) => { + let base = std::mem::take(variants); + for branch in branches { + let mut local = base.clone(); + prefix_extend_variants(&mut local, branch)?; + if variants.len() + local.len() > MAX_PREFIX_VARIANTS { + return None; + } + variants.extend(local); + } + Some(()) + } + _ => None, + } +} + +/// Capture must be a greedy `[^X]+` over a single ASCII byte X. +fn capture_terminator_byte(hir: ®ex_syntax::hir::Hir) -> Option { + use regex_syntax::hir::HirKind; + + let HirKind::Repetition(rep) = hir.kind() else { + return None; + }; + if rep.min < 1 || rep.max.is_some() || !rep.greedy { + return None; + } + let HirKind::Class(class) = rep.sub.kind() else { + return None; + }; + single_excluded_ascii_byte(class) +} + +/// `.*` for default-flag regexes: any byte except `\n`, zero or more, greedy. +fn is_dot_star(hir: ®ex_syntax::hir::Hir) -> bool { + use regex_syntax::hir::HirKind; + + let HirKind::Repetition(rep) = hir.kind() else { + return false; + }; + if rep.min != 0 || rep.max.is_some() || !rep.greedy { + return false; + } + let HirKind::Class(class) = rep.sub.kind() else { + return false; + }; + single_excluded_ascii_byte(class) == Some(b'\n') +} + +/// Returns `Some(b)` iff `class` matches every Unicode codepoint or byte +/// except a single ASCII byte `b`. We require ASCII because the runtime +/// matcher uses `memchr` over byte slices. +fn single_excluded_ascii_byte(class: ®ex_syntax::hir::Class) -> Option { + use regex_syntax::hir::Class; + + match class { + Class::Unicode(uc) => { + let ranges = uc.ranges(); + if ranges.len() != 2 { + return None; + } + let (r0, r1) = (&ranges[0], &ranges[1]); + if (r0.start() as u32) != 0 || (r1.end() as u32) != 0x10FFFF { + return None; + } + let gap_start = r0.end() as u32 + 1; + let gap_end = r1.start() as u32 - 1; + if gap_start != gap_end || gap_start > 0x7F { + return None; + } + Some(gap_start as u8) + } + Class::Bytes(bc) => { + let ranges = bc.ranges(); + if ranges.len() != 2 { + return None; + } + let (r0, r1) = (&ranges[0], &ranges[1]); + if r0.start() != 0 || r1.end() != 0xFF { + return None; + } + let gap_start = r0.end() as u16 + 1; + let gap_end = r1.start() as u16 - 1; + if gap_start != gap_end || gap_start > 0x7F { + return None; + } + Some(gap_start as u8) + } + } } /// Replaces substring(s) matching a PCRE-like regular expression. @@ -1012,4 +1309,129 @@ mod tests { assert_eq!(re.as_ref(), &expected); assert_eq!(re.null_count(), 3); } + + /// Assert that the fast path was selected for `pattern`. + fn assert_literal_prefix_capture( + pattern: &str, + expected_prefixes: &[&[u8]], + expected_terminator: u8, + ) { + let spec = + try_recognize_literal_prefix_capture(pattern).unwrap_or_else(|| { + panic!("expected literal-prefix recognizer to accept {pattern}") + }); + assert_eq!(spec.terminator, expected_terminator, "pattern {pattern}"); + let actual: Vec<&[u8]> = spec.prefixes.iter().map(|p| &p[..]).collect(); + assert_eq!(actual, expected_prefixes, "pattern {pattern}"); + } + + /// Run the optimized path and the full regex on every value and confirm they + /// agree. Acts as a generic differential test for the fast path. + fn assert_optimized_matches_regex(pattern: &str, values: &[&str]) { + let re = Regex::new(pattern).unwrap(); + let mut opt = OptimizedRegex::new(re.clone(), 1, pattern, "${1}"); + for value in values { + let expected = re.replacen(value, 1, "${1}"); + let actual = opt.replacen(value, 1, "${1}"); + assert_eq!(actual, expected, "pattern {pattern}, value {value:?}"); + } + } + + #[test] + fn literal_prefix_recognizer_accepts_clickbench_q28() { + assert_literal_prefix_capture( + r"^https?://(?:www\.)?([^/]+)/.*$", + &[ + b"https://www.", + b"http://www.", + b"https://", + b"http://", + ], + b'/', + ); + } + + #[test] + fn literal_prefix_recognizer_accepts_single_literal() { + assert_literal_prefix_capture(r"^foo:([^,]+),.*$", &[b"foo:"], b','); + } + + #[test] + fn literal_prefix_recognizer_accepts_alternation() { + // Equal-length prefixes end up in dedupe (alphabetical) order after + // the descending-length sort, which is stable. + assert_literal_prefix_capture( + r"^(?:foo|bar|baz):([^/]+)/.*$", + &[b"bar:", b"baz:", b"foo:"], + b'/', + ); + } + + #[test] + fn literal_prefix_recognizer_rejects_non_anchored() { + assert!(try_recognize_literal_prefix_capture(r"https?://([^/]+)/.*$").is_none()); + } + + #[test] + fn literal_prefix_recognizer_rejects_unbounded_prefix() { + assert!( + try_recognize_literal_prefix_capture(r"^.+?:([^/]+)/.*$").is_none() + ); + } + + #[test] + fn literal_prefix_recognizer_rejects_non_ascii_terminator() { + assert!( + try_recognize_literal_prefix_capture(r"^a([^\u{0080}]+)\u{0080}.*$") + .is_none() + ); + } + + #[test] + fn literal_prefix_recognizer_rejects_case_insensitive() { + // `(?i)` folds the negated class to two bytes, which would break the + // single-byte memchr terminator assumption. + assert!( + try_recognize_literal_prefix_capture(r"^(?i)foo:([^a]+)a.*$") + .is_none() + ); + } + + #[test] + fn literal_prefix_fast_path_matches_full_regex_for_q28_pattern() { + assert_optimized_matches_regex( + r"^https?://(?:www\.)?([^/]+)/.*$", + &[ + "http://example.com/path", + "https://www.example.com/path", + "http://www./path", + "http://wwww.example.com/path", + "http://example.com/path\nnext", + "http://example.com/path\n", + "http://exa\nmple.com/path", + "https://example.com", + "ftp://example.com/path", + "http:///path", + "", + "/", + "http://example.com/", + ], + ); + } + + #[test] + fn literal_prefix_fast_path_matches_full_regex_for_alternation_pattern() { + assert_optimized_matches_regex( + r"^(?:foo|bar|baz):([^/]+)/.*$", + &[ + "foo:one/two", + "bar:value/", + "baz:x/y/z", + "qux:one/two", + "foo:/empty", + "foo:abc", + "", + ], + ); + } }