From 9df6801f5275e6097117edde307bd16f2b6c72ae Mon Sep 17 00:00:00 2001 From: Mark Jubenville Date: Wed, 27 May 2026 11:50:39 -0400 Subject: [PATCH 1/3] feat(data-sanitization): pattern groups, strict matching, cookieAndFormEncodedMatcher (#321) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add PatternEntry type (string | { match: string; strict?: boolean }) for exact vs. substring field-name control - Add credentialPatterns, headerPatterns, piiPatterns, phiPatterns constants; export all from index - defaultPatterns now includes headerPatterns (authorization, api-key) in addition to credentialPatterns - Rename formEncodedMatcher → cookieAndFormEncodedMatcher; stops at both & and ; so it handles URL form-encoded and HTTP Cookie headers in one matcher - Add strict param to all three matchers; cookie matcher uses a negative lookbehind (? --- CLAUDE.md | 12 +- packages/data-sanitization/README.md | 88 +++++-- packages/data-sanitization/src/constants.ts | 108 +++++++- packages/data-sanitization/src/index.ts | 9 + packages/data-sanitization/src/matchers.ts | 60 +++-- packages/data-sanitization/src/replacers.ts | 66 +++-- packages/data-sanitization/src/types.ts | 29 ++- .../data-sanitization/test/constants.test.ts | 118 +++++++++ .../data-sanitization/test/matchers.test.ts | 235 ++++++++++++++++-- 9 files changed, 629 insertions(+), 96 deletions(-) create mode 100644 packages/data-sanitization/test/constants.test.ts diff --git a/CLAUDE.md b/CLAUDE.md index 92cad95..46128a6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,14 +29,16 @@ a single function, `sanitizeData`, exported from `packages/data-sanitization/src **Key modules:** - `packages/data-sanitization/src/matchers.ts` — Three built-in `DataSanitizationMatcher` - factories (`jsonMatcher`, `escapedJsonMatcher`, `formEncodedMatcher`). Each takes a pattern string - and optional `remove` flag and returns a `RegExp`. Custom matchers must produce a global, - case-insensitive regex using capture groups `$1`/`$2` for value replacement. + factories (`cookieAndFormEncodedMatcher`, `jsonMatcher`, `escapedJsonMatcher`). Each takes a + pattern string, optional `remove` flag, and optional `strict` flag and returns a `RegExp`. Custom + matchers must produce a global, case-insensitive regex using capture groups `$1`/`$2` for value + replacement. - `packages/data-sanitization/src/replacers.ts` — `stringReplacer` and `objectReplacer`. String replacer iterates all (pattern × matcher) combinations. Object replacer builds `RegExp` key matchers once, then recurses with a `WeakSet` to detect circular references. -- `packages/data-sanitization/src/constants.ts` — Default field-name patterns (`apikey`, - `api_key`, `password`, `secret`, `token`) and default mask (`**********`). +- `packages/data-sanitization/src/constants.ts` — Pattern groups: `credentialPatterns`, + `headerPatterns`, `piiPatterns`, `phiPatterns`, and `defaultPatterns` (credentials + headers). + Also exports `DEFAULT_PATTERN_MASK` and `DEFAULT_NUMERIC_MASK`. - `packages/data-sanitization/src/types.ts` — All exported TypeScript types (`DataSanitizationMatcher`, `DataSanitizationReplacer`, `DataSanitizationReplacerOptions`, etc.). - `packages/data-sanitization/src/errors.ts` — `DataSanitizationError` with a `details` property; diff --git a/packages/data-sanitization/README.md b/packages/data-sanitization/README.md index 46b1bb9..069ac37 100644 --- a/packages/data-sanitization/README.md +++ b/packages/data-sanitization/README.md @@ -246,25 +246,11 @@ sanitizeData( ### Sanitize PII and PHI with custom patterns -Use `customPatterns` to mask fields that are sensitive for your domain, such as -PII or PHI fields. +Use the exported `piiPatterns` and `phiPatterns` constants — or build your own +list — and pass them via `customPatterns`. ```typescript -import { sanitizeData } from 'data-sanitization'; - -const sensitivePatterns = [ - 'address', - 'date_of_birth', - 'email', - 'emergency_contact', - 'full_name', - 'health_card', - 'ip_address', - 'medications', - 'phone', - 'postal_code', - 'ssn', -]; +import { sanitizeData, piiPatterns, phiPatterns } from 'data-sanitization'; const patient = { accountId: 'acct_123', @@ -277,7 +263,7 @@ const patient = { }; sanitizeData(patient, { - customPatterns: sensitivePatterns, + customPatterns: [...piiPatterns, ...phiPatterns], useDefaultPatterns: false, }); // => { @@ -296,7 +282,7 @@ masking them. ```typescript sanitizeData(patient, { - customPatterns: sensitivePatterns, + customPatterns: [...piiPatterns, ...phiPatterns], removeMatches: true, useDefaultPatterns: false, }); @@ -351,7 +337,7 @@ sanitizeData({ tags }, { sanitizeCollections: true }); | `sanitizeCollections` | `boolean` | `false` | Sanitize `Map` and `Set` instances by traversing their entries and returning a new sanitized copy. When false, these pass through unchanged like other non-plain object instances. | | `scanStringValues` | `boolean` | `true` | Scan string values on non-sensitive keys for embedded patterns. Applies to object input and to string input when `parseJsonStrings` is enabled; has no effect on raw string input. | | `parseJsonStrings` | `boolean` | `false` | Parse valid JSON string inputs as structured data and sanitize by field name. Re-serializes with `JSON.stringify`, discarding original whitespace. | -| `customPatterns` | `string[]` | `[]` | Additional field name patterns to match | +| `customPatterns` | `PatternEntry[]` | `[]` | Additional field name patterns to match. Each entry is a pattern string (substring match) or `{ match: string; strict?: boolean }` for an exact match. | | `customMatchers` | `DataSanitizationMatcher[]` | `[]` | Additional regex matchers for custom string formats | | `useDefaultPatterns` | `boolean` | `true` | Set to `false` to use only your custom patterns, ignoring the built-in defaults. | | `useDefaultMatchers` | `boolean` | `true` | Set to `false` to use only your custom matchers, ignoring the built-in defaults. | @@ -359,8 +345,10 @@ sanitizeData({ tags }, { sanitizeCollections: true }); ## Default patterns -The following field name patterns are matched by default using a -case-insensitive substring match: +The following field name patterns are matched by default. All use +case-insensitive substring matching unless noted as exact. + +**Credentials** (`credentialPatterns`): - `apikey` - `api_key` @@ -368,9 +356,57 @@ case-insensitive substring match: - `secret` - `token` -A field named `db_password` or `client_secret_key` would also match because +**HTTP authentication headers** (`headerPatterns`): + +- `authorization` +- `api-key` + +A field named `db_password` or `x-authorization` would also match because these patterns match as substrings. +Two additional pattern groups are exported but not included by default: + +- **`piiPatterns`** — Personally Identifiable Information: names, contact + details, government IDs, and digital identifiers. Ambiguous single-word + terms such as `address`, `city`, `state`, and `zip` use exact matching to + avoid false positives (e.g. `email_address` is not masked when only `address` + is in `piiPatterns`). +- **`phiPatterns`** — Protected Health Information under HIPAA: medical record + identifiers, healthcare dates, clinical data, and biometrics. + +Use them via `customPatterns`: + +```typescript +import { sanitizeData, piiPatterns, phiPatterns } from 'data-sanitization'; + +sanitizeData(patient, { + customPatterns: [...piiPatterns, ...phiPatterns], +}); +``` + +### Exact vs. substring matching + +Each pattern in `customPatterns` is a `PatternEntry`: either a plain string +(substring match) or an object with `strict: true` for an exact field-name +match. + +```typescript +// Substring: matches 'token', 'access_token', 'session_token', ... +sanitizeData(data, { customPatterns: ['token'] }); + +// Exact: matches only 'token', not 'access_token' +sanitizeData(data, { customPatterns: [{ match: 'token', strict: true }] }); +``` + +Use exact matching when a pattern is a common English word that would produce +false positives as a substring — for example, `state` would otherwise mask +`statement` or `stateCode`. + +> **`ignorePatterns` and exact matching:** `ignorePatterns` is a `string[]` +> matched against the `match` string of each active pattern. To suppress an +> exact-match entry such as `{ match: 'state', strict: true }`, pass +> `ignorePatterns: ['state']`. + ## Default matchers Three matchers are included by default: @@ -379,8 +415,10 @@ Three matchers are included by default: JSON-like strings - **Escaped JSON matcher**: matches `\"fieldName\":\"value\"` patterns in JSON embedded inside JSON string values -- **Form-encoded matcher**: matches `fieldName=value` and `fieldName:value` - patterns in URL-encoded and similarly delimited strings +- **Cookie and form-encoded matcher**: matches `fieldName=value` and + `fieldName:value` patterns in URL form-encoded strings and HTTP Cookie + headers. Values stop at `&`, `;`, `\r`, or `\n` so neither format's + separator is consumed as part of a value. ## Custom patterns and matchers diff --git a/packages/data-sanitization/src/constants.ts b/packages/data-sanitization/src/constants.ts index 95ff048..155beb9 100644 --- a/packages/data-sanitization/src/constants.ts +++ b/packages/data-sanitization/src/constants.ts @@ -1,8 +1,10 @@ +import { PatternEntry } from './types'; + /** - * These are some default patterns to search within field - * names used to determine what data is sanitized. + * Field-name patterns for credentials commonly present in any application + * that performs authentication or calls external APIs. */ -const DEFAULT_FIELD_NAME_PATTERNS = [ +const credentialPatterns: PatternEntry[] = [ 'apikey', 'api_key', 'password', @@ -10,6 +12,100 @@ const DEFAULT_FIELD_NAME_PATTERNS = [ 'token', ]; +/** + * Field-name patterns for HTTP headers that carry authentication or + * API-key material. Substring matching covers common variants: + * `authorization` matches `x-authorization` and `proxy-authorization`; + * `api-key` matches `x-api-key`. + */ +const headerPatterns: PatternEntry[] = ['authorization', 'api-key']; + +/** + * Field-name patterns for Personally Identifiable Information (PII). + * Opt-in — not included in `defaultPatterns`. Single-word terms that + * would produce false positives as substrings use `strict: true`. + */ +const piiPatterns: PatternEntry[] = [ + // Names + 'first_name', + 'last_name', + 'middle_name', + 'full_name', + 'date_of_birth', + 'dob', + 'birth_date', + // Contact + 'email', + 'phone', + 'mobile', + // Address — single-word terms use strict to avoid false positives + // (e.g. 'email_address', 'ip_address') + { match: 'address', strict: true }, + 'street_address', + 'address_line', + 'postal_code', + { match: 'city', strict: true }, + { match: 'state', strict: true }, + { match: 'zip', strict: true }, + // Government IDs + 'ssn', + 'social_security', + 'social_insurance_number', + 'national_id', + 'passport', + 'drivers_license', + 'tax_id', + // Digital identifiers (GDPR-relevant) + 'ip_address', +]; + +/** + * Field-name patterns for Protected Health Information (PHI) under HIPAA. + * Opt-in — not included in `defaultPatterns`. + */ +const phiPatterns: PatternEntry[] = [ + // Medical record identifiers + 'mrn', + 'medical_record_number', + 'patient_id', + 'chart_number', + 'member_id', + 'beneficiary_id', + 'subscriber_id', + 'insurance_id', + 'claim_number', + 'encounter_id', + // Healthcare-specific dates + 'admission_date', + 'discharge_date', + 'service_date', + 'appointment_date', + 'death_date', + // Clinical data + 'diagnosis_code', + 'diagnosis', + 'condition', + 'medication', + 'prescription', + 'procedure_code', + // Provider / facility + 'provider_npi', + 'provider_id', + // Biometrics + 'fingerprint', + 'biometric_id', +]; + +/** + * The default set of field-name patterns applied when no options override + * them. Covers credentials and common authentication headers. PII and PHI + * patterns are opt-in via `piiPatterns` and `phiPatterns`. + */ +const defaultPatterns: PatternEntry[] = [ + ...credentialPatterns, + ...headerPatterns, +]; + /** * A default mask used when replacing string field values. */ @@ -21,7 +117,11 @@ const DEFAULT_PATTERN_MASK = '**********'; const DEFAULT_NUMERIC_MASK = 9999999999; export { - DEFAULT_FIELD_NAME_PATTERNS, + credentialPatterns, + defaultPatterns, + headerPatterns, + phiPatterns, + piiPatterns, DEFAULT_NUMERIC_MASK, DEFAULT_PATTERN_MASK, }; diff --git a/packages/data-sanitization/src/index.ts b/packages/data-sanitization/src/index.ts index e79f77e..b5d4746 100644 --- a/packages/data-sanitization/src/index.ts +++ b/packages/data-sanitization/src/index.ts @@ -8,8 +8,17 @@ export type { DataSanitizationOutput, DataSanitizationReplacer, DataSanitizationReplacerOptions, + PatternEntry, } from './types'; +export { + credentialPatterns, + defaultPatterns, + headerPatterns, + phiPatterns, + piiPatterns, +} from './constants'; + /** * Returns a safe type label for data passed to the sanitizer. * diff --git a/packages/data-sanitization/src/matchers.ts b/packages/data-sanitization/src/matchers.ts index 659a6e4..77b14da 100644 --- a/packages/data-sanitization/src/matchers.ts +++ b/packages/data-sanitization/src/matchers.ts @@ -16,42 +16,55 @@ const escapePattern = (pattern: string): string => pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); /** - * Matches field names in url form encoded data, or other types of - * data similarly character delimited + * Matches field names in cookie header strings (`key=value; key=value`) and + * URL form-encoded strings (`key=value&key=value`). Values stop at `&`, `;`, + * `\r`, or `\n`, so neither format's separator is consumed as part of a value. * * @example - * // Masking - * formEncodedMatcher('password') + * // Form-encoded masking + * cookieAndFormEncodedMatcher('password') * // 'password=secret&user=alice' → 'password=**********&user=alice' * * @example + * // Cookie masking + * cookieAndFormEncodedMatcher('token') + * // 'session_token=abc; user=alice' → 'session_token=**********; user=alice' + * + * @example * // Removal - * formEncodedMatcher('password', true) + * cookieAndFormEncodedMatcher('password', true) * // 'password=secret&user=alice' → 'user=alice' * - * @param pattern - Pattern in url form encoded like data used to match against field names. + * @param pattern - Pattern used to match against field names. * @param remove - Whether to create a matcher for removing matched fields instead of masking values. - * @returns A global, case-insensitive regular expression for matching form-like fields. + * @param strict - When true, matches only the exact field name rather than as a substring. + * @returns A global, case-insensitive regular expression for matching cookie and form-encoded fields. */ -const formEncodedMatcher: DataSanitizationMatcher = ( +const cookieAndFormEncodedMatcher: DataSanitizationMatcher = ( pattern, remove = false, + strict = false, ) => { const escaped = escapePattern(pattern); - const fieldName = `[\\w-]*${escaped}[\\w-]*`; + // Strict mode uses a negative lookbehind to reject substring matches (e.g. + // 'token' must not match inside 'session_token'). Non-strict mode wraps the + // pattern with [\w-]* on both sides to intentionally match substrings. + const fieldName = strict + ? `(? { +const jsonMatcher: DataSanitizationMatcher = ( + pattern, + remove = false, + strict = false, +) => { const escaped = escapePattern(pattern); - const fieldName = `[\\w-]*${escaped}[\\w-]*`; + const fieldName = strict ? escaped : `[\\w-]*${escaped}[\\w-]*`; if (remove) { const fieldPrefix = `"${fieldName}"\\s*:\\s*"`; @@ -108,14 +126,16 @@ const jsonMatcher: DataSanitizationMatcher = (pattern, remove = false) => { * * @param pattern - Pattern in escaped json data used to match against field names. * @param remove - Whether to create a matcher for removing matched fields instead of masking values. + * @param strict - When true, matches only the exact field name rather than as a substring. * @returns A global, case-insensitive regular expression for matching escaped JSON fields. */ const escapedJsonMatcher: DataSanitizationMatcher = ( pattern, remove = false, + strict = false, ) => { const escaped = escapePattern(pattern); - const fieldName = `[\\w-]*${escaped}[\\w-]*`; + const fieldName = strict ? escaped : `[\\w-]*${escaped}[\\w-]*`; const fieldPrefix = `\\\\"${fieldName}\\\\"\\s*:\\s*\\\\"`; if (remove) { @@ -131,13 +151,17 @@ const escapedJsonMatcher: DataSanitizationMatcher = ( return new RegExp(maskField, MATCHER_FLAGS); }; -const defaultMatchers = [formEncodedMatcher, jsonMatcher, escapedJsonMatcher]; +const defaultMatchers = [ + cookieAndFormEncodedMatcher, + jsonMatcher, + escapedJsonMatcher, +]; export { + cookieAndFormEncodedMatcher, defaultMatchers, escapedJsonMatcher, escapePattern, - formEncodedMatcher, jsonMatcher, }; diff --git a/packages/data-sanitization/src/replacers.ts b/packages/data-sanitization/src/replacers.ts index cba1838..31a51fb 100644 --- a/packages/data-sanitization/src/replacers.ts +++ b/packages/data-sanitization/src/replacers.ts @@ -1,27 +1,44 @@ -import { DataSanitizationMatcher, DataSanitizationReplacer } from './types'; import { - DEFAULT_FIELD_NAME_PATTERNS, + DataSanitizationMatcher, + DataSanitizationReplacer, + PatternEntry, +} from './types'; +import { + defaultPatterns, DEFAULT_NUMERIC_MASK, DEFAULT_PATTERN_MASK, } from './constants'; import defaultMatchers, { escapePattern } from './matchers'; +/** + * Normalizes a `PatternEntry` to a plain object with `match` and `strict` fields. + * + * @param entry - Pattern string or object to normalize. + * @returns Normalized entry with explicit `match` and `strict` values. + */ +const normalizeEntry = ( + entry: PatternEntry, +): { match: string; strict: boolean } => + typeof entry === 'string' + ? { match: entry, strict: false } + : { match: entry.match, strict: entry.strict ?? false }; + /** * Builds the active pattern list from defaults and any caller-supplied patterns, * minus any patterns listed in `ignorePatterns`. * * @param useDefaultPatterns - Whether to include the built-in default patterns. * @param customPatterns - Additional patterns to append to the active list. - * @param ignorePatterns - Patterns to remove from the assembled list. + * @param ignorePatterns - Pattern match strings to remove from the assembled list. * @returns Combined array of field-name patterns to match against. */ const buildPatterns = ( useDefaultPatterns: boolean, - customPatterns?: string[], + customPatterns?: PatternEntry[], ignorePatterns?: string[], -): string[] => { - const patterns = [ - ...(useDefaultPatterns ? DEFAULT_FIELD_NAME_PATTERNS : []), +): PatternEntry[] => { + const patterns: PatternEntry[] = [ + ...(useDefaultPatterns ? defaultPatterns : []), ...(customPatterns ?? []), ]; @@ -30,7 +47,9 @@ const buildPatterns = ( } const ignored = new Set(ignorePatterns.map((p) => p.toLowerCase())); - return patterns.filter((p) => !ignored.has(p.toLowerCase())); + return patterns.filter( + (p) => !ignored.has(normalizeEntry(p).match.toLowerCase()), + ); }; /** @@ -94,13 +113,17 @@ const getMatcherId = (matcher: DataSanitizationMatcher): string => { */ const buildStringScanRegexes = ( matchers: DataSanitizationMatcher[], - patterns: string[], + patterns: PatternEntry[], removeMatches: boolean, ): StringScanRegexes => { + const normalizedPatterns = patterns.map(normalizeEntry); + const key = matchers.map(getMatcherId).join('\x00') + '\x01' + - patterns.join('\x00') + + normalizedPatterns + .map(({ match, strict }) => `${match}:${strict}`) + .join('\x00') + '\x01' + removeMatches; @@ -114,11 +137,16 @@ const buildStringScanRegexes = ( const result: StringScanRegexes = { preFilter: - patterns.length > 0 - ? new RegExp(patterns.map(escapePattern).join('|'), 'i') + normalizedPatterns.length > 0 + ? new RegExp( + normalizedPatterns + .map(({ match }) => escapePattern(match)) + .join('|'), + 'i', + ) : null, - regexes: patterns.flatMap((pattern) => - matchers.map((matcher) => matcher(pattern, removeMatches)), + regexes: normalizedPatterns.flatMap(({ match, strict }) => + matchers.map((matcher) => matcher(match, removeMatches, strict)), ), }; @@ -240,9 +268,13 @@ const objectReplacer: DataSanitizationReplacer = (data, options = {}) => { customPatterns, ignorePatterns, ); - const keyMatchers = patterns.map( - (pattern) => new RegExp(`\\w*${escapePattern(pattern)}\\w*`, 'i'), - ); + const keyMatchers = patterns.map((entry) => { + const { match, strict } = normalizeEntry(entry); + const escaped = escapePattern(match); + return strict + ? new RegExp(`^${escaped}$`, 'i') + : new RegExp(`[\\w-]*${escaped}[\\w-]*`, 'i'); + }); const { preFilter: patternPreFilter, regexes: stringRegexes } = scanStringValues ? buildStringScanRegexes(matchers, patterns, removeMatches) diff --git a/packages/data-sanitization/src/types.ts b/packages/data-sanitization/src/types.ts index 3a4769f..5cdbf01 100644 --- a/packages/data-sanitization/src/types.ts +++ b/packages/data-sanitization/src/types.ts @@ -1,3 +1,17 @@ +/** + * A pattern entry for field-name matching. A plain string uses substring + * matching; the object form allows strict (exact) matching via `strict: true`. + * + * @example + * // Substring match — matches 'address', 'street_address', 'email_address' + * const p1: PatternEntry = 'address'; + * + * @example + * // Strict match — matches only the exact field name 'address' + * const p2: PatternEntry = { match: 'address', strict: true }; + */ +type PatternEntry = string | { match: string; strict?: boolean }; + /** * DataSanitizationMatchers are regex matchers to test against field names in data. * @@ -6,6 +20,7 @@ * * @param pattern - Field-name pattern used to create the matcher. * @param remove - Whether the matcher should support removal instead of masking. + * @param strict - When true, matches only the exact field name rather than as a substring. * @returns A regular expression that matches sensitive fields for the pattern. * @throws {Error} If the matcher cannot create a regular expression for the pattern. * @@ -14,7 +29,11 @@ * matcher('password').test('password=secret'); * // => true */ -type DataSanitizationMatcher = (pattern: string, remove?: boolean) => RegExp; +type DataSanitizationMatcher = ( + pattern: string, + remove?: boolean, + strict?: boolean, +) => RegExp; interface DataSanitizationReplacerOptions { /** @@ -23,10 +42,11 @@ interface DataSanitizationReplacerOptions { */ customMatchers?: DataSanitizationMatcher[]; /** - * Array of patterns to use in addition or in place - * of the built-in default patterns + * Array of patterns to use in addition or in place of the built-in default + * patterns. Accepts plain strings (substring match) or `PatternEntry` objects + * with `strict: true` for exact field-name matching. */ - customPatterns?: string[]; + customPatterns?: PatternEntry[]; /** * A number to use as a mask for number-typed field values in place of the * built-in default numeric mask @@ -145,4 +165,5 @@ export { DataSanitizationOutput, DataSanitizationReplacer, DataSanitizationReplacerOptions, + PatternEntry, }; diff --git a/packages/data-sanitization/test/constants.test.ts b/packages/data-sanitization/test/constants.test.ts new file mode 100644 index 0000000..8a97b4d --- /dev/null +++ b/packages/data-sanitization/test/constants.test.ts @@ -0,0 +1,118 @@ +import { describe, it, expect } from 'vitest'; +import { + credentialPatterns, + defaultPatterns, + headerPatterns, + phiPatterns, + piiPatterns, +} from '../src/constants'; +import { PatternEntry } from '../src/types'; + +const toMatchString = (entry: PatternEntry): string => + typeof entry === 'string' ? entry : entry.match; + +describe('constants', () => { + describe('credentialPatterns', () => { + it('should be a non-empty array', () => { + expect(credentialPatterns.length).toBeGreaterThan(0); + }); + + it('should contain common credential field names', () => { + const matches = credentialPatterns.map(toMatchString); + expect(matches).toContain('password'); + expect(matches).toContain('token'); + expect(matches).toContain('secret'); + }); + }); + + describe('headerPatterns', () => { + it('should be a non-empty array', () => { + expect(headerPatterns.length).toBeGreaterThan(0); + }); + + it('should contain common HTTP authentication header names', () => { + const matches = headerPatterns.map(toMatchString); + expect(matches).toContain('authorization'); + expect(matches).toContain('api-key'); + }); + }); + + describe('piiPatterns', () => { + it('should be a non-empty array', () => { + expect(piiPatterns.length).toBeGreaterThan(0); + }); + + it('should contain common PII field names', () => { + const matches = piiPatterns.map(toMatchString); + expect(matches).toContain('email'); + expect(matches).toContain('phone'); + expect(matches).toContain('ssn'); + }); + + it('should use strict matching for ambiguous single-word terms', () => { + const strictEntries = piiPatterns.filter( + (entry): entry is { match: string; strict?: boolean } => + typeof entry === 'object' && entry.strict === true, + ); + const strictMatches = strictEntries.map((e) => e.match); + expect(strictMatches).toContain('address'); + expect(strictMatches).toContain('city'); + expect(strictMatches).toContain('state'); + }); + }); + + describe('phiPatterns', () => { + it('should be a non-empty array', () => { + expect(phiPatterns.length).toBeGreaterThan(0); + }); + + it('should contain common PHI field names', () => { + const matches = phiPatterns.map(toMatchString); + expect(matches).toContain('patient_id'); + expect(matches).toContain('diagnosis'); + expect(matches).toContain('medication'); + }); + }); + + describe('defaultPatterns', () => { + it('should be a non-empty array', () => { + expect(defaultPatterns.length).toBeGreaterThan(0); + }); + + it('should include all credential patterns', () => { + const defaultMatches = defaultPatterns.map(toMatchString); + for (const entry of credentialPatterns) { + expect(defaultMatches).toContain(toMatchString(entry)); + } + }); + + it('should include all header patterns', () => { + const defaultMatches = defaultPatterns.map(toMatchString); + for (const entry of headerPatterns) { + expect(defaultMatches).toContain(toMatchString(entry)); + } + }); + + it('should not include PII patterns', () => { + const defaultMatches = defaultPatterns.map(toMatchString); + const piiMatches = piiPatterns.map(toMatchString); + for (const match of piiMatches) { + expect(defaultMatches).not.toContain(match); + } + }); + + it('should not include PHI patterns', () => { + const defaultMatches = defaultPatterns.map(toMatchString); + const phiMatches = phiPatterns.map(toMatchString); + for (const match of phiMatches) { + expect(defaultMatches).not.toContain(match); + } + }); + + it('should have no duplicate match strings', () => { + const matches = defaultPatterns.map(toMatchString); + const unique = new Set(matches); + expect(unique.size).toBe(matches.length); + }); + }); +}); diff --git a/packages/data-sanitization/test/matchers.test.ts b/packages/data-sanitization/test/matchers.test.ts index ef9e3b8..84e98c9 100644 --- a/packages/data-sanitization/test/matchers.test.ts +++ b/packages/data-sanitization/test/matchers.test.ts @@ -6,12 +6,12 @@ import { describe, expect, it } from 'vitest'; import { escapedJsonMatcher, escapePattern, - formEncodedMatcher, + cookieAndFormEncodedMatcher, jsonMatcher, } from '../src/matchers'; describe('DataSanitizationMatchers', () => { - describe('formEncodedMatcher', () => { + describe('cookieAndFormEncodedMatcher', () => { it('should find fields that have names that match the pattern', () => { // Arrange const testPattern = 'password'; @@ -21,7 +21,7 @@ describe('DataSanitizationMatchers', () => { username: 'bar', }; const testData = queryString.stringify(testObject); - const matcher = formEncodedMatcher(testPattern); + const matcher = cookieAndFormEncodedMatcher(testPattern); const allMatches: Array = []; // Act @@ -38,7 +38,7 @@ describe('DataSanitizationMatchers', () => { it('should match colon-separated field values', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const testData = 'password:secret'; // Act @@ -51,7 +51,7 @@ describe('DataSanitizationMatchers', () => { it('should match form values containing non-delimiter punctuation', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const testData = 'password=abc-123%2Ba/b.c:z+q&username=mark'; // Act @@ -66,7 +66,7 @@ describe('DataSanitizationMatchers', () => { it('should match case-insensitively', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const testData = 'PASSWORD=foo&Password=bar'; // Act @@ -78,7 +78,7 @@ describe('DataSanitizationMatchers', () => { it('should match fields with the pattern as a substring', () => { // Arrange - const matcher = formEncodedMatcher('secret'); + const matcher = cookieAndFormEncodedMatcher('secret'); const testData = 'client_secret_key=abc&name=bob'; // Act @@ -91,7 +91,7 @@ describe('DataSanitizationMatchers', () => { it('should not match fields that do not contain the pattern', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const testData = 'username=foo&email=bar'; // Act @@ -103,7 +103,7 @@ describe('DataSanitizationMatchers', () => { it('should remove matched fields and their values from the string', () => { // Arrange - const matcher = formEncodedMatcher('password', true); + const matcher = cookieAndFormEncodedMatcher('password', true); const testData = 'db_password=baz&username=bar&password=foo'; // Act @@ -115,7 +115,7 @@ describe('DataSanitizationMatchers', () => { it('should remove the field when it is the only entry in the string', () => { // Arrange - const matcher = formEncodedMatcher('token', true); + const matcher = cookieAndFormEncodedMatcher('token', true); const testData = 'token=abc'; // Act @@ -127,7 +127,7 @@ describe('DataSanitizationMatchers', () => { it('should remove the field even when its value contains special characters', () => { // Arrange - const matcher = formEncodedMatcher('password', true); + const matcher = cookieAndFormEncodedMatcher('password', true); const testData = 'password=abc-123%2Ba/b.c:z+q&username=mark'; // Act @@ -139,7 +139,7 @@ describe('DataSanitizationMatchers', () => { it('should stop matching at a newline without consuming lines that follow', () => { // Arrange - const matcher = formEncodedMatcher('api_key'); + const matcher = cookieAndFormEncodedMatcher('api_key'); const testData = 'api_key=hunter2\n at authenticate (/app/src/auth.js:89:15)'; @@ -154,7 +154,7 @@ describe('DataSanitizationMatchers', () => { it('should mask a field value and preserve lines that follow it', () => { // Arrange - const matcher = formEncodedMatcher('api_key'); + const matcher = cookieAndFormEncodedMatcher('api_key'); const testData = 'api_key=hunter2\n at authenticate (/app/src/auth.js:89:15)'; const mask = '**********'; @@ -170,7 +170,7 @@ describe('DataSanitizationMatchers', () => { it('should mask a field value when & follows on the same line and preserve subsequent lines', () => { // Arrange - const matcher = formEncodedMatcher('api_key'); + const matcher = cookieAndFormEncodedMatcher('api_key'); const testData = 'api_key=hunter2®ion=us-east-1\n at authenticate (/app/src/auth.js:89:15)'; const mask = '**********'; @@ -186,7 +186,7 @@ describe('DataSanitizationMatchers', () => { it('should remove the field and preserve lines that follow it', () => { // Arrange - const matcher = formEncodedMatcher('api_key', true); + const matcher = cookieAndFormEncodedMatcher('api_key', true); const testData = 'api_key=hunter2\n at authenticate (/app/src/auth.js:89:15)'; @@ -199,7 +199,7 @@ describe('DataSanitizationMatchers', () => { it('should stop at Windows-style line endings without including the carriage return', () => { // Arrange - const matcher = formEncodedMatcher('api_key'); + const matcher = cookieAndFormEncodedMatcher('api_key'); const testData = 'api_key=hunter2\r\n at authenticate (/app/src/auth.js:89:15)'; @@ -213,7 +213,7 @@ describe('DataSanitizationMatchers', () => { it('should mask a field value and preserve Windows-style lines that follow it', () => { // Arrange - const matcher = formEncodedMatcher('api_key'); + const matcher = cookieAndFormEncodedMatcher('api_key'); const testData = 'api_key=hunter2\r\n at authenticate (/app/src/auth.js:89:15)'; const mask = '**********'; @@ -229,7 +229,7 @@ describe('DataSanitizationMatchers', () => { it('should match a field with an empty value', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const testData = 'password=&username=mark'; // Act @@ -241,7 +241,7 @@ describe('DataSanitizationMatchers', () => { it('should treat a URL-encoded ampersand as part of the value and not as a delimiter', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const testData = 'password=a%26b&username=mark'; // Act @@ -254,7 +254,7 @@ describe('DataSanitizationMatchers', () => { it('should match a value containing base64 padding characters', () => { // Arrange - const matcher = formEncodedMatcher('token'); + const matcher = cookieAndFormEncodedMatcher('token'); const testData = 'token=abc123==&username=mark'; // Act @@ -267,7 +267,7 @@ describe('DataSanitizationMatchers', () => { it('should not treat a semicolon as a field delimiter', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const testData = 'password=secret;username=mark'; // Act @@ -280,7 +280,7 @@ describe('DataSanitizationMatchers', () => { it('should match a field with a very long value', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const longValue = 'x'.repeat(10_000); const testData = `password=${longValue}&username=mark`; @@ -294,7 +294,7 @@ describe('DataSanitizationMatchers', () => { it('should match a field whose value contains a tab character', () => { // Arrange - const matcher = formEncodedMatcher('password'); + const matcher = cookieAndFormEncodedMatcher('password'); const testData = 'password=sec\tret&username=mark'; // Act @@ -303,6 +303,119 @@ describe('DataSanitizationMatchers', () => { // Assert — tab is not in the stop-character set; should be captured as part of value expect(allMatches.length).toBe(1); }); + + describe('with cookie-style semicolon-separated input', () => { + it('should mask a sensitive field in a cookie string', () => { + // Arrange + const matcher = cookieAndFormEncodedMatcher('token'); + const testData = 'session_token=abc123; user=mark'; + + // Act + const result = testData.replace(matcher, '$1**********$2'); + + // Assert + expect(result).toBe('session_token=**********; user=mark'); + }); + + it('should preserve non-sensitive cookie pairs when masking', () => { + // Arrange + const matcher = cookieAndFormEncodedMatcher('token'); + const testData = 'user=mark; session_token=abc; theme=dark'; + + // Act + const result = testData.replace(matcher, '$1**********$2'); + + // Assert + expect(result).toBe('user=mark; session_token=**********; theme=dark'); + }); + + it('should remove a sensitive cookie field and leave others intact', () => { + // Arrange + const matcher = cookieAndFormEncodedMatcher('token', true); + const testData = 'user=mark; session_token=abc; theme=dark'; + + // Act + const result = testData.replace(matcher, ''); + + // Assert + expect(result).toBe('user=mark; theme=dark'); + }); + + it('should match a cookie field at the end of the string with no trailing semicolon', () => { + // Arrange + const matcher = cookieAndFormEncodedMatcher('token'); + const testData = 'user=mark; auth_token=xyz'; + + // Act + const result = testData.replace(matcher, '$1**********$2'); + + // Assert + expect(result).toBe('user=mark; auth_token=**********'); + }); + + it('should match a cookie field with an empty value', () => { + // Arrange + const matcher = cookieAndFormEncodedMatcher('token'); + const testData = 'session_token=; user=mark'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(1); + }); + + it('should not consume an ampersand as part of a cookie value', () => { + // Arrange — form-encoded field follows a cookie-style masked field + const matcher = cookieAndFormEncodedMatcher('token'); + const testData = 'session_token=abc; user=mark&extra=data'; + + // Act + const result = testData.replace(matcher, '$1**********$2'); + + // Assert — &extra=data is preserved, not consumed + expect(result).toContain('user=mark'); + expect(result).toContain('extra=data'); + }); + }); + + describe('with strict matching', () => { + it('should match only the exact field name when strict is true', () => { + // Arrange + const matcher = cookieAndFormEncodedMatcher('token', false, true); + const testData = 'token=abc&username=mark'; + + // Act + const result = testData.replace(matcher, '$1**********$2'); + + // Assert + expect(result).toBe('token=**********&username=mark'); + }); + + it('should not match a field whose name only contains the pattern as a substring when strict is true', () => { + // Arrange + const matcher = cookieAndFormEncodedMatcher('token', false, true); + const testData = 'session_token=abc&username=mark'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(0); + }); + + it('should match substring field names when strict is false (default)', () => { + // Arrange + const matcher = cookieAndFormEncodedMatcher('token'); + const testData = 'session_token=abc&username=mark'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(1); + }); + }); }); describe('jsonMatcher', () => { @@ -630,6 +743,44 @@ describe('DataSanitizationMatchers', () => { // Assert expect(allMatches.length).toBe(0); }); + + describe('with strict matching', () => { + it('should match only the exact field name when strict is true', () => { + // Arrange + const matcher = jsonMatcher('password', false, true); + const testData = '{"password":"secret"}'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(1); + }); + + it('should not match a field whose name contains the pattern as a substring when strict is true', () => { + // Arrange + const matcher = jsonMatcher('password', false, true); + const testData = '{"db_password":"secret"}'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(0); + }); + + it('should match substring field names when strict is false', () => { + // Arrange + const matcher = jsonMatcher('password'); + const testData = '{"db_password":"secret"}'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(1); + }); + }); }); describe('escapedJsonMatcher', () => { @@ -795,6 +946,44 @@ describe('DataSanitizationMatchers', () => { // Assert expect(allMatches.length).toBe(0); }); + + describe('with strict matching', () => { + it('should match only the exact field name when strict is true', () => { + // Arrange + const matcher = escapedJsonMatcher('password', false, true); + const testData = '\\"password\\":\\"secret\\"'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(1); + }); + + it('should not match a field whose name only contains the pattern as a substring when strict is true', () => { + // Arrange + const matcher = escapedJsonMatcher('password', false, true); + const testData = '\\"db_password\\":\\"secret\\"'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(0); + }); + + it('should match substring field names when strict is false (default)', () => { + // Arrange + const matcher = escapedJsonMatcher('password'); + const testData = '\\"db_password\\":\\"secret\\"'; + + // Act + const allMatches = [...testData.matchAll(matcher)]; + + // Assert + expect(allMatches.length).toBe(1); + }); + }); }); describe('escapePattern', () => { From 9b5281be8f9005bd5a91d9af6fc4800bf63350e1 Mon Sep 17 00:00:00 2001 From: Mark Jubenville Date: Wed, 27 May 2026 12:00:23 -0400 Subject: [PATCH 2/3] docs(data-sanitization): update performance.md for cookieAndFormEncodedMatcher - Rename 'Form-encoded matcher and multiline strings' section to 'Cookie and form-encoded matcher and multiline strings' - Update value stop-char description to mention & and ; both - Remove outdated claim that string removal is 10-20% slower than masking; benchmarks show cost is comparable - Widen cold-start ratio to 15-32x range to reflect hardware variance Co-Authored-By: Claude Sonnet 4.6 --- .../data-sanitization/docs/performance.md | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/packages/data-sanitization/docs/performance.md b/packages/data-sanitization/docs/performance.md index c9056db..ac2d38e 100644 --- a/packages/data-sanitization/docs/performance.md +++ b/packages/data-sanitization/docs/performance.md @@ -236,10 +236,11 @@ reuse the cache and pay no compile cost. | Warm cache (same options each call) | ~451,000 | ~0.002 | | Cold start (unique options per call) | ~14,000 | ~0.070 | -The first call is ~32× slower than a warm call due to regex compilation. -In steady-state server usage this cost is paid once per process lifetime and -is negligible. It becomes visible only in tests or scripts that create many -distinct option configurations (e.g. per-request custom patterns). +The first call is significantly slower than a warm call due to regex +compilation (typically 15–32×, hardware-dependent). In steady-state server +usage this cost is paid once per process lifetime and is negligible. It becomes +visible only in tests or scripts that create many distinct option +configurations (e.g. per-request custom patterns). See [Cache memory growth](#cache-memory-growth) below for the memory implication of many distinct configurations. @@ -248,8 +249,7 @@ implication of many distinct configurations. `removeMatches: true` deletes matched fields from objects and matched key=value pairs from strings instead of masking them. The cost is similar to -masking for objects but slightly higher for string inputs due to regex -replacement pattern differences. +masking for both objects and strings. @@ -295,9 +295,9 @@ replacement pattern differences.
For objects, removal and masking are nearly equivalent — both write a result -object with the same traversal cost. For strings, removal is 10–20% slower -because the match-and-remove regex path involves different replacement -semantics than the `$1$2` substitution. +object with the same traversal cost. For strings, removal cost is comparable +to masking; the exact relative overhead varies with input and is within +benchmark noise at typical payload sizes. ## String workloads @@ -416,16 +416,17 @@ In steady-state usage — a fixed configuration, possibly with a static list of If `customPatterns` vary per call (e.g. injected from user input or request data), entries will cycle through the cache and every call will pay the -cold-start regex compilation cost (~32× slower than a warm call). In that -scenario, prebuild the options object once (or a small set of them) and reuse -it across calls. Or set `scanStringValues: false`, which bypasses the cache -entirely. +cold-start regex compilation cost (typically 15–32× slower than a warm call, +depending on pattern count and hardware). In that scenario, prebuild the +options object once (or a small set of them) and reuse it across calls. Or set +`scanStringValues: false`, which bypasses the cache entirely. -### Form-encoded matcher and multiline strings +### Cookie and form-encoded matcher and multiline strings -The built-in form-encoded matcher uses `[^\n&]*` to match a field value — -stopping at either an `&` delimiter or a newline. This means content on lines -after a matched value is preserved: +The built-in `cookieAndFormEncodedMatcher` uses `[^\r\n&;]*` to match a field +value — stopping at `&`, `;`, `\r`, or `\n`. This means content on lines after +a matched value is preserved, and the two separator styles (URL form-encoded +`&` and HTTP Cookie `;`) do not bleed into each other: ```text Input: "Error: auth failed — api_key=hunter2\n at foo (bar.js:10)" From 1b04d5212fb313f46af1d514b506b5f63c502e19 Mon Sep 17 00:00:00 2001 From: Mark Jubenville Date: Wed, 27 May 2026 12:05:21 -0400 Subject: [PATCH 3/3] test(data-sanitization): cover strict PatternEntry branches in objectReplacer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add two tests for object-form customPatterns entries: - strict: true → exact key match only (covers the ^pattern$ branch) - strict omitted → substring match (covers the strict ?? false branch) Restores 100% branch coverage. Co-Authored-By: Claude Sonnet 4.6 --- .../data-sanitization/test/replacers.test.ts | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/packages/data-sanitization/test/replacers.test.ts b/packages/data-sanitization/test/replacers.test.ts index 0e18c7b..9b54e89 100644 --- a/packages/data-sanitization/test/replacers.test.ts +++ b/packages/data-sanitization/test/replacers.test.ts @@ -1223,6 +1223,46 @@ describe('DataSanitizationReplacers', () => { }); }); + it('should mask only the exact field name when a strict object-form pattern is used', () => { + // Arrange + const testData = { + state: 'secret-state', + statement: 'not-sensitive', + username: 'mark', + }; + + // Act + const result = objectReplacer(testData, { + customPatterns: [{ match: 'state', strict: true }], + useDefaultPatterns: false, + }) as typeof testData; + + // Assert + expect(result.state).toBe(DEFAULT_PATTERN_MASK); + expect(result.statement).toBe('not-sensitive'); + expect(result.username).toBe('mark'); + }); + + it('should mask substring field names when an object-form pattern omits strict', () => { + // Arrange + const testData = { + state: 'secret-state', + statement: 'also-sensitive', + username: 'mark', + }; + + // Act + const result = objectReplacer(testData, { + customPatterns: [{ match: 'state' }], + useDefaultPatterns: false, + }) as typeof testData; + + // Assert + expect(result.state).toBe(DEFAULT_PATTERN_MASK); + expect(result.statement).toBe(DEFAULT_PATTERN_MASK); + expect(result.username).toBe('mark'); + }); + it('should leave class instances unchanged while masking sensitive fields in plain objects', () => { // Arrange const date = new Date('2024-01-01');