diff --git a/CLAUDE.md b/CLAUDE.md index a20a863b..8a90f985 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -161,7 +161,7 @@ Source is TypeScript in `src/`, compiled via `tsup`. The Rust native engine live **Configuration:** All tunable behavioral constants live in `DEFAULTS` in `src/infrastructure/config.ts`, grouped by concern (`analysis`, `risk`, `search`, `display`, `community`, `structure`, `mcp`, `check`, `coChange`, `manifesto`). Users override via `.codegraphrc.json` — `mergeConfig` deep-merges recursively so partial overrides preserve sibling keys. Env vars override LLM settings (`CODEGRAPH_LLM_*`). When adding new behavioral constants, **always add them to `DEFAULTS`** and wire them through config — never introduce new hardcoded magic numbers in individual modules. Category F values (safety boundaries, standard formulas, platform concerns) are the only exception. -**Database:** SQLite at `.codegraph/graph.db` with tables: `nodes`, `edges`, `metadata`, `embeddings`, `function_complexity` +**Database:** SQLite at `.codegraph/graph.db` with tables: `nodes`, `edges`, `metadata`, `embeddings`, `function_complexity`, `ast_nodes` (stored `new`/`throw`/`await`/`string`/`regex` literals queryable via `codegraph ast`). Both engines must extract `ast_nodes` for every language they parse — per-language node-type maps live in `src/ast-analysis/rules/index.ts` (`AST_TYPE_MAPS`, `AST_STRING_CONFIGS`) and mirror the native `LangAstConfig` constants in `crates/codegraph-core/src/extractors/helpers.rs`. Adding a new language requires a matching entry in both. ## Test Structure diff --git a/crates/codegraph-core/src/extractors/javascript.rs b/crates/codegraph-core/src/extractors/javascript.rs index e5ca31d2..a3819f2a 100644 --- a/crates/codegraph-core/src/extractors/javascript.rs +++ b/crates/codegraph-core/src/extractors/javascript.rs @@ -551,7 +551,12 @@ fn walk_ast_nodes_depth(node: &Node, source: &[u8], ast_nodes: &mut Vec let content = raw .trim_start_matches(|c| c == '\'' || c == '"' || c == '`') .trim_end_matches(|c| c == '\'' || c == '"' || c == '`'); - if content.len() < 2 { + // Count Unicode code points, not UTF-8 bytes, so the filter matches + // helpers.rs `build_string_node` and the WASM visitor — a single non- + // ASCII glyph like `─` (3 bytes / 1 code point) must be treated as one + // character, otherwise we emit "excess" string nodes the WASM engine + // skips (see parity issue #1010). + if content.chars().count() < 2 { // Still recurse children (template_string may have nested expressions) for i in 0..node.child_count() { if let Some(child) = node.child(i) { diff --git a/src/ast-analysis/engine.ts b/src/ast-analysis/engine.ts index f92af4a9..b623a8cc 100644 --- a/src/ast-analysis/engine.ts +++ b/src/ast-analysis/engine.ts @@ -43,7 +43,9 @@ import type { } from '../types.js'; import { computeLOCMetrics, computeMaintainabilityIndex } from './metrics.js'; import { + AST_STRING_CONFIGS, AST_TYPE_MAPS, + astStopRecurseKinds, CFG_RULES, COMPLEXITY_RULES, DATAFLOW_RULES, @@ -458,7 +460,15 @@ function setupAstVisitor( for (const row of bulkNodeIdsByFile(db, relPath)) { nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); } - return createAstStoreVisitor(astTypeMap, symbols.definitions || [], relPath, nodeIdMap); + const stringConfig = AST_STRING_CONFIGS.get(langId); + return createAstStoreVisitor( + astTypeMap, + symbols.definitions || [], + relPath, + nodeIdMap, + stringConfig, + astStopRecurseKinds(langId), + ); } /** Set up complexity visitor if any definitions need WASM complexity analysis. */ diff --git a/src/ast-analysis/rules/csharp.ts b/src/ast-analysis/rules/csharp.ts index 8c470907..e500db20 100644 --- a/src/ast-analysis/rules/csharp.ts +++ b/src/ast-analysis/rules/csharp.ts @@ -200,4 +200,11 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({ // ─── AST Node Types ─────────────────────────────────────────────────────── -export const astTypes: Record | null = null; +export const astTypes: Record | null = { + object_creation_expression: 'new', + throw_statement: 'throw', + throw_expression: 'throw', + await_expression: 'await', + string_literal: 'string', + verbatim_string_literal: 'string', +}; diff --git a/src/ast-analysis/rules/go.ts b/src/ast-analysis/rules/go.ts index b2792084..66099fde 100644 --- a/src/ast-analysis/rules/go.ts +++ b/src/ast-analysis/rules/go.ts @@ -181,4 +181,7 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({ // ─── AST Node Types ─────────────────────────────────────────────────────── -export const astTypes: Record | null = null; +export const astTypes: Record | null = { + interpreted_string_literal: 'string', + raw_string_literal: 'string', +}; diff --git a/src/ast-analysis/rules/index.ts b/src/ast-analysis/rules/index.ts index dac941b9..653cbd59 100644 --- a/src/ast-analysis/rules/index.ts +++ b/src/ast-analysis/rules/index.ts @@ -73,10 +73,187 @@ export const DATAFLOW_RULES: Map = new Map([ ['ruby', ruby.dataflow], ]); -// ─── AST Type Maps ─────────────────────────────────────────────────────── +// ─── AST Node Type Maps ────────────────────────────────────────────────── +// +// These mirror the per-language `LangAstConfig` constants in the native Rust +// engine (`crates/codegraph-core/src/extractors/helpers.rs`). WASM and native +// must agree on which tree-sitter node types to emit as `ast_nodes` rows. +// Languages without a dedicated rules/*.ts file have their maps inlined here. + +const JS_AST_TYPES = javascript.astTypes as Record; +const PY_AST_TYPES = python.astTypes as Record; +const GO_AST_TYPES = go.astTypes as Record; +const RS_AST_TYPES = rust.astTypes as Record; +const JAVA_AST_TYPES = java.astTypes as Record; +const CS_AST_TYPES = csharp.astTypes as Record; +const RB_AST_TYPES = ruby.astTypes as Record; +const PHP_AST_TYPES = php.astTypes as Record; + +const C_AST_TYPES: Record = { + string_literal: 'string', +}; + +const CPP_AST_TYPES: Record = { + new_expression: 'new', + throw_statement: 'throw', + co_await_expression: 'await', + string_literal: 'string', + raw_string_literal: 'string', +}; + +const KOTLIN_AST_TYPES: Record = { + throw_expression: 'throw', + string_literal: 'string', +}; + +const SWIFT_AST_TYPES: Record = { + throw_statement: 'throw', + await_expression: 'await', + string_literal: 'string', +}; + +const SCALA_AST_TYPES: Record = { + object_creation_expression: 'new', + throw_expression: 'throw', + string_literal: 'string', +}; + +const BASH_AST_TYPES: Record = { + string: 'string', + expansion: 'string', +}; + +const ELIXIR_AST_TYPES: Record = { + string: 'string', + sigil: 'regex', +}; + +const LUA_AST_TYPES: Record = { + string: 'string', +}; + +const DART_AST_TYPES: Record = { + new_expression: 'new', + constructor_invocation: 'new', + throw_expression: 'throw', + await_expression: 'await', + string_literal: 'string', +}; + +const ZIG_AST_TYPES: Record = { + string_literal: 'string', +}; + +const HASKELL_AST_TYPES: Record = { + string: 'string', + char: 'string', +}; + +const OCAML_AST_TYPES: Record = { + string: 'string', +}; export const AST_TYPE_MAPS: Map> = new Map([ - ['javascript', javascript.astTypes as Record], - ['typescript', javascript.astTypes as Record], - ['tsx', javascript.astTypes as Record], + ['javascript', JS_AST_TYPES], + ['typescript', JS_AST_TYPES], + ['tsx', JS_AST_TYPES], + ['python', PY_AST_TYPES], + ['go', GO_AST_TYPES], + ['rust', RS_AST_TYPES], + ['java', JAVA_AST_TYPES], + ['csharp', CS_AST_TYPES], + ['ruby', RB_AST_TYPES], + ['php', PHP_AST_TYPES], + ['c', C_AST_TYPES], + ['cpp', CPP_AST_TYPES], + ['kotlin', KOTLIN_AST_TYPES], + ['swift', SWIFT_AST_TYPES], + ['scala', SCALA_AST_TYPES], + ['bash', BASH_AST_TYPES], + ['elixir', ELIXIR_AST_TYPES], + ['lua', LUA_AST_TYPES], + ['dart', DART_AST_TYPES], + ['zig', ZIG_AST_TYPES], + ['haskell', HASKELL_AST_TYPES], + ['ocaml', OCAML_AST_TYPES], + ['ocaml-interface', OCAML_AST_TYPES], +]); + +// ─── Per-language string-extraction config ─────────────────────────────── +// +// Mirrors `quote_chars` + `string_prefixes` in the native `LangAstConfig`. +// Used by the AST-store visitor to strip quote characters and language- +// specific prefix sigils (Python `r"..."`, C# verbatim `@"..."`, Rust raw +// `r#"..."#`, etc.) when computing string content for the `name` column. + +export interface AstStringConfig { + quoteChars: string; + stringPrefixes: string; +} + +const JS_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"`', stringPrefixes: '' }; +const PY_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: 'rbfuRBFU' }; +const GO_STRING_CONFIG: AstStringConfig = { quoteChars: '"`', stringPrefixes: '' }; +const RS_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const JAVA_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const CS_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const RB_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' }; +const PHP_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' }; +const C_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const CPP_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: 'LuUR' }; +const KOTLIN_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const SWIFT_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const SCALA_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const BASH_STRING_CONFIG: AstStringConfig = { quoteChars: '"\'', stringPrefixes: '' }; +const ELIXIR_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const LUA_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' }; +const DART_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' }; +const ZIG_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const HASKELL_STRING_CONFIG: AstStringConfig = { quoteChars: '"\'', stringPrefixes: '' }; +const OCAML_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; + +export const AST_STRING_CONFIGS: Map = new Map([ + ['javascript', JS_STRING_CONFIG], + ['typescript', JS_STRING_CONFIG], + ['tsx', JS_STRING_CONFIG], + ['python', PY_STRING_CONFIG], + ['go', GO_STRING_CONFIG], + ['rust', RS_STRING_CONFIG], + ['java', JAVA_STRING_CONFIG], + ['csharp', CS_STRING_CONFIG], + ['ruby', RB_STRING_CONFIG], + ['php', PHP_STRING_CONFIG], + ['c', C_STRING_CONFIG], + ['cpp', CPP_STRING_CONFIG], + ['kotlin', KOTLIN_STRING_CONFIG], + ['swift', SWIFT_STRING_CONFIG], + ['scala', SCALA_STRING_CONFIG], + ['bash', BASH_STRING_CONFIG], + ['elixir', ELIXIR_STRING_CONFIG], + ['lua', LUA_STRING_CONFIG], + ['dart', DART_STRING_CONFIG], + ['zig', ZIG_STRING_CONFIG], + ['haskell', HASKELL_STRING_CONFIG], + ['ocaml', OCAML_STRING_CONFIG], + ['ocaml-interface', OCAML_STRING_CONFIG], ]); + +// ─── Per-language "stop-after-collect" kinds ───────────────────────────── +// +// Mirrors the subtle difference between the native JS walker +// (`extractors/javascript.rs::walk_ast_nodes_depth`) — which *returns* after +// collecting `new_expression` and `throw_statement` to avoid double-counting +// the wrapped expression — and the generic walker (`helpers.rs::walk_ast_ +// nodes_with_config_depth`), which always recurses. For WASM/native parity +// the JS family must skip recursion on `new` and `throw`; every other +// language recurses normally. + +const JS_STOP_RECURSE: ReadonlySet = new Set(['new', 'throw']); +const EMPTY_STOP_RECURSE: ReadonlySet = new Set(); + +export function astStopRecurseKinds(langId: string): ReadonlySet { + if (langId === 'javascript' || langId === 'typescript' || langId === 'tsx') { + return JS_STOP_RECURSE; + } + return EMPTY_STOP_RECURSE; +} diff --git a/src/ast-analysis/rules/java.ts b/src/ast-analysis/rules/java.ts index 0b18d456..fa7ccbff 100644 --- a/src/ast-analysis/rules/java.ts +++ b/src/ast-analysis/rules/java.ts @@ -174,4 +174,8 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({ // ─── AST Node Types ─────────────────────────────────────────────────────── -export const astTypes: Record | null = null; +export const astTypes: Record | null = { + object_creation_expression: 'new', + throw_statement: 'throw', + string_literal: 'string', +}; diff --git a/src/ast-analysis/rules/php.ts b/src/ast-analysis/rules/php.ts index de689e08..113cf6c4 100644 --- a/src/ast-analysis/rules/php.ts +++ b/src/ast-analysis/rules/php.ts @@ -218,4 +218,9 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({ // ─── AST Node Types ─────────────────────────────────────────────────────── -export const astTypes: Record | null = null; +export const astTypes: Record | null = { + object_creation_expression: 'new', + throw_expression: 'throw', + string: 'string', + encapsed_string: 'string', +}; diff --git a/src/ast-analysis/rules/python.ts b/src/ast-analysis/rules/python.ts index 89d3f803..46621dd7 100644 --- a/src/ast-analysis/rules/python.ts +++ b/src/ast-analysis/rules/python.ts @@ -195,4 +195,8 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({ // ─── AST Node Types ─────────────────────────────────────────────────────── -export const astTypes: Record | null = null; +export const astTypes: Record | null = { + raise_statement: 'throw', + await: 'await', + string: 'string', +}; diff --git a/src/ast-analysis/rules/ruby.ts b/src/ast-analysis/rules/ruby.ts index ea18c7ac..e9e71c10 100644 --- a/src/ast-analysis/rules/ruby.ts +++ b/src/ast-analysis/rules/ruby.ts @@ -203,4 +203,7 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({ // ─── AST Node Types ─────────────────────────────────────────────────────── -export const astTypes: Record | null = null; +export const astTypes: Record | null = { + string: 'string', + regex: 'regex', +}; diff --git a/src/ast-analysis/rules/rust.ts b/src/ast-analysis/rules/rust.ts index 4af834db..c21f9d6b 100644 --- a/src/ast-analysis/rules/rust.ts +++ b/src/ast-analysis/rules/rust.ts @@ -172,4 +172,8 @@ export const dataflow: DataflowRulesConfig = makeDataflowRules({ // ─── AST Node Types ─────────────────────────────────────────────────────── -export const astTypes: Record | null = null; +export const astTypes: Record | null = { + await_expression: 'await', + string_literal: 'string', + raw_string_literal: 'string', +}; diff --git a/src/ast-analysis/visitors/ast-store-visitor.ts b/src/ast-analysis/visitors/ast-store-visitor.ts index c21dd306..f825bc10 100644 --- a/src/ast-analysis/visitors/ast-store-visitor.ts +++ b/src/ast-analysis/visitors/ast-store-visitor.ts @@ -5,9 +5,42 @@ import type { Visitor, VisitorContext, } from '../../types.js'; +import type { AstStringConfig } from '../rules/index.js'; const TEXT_MAX = 200; +// ── Cross-language node-type constants (mirror Rust `helpers.rs`) ──────── +const IDENT_TYPES = new Set([ + 'identifier', + 'type_identifier', + 'name', + 'qualified_name', + 'scoped_identifier', + 'qualified_identifier', + 'member_expression', + 'member_access_expression', + 'field_expression', + 'attribute', + 'scoped_type_identifier', +]); + +const CALL_TYPES = new Set([ + 'call_expression', + 'call', + 'invocation_expression', + 'method_invocation', + 'function_call_expression', + 'member_call_expression', + 'scoped_call_expression', +]); + +const DEFAULT_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"`', stringPrefixes: '' }; + +// Keyword tokens skipped when extracting the inner expression text of a +// throw/raise/await/new node. Module-level constant avoids reallocating on +// every call (can be hot in large files). +const CHILD_EXPR_SKIP_KEYWORDS = new Set(['throw', 'raise', 'await', 'new']); + interface AstStoreRow { file: string; line: number; @@ -20,69 +53,122 @@ interface AstStoreRow { function truncate(s: string | null | undefined, max: number = TEXT_MAX): string | null { if (!s) return null; - return s.length <= max ? s : `${s.slice(0, max - 1)}\u2026`; + return s.length <= max ? s : `${s.slice(0, max - 1)}…`; +} + +function trimLeadingChars(s: string, chars: string): string { + if (!chars) return s; + let i = 0; + while (i < s.length && chars.includes(s[i]!)) i++; + return i === 0 ? s : s.slice(i); } -function extractNewName(node: TreeSitterNode): string { +function trimTrailingChars(s: string, chars: string): string { + if (!chars) return s; + let i = s.length; + while (i > 0 && chars.includes(s[i - 1]!)) i--; + return i === s.length ? s : s.slice(0, i); +} + +/** Extract constructor name from a `new_expression` / `object_creation_expression`. */ +function extractConstructorName(node: TreeSitterNode): string { + for (const field of ['type', 'class', 'constructor']) { + const f = node.childForFieldName(field); + if (f?.text) return f.text; + } for (let i = 0; i < node.childCount; i++) { const child = node.child(i); if (!child) continue; - if (child.type === 'identifier') return child.text; - if (child.type === 'member_expression') return child.text; + if (IDENT_TYPES.has(child.type)) return child.text; + } + const raw = node.text || ''; + const beforeParen = raw.split('(')[0] || raw; + return beforeParen.replace(/^new\s+/, '').trim() || '?'; +} + +/** Extract function name from a call node. */ +function extractCallName(node: TreeSitterNode): string { + for (const field of ['function', 'method', 'name']) { + const f = node.childForFieldName(field); + if (f?.text) return f.text; } - return node.text?.split('(')[0]?.replace('new ', '').trim() || '?'; + const text = node.text || ''; + return text.split('(')[0] || '?'; } -function extractExpressionText(node: TreeSitterNode): string | null { +/** Extract name from a throw/raise statement — matches native `extract_throw_target`. */ +function extractThrowName(node: TreeSitterNode, newTypes: Set): string { for (let i = 0; i < node.childCount; i++) { const child = node.child(i); if (!child) continue; - if (child.type !== 'throw' && child.type !== 'await') { - return truncate(child.text); - } + const ck = child.type; + if (newTypes.has(ck)) return extractConstructorName(child); + if (CALL_TYPES.has(ck)) return extractCallName(child); + if (IDENT_TYPES.has(ck)) return child.text; } - return truncate(node.text); + return truncate(node.text) ?? node.text ?? ''; } -/** Extract the name from a throw statement's child nodes. */ -function extractThrowName(node: TreeSitterNode): string | null { +/** Extract name from an await expression — matches native `extract_awaited_name`. */ +function extractAwaitName(node: TreeSitterNode): string { for (let i = 0; i < node.childCount; i++) { const child = node.child(i); if (!child) continue; - if (child.type === 'new_expression') return extractNewName(child); - if (child.type === 'call_expression') { - const fn = child.childForFieldName('function'); - return fn ? fn.text : child.text?.split('(')[0] || '?'; - } - if (child.type === 'identifier') return child.text; + const ck = child.type; + if (CALL_TYPES.has(ck)) return extractCallName(child); + if (IDENT_TYPES.has(ck)) return child.text; } - return truncate(node.text); + return truncate(node.text) ?? node.text ?? ''; } -/** Extract the name from an await expression's child nodes. */ -function extractAwaitName(node: TreeSitterNode): string | null { +/** Extract text of the expression inside a throw/await, skipping the keyword. */ +function extractChildExpressionText(node: TreeSitterNode): string | null { for (let i = 0; i < node.childCount; i++) { const child = node.child(i); if (!child) continue; - if (child.type === 'call_expression') { - const fn = child.childForFieldName('function'); - return fn ? fn.text : child.text?.split('(')[0] || '?'; - } - if (child.type === 'identifier' || child.type === 'member_expression') { - return child.text; - } + if (!CHILD_EXPR_SKIP_KEYWORDS.has(child.type)) return truncate(child.text); } return truncate(node.text); } +/** + * Extract string content from a string-literal node, mirroring the native + * engine's `build_string_node` (`helpers.rs`). Returns `null` when the + * content is shorter than 2 Unicode code points. + */ +function extractStringContent(node: TreeSitterNode, cfg: AstStringConfig): string | null { + const raw = node.text ?? ''; + const isRawString = node.type.includes('raw_string'); + + let s = raw; + s = trimLeadingChars(s, '@'); + s = trimLeadingChars(s, cfg.stringPrefixes); + if (isRawString) s = trimLeadingChars(s, 'r#'); + s = trimLeadingChars(s, cfg.quoteChars); + if (isRawString) s = trimTrailingChars(s, '#'); + s = trimTrailingChars(s, cfg.quoteChars); + + // Count code points, not UTF-16 code units — matches Rust `chars().count()`. + const codePointCount = [...s].length; + if (codePointCount < 2) return null; + return s; +} + export function createAstStoreVisitor( astTypeMap: Record, defs: Definition[], relPath: string, nodeIdMap: Map, + stringConfig: AstStringConfig = DEFAULT_STRING_CONFIG, + stopRecurseKinds: ReadonlySet = new Set(), ): Visitor { const rows: AstStoreRow[] = []; const matched = new Set(); + const newTypes = new Set( + Object.entries(astTypeMap) + .filter(([, kind]) => kind === 'new') + .map(([type]) => type), + ); function findParentDef(line: number): Definition | null { let best: Definition | null = null; @@ -106,12 +192,15 @@ export function createAstStoreVisitor( type KindHandler = (node: TreeSitterNode) => NameTextResult; const kindHandlers: Record = { - new: (node) => ({ name: extractNewName(node), text: truncate(node.text) }), - throw: (node) => ({ name: extractThrowName(node), text: extractExpressionText(node) }), - await: (node) => ({ name: extractAwaitName(node), text: extractExpressionText(node) }), + new: (node) => ({ name: extractConstructorName(node), text: truncate(node.text) }), + throw: (node) => ({ + name: extractThrowName(node, newTypes), + text: extractChildExpressionText(node), + }), + await: (node) => ({ name: extractAwaitName(node), text: extractChildExpressionText(node) }), string: (node) => { - const content = node.text?.replace(/^['"`]|['"`]$/g, '') || ''; - if (content.length < 2) return { name: null, text: null, skip: true }; + const content = extractStringContent(node, stringConfig); + if (content == null) return { name: null, text: null, skip: true }; return { name: truncate(content, 100), text: truncate(node.text) }; }, regex: (node) => ({ name: node.text || '?', text: truncate(node.text) }), @@ -156,7 +245,13 @@ export function createAstStoreVisitor( collectNode(node, kind); - if (kind !== 'string' && kind !== 'regex') { + // Mirror the native walker's recursion policy. In JS/TS, the native + // javascript.rs walker returns after collecting `new` or `throw` to + // avoid double-counting the wrapped expression (e.g. `throw new + // Error('x')` emits one `throw` row, not throw+new+string). Other + // languages go through helpers.rs::walk_ast_nodes_with_config_depth + // which always recurses — so `stopRecurseKinds` is empty for them. + if (stopRecurseKinds.has(kind)) { return { skipChildren: true }; } }, diff --git a/src/domain/wasm-worker-entry.ts b/src/domain/wasm-worker-entry.ts index d421ed9d..c594850b 100644 --- a/src/domain/wasm-worker-entry.ts +++ b/src/domain/wasm-worker-entry.ts @@ -28,7 +28,9 @@ import type { Tree } from 'web-tree-sitter'; import { Language, Parser, Query } from 'web-tree-sitter'; import { computeLOCMetrics, computeMaintainabilityIndex } from '../ast-analysis/metrics.js'; import { + AST_STRING_CONFIGS, AST_TYPE_MAPS, + astStopRecurseKinds, CFG_RULES, COMPLEXITY_RULES, DATAFLOW_RULES, @@ -584,7 +586,15 @@ function setupVisitorsLocal( if (opts.ast) { const astTypeMap = AST_TYPE_MAPS.get(langId); if (astTypeMap) { - astVisitor = createAstStoreVisitor(astTypeMap, defs, relPath, new Map()); + const stringConfig = AST_STRING_CONFIGS.get(langId); + astVisitor = createAstStoreVisitor( + astTypeMap, + defs, + relPath, + new Map(), + stringConfig, + astStopRecurseKinds(langId), + ); visitors.push(astVisitor); } } diff --git a/src/features/ast.ts b/src/features/ast.ts index c31b6690..da9aff19 100644 --- a/src/features/ast.ts +++ b/src/features/ast.ts @@ -1,5 +1,9 @@ import path from 'node:path'; -import { AST_TYPE_MAPS } from '../ast-analysis/rules/index.js'; +import { + AST_STRING_CONFIGS, + AST_TYPE_MAPS, + astStopRecurseKinds, +} from '../ast-analysis/rules/index.js'; import { buildExtensionSet } from '../ast-analysis/shared.js'; import { walkWithVisitors } from '../ast-analysis/visitor.js'; import { createAstStoreVisitor } from '../ast-analysis/visitors/ast-store-visitor.js'; @@ -22,8 +26,6 @@ const KIND_ICONS: Record = { await: '\u22B3', // ⊳ }; -const JS_TS_AST_TYPES = AST_TYPE_MAPS.get('javascript'); - const WALK_EXTENSIONS = buildExtensionSet(AST_TYPE_MAPS); // ─── Helpers ────────────────────────────────────────────────────────── @@ -171,9 +173,10 @@ function collectFileAstRows( // WASM fallback — walk tree if available const ext = path.extname(relPath).toLowerCase(); - if (WALK_EXTENSIONS.has(ext) && symbols._tree) { + const langId = symbols._langId || ''; + if ((WALK_EXTENSIONS.has(ext) || AST_TYPE_MAPS.has(langId)) && symbols._tree) { const rows: AstRow[] = []; - walkAst(symbols._tree.rootNode, defs, relPath, rows, nodeIdMap); + walkAst(symbols._tree.rootNode, defs, relPath, rows, nodeIdMap, langId); return rows; } @@ -226,13 +229,23 @@ function walkAst( relPath: string, rows: AstRow[], nodeIdMap: Map, + langId: string, ): void { - if (!JS_TS_AST_TYPES) { - debug('ast-store: JS_TS_AST_TYPES not available — skipping walk'); + const astTypeMap = AST_TYPE_MAPS.get(langId); + if (!astTypeMap) { + debug(`ast-store: no astTypes for langId=${langId} — skipping walk`); return; } - const visitor = createAstStoreVisitor(JS_TS_AST_TYPES, defs, relPath, nodeIdMap); - const results = walkWithVisitors(rootNode, [visitor], 'javascript'); + const stringConfig = AST_STRING_CONFIGS.get(langId); + const visitor = createAstStoreVisitor( + astTypeMap, + defs, + relPath, + nodeIdMap, + stringConfig, + astStopRecurseKinds(langId), + ); + const results = walkWithVisitors(rootNode, [visitor], langId); const collected = (results['ast-store'] || []) as AstRow[]; rows.push(...collected); } diff --git a/tests/engines/ast-parity.test.ts b/tests/engines/ast-parity.test.ts index e5ba990d..a73c5852 100644 --- a/tests/engines/ast-parity.test.ts +++ b/tests/engines/ast-parity.test.ts @@ -8,6 +8,14 @@ */ import { beforeAll, describe, expect, it } from 'vitest'; +import { + AST_STRING_CONFIGS, + AST_TYPE_MAPS, + astStopRecurseKinds, +} from '../../src/ast-analysis/rules/index.js'; +import { walkWithVisitors } from '../../src/ast-analysis/visitor.js'; +import { createAstStoreVisitor } from '../../src/ast-analysis/visitors/ast-store-visitor.js'; +import { createParsers, getParser } from '../../src/domain/parser.js'; import { getNative, isNativeAvailable } from '../../src/infrastructure/native.js'; import type { NativeAddon } from '../../src/types.js'; @@ -138,4 +146,300 @@ describeOrSkip('AST node parity (native vs WASM)', () => { // Should be an array (possibly empty), not undefined expect(Array.isArray(astNodes)).toBe(true); }); + + // ── Row-count parity per language (#1010) ──────────────────────────── + // Both engines must emit the same ast_nodes row count for a given source. + // Divergence means one engine is under- or over-extracting relative to the + // other. Tested across all languages WASM has grammars + AST_TYPE_MAPS for. + + interface Fixture { + langId: string; + ext: string; + code: string; + } + + const PARITY_FIXTURES: Fixture[] = [ + { + langId: 'javascript', + ext: '.js', + code: JS_SNIPPET, + }, + { + langId: 'typescript', + ext: '.ts', + code: TS_SNIPPET, + }, + { + langId: 'python', + ext: '.py', + code: ` +import os +async def fetch(url): + resp = await http.get(url) + if not resp: + raise ValueError("no data") + return resp +s = "hello world" +r = r"raw" +f = f"prefix {s}" +`, + }, + { + langId: 'rust', + ext: '.rs', + code: ` +async fn load() -> Result { + let s = "hello world".to_string(); + let r = r"raw string content"; + let got = fetch().await?; + Ok(s + &got) +} +`, + }, + { + langId: 'go', + ext: '.go', + code: + ` +package main +import "fmt" +func main() { + s := "hello world" + r := ` + + '`raw string content`' + + ` + fmt.Println(s, r) +} +`, + }, + { + langId: 'java', + ext: '.java', + code: ` +public class App { + public static void main(String[] args) { + String s = "hello world"; + if (args.length == 0) { + throw new RuntimeException("no args"); + } + Object o = new Object(); + } +} +`, + }, + // ── Minimal fixtures for languages added in PR #1016 ─────────────── + // Each exercises at least one string literal + one other ast_node kind + // from AST_TYPE_MAPS to catch silent WASM/native divergence. + { + langId: 'csharp', + ext: '.cs', + code: ` +using System; +public class App { + public static void Main() { + string s = "hello world"; + throw new InvalidOperationException("bad"); + } +} +`, + }, + { + langId: 'ruby', + ext: '.rb', + code: ` +class MyError < StandardError; end +def load_data + s = "hello world" + raise MyError, "no data" +end +`, + }, + { + langId: 'php', + ext: '.php', + code: ` +int main(void) { + const char *s = "hello world"; + printf("%s\\n", s); + return 0; +} +`, + }, + { + langId: 'cpp', + ext: '.cpp', + code: ` +#include +#include +int run() { + std::string s = "hello world"; + auto *p = new int(42); + throw std::runtime_error("bad"); + return *p; +} +`, + }, + { + langId: 'kotlin', + ext: '.kt', + code: ` +fun run() { + val s = "hello world" + throw RuntimeException("bad") +} +`, + }, + { + langId: 'swift', + ext: '.swift', + code: ` +enum MyError: Error { case bad } +func run() async throws -> String { + let s = "hello world" + let r = try await load() + throw MyError.bad +} +`, + }, + { + langId: 'scala', + ext: '.scala', + code: ` +object App { + def run(): Unit = { + val s = "hello world" + val o = new Exception("bad") + throw o + } +} +`, + }, + { + langId: 'bash', + ext: '.sh', + code: ` +#!/bin/bash +s="hello world" +echo "$s" +`, + }, + { + langId: 'elixir', + ext: '.ex', + code: ` +defmodule App do + def run do + s = "hello world" + r = ~r/^[a-z]+$/ + {s, r} + end +end +`, + }, + { + langId: 'lua', + ext: '.lua', + code: ` +local function run() + local s = "hello world" + return s +end +`, + }, + { + langId: 'dart', + ext: '.dart', + code: ` +Future run() async { + final s = "hello world"; + final r = await load(); + throw Exception("bad"); +} +`, + }, + { + langId: 'zig', + ext: '.zig', + code: ` +const std = @import("std"); +pub fn main() void { + const s = "hello world"; + std.debug.print("{s}\\n", .{s}); +} +`, + }, + { + langId: 'haskell', + ext: '.hs', + code: ` +module Main where +main :: IO () +main = do + let s = "hello world" + putStrLn s +`, + }, + { + langId: 'ocaml', + ext: '.ml', + code: ` +let run () = + let s = "hello world" in + print_endline s +`, + }, + ]; + + async function wasmExtractAstNodes(code: string, ext: string, langId: string): Promise { + const parsers = await createParsers(); + const parser = getParser(parsers, `/test/file${ext}`); + if (!parser) return -1; + const tree = parser.parse(code); + if (!tree) return -1; + const astTypeMap = AST_TYPE_MAPS.get(langId); + if (!astTypeMap) return 0; + const stringConfig = AST_STRING_CONFIGS.get(langId); + const visitor = createAstStoreVisitor( + astTypeMap, + [], + `/test/file${ext}`, + new Map(), + stringConfig, + astStopRecurseKinds(langId), + ); + const results = walkWithVisitors(tree.rootNode as any, [visitor], langId); + const rows = (results['ast-store'] || []) as unknown[]; + return rows.length; + } + + for (const fixture of PARITY_FIXTURES) { + it.skipIf(!isNativeAvailable())(`ast_nodes row-count parity: ${fixture.langId}`, async () => { + const wasmCount = await wasmExtractAstNodes(fixture.code, fixture.ext, fixture.langId); + if (wasmCount === -1) return; // Grammar unavailable locally — skip. + + const nativeResult = nativeExtract(fixture.code, `/test/file${fixture.ext}`); + const nativeCount = (nativeResult.astNodes || nativeResult.ast_nodes || []).length; + + // Allow ≤1 row tolerance — see issue #1010 acceptance criteria. + const diff = Math.abs(wasmCount - nativeCount); + expect( + diff, + `${fixture.langId}: WASM=${wasmCount}, Native=${nativeCount}`, + ).toBeLessThanOrEqual(1); + }); + } });