diff --git a/.gitattributes b/.gitattributes index eb61982..e141d46 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,8 +1,8 @@ # Generated artifacts (npm run gen) — committed for consumers, CI-gated for # staleness, collapsed in GitHub diffs. The grammar sources (*.ts at the repo # root) are the hand-written truth; everything below is derived from them. -# (*.cst-types.ts / *.cst-match.ts are generated too but NOT committed — see -# .gitignore; they regenerate locally and in CI before typecheck/gates.) +# (*.cst-match.ts is generated too but NOT committed — see .gitignore; +# it regenerates locally and in CI before typecheck/gates.) *.tmLanguage.json linguist-generated=true *.language-configuration.json linguist-generated=true *.monarch.json linguist-generated=true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4816031..c710619 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,9 +29,9 @@ jobs: - run: npm ci - # Regenerate every grammar's artifacts FIRST: the uncommitted ones - # (*.cst-types.ts / *.cst-match.ts, gitignored) must exist before Typecheck - # and the gates, which import them. Then fail if any COMMITTED artifact + # Regenerate every grammar's artifacts FIRST: the uncommitted one + # (*.cst-match.ts, gitignored) must exist before Typecheck + # and the gates, which import it. Then fail if any COMMITTED artifact # drifts from the regenerated output (someone edited a grammar but forgot # to regenerate). Covers all grammars (sources at the repo root) + the # tree-sitter packages. diff --git a/.gitignore b/.gitignore index bb05bd4..72189b4 100644 --- a/.gitignore +++ b/.gitignore @@ -18,7 +18,6 @@ tree-sitter/*/src/node-types.json tree-sitter/*/src/tree_sitter/ tree-sitter/*/*.wasm -# Generated CST consumer artifacts (npm run gen) — derived from the grammar, not +# Generated CST consumer artifact (npm run gen) — derived from the grammar, not # committed: generate locally / in CI before typecheck and gates. -*.cst-types.ts *.cst-match.ts diff --git a/README.md b/README.md index ea56dd7..243879e 100644 --- a/README.md +++ b/README.md @@ -338,6 +338,19 @@ const Regex = token(seq( [`test/agnostic.ts`](test/agnostic.ts) proves it directly — the same engine parses a toy grammar whose identifier token is `Word`, with no templates or regex. The deeper proof is [`html.ts`](html.ts): markup shares *nothing* with TypeScript's token stream, yet the same engine handles it. +### The emitted parser need not be JS — Go, Rust, native + +The grammar also derives a **standalone parser in another language**. [`emitParser(grammar, target)`](src/emit.ts) runs one analysis into one language-agnostic IR, and each `Target` renders it — including its own regex-free lexer (`emitParser` reuses `emitLexer(grammar, target)`), so the output has no dependency on the JS runtime and compiles offline: + +```ts +import { emitParser, goTarget, rustTarget } from './src/emit.ts'; + +writeFileSync('parser.go', emitParser(grammar, goTarget)); // `go build`, no deps +writeFileSync('parser.rs', emitParser(grammar, rustTarget)); // `rustc`, no crates +``` + +The proof is the full languages: the real [`javascript.ts`](javascript.ts) and [`typescript.ts`](typescript.ts) grammars — including the `[Await]/[Yield]` fork, left recursion, the regex/division and template state machines, arrow functions, and the TS type grammar — emit to **TypeScript, Go, and Rust**, and every CST is byte-identical to the reference interpreter. [`test/portable-targets.ts`](test/portable-targets.ts) compiles and runs all three for sixteen grammars (the two real languages plus focused fixtures) on every CI run. The Rust output reaches [oxc](https://github.com/oxc-project/oxc) throughput and the Go output beats [tsgo](https://github.com/microsoft/typescript-go) on the same corpus (an arena keeps both near zero-allocation). Byte-based Go/Rust use UTF-8 offsets — identical to the JS interpreter's for ASCII; non-ASCII offset units differ inherently. + ## Adding a language A new language is **one grammar file** on the unchanged engine: @@ -375,8 +388,7 @@ typescript.ts one grammar (TypeScript combinator API) ├─ src/gen-tm.ts ───────────▶ typescript.tmLanguage.json (TextMate highlighter) ├─ src/gen-vscode-config.ts ▶ typescript.language-configuration.json (editor behavior) ├─ src/gen-treesitter.ts ───▶ tree-sitter/ (grammar.js + highlights.scm + scanner.c) - ├─ src/gen-monarch.ts ──────▶ typescript.monarch.json - └─ src/gen-ast-types.ts ────▶ typescript.cst-types.ts + └─ src/gen-monarch.ts ──────▶ typescript.monarch.json shared src/grammar-utils.ts structural helpers used across stages src/api.ts, types.ts the grammar's combinator + type surface diff --git a/src/cli.ts b/src/cli.ts index 9752e16..6a567df 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -4,7 +4,6 @@ import { generateTmLanguage, generateMarkupInjection, generateAliasGrammar, gene import { generateLanguageConfig } from './gen-vscode-config.ts'; import { generateTreeSitter } from './gen-treesitter.ts'; import { generateMonarch } from './gen-monarch.ts'; -import { generateAstTypes } from './gen-ast-types.ts'; import { generateCstMatch } from './gen-cst-match.ts'; import type { CstGrammar, RuleExpr } from './types.ts'; import { tokenPatternSource } from './token-pattern.ts'; @@ -115,11 +114,8 @@ emit(`tree-sitter/${langName}/package.json`, // Monaco Monarch tokenizer (markup-aware: emits a tag/text/raw-text state machine). emit(`${langName}.monarch.json`, JSON.stringify(generateMonarch(grammar), null, 2)); -// CST node types (TypeScript) — generic over rules, fine for markup too. -emit(`${langName}.cst-types.ts`, generateAstTypes(grammar)); - -// Per-arm CST destructurers (value-level sibling of the types above). -emit(`${langName}.cst-match.ts`, generateCstMatch(grammar, `./${langName}.cst-types.ts`)); +// Per-arm CST destructurers. +emit(`${langName}.cst-match.ts`, generateCstMatch(grammar)); function formatExpr(expr: RuleExpr): string { switch (expr.type) { diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 13e254d..c9f2921 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -39,13 +39,13 @@ const resyncRetractLine = (indent: string): string => // loop, so `cc>127 && lxNonAsciiWs(cc)` is EXACTLY "the regex would match here" → byte- // identical, minus the wasted exec on the common non-whitespace case (#45 B4). const NON_ASCII_WS_FN = - `function lxNonAsciiWs(cc) { return cc === 0xa0 || cc === 0x1680 || (cc >= 0x2000 && cc <= 0x200a) || cc === 0x2028 || cc === 0x2029 || cc === 0x202f || cc === 0x205f || cc === 0x3000 || cc === 0xfeff; }`; + `function lxNonAsciiWs(cc: number) { return cc === 0xa0 || cc === 0x1680 || (cc >= 0x2000 && cc <= 0x200a) || cc === 0x2028 || cc === 0x2029 || cc === 0x202f || cc === 0x205f || cc === 0x3000 || cc === 0xfeff; }`; // The non-ASCII whitespace fallback, emitted at the two sites that need it (after an ASCII run, // and as the lead char). `cont` appends the `continue` the lead-char site needs. const nonAsciiWsConsume = (v: string, cont: boolean, indent: string): string => - `${indent}if (${v} > 127 && lxNonAsciiWs(${v})) { LX_WS.lastIndex = pos; const m = LX_WS.exec(source); if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length;${cont ? ' continue;' : ''} } }`; + `${indent}if (${v} > 127 && lxNonAsciiWs(${v})) { LX_WS.lastIndex = pos; const m = LX_WS.exec(source); if (m !== null) { if (/[\\n\\r\\u2028\\u2029]/.test(m[0])) pendingNl = true; pos += m[0].length;${cont ? ' continue;' : ''} } }`; -export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { +export function emitSoaLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // Out of scope: the markup / indentation / newline state machines. if (grammar.markup || grammar.indent || grammar.newline) return null; if (grammar.tokens.some(t => tokenBlockPatternSource(t) || t.blockOnly)) return null; @@ -134,22 +134,22 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// min paren depth recorded over the old suffix [j, altN) (pop-on-empty = -1),`); emit(`// built lazily once per edit (the caller nulls it when the alt stream changes).`); emit(`let lexResyncPd = 0;`); - emit(`let altSuffMin = null;`); - emit(`let altSuffMinBuf = null;`); + emit(`let altSuffMin: Int32Array | null = null;`); + emit(`let altSuffMinBuf: Int32Array | null = null;`); emit(`// ')' pops that found an empty stack, in THIS lexCore call's token indices`); - emit(`let lexEmptyPops = [];`); + emit(`let lexEmptyPops: number[] = [];`); emit(`// Min OLD-stream paren depth over the tokens inside the damage itself (set by the`); emit(`// caller before the window lex): the old-side trajectory min starts from here.`); emit(`let wndOldMin0 = 0x7fffffff;`); - emit(`function buildAltSuffMin(lo) {`); + emit(`function buildAltSuffMin(lo: number) {`); emit(` if (altSuffMinBuf === null || altSuffMinBuf.length < altN + 1) altSuffMinBuf = new Int32Array(altN + 1025);`); emit(` altSuffMin = altSuffMinBuf;`); - emit(` altSuffMin[altN] = 0x7fffffff;`); + emit(` altSuffMin![altN] = 0x7fffffff;`); emit(` for (let j = altN - 1; j >= lo; j--) {`); - emit(` let d = altPd[j];`); - emit(` if (d === 0 && altK[j] === K_PUNCT && altT[j] === ${tOf(')')} && (j === 0 || altPd[j - 1] === 0)) d = -1;`); - emit(` const nx = altSuffMin[j + 1];`); - emit(` altSuffMin[j] = d < nx ? d : nx;`); + emit(` let d = altPd![j];`); + emit(` if (d === 0 && altK![j] === K_PUNCT && altT![j] === ${tOf(')')} && (j === 0 || altPd![j - 1] === 0)) d = -1;`); + emit(` const nx = altSuffMin![j + 1];`); + emit(` altSuffMin![j] = d < nx ? d : nx;`); emit(` }`); emit(`}`); emit(`const LX_UNI_IDENT = /[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/uy;`); @@ -175,7 +175,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // Length window → first-charCode switch → per-keyword compare chains (shortest first); // returns exactly what LIT_KW.get(source.slice(a, b)) ?? 0 would — the keyword set is // enumerated completely and keywords are pure ASCII, so charCode compares are exact. - emit(`function lexKwT(source, a, b) {`); + emit(`function lexKwT(source: string, a: number, b: number) {`); const kwEntries = [...st.kwLitKind.entries()]; if (kwEntries.length === 0) { emit(` return 0;`); @@ -205,11 +205,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { } emit(`}`); // identTextValid, with the per-token prefix length baked at the call site. - emit(`function lexIdentValid(text, prefixLen) {`); + emit(`function lexIdentValid(text: string, prefixLen: number) {`); emit(` const body = prefixLen > 0 ? text.slice(prefixLen) : text;`); emit(` if (!body.includes('\\\\')) return true;`); emit(` let bad = false;`); - emit(` const decoded = body.replace(LX_DECODE_ESC, (_m, braced, fixed) => {`); + emit(` const decoded = body.replace(LX_DECODE_ESC, (_m: string, braced: string, fixed: string) => {`); emit(` const cp = parseInt(braced ?? fixed, 16);`); emit(` if (cp > 0x10FFFF) { bad = true; return ''; }`); emit(` return String.fromCodePoint(cp);`); @@ -219,7 +219,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` return m !== null && m[0].length === decoded.length;`); emit(`}`); if (templateToken) { - emit(`function lexTplSpan(source, pos, validateEscapes) {`); + emit(`function lexTplSpan(source: string, pos: number, validateEscapes: boolean) {`); emit(` const tplFrom = pos;`); emit(` while (pos < source.length) {`); emit(` if (${startsWithExpr('source', 'pos', tplInterpOpen)}) return { endsWithInterp: true, end: pos + ${tplInterpOpen.length} };`); @@ -256,7 +256,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // — no per-token object, no text slice: text is materialized from the source span only // when a CST leaf is built. Flag bits: 1 = newlineBefore (the only stamp this emitted // lexer ever sets; comment/multilineFlow stamps belong to fallback-only grammars). - emit(`function tokenize(source) {`); + emit(`function tokenize(source: string) {`); emit(` docPieces = [source]; docPieceOff = [0]; docLen = source.length;`); emit(` docFlat = source; docCur = 0;`); emit(` tokN = 0;`); @@ -281,7 +281,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// old token (same k/t, offsets shifted by wndDelta, both depth records 0) while`); emit(`// the window's own stacks are empty — returns that OLD index (the duplicate push`); emit(`// is retracted), or -1 when lexing ran to EOF.`); - emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs, initParens, srcBase, hasMore) {`); + emit(`function lexCore(source: string, startPos: number, pvK: number, pvT: number, wndPtr0: number, wndMinOff: number, wndDelta: number, wndCs?: number, initParens?: boolean[] | null, srcBase?: number, hasMore?: boolean) {`); emit(` if (srcBase === undefined) srcBase = 0;`); emit(` lexWindowMore = hasMore === true;`); emit(` lexSrcBase = srcBase;`); @@ -291,7 +291,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` let extraFl = 0;`); emit(` let lastBangWasPostfix = false;`); emit(` let lastCloseWasParenHead = false;`); - emit(` const templateStack = [];`); + emit(` const templateStack: number[] = [];`); emit(` const parenHeadStack = initParens !== undefined && initParens !== null ? initParens : [];`); emit(` let wndPtr = wndPtr0;`); emit(` let wndHit = -1;`); @@ -301,8 +301,8 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` // tokens and stack ops). An entry at depth <= BOTH mins was open at the`); emit(` // divergence point in both lexes - i.e. it is the SAME entry.`); emit(` let dmgMinOld = wndOldMin0, dmgMinNew = -1;`); - emit(` function tkPush(k, t, off, end) {`); - emit(` off += srcBase; end += srcBase;`); + emit(` function tkPush(k: number, t: number, off: number, end: number) {`); + emit(` off += srcBase!; end += srcBase!;`); emit(` if (tokN === tkCap) growTok();`); emit(` tkK[tokN] = k; tkT[tokN] = t; tkOff[tokN] = off; tkEnd[tokN] = end;`); emit(` tkFl[tokN] = (pendingNl ? 1 : 0) | extraFl;`); @@ -331,20 +331,20 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` // adopted tkPd column by lexResyncPd to the new truth.`); emit(` if (wndPtr >= 0) {`); emit(` const pd = tkPd[tokN - 1];`); - emit(` if (dmgMinNew < 0) { if (off >= wndCs) dmgMinNew = pd; }`); + emit(` if (dmgMinNew < 0) { if (off >= wndCs!) dmgMinNew = pd; }`); emit(` else if (pd < dmgMinNew) dmgMinNew = pd;`); emit(` if (off >= wndMinOff) {`); - emit(` while (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta < off) { if (altPd[wndPtr] < dmgMinOld) dmgMinOld = altPd[wndPtr]; wndPtr++; }`); - emit(` if (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta === off && altK[wndPtr] === k && altT[wndPtr] === t`); - emit(` && (altEnd[wndPtr] < 0 ? altEnd[wndPtr] + srcLenP1 : altEnd[wndPtr]) + wndDelta === end`); + emit(` while (wndPtr < altN && (altOff![wndPtr] < 0 ? altOff![wndPtr] + srcLenP1 : altOff![wndPtr]) + wndDelta < off) { if (altPd![wndPtr] < dmgMinOld) dmgMinOld = altPd![wndPtr]; wndPtr++; }`); + emit(` if (wndPtr < altN && (altOff![wndPtr] < 0 ? altOff![wndPtr] + srcLenP1 : altOff![wndPtr]) + wndDelta === off && altK![wndPtr] === k && altT![wndPtr] === t`); + emit(` && (altEnd![wndPtr] < 0 ? altEnd![wndPtr] + srcLenP1 : altEnd![wndPtr]) + wndDelta === end`); emit(` // the candidate's LEADING-TRIVIA flags must match too: the gap before`); emit(` // it may sit inside the edit (newline removed/added without moving any`); emit(` // token bytes), and parsers read these flags (sameLine / commentBefore)`); - emit(` && altFl[wndPtr] === tkFl[tokN - 1]`); - emit(` && templateStack.length === 0 && altDp[wndPtr] === 0`); + emit(` && altFl![wndPtr] === tkFl[tokN - 1]`); + emit(` && templateStack.length === 0 && altDp![wndPtr] === 0`); emit(` && LX_PFXV[t] === 0 && LX_PARENKW[t] === 0`); emit(` && !(k === K_PUNCT && (t === ${tLParen} || t === ${tRParen}))) {`); - emit(` const q = altPd[wndPtr];`); + emit(` const q = altPd![wndPtr];`); emit(` if (q < dmgMinOld) dmgMinOld = q;`); emit(` if (q === pd && pd <= dmgMinOld && pd <= dmgMinNew) {`); emit(` wndHit = wndPtr;`); @@ -358,7 +358,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` okTail = docEmptyPops.length === 0 || docEmptyPops[docEmptyPops.length - 1] <= wndPtr;`); emit(` } else {`); emit(` if (altSuffMin === null) buildAltSuffMin(wndPtr0);`); - emit(` okTail = altSuffMin[wndPtr + 1] >= q;`); + emit(` okTail = altSuffMin![wndPtr + 1] >= q;`); emit(` }`); emit(` if (okTail) {`); emit(` wndHit = wndPtr;`); @@ -390,7 +390,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` if (cc === 32 || (cc >= 9 && cc <= 13)) {`); emit(` let wc = cc;`); emit(` do {`); - emit(` if (wc === 10) pendingNl = true;`); + emit(` if (wc === 10 || wc === 13) pendingNl = true;`); // JS line terminators LF/CR (LS/PS via the \\s regex below) emit(` pos++;`); emit(` wc = source.charCodeAt(pos);`); emit(` } while (wc === 32 || (wc >= 9 && wc <= 13));`); @@ -476,7 +476,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`${ind} }`); } if (m.skip) { - emit(`${ind} if (m[0].includes('\\n')) pendingNl = true;`); + emit(`${ind} if (/[\\n\\r\\u2028\\u2029]/.test(m[0])) pendingNl = true;`); emit(`${ind} pos += m[0].length;`); } else { emit(`${ind} const _e = pos + m[0].length;`); @@ -495,7 +495,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`${ind} extraFl = _ph ? 8 : 0; }`); } else if (lit === ')') { emit(`${ind}if (parenHeadStack.length === 0) { lastCloseWasParenHead = false; lexEmptyPops.push(tokN); }`); - emit(`${ind}else lastCloseWasParenHead = parenHeadStack.pop();`); + emit(`${ind}else lastCloseWasParenHead = parenHeadStack.pop()!;`); } if (regexCtx?.postfixAfterValueTexts?.includes(lit)) { emit(`${ind}lastBangWasPostfix = prevIsValue();`); @@ -635,7 +635,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// lexer flag live (a control-head ')' or a postfix-ambiguous operator would`); emit(`// make the next token's regex-context depend on unrecoverable state). -1 = file`); emit(`// head (always sound, degrades to a full re-lex).`); - emit(`function findRestart(cs) {`); + emit(`function findRestart(cs: number) {`); emit(` let lo = 0, hi = tokN;`); // STRICTLY before the damage: a token ENDING exactly at cs can be EXTENDED by // the edit under maximal munch ('b' + inserted 'x' = 'bx'; '=' + '=' = '=='; @@ -658,9 +658,9 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// openers at that depth are re-opened later, and the re-opener comes first`); emit(`// backward). The '(' records its depth INCLUDING itself, and carries its`); emit(`// control-head-ness as tkFl bit 8.`); - emit(`function reconstructParens(b) {`); + emit(`function reconstructParens(b: number) {`); emit(` let need = b >= 0 ? tkPd[b] : 0;`); - emit(` const out = new Array(need);`); + emit(` const out: boolean[] = new Array(need);`); emit(` for (let i = b; i >= 0 && need > 0; i--) {`); emit(` if (tkK[i] === 1 && tkT[i] === ${tOf('(')} && tkPd[i] === need) { out[need - 1] = (tkFl[i] & 8) !== 0; need--; }`); emit(` }`); @@ -673,9 +673,9 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// are splice-stable (every splice begins past its own anchor), so the baseline`); emit(`// stays exact; a backward jump (b < cached) falls back to the full scan.`); emit(`let parenCachePos = -1;`); - emit(`let parenCacheStack = [];`); - emit(`function reconstructParensCached(b) {`); - emit(` let stack;`); + emit(`let parenCacheStack: boolean[] = [];`); + emit(`function reconstructParensCached(b: number) {`); + emit(` let stack: boolean[];`); emit(` if (b < 0) stack = [];`); emit(` else if (parenCachePos >= 0 && parenCachePos <= b) {`); emit(` stack = parenCacheStack;`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 68923f3..ebbc2f9 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -27,7 +27,8 @@ import type { CstGrammar, RuleExpr, RuleDecl } from './types.ts'; import { isKeywordLiteral, collectLiterals } from './grammar-utils.ts'; import { analyzeGrammar, findEntryRule, type Sec } from './grammar-analysis.ts'; -import { emitLexer } from './emit-lexer.ts'; +import { emitSoaLexer } from './emit-lexer.ts'; +import type { Target } from './emit.ts'; import { withAwaitYield } from './await-yield-fork.ts'; // ── Static analysis ── @@ -346,12 +347,19 @@ function analyze(grammar: CstGrammar) { typeKind, kwLitKind, puLitKind, classifyKey, }; + // Column element types: Uint8 when the kind/literal id spaces fit a byte (the SoA + // token columns and their spare-buffer mirrors). Single-sourced here so every emit + // function — emitRuntime's `let tk* = new …`, emitDriver's `let alt* …` — agrees. + const tMaxT = Math.max(1, ...kwLitKind.values(), ...puLitKind.values()); + const kArr = KIND_NAMED_FALLBACK <= 255 ? 'Uint8Array' : 'Uint16Array'; + const tArr = tMaxT <= 255 ? 'Uint8Array' : 'Uint16Array'; + return { grammar, tokenNames, opTable, prefixOps, noUnaryLhsOps, postfixOpValues, requireTargetOps, binaryConnectors, prattRules, leftRecSet, ruleByName, prattClassified, leftRecClassified, maxBp, templateTokenName, templateTokenNames, firstTokenOf, altDeepFirst, altNullable, altSecond, ledMeta, contMeta, nudCap, nullableRules, firstSets, symtab, qualKeys, - exprFirst, exprNullable, + exprFirst, exprNullable, kArr, tArr, }; } @@ -649,7 +657,7 @@ class Emitter { // A suppress-carrying group stages the LED-connector exclusion for the next // parseRule, then matches its body (same as matchExpr 'group'). const pre = (expr.suppress && expr.suppress.length) - ? `suppressNext = new Set(${J(expr.suppress)});` + ? `suppressNext = new Set(${J(expr.suppress)});` : ``; return [pre, this.matchInto(expr.body, onFail)].filter(Boolean).join('\n'); } @@ -865,7 +873,7 @@ class Emitter { if (!nm) { nm = `_q${this.memberFns.size}`; this.memberFns.set(fnKey, nm); - this.helperDefs.push(`function ${nm}(i) { return i >= cap || (${kArr}[tkK[i]] | ${tArr}[tkT[i]]) !== 0; }`); + this.helperDefs.push(`function ${nm}(i: number) { return i >= cap || (${kArr}[tkK[i]] | ${tArr}[tkT[i]]) !== 0; }`); } return nm; } @@ -1052,7 +1060,7 @@ class Emitter { let nm = this.u8Consts.get(key); if (!nm) { if (!this.u8Emitted) { - this.helperDefs.push(`function u8(n, ones) { const a = new Uint8Array(n); for (let i = 0; i < ones.length; i++) a[ones[i]] = 1; return a; }`); + this.helperDefs.push(`function u8(n: number, ones: number[]) { const a = new Uint8Array(n); for (let i = 0; i < ones.length; i++) a[ones[i]] = 1; return a; }`); this.u8Emitted = true; } nm = `_qb${this.u8Consts.size}`; @@ -1085,7 +1093,28 @@ class Emitter { // ── Top-level emit ── -export function emitParser(grammar: CstGrammar): string { +// The `js` Target: the optimized SoA-int parser/lexer, wrapped behind the same two-method +// Target contract as the portable ts/go/rust targets (see emit.ts). `emitJsLexer` derives the +// standalone lexer; `emitJsParser` embeds whatever lexer source it is handed. Splitting the +// lexer COMPUTATION from its EMBEDDING leaves the emitted bytes identical (both re-derive the +// same deterministic symtab), so `emit-parser-verify` stays byte-for-byte. +export const jsTarget: Target = { + name: 'javascript', + ext: 'js', + emitLexer: emitJsLexer, + emitParser: emitJsParser, +}; + +export function emitJsLexer(grammar: CstGrammar): string | null { + grammar = withAwaitYield(grammar); + const st = analyze(grammar).symtab; + return emitSoaLexer(grammar, { + typeKind: st.typeKind, kwLitKind: st.kwLitKind, puLitKind: st.puLitKind, + KIND_PUNCT: st.KIND_PUNCT, KIND_NAMED_FALLBACK: st.KIND_NAMED_FALLBACK, + }); +} + +export function emitJsParser(grammar: CstGrammar, lexSrc: string | null): string { // [Await]/[Yield] context: name-fork the body-reachable rule closure into $A/$Y/$AY // families (see await-yield-fork.ts). No-op for a grammar with no ctx markers. Done // HERE (not at grammar export) so the forks exist ONLY in the parser's rule identity @@ -1120,11 +1149,8 @@ export function emitParser(grammar: CstGrammar): string { // The lexer: EMITTED (specialized, standalone — see emit-lexer.ts) when the grammar // is a plain token stream; the data-driven createLexer runtime otherwise // (markup/indent/newline state machines stay interpreter-only). + // `lexSrc` is handed in by the Target façade (emitParser reuses emitLexer) — see emit.ts. const st = a.symtab; - const lexSrc = emitLexer(grammar, { - typeKind: st.typeKind, kwLitKind: st.kwLitKind, puLitKind: st.puLitKind, - KIND_PUNCT: st.KIND_PUNCT, KIND_NAMED_FALLBACK: st.KIND_NAMED_FALLBACK, - }); e.soa = lexSrc !== null; if (!lexSrc) { e.emit(`import { createLexer } from ${J(resolveLexerImport())};`); @@ -1136,9 +1162,9 @@ export function emitParser(grammar: CstGrammar): string { // TYPE_KIND: tok.type → int. LIT_KW / LIT_PU: tok.text → keyword / punct literal int. // Every token is BORN with tok.k (type kind) + tok.t (literal kind) and the stamp // flags — one monomorphic shape, one allocation, no post-pass. - e.emit(`const TYPE_KIND = new Map(${J([...st.typeKind])});`); - e.emit(`const LIT_KW = new Map(${J([...st.kwLitKind])});`); - e.emit(`const LIT_PU = new Map(${J([...st.puLitKind])});`); + e.emit(`const TYPE_KIND = new Map(${J([...st.typeKind])});`); + e.emit(`const LIT_KW = new Map(${J([...st.kwLitKind])});`); + e.emit(`const LIT_PU = new Map(${J([...st.puLitKind])});`); e.emit(`const K_PUNCT = ${st.KIND_PUNCT};`); e.emit(`const K_TEMPLATE_HEAD = ${st.KIND_TEMPLATE_HEAD};`); e.emit(`const K_TEMPLATE_MIDDLE = ${st.KIND_TEMPLATE_HEAD + 1};`); @@ -1151,15 +1177,15 @@ export function emitParser(grammar: CstGrammar): string { if (lexSrc) { e.emit(lexSrc); } else { - e.emit(`const { tokenize } = createLexer(LEX_GRAMMAR, {`); + e.emit(`const { tokenize } = createLexer(LEX_GRAMMAR as any, {`); e.emit(` typeKind: TYPE_KIND, kwLit: LIT_KW, puLit: LIT_PU,`); e.emit(` punctKind: K_PUNCT, namedFallback: K_NAMED_FALLBACK,`); e.emit(`});`); } e.emit(``); // Baked maps. Emit as object literals → Map. - e.emit(`const opTable = new Map(${J([...a.opTable])});`); - e.emit(`const prefixOps = new Map(${J([...a.prefixOps])});`); + e.emit(`const opTable = new Map(${J([...a.opTable])});`); + e.emit(`const prefixOps = new Map(${J([...a.prefixOps])});`); // The same op tables re-keyed by the literal int (tok.t): the Pratt loops look an // operator up for EVERY token they reach, and tok.t is already interned — an array // load replaces the string-keyed Map.get. Equivalent because a token's text can equal @@ -1178,10 +1204,11 @@ export function emitParser(grammar: CstGrammar): string { } return arr; }; - e.emit(`const OP_BY_T = ${J(byT(a.opTable))};`); - e.emit(`const PREFIX_BY_T = ${J(byT(a.prefixOps))};`); + e.emit(`type OpInfo = { lbp: number; rbp: number; assoc: string; position: string; requireTarget?: boolean };`); + e.emit(`const OP_BY_T: (OpInfo | null)[] = ${J(byT(a.opTable))};`); + e.emit(`const PREFIX_BY_T: (OpInfo | null)[] = ${J(byT(a.prefixOps))};`); } - e.emit(`const noUnaryLhsOps = new Set(${J([...a.noUnaryLhsOps])});`); + e.emit(`const noUnaryLhsOps = new Set(${J([...a.noUnaryLhsOps])});`); { let tSize = 1; for (const v of st.kwLitKind.values()) tSize = Math.max(tSize, v + 1); @@ -1206,14 +1233,14 @@ export function emitParser(grammar: CstGrammar): string { } e.emit(`const REQTGT_T = Uint8Array.from([${rt.join(',')}]);`); } - e.emit(`const postfixOpValues = new Set(${J([...a.postfixOpValues])});`); - e.emit(`const binaryConnectors = new Set(${J([...a.binaryConnectors])});`); + e.emit(`const postfixOpValues = new Set(${J([...a.postfixOpValues])});`); + e.emit(`const binaryConnectors = new Set(${J([...a.binaryConnectors])});`); // Assignment-target shape test (ECMAScript AssignmentTargetType): a node id is NOT a // valid LHS target iff its outermost form is a prefix-op (prefix-unary OR prefix-update // `++x`) — head kid is an operator-tag leaf in prefixOps — or a postfix-update (`x++`) — // tail kid is an operator-tag leaf in postfixOpValues. A parenthesized cover / member / // element / call / non-null tail has no operator-tag leaf at head or tail, so it passes. - e.emit(`function _notTarget(lhs) {`); + e.emit(`function _notTarget(lhs: number) {`); e.emit(` const n = rowCount[lhs]; if (n === 0) return false;`); e.emit(` const cs = rowStart[lhs];`); e.emit(` const _h = kids[cs];`); @@ -1238,7 +1265,7 @@ export function emitParser(grammar: CstGrammar): string { // nodes). Drives the notLeftLeaf LED gate: a node whose head leaf text is in the arm's word set // (e.g. `void`/`null`/`this` for the type `.` qualification) is not a valid LEFT operand of the // arm. A childless ($missing recovery) node returns '' (matches no word → the arm is not blocked). - e.emit(`function _headLeafText(id) {`); + e.emit(`function _headLeafText(id: number) {`); e.emit(` while (rowCount[id] > 0) {`); e.emit(` const _hh = kids[rowStart[id]];`); e.emit(` if (_hh >= 0) { id = _hh; continue; }`); @@ -1247,8 +1274,8 @@ export function emitParser(grammar: CstGrammar): string { e.emit(` }`); e.emit(` return '';`); e.emit(`}`); - e.emit(`const tokenNames = new Set(${J([...a.tokenNames])});`); - e.emit(`const templateTokenNames = new Set(${J([...a.templateTokenNames])});`); + e.emit(`const tokenNames = new Set(${J([...a.tokenNames])});`); + e.emit(`const templateTokenNames = new Set(${J([...a.templateTokenNames])});`); e.emit(`const templateTokenName = ${J(a.templateTokenName ?? null)};`); e.emit(`const maxBp = ${a.maxBp};`); e.emit(`const ENTRY = ${J(entry)};`); @@ -1272,7 +1299,7 @@ export function emitParser(grammar: CstGrammar): string { } // (recovery sync closers are threaded per-loop from the enclosing seq — see // quantFollowT; a global closer table froze top-level recovery at any ']'.) - e.emit(`const prattRuleNames = new Set(${J([...a.prattRules])});`); + e.emit(`const prattRuleNames = new Set(${J([...a.prattRules])});`); // The expression rule the template-interpolation fallback (findExprRule) picks: // first pratt rule that isn't Type, in declaration order. Bake the resolved name. const exprRuleName = (() => { @@ -1311,13 +1338,10 @@ function resolveLexerImport(): string { return pathResolve(__dir, 'gen-lexer.ts' // ONLY change: where the interpreter called matchExpr(alt)/matchSeq(items) per arm, // these call the GENERATED per-arm matcher functions (installed via the rule fns). function emitRuntime(e: Emitter) { - // Column element type: Uint8 when the kind/literal id spaces fit a byte. - const st = e.a.symtab; - let tMax = 1; - for (const v of st.kwLitKind.values()) tMax = Math.max(tMax, v); - for (const v of st.puLitKind.values()) tMax = Math.max(tMax, v); - const K_ARR = st.KIND_NAMED_FALLBACK <= 255 ? 'Uint8Array' : 'Uint16Array'; - const T_ARR = tMax <= 255 ? 'Uint8Array' : 'Uint16Array'; + // Column element type: Uint8 when the kind/literal id spaces fit a byte (single- + // sourced in analyze() so emitDriver's spare-buffer mirrors pick the same width). + const K_ARR = e.a.kArr; + const T_ARR = e.a.tArr; e.emit(String.raw` // ── Token stream: struct-of-arrays (no per-token object, no eager text) ── // tkK = type kind, tkT = literal kind, tkOff/tkEnd = source span, tkFl = stamp bits @@ -1345,14 +1369,14 @@ let tokN = 0; // joined form for the cold paths that need one (errors, debug views); batch parses // set it directly. Reads route through docChar/docText: flat fast path, piece // lookup (cursor-cached) otherwise. -let docPieces = null; -let docPieceOff = null; +let docPieces: string[] | null = null; +let docPieceOff: number[] | null = null; let docLen = 0; -let docFlat = null; +let docFlat: string | null = null; let docCur = 0; -function docLocate(i) { +function docLocate(i: number) { let k = docCur; - const po = docPieceOff; + const po = docPieceOff!; const n = po.length; if (k >= n || po[k] > i || (k + 1 < n && po[k + 1] <= i)) { let lo = 0, hi = n; @@ -1362,57 +1386,57 @@ function docLocate(i) { } return k; } -function docChar(i) { +function docChar(i: number) { if (docFlat !== null) return docFlat.charCodeAt(i); const k = docLocate(i); - return docPieces[k].charCodeAt(i - docPieceOff[k]); + return docPieces![k].charCodeAt(i - docPieceOff![k]); } -function docText(a, b) { +function docText(a: number, b: number) { if (docFlat !== null) return docFlat.slice(a, b); if (b <= a) return ''; let k = docLocate(a); - const first = docPieces[k]; - const lo = a - docPieceOff[k]; - if (b - docPieceOff[k] <= first.length) return first.slice(lo, b - docPieceOff[k]); + const first = docPieces![k]; + const lo = a - docPieceOff![k]; + if (b - docPieceOff![k] <= first.length) return first.slice(lo, b - docPieceOff![k]); let out = first.slice(lo); k++; - while (k < docPieces.length && docPieceOff[k] < b) { - const piece = docPieces[k]; - const need = b - docPieceOff[k]; + while (k < docPieces!.length && docPieceOff![k] < b) { + const piece = docPieces![k]; + const need = b - docPieceOff![k]; out += need >= piece.length ? piece : piece.slice(0, need); k++; } return out; } function flattenDoc() { - if (docFlat === null) docFlat = docPieces.join(''); + if (docFlat === null) docFlat = docPieces!.join(''); return docFlat; } -function applyChange(start, end, text) { +function applyChange(start: number, end: number, text: string) { const ks = docLocate(start); const ke = docLocate(end > start ? end - 1 : start); - const head = docPieces[ks].slice(0, start - docPieceOff[ks]); - const tailPiece = end > start ? docPieces[ke] : docPieces[ks]; - const tailOff = end - docPieceOff[end > start ? ke : ks]; + const head = docPieces![ks].slice(0, start - docPieceOff![ks]); + const tailPiece = end > start ? docPieces![ke] : docPieces![ks]; + const tailOff = end - docPieceOff![end > start ? ke : ks]; const tail = tailPiece.slice(tailOff); const repl = []; if (head.length > 0) repl.push(head); if (text.length > 0) repl.push(text); if (tail.length > 0) repl.push(tail); - docPieces.splice(ks, (end > start ? ke : ks) - ks + 1, ...repl); + docPieces!.splice(ks, (end > start ? ke : ks) - ks + 1, ...repl); // consolidate when fragmenting (amortized: a join every ≥256 edits) - if (docPieces.length > 256) { - docPieces = [docPieces.join('')]; + if (docPieces!.length > 256) { + docPieces = [docPieces!.join('')]; } docLen += text.length - (end - start); // rebuild offsets from the splice point (suffix offsets shifted anyway) - if (docPieceOff.length !== docPieces.length) docPieceOff.length = docPieces.length; - let off = ks > 0 && ks - 1 < docPieces.length ? docPieceOff[ks - 1] + docPieces[ks - 1].length : 0; - for (let k2 = ks > 0 ? ks : 0; k2 < docPieces.length; k2++) { - docPieceOff[k2] = off; - off += docPieces[k2].length; + if (docPieceOff!.length !== docPieces!.length) docPieceOff!.length = docPieces!.length; + let off = ks > 0 && ks - 1 < docPieces!.length ? docPieceOff![ks - 1] + docPieces![ks - 1].length : 0; + for (let k2 = ks > 0 ? ks : 0; k2 < docPieces!.length; k2++) { + docPieceOff![k2] = off; + off += docPieces![k2].length; } - if (docPieces.length === 1) docPieceOff[0] = 0; + if (docPieces!.length === 1) docPieceOff![0] = 0; docCur = 0; docFlat = null; } @@ -1425,9 +1449,9 @@ function applyChange(start, end, text) { // parses are all-positive and the decode branch never fires. let srcLenP1 = 1; let negFrom = 0x7fffffff; -function toff(i) { const v = tkOff[i]; return v < 0 ? v + srcLenP1 : v; } -function tend(i) { const v = tkEnd[i]; return v < 0 ? v + srcLenP1 : v; } -${e.soa ? '' : 'let tkText = []; // fallback-lexer text column (synthetic tokens are not source spans)'} +function toff(i: number) { const v = tkOff[i]; return v < 0 ? v + srcLenP1 : v; } +function tend(i: number) { const v = tkEnd[i]; return v < 0 ? v + srcLenP1 : v; } +${e.soa ? '' : 'let tkText: string[] = []; // fallback-lexer text column (synthetic tokens are not source spans)'} function growTok() { tkCap *= 2; const k = new ${K_ARR}(tkCap); k.set(tkK); tkK = k; @@ -1483,8 +1507,8 @@ let rowNF = new Int32Array(8192).fill(0x7fffffff); // 'succeed' over broken text and wipe its diagnostics). Recovering passes adopt // these rows freely. let rowRM = new Uint8Array(8192); -function ktr(p, k) { const v = kidTokRel[k]; return v < 0 ? v + rowTokLen[p] + 1 : v; } -function kcr(p, k) { const v = kidRel[k]; return v < 0 ? v + rowLen[p] + 1 : v; } +function ktr(p: number, k: number) { const v = kidTokRel[k]; return v < 0 ? v + rowTokLen[p] + 1 : v; } +function kcr(p: number, k: number) { const v = kidRel[k]; return v < 0 ? v + rowLen[p] + 1 : v; } // transient BUILD coordinates (absolute), valid for rows completed in the current // parse and REFRESHED at memo-hit time for reused roots — parents read them at // finishNode to write the children's relative fields; never part of the green tree. @@ -1531,24 +1555,24 @@ function growRows() { const ac = new Int32Array(rowCap); ac.set(absChar); absChar = ac; const at = new Int32Array(rowCap); at.set(absTok); absTok = at; } -function growKids(n) { +function growKids(n: number) { while (kidN + n > kidCap) kidCap *= 2; const k = new Int32Array(kidCap); k.set(kids.subarray(0, kidN)); kids = k; const r = new Int32Array(kidCap); r.set(kidRel.subarray(0, kidN)); kidRel = r; const t = new Int32Array(kidCap); t.set(kidTokRel.subarray(0, kidN)); kidTokRel = t; } -function scPush(e) { +function scPush(e: number) { if (scn === scCap) { scCap *= 2; const s = new Int32Array(scCap); s.set(sc); sc = s; } sc[scn++] = e; } -function entryOff(e) { return e >= 0 ? absChar[e] : toff((~e) >>> 2); } -function entryEnd(e) { return e >= 0 ? absChar[e] + rowLen[e] : tend((~e) >>> 2); } -function entryTok(e) { return e >= 0 ? absTok[e] : (~e) >>> 2; } -function entryTokEnd(e) { return e >= 0 ? absTok[e] + rowTokLen[e] : ((~e) >>> 2) + 1; } +function entryOff(e: number) { return e >= 0 ? absChar[e] : toff((~e) >>> 2); } +function entryEnd(e: number) { return e >= 0 ? absChar[e] + rowLen[e] : tend((~e) >>> 2); } +function entryTok(e: number) { return e >= 0 ? absTok[e] : (~e) >>> 2; } +function entryTokEnd(e: number) { return e >= 0 ? absTok[e] + rowTokLen[e] : ((~e) >>> 2) + 1; } // Complete a node whose children are scratch[mark..scn): copy them into kids, write // the row, truncate scratch, return the id. Empty children = a zero-width node // at the current token (the old offset() rule). -function finishNode(rid, mark) { +function finishNode(rid: number, mark: number) { const n = scn - mark; if (nodeN === rowCap) growRows(); const id = nodeN++; @@ -1607,7 +1631,7 @@ function finishNode(rid, mark) { return id; } // Complete a LED/continuation wrap: children = [lhs, ...scratch[mark..scn)]. -function finishWrap(rid, lhsId, mark) { +function finishWrap(rid: number, lhsId: number, mark: number) { const n = scn - mark; if (nodeN === rowCap) growRows(); const id = nodeN++; @@ -1675,22 +1699,22 @@ let _prattCapped = false; // be identical between a fresh parse and an adoption re-run. frameMax <= maxPos // always, so the hot advance pays one extra compare only at frontier breaches. let frameMax = 0; -let memoNode = []; -let memoEnd = []; -let memoExt = []; // per-entry lookahead extent (see parseRuleEntry) +let memoNode: number[][] = []; +let memoEnd: number[][] = []; +let memoExt: number[][] = []; // per-entry lookahead extent (see parseRuleEntry) // GENERATION-STAMPED memo: the per-rule arrays persist across parses (allocating // fresh multi-million-slot arrays per edit cost ~30% of a large-file edit in GC // alone); an entry is live iff its stamp equals the current generation — bumping // memoGenCur IS the whole reset. -let memoGen = []; +let memoGen: Int32Array[] = []; let memoGenCur = 0; let parseLimit = -1; // cap = the exclusive lookahead bound: min(parseLimit-or-∞, tokN), maintained at the // parseLimit set/restore sites and the one token-stream mutation (the '>' splice). let cap = 0; -let currentPrattContext = null; -let suppressNext = null; -let suppressCur = null; +let currentPrattContext: string | null = null; +let suppressNext: Set | null = null; +let suppressCur: Set | null = null; function offset() { if (pos < cap) return toff(pos); @@ -1703,7 +1727,7 @@ function offset() { // Keyword literal: the interpreter required tok.type !== '' && tokenNames.has(tok.type) // && tok.text === value. With interned kinds that is tok.k >= K_NAMED_MIN (a declared // token name; '' is PUNCT, templates are below NAMED_MIN) && tok.t === KW(value). -function matchKwLit(kw, vs) { +function matchKwLit(kw: number, vs?: number) { // A kw-range t can only come from a named token (template spans never intern to a // keyword), so the old k >= K_NAMED_MIN guard was redundant — one int compare. // vs (optional) = the call site's viable-set id, threaded into the $missing row. @@ -1715,7 +1739,7 @@ function matchKwLit(kw, vs) { // Punct literal: tok.type === '' && tok.text === value, with the gt-splice fallback. // tok.t === PU(value) is the exact-text fast path; the splice handles a longer // gt-led token matching the gt key. value/pu are baked by the caller. -function matchPuLit(pu, vs) { +function matchPuLit(pu: number, vs?: number) { // A pu-range t can only come from a punct token, so the old k === K_PUNCT guard was // redundant — one int compare. The '>'-split lives only in matchPuLitGT ('>' sites). if (pos >= cap || tkT[pos] !== pu) return recovering ? missTok(pu, vs) : false; @@ -1723,7 +1747,7 @@ function matchPuLit(pu, vs) { if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } -function matchPuLitGT(pu, vs) { +function matchPuLitGT(pu: number, vs?: number) { if (pos >= cap) return false; const off = toff(pos); if (tkT[pos] === pu) { @@ -1738,7 +1762,7 @@ function matchPuLitGT(pu, vs) { const end0 = tend(pos); ${e.soa ? '' : 'const restText = tkText[pos].slice(1);'} if (tokN === tkCap) growTok(); - parenCachePos = -1; + ${e.soa ? 'parenCachePos = -1;' : ''} // invalidate the paren-stack cache (soa emitted lexer only) // token indices shift past this point: the OLD-TREE adoption mapping // (adoptDmg*/adoptDelta, frozen at edit start) is no longer valid — turn // adoption off for the remainder of this parse (the '>' split is rare; the @@ -1783,7 +1807,7 @@ function matchPuLitGT(pu, vs) { } // Generic matchLiteral kept for any unspecialized site: classify value via the baked // tables (no per-call isKeywordLiteral / string compares) and delegate. -function matchLiteral(value) { +function matchLiteral(value: string) { const kw = LIT_KW.get(value); if (kw !== undefined) return matchKwLit(kw); if (value === '>') return matchPuLitGT(LIT_PU.get(value) ?? 0); @@ -1793,7 +1817,7 @@ function matchLiteral(value) { // Match a token ref by its baked TYPE kind: tok.type === name ⟺ tok.k === nameKind. // (No named-token kind equals K_NAMED_FALLBACK, so an unforeseen type never matches.) // The materialized tokenType is type-derived (kind 0) — name needs no baking here. -function matchTokK(nameKind) { +function matchTokK(nameKind: number) { if (pos >= cap || tkK[pos] !== nameKind) return recovering ? missTok(-nameKind) : false; scPush(~(pos << 2)); if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } @@ -1858,7 +1882,7 @@ function emitRuleFns(e: Emitter, a: ReturnType) { else emitNonRecRule(e, a, rule, spine.has(rule.name) && !a.prattRules.has(rule.name) && !a.leftRecSet.has(rule.name)); } // Dispatch table (string rule name → fn), for parseTemplateExpr's dynamic interp rule. - e.emit(`const RULES = {`); + e.emit(`const RULES: Record boolean> = {`); for (const rule of a.grammar.rules) e.emit(` ${J(rule.name)}: ${ruleFn(rule.name)},`); e.emit(`};`); @@ -1954,7 +1978,7 @@ function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDec // pratt/left-rec rules. if (memoized) { e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_core); }`); - e.emit(`function ${ruleFn}_core(_minBp) {`); + e.emit(`function ${ruleFn}_core(_minBp: number) {`); } else { e.emit(`function ${ruleFn}() {`); } @@ -1998,9 +2022,9 @@ function emitLeftRecRule(e: Emitter, a: ReturnType, rule: RuleDe e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_lr); }`); // notLeftLeaf head-leaf word sets (module-level, built once) for this rule's gated continuations. contNotLeftLeaf.forEach((words, i) => { - if (words) e.emit(`const _NLLC_${sn}_${i} = new Set(${J(words)});`); + if (words) e.emit(`const _NLLC_${sn}_${i} = new Set(${J(words)});`); }); - e.emit(`function ${ruleFn}_lr(_minBp) {`); + e.emit(`function ${ruleFn}_lr(_minBp: number) {`); e.emit(` const saved = pos; const mark = scn;`); e.emit(` let node = -1; let bestAtomPos = saved;`); const atomDispatch = e.altMaskDispatch(atoms, '_am'); @@ -2063,9 +2087,9 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_pratt); }`); // notLeftLeaf head-leaf word sets (module-level, built once) for this rule's gated LED arms. meta.notLeftLeaf.forEach((words, i) => { - if (words) e.emit(`const _NLL_${sn}_${i} = new Set(${J(words)});`); + if (words) e.emit(`const _NLL_${sn}_${i} = new Set(${J(words)});`); }); - e.emit(`function ${ruleFn}_pratt(minBp) {`); + e.emit(`function ${ruleFn}_pratt(minBp: number) {`); e.emit(` const saved = pos; const mark = scn;`); e.emit(` let lhs = -1; let bestNudPos = saved;`); // `capped` becomes true iff the winning NUD is a capped (assignment-level) expression — @@ -2322,7 +2346,7 @@ function emitDriver(e: Emitter, a: ReturnType, entry: string) { // and SECOND-token reads past it. Left-to-right parsing keeps the watermark near the // current frontier, so the value is tight on the dominant flow and only OVER- // invalidates (soundly) near big-backtrack clusters. -function parseRuleEntry(idx, rid, name, core) { +function parseRuleEntry(idx: number, rid: number, name: string, core: (minBp: number) => number) { const mySup = suppressNext; suppressNext = null; const capped = parseLimit >= 0; @@ -2499,14 +2523,14 @@ function parseRuleEntry(idx, rid, name, core) { } // Token text at an arbitrary index (cold paths: errors, the tokenAt debug view). -function tokTextAt(i) { +function tokTextAt(i: number) { return ${e.soa ? 'docText(toff(i), tend(i))' : 'tkText[i]'}; } // The k → type-name inverse, for reconstructing a token object (tokenAt). -const K_NAMES = []; +const K_NAMES: string[] = []; for (const [n, k] of TYPE_KIND) K_NAMES[k] = n; // A per-token object view over the columns (gates / debugging — the parser never builds these). -export function tokenAt(i) { +export function tokenAt(i: number) { return { type: K_NAMES[tkK[i]] ?? '', text: tokTextAt(i), @@ -2524,7 +2548,7 @@ export function tokenAt(i) { // The arena IS the tree: parse() returns the root node id and consumers traverse // via visit()/the accessors — nothing is materialized on the parse path. All views // are valid until the NEXT parse (the columns are reused). -function leafTokenType(entry, tokBase) { +function leafTokenType(entry: number, tokBase: number) { const tok = tokBase + ((~entry) >>> 2); const kind = (~entry) & 3; return kind === 1 ? '$keyword' @@ -2539,36 +2563,36 @@ function leafTokenType(entry, tokBase) { // — the node's own absolute start coordinates. Leaf spans come from the token // columns at tokBase + the entry's node-relative token index. export const tree = { - ruleNameOf: (id) => RULE_DISPLAY[rowRule[id]], - ruleIdOf: (id) => rowRule[id], - lenOf: (id) => rowLen[id], - tokLenOf: (id) => rowTokLen[id], + ruleNameOf: (id: number) => RULE_DISPLAY[rowRule[id]], + ruleIdOf: (id: number) => rowRule[id], + lenOf: (id: number) => rowLen[id], + tokLenOf: (id: number) => rowTokLen[id], // a node CHILD's relative coordinates live on the parent edge (kids-parallel) - childRelAt: (id, i) => kcr(id, rowStart[id] + i), - childTokRelAt: (id, i) => ktr(id, rowStart[id] + i), + childRelAt: (id: number, i: number) => kcr(id, rowStart[id] + i), + childTokRelAt: (id: number, i: number) => ktr(id, rowStart[id] + i), // base-threaded spans: nodes from their bases, leaves from the token columns - offsetOf: (entry, charBase, tokBase) => entry >= 0 ? charBase : toff(tokBase + ((~entry) >>> 2)), - endOf: (entry, charBase, tokBase) => entry >= 0 ? charBase + rowLen[entry] : tend(tokBase + ((~entry) >>> 2)), - childCount: (id) => rowCount[id], - childAt: (id, i) => kids[rowStart[id] + i], + offsetOf: (entry: number, charBase: number, tokBase: number) => entry >= 0 ? charBase : toff(tokBase + ((~entry) >>> 2)), + endOf: (entry: number, charBase: number, tokBase: number) => entry >= 0 ? charBase + rowLen[entry] : tend(tokBase + ((~entry) >>> 2)), + childCount: (id: number) => rowCount[id], + childAt: (id: number, i: number) => kids[rowStart[id] + i], // Bulk child load into a caller-owned array; returns the count. One call per node // instead of childCount+childAt-per-probe (the generated matchers' hot path). - childrenInto: (id, out2) => { + childrenInto: (id: number, out2: number[]) => { const n2 = rowCount[id]; const cs2 = rowStart[id]; for (let i2 = 0; i2 < n2; i2++) out2[i2] = kids[cs2 + i2]; return n2; }, - isLeaf: (entry) => entry < 0, - leafToken: (entry, tokBase) => tokBase + ((~entry) >>> 2), + isLeaf: (entry: number) => entry < 0, + leafToken: (entry: number, tokBase: number) => tokBase + ((~entry) >>> 2), leafTokenType, // Int-world leaf accessors (the match-path encoding): kind bits — 0 type-derived, // 1 '$keyword', 2 '$operator' — and the token's TYPE kind int (1 = punctuation). - leafKindOf: (entry) => (~entry) & 3, - leafTokKindOf: (entry, tokBase) => tkK[tokBase + ((~entry) >>> 2)], - leafOffsetOf: (entry, tokBase) => toff(tokBase + ((~entry) >>> 2)), - leafEndOf: (entry, tokBase) => tend(tokBase + ((~entry) >>> 2)), - textOf: (entry, source, charBase, tokBase) => entry >= 0 + leafKindOf: (entry: number) => (~entry) & 3, + leafTokKindOf: (entry: number, tokBase: number) => tkK[tokBase + ((~entry) >>> 2)], + leafOffsetOf: (entry: number, tokBase: number) => toff(tokBase + ((~entry) >>> 2)), + leafEndOf: (entry: number, tokBase: number) => tend(tokBase + ((~entry) >>> 2)), + textOf: (entry: number, source: string, charBase: number, tokBase: number) => entry >= 0 ? source.slice(charBase, charBase + rowLen[entry]) : source.slice(toff(tokBase + ((~entry) >>> 2)), tend(tokBase + ((~entry) >>> 2))), }; @@ -2579,22 +2603,23 @@ export const tree = { // Depth-first traversal threading the RED coordinates: enter/leave receive the // node's absolute (charBase, tokBase); leaf receives its absolute token index. // Call with the root only — the bases default from the root's rel fields. -function visitCore(entry, fns, charBase, tokBase) { +type _VisitFns = { enter?: (id: number, charBase: number, tokBase: number) => boolean | void; leave?: (id: number, charBase: number, tokBase: number) => void; leaf?: (entry: number, tok: number) => void }; +function visitCore(entry: number, fns: _VisitFns, charBase?: number, tokBase?: number) { if (charBase === undefined) { charBase = rootCharBase; tokBase = rootTokBase; } - if (entry < 0) { if (fns.leaf) fns.leaf(entry, tokBase + ((~entry) >>> 2)); return; } - if (fns.enter && fns.enter(entry, charBase, tokBase) === false) return; + if (entry < 0) { if (fns.leaf) fns.leaf(entry, tokBase! + ((~entry) >>> 2)); return; } + if (fns.enter && fns.enter(entry, charBase, tokBase!) === false) return; const n = rowCount[entry]; const cs = rowStart[entry]; for (let i = 0; i < n; i++) { const e = kids[cs + i]; - if (e < 0) { if (fns.leaf) fns.leaf(e, tokBase + ((~e) >>> 2)); } - else visitCore(e, fns, charBase + kcr(entry, cs + i), tokBase + ktr(entry, cs + i)); + if (e < 0) { if (fns.leaf) fns.leaf(e, tokBase! + ((~e) >>> 2)); } + else visitCore(e, fns, charBase + kcr(entry, cs + i), tokBase! + ktr(entry, cs + i)); } - if (fns.leave) fns.leave(entry, charBase, tokBase); + if (fns.leave) fns.leave(entry, charBase, tokBase!); } // Parse to the ARENA: returns the root node id. -function lexInto(source) { +function lexInto(source: string) { ${e.soa ? ` tokenize(source); docEmptyPops = lexEmptyPops.slice();` : String.raw` docPieces = [source]; docPieceOff = [0]; docLen = source.length; docFlat = source; docCur = 0; const _toks = tokenize(source); @@ -2611,14 +2636,14 @@ ${e.soa ? ` tokenize(source); tokN = _n;`} } -function farthest(errPos) { +function farthest(errPos: number) { if (maxPos <= errPos || maxPos >= tokN) return ''; return ' [farthest: offset ' + toff(maxPos) + " near '" + tokTextAt(maxPos).slice(0, 20) + "']"; } // Run the entry rule over the CURRENT token stream (shared by parse / parseEdited — // everything per-parse EXCEPT the memo and the arena cursor, which parseEdited carries). -function runParse(entryRule) { +function runParse(entryRule?: string) { pos = 0; maxPos = 0; frameMax = 0; @@ -2691,15 +2716,15 @@ let adoptDmgStart = 0; // damage window in OLD token coords: [adoptDmgStar let adoptDmgOldEnd = 0; let adoptDelta = 0; // new-minus-old token delta past the damage // cached descent path (top-down): ids + their absolute old token bases -let adoptPath = []; -let adoptBase = []; +let adoptPath: number[] = []; +let adoptBase: number[] = []; // run-extension state: where the last single adoption sat in the old tree (its // parent row / kid index / parent token base), published by adoptSeek, plus the // (pos, rid, generation) signature a repetition must present to consume it. let adoptHitP = -1, adoptHitKid = 0, adoptHitBase = 0; let adoptRunPos = -1, adoptRunRid = -1, adoptRunGen = -1; let adoptRunP = -1, adoptRunKid = 0, adoptRunOq = 0, adoptRunBase = 0; -function adoptSeek(q, rid) { +function adoptSeek(q: number, rid: number) { // reuse the cached path while it still CONTAINS q (strictly inside, not at start) let depth = 0; while (depth < adoptPath.length) { @@ -2710,7 +2735,7 @@ function adoptSeek(q, rid) { } adoptPath.length = depth; adoptBase.length = depth; - let id, base; + let id: number, base: number; if (depth === 0) { if (q < adoptRootTok || q >= adoptRootTok + rowTokLen[adoptRoot]) return -1; id = adoptRoot; base = adoptRootTok; @@ -2779,11 +2804,11 @@ let recovering = false; // adoption reused this pass (a recovering pass adopts error regions wholesale, // so per-pass collection alone would silently drop their diagnostics). docPar // keeps the formatted result for the paths that do not re-parse (surgery). -let docDiags = []; -let docLex = []; -let docPar = []; +let docDiags: Diag[] = []; +let docLex: LexDiag[] = []; +let docPar: Diag[] = []; -function lexMsg(g) { +function lexMsg(g: LexDiag) { if (g.kind === 0) return "Unexpected character at offset " + g.offset + ": '" + g.ch + "'"; if (g.kind === 1) return 'Invalid escape sequence in template at offset ' + g.offset; if (g.kind === 2) return 'Unterminated template literal at offset ' + g.offset; @@ -2801,7 +2826,7 @@ function lexMsg(g) { // past the last bar aborts the attempt, appends the new farthest-fail bar, and the // pass re-runs (adoption keeps re-runs cheap). Bars are text-determined, so fresh // and incremental recovering parses are byte-identical by construction. -let recoverBars = []; +let recoverBars: number[] = []; // (rule, pos) frames currently ON THE STACK during a recovering run, keyed to // their entry SERIAL. Token synthesis makes zero-width matches possible, so a rule // can re-enter itself at the SAME position through a synthesized leading token — @@ -2828,7 +2853,7 @@ let cycleMinSerial = 0x7fffffff; // non-consuming probes, so the frame behaved strictly: a pure function of the // window text, stable under any bar list that stays out of the window. let memoRecFloor = 0x7fffffff; -function barFreeWin(s, m) { +function barFreeWin(s: number, m: number) { const hi = m + 2; for (let i = 0; i < recoverBars.length; i++) { const b = recoverBars[i]; @@ -2855,7 +2880,7 @@ let probing = 0; // group is allowed only once the group consumed past this (committed) — failures // of an uncommitted probe are ordinary "the optional thing isn't there". let probeBase = -1; -function missAt(p2) { +function missAt(p2: number) { for (let i = 0; i < recoverBars.length; i++) { const b = recoverBars[i]; if (b > p2 + 2) break; @@ -2863,7 +2888,7 @@ function missAt(p2) { } return false; } -function missTok(t, vs) { +function missTok(t: number, vs?: number) { if (probing !== 0 || pos <= probeBase || recoverFree || !missAt(pos)) return false; const id = finishNode(RID_MISSING, scn); rowStart[id] = vs ? t | (vs << 21) : t; @@ -2881,7 +2906,7 @@ function missTok(t, vs) { // row carrying the rule identity. Same purity rules as missTok. Returns the node // id (not pushed — call sites differ) or -1. const RULE_MISS_BASE = 1 << 20; -function missRule(rid) { +function missRule(rid: number) { if (probing !== 0 || pos <= probeBase || recoverFree || !missAt(pos)) return -1; const id = finishNode(RID_MISSING, scn); rowStart[id] = RULE_MISS_BASE + rid; @@ -2897,11 +2922,11 @@ function missRule(rid) { // Decode a $missing row's packed expected identity (see missTok): bits 21+ carry // the call site's viable-set id; bit 20 marks a missing nonterminal; else a plain // literal int (>0) or a named token kind (<0). -function missLit(v) { +function missLit(v: number) { if (v >= 1 << 21) return v & 0xFFFFF; return v > 0 && v < RULE_MISS_BASE ? v : 0; } -function missEntry(v, kb) { +function missEntry(v: number, kb: number): Diag { let message; if (v >= 1 << 21) message = 'expected ' + VSETS[v >>> 21]; else if (v >= RULE_MISS_BASE) message = 'expected ' + RULE_DISPLAY[v - RULE_MISS_BASE]; @@ -2909,7 +2934,7 @@ function missEntry(v, kb) { else message = "expected '" + (K_NAMES[-v] ?? '?') + "'"; return { offset: kb, end: kb, message }; } -function collectErrRows(id, charBase, tokBase) { +function collectErrRows(id: number, charBase: number, tokBase: number) { if (rowRule[id] === RID_MISSING) { docPar.push(missEntry(rowStart[id], charBase)); return; @@ -2990,16 +3015,16 @@ function rebuildDiagView() { // stray closer beyond balance. The shifted lexer resync's dominant q=0 case needs // exactly one fact about the whole old suffix ("no pop-on-empty beyond the // candidate"), which this list answers O(1) instead of an O(suffix) min-build. -let docEmptyPops = []; +let docEmptyPops: number[] = []; // Bar list that built lastRoot (that run's token coords); null = free-fire built // (free-fire decisions are not bar-pure — such a tree is never adoptable while // recovering). Strict trees carry []. -let lastBars = []; +let lastBars: number[] | null = []; // A row replays identically in a recovering run iff its window sees the SAME bars // (shifted) the build run saw there — every recovery decision (hook arming, // missTok/missRule, the cycle sentinel) is position-pure, so window text + window // bars determine the frame's behavior completely. -function barsWindowEq(s, q, ext) { +function barsWindowEq(s: number, q: number, ext: number) { if (lastBars === null) return false; const hiN = s + ext + 2, hiO = q + ext + 2; let i = 0, j = 0; @@ -3013,7 +3038,7 @@ function barsWindowEq(s, q, ext) { i++; j++; } } -function recoverArmed(from, reach) { +function recoverArmed(from: number, reach: number) { // armed iff THE FAILING ELEMENT is stuck at a bar: it starts at/before the bar // and its OWN farthest probe sits ON it (+2 read slack). The reach is the // element's frame-local watermark, NOT the global maxPos — a global frontier @@ -3028,7 +3053,7 @@ function recoverArmed(from, reach) { } return false; } -function recoverSkip(canStart, closerT, from0, reach) { +function recoverSkip(canStart: ((p: number) => boolean) | null, closerT: number, from0: number, reach: number) { if (!recoverArmed(from0, reach)) return false; if (pos >= cap) return false; if (closerT >= 0 && tkK[pos] === K_PUNCT && tkT[pos] === closerT) return false; @@ -3055,7 +3080,7 @@ function recoverSkip(canStart, closerT, from0, reach) { // proves the loop's FIRST-set guard true at its position (its first token starts // the rule), and the loop's own continuation checks run again after the run // breaks. Members get no memo entries — a backtracking re-probe just re-adopts. -function runExtend(rid) { +function runExtend(rid: number) { if (rid !== adoptRunRid || memoGenCur !== adoptRunGen) { adoptRunPos = -1; return; } adoptRunPos = -1; const P = adoptRunP; @@ -3100,10 +3125,10 @@ function runExtend(rid) { // re-parse. Prefix kids are kept under the same watermark rule single adoption // uses, made transitive by rowKC: each kid's probe watermark stays at/below the // next kid's start, so checking the LAST kept kid bounds them all. -let surgX = [], surgBase = [], surgA = [], surgB = []; +let surgX: number[] = [], surgBase: number[] = [], surgA: number[] = [], surgB: number[] = []; // composed change envelope handed from the text-application step to the window relex let editDmgS = 0, editDmgE = 0; -function rowKCof(id) { +function rowKCof(id: number) { const c = rowKC[id]; if (c !== 0) return c; const cs = rowStart[id], n = rowCount[id]; @@ -3117,7 +3142,7 @@ function rowKCof(id) { rowKC[id] = ok; return ok; } -function trySurgery(dmgA, dmgB, tokD, chrD) { +function trySurgery(dmgA: number, dmgB: number, tokD: number, chrD: number) { if (adoptRoot < 0) return -1; if (rowRule[adoptRoot] >= RID_ERROR) return -1; // A recovery-made tree (rowRM root) CAN take a strict splice when the edit @@ -3240,8 +3265,8 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { if (recTree) { // the strict re-parse stands for the fresh recovering parse of this span only // if no bar window touches anything it read (probes included) - for (let i = 0; i < lastBars.length; i++) { - const b = lastBars[i]; + for (let i = 0; i < lastBars!.length; i++) { + const b = lastBars![i]; const bn = b < dmgA ? b : b + tokD; if (bn + 2 >= s0 && bn <= maxPos + 2) return -1; } @@ -3458,7 +3483,7 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { // The spare token-column buffer set (parseEdited ping-pongs between the live set and // this one, so steady-state edits never allocate columns). -let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; +let altK: typeof tkK | null = null, altT: typeof tkT | null = null, altOff: typeof tkOff | null = null, altEnd: typeof tkEnd | null = null, altFl: typeof tkFl | null = null, altDp: typeof tkDp | null = null, altPd: typeof tkPd | null = null; let altCap = 0; let altN = 0; // old-stream token count while a window lex runs (lexCore's resync bound) @@ -3469,9 +3494,28 @@ let altN = 0; // old-stream token count while a window lex runs (lexCore's res // variables are the truth, and is written back only when another doc activates. // Per-PARSE transients (pos/maxPos/scratch/adopt*/surg*) reset on every entry and // are shared safely. -function makeDoc() { +type Diag = { offset: number; end: number; message: string; related?: { offset: number; end: number; message: string } }; +type LexDiag = { offset: number; end: number; kind: number; ch: string }; +type Edit = { start: number; end: number; text: string }; +type Doc = { + tkK: typeof tkK; tkT: typeof tkT; tkOff: typeof tkOff; tkEnd: typeof tkEnd; tkFl: typeof tkFl; tkDp: typeof tkDp; tkPd: typeof tkPd; + tkCap: number; tokN: number; srcLenP1: number; negFrom: number; + rowRule: typeof rowRule; rowLen: typeof rowLen; rowTokLen: typeof rowTokLen; rowStart: typeof rowStart; rowCount: typeof rowCount; rowExt: typeof rowExt; + rowOK: typeof rowOK; rowKC: typeof rowKC; rowNF: typeof rowNF; rowRM: typeof rowRM; absChar: typeof absChar; absTok: typeof absTok; + rowCap: number; nodeN: number; + kids: typeof kids; kidRel: typeof kidRel; kidTokRel: typeof kidTokRel; kidCap: number; kidN: number; + memoNode: number[][]; memoEnd: number[][]; memoExt: number[][]; memoGen: Int32Array[]; memoGenCur: number; + docDiags: Diag[]; docLex: LexDiag[]; docPar: Diag[]; + docPieces: string[] | null; docPieceOff: number[] | null; docLen: number; docFlat: string | null; docCur: number; + rootCharBase: number; rootTokBase: number; lastRoot: number; lastRootTok: number; lastBars: number[] | null; docEmptyPops: number[]; +${e.soa ? ' parenCachePos: number; parenCacheStack: boolean[];' : ''} + altK: typeof tkK | null; altT: typeof tkT | null; altOff: typeof tkOff | null; altEnd: typeof tkEnd | null; altFl: typeof tkFl | null; altDp: typeof tkDp | null; altPd: typeof tkPd | null; + altCap: number; altN: number; +}; +type Handle = { d: Doc; gen: number; root: number; errors: Diag[] }; +function makeDoc(): Doc { return { - tkK: new tkK.constructor(4096), tkT: new tkT.constructor(4096), + tkK: new (tkK.constructor as any)(4096), tkT: new (tkT.constructor as any)(4096), tkOff: new Int32Array(4096), tkEnd: new Int32Array(4096), tkFl: new Uint8Array(4096), tkDp: new Uint8Array(4096), tkPd: new Uint16Array(4096), tkCap: 4096, tokN: 0, srcLenP1: 1, negFrom: 0x7fffffff, @@ -3487,13 +3531,13 @@ function makeDoc() { memoNode: [], memoEnd: [], memoExt: [], memoGen: [], memoGenCur: 0, docDiags: [], docLex: [], docPar: [], docPieces: null, docPieceOff: null, docLen: 0, docFlat: null, docCur: 0, - rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, docEmptyPops: [], + rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, lastBars: null, docEmptyPops: [], ${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} altK: null, altT: null, altOff: null, altEnd: null, altFl: null, altDp: null, altPd: null, altCap: 0, altN: 0, }; } -function saveDoc(d) { +function saveDoc(d: Doc) { d.tkK = tkK; d.tkT = tkT; d.tkOff = tkOff; d.tkEnd = tkEnd; d.tkFl = tkFl; d.tkDp = tkDp; d.tkPd = tkPd; d.tkCap = tkCap; d.tokN = tokN; d.srcLenP1 = srcLenP1; d.negFrom = negFrom; @@ -3511,7 +3555,7 @@ ${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStac d.altK = altK; d.altT = altT; d.altOff = altOff; d.altEnd = altEnd; d.altFl = altFl; d.altDp = altDp; d.altPd = altPd; d.altCap = altCap; d.altN = altN; } -function loadDoc(d) { +function loadDoc(d: Doc) { tkK = d.tkK; tkT = d.tkT; tkOff = d.tkOff; tkEnd = d.tkEnd; tkFl = d.tkFl; tkDp = d.tkDp; tkPd = d.tkPd; tkCap = d.tkCap; tokN = d.tokN; srcLenP1 = d.srcLenP1; negFrom = d.negFrom; @@ -3532,26 +3576,26 @@ ${e.soa ? ' parenCachePos = d.parenCachePos; parenCacheStack = d.parenCacheStac const docDefault = makeDoc(); let curDoc = docDefault; loadDoc(docDefault); -function activate(d) { +function activate(d: Doc) { if (d === curDoc) return; saveDoc(curDoc); loadDoc(d); curDoc = d; } function swapBuffers() { - let x; - x = tkK; tkK = altK; altK = x; - x = tkT; tkT = altT; altT = x; - x = tkOff; tkOff = altOff; altOff = x; - x = tkEnd; tkEnd = altEnd; altEnd = x; - x = tkFl; tkFl = altFl; altFl = x; - x = tkDp; tkDp = altDp; altDp = x; - x = tkPd; tkPd = altPd; altPd = x; + let x: any; + x = tkK; tkK = altK!; altK = x; + x = tkT; tkT = altT!; altT = x; + x = tkOff; tkOff = altOff!; altOff = x; + x = tkEnd; tkEnd = altEnd!; altEnd = x; + x = tkFl; tkFl = altFl!; altFl = x; + x = tkDp; tkDp = altDp!; altDp = x; + x = tkPd; tkPd = altPd!; altPd = x; x = tkCap; tkCap = altCap; altCap = x; } -${e.soa ? '' : 'let altText = [];'} +${e.soa ? '' : 'let altText: string[] = [];'} -function parseCore(source, entryRule) { +function parseCore(source: string, entryRule?: string) { adoptRoot = -1; adoptRunPos = -1; lexInto(source); @@ -3578,7 +3622,7 @@ function parseCore(source, entryRule) { // Parser-diag shift for the LOCALLY-strict paths (surgery / strict success): the // LEXER list is maintained by the window block (which already dropped the re-lexed // range and shifted the suffix — shifting here would double-apply the delta). -function shiftDiags(a, b, delta) { +function shiftDiags(a: number, b: number, delta: number) { let w = 0; for (let i = 0; i < docPar.length; i++) { const g = docPar[i]; @@ -3617,7 +3661,7 @@ function shiftDiags(a, b, delta) { // Last-resort totality net: a layer without recovery support threw — the handle // API still never crashes. Zero-width $error root + the thrown message as the // diagnostic; the next successful parse/edit resumes normal service. -function totalNet(e) { +function totalNet(e: any) { // the message lives in the SOURCE layer (docLex kind 4) — a later settle rebuilds // the view from the sources, and a view-only push would be wiped by it docLex.length = 0; @@ -3633,12 +3677,12 @@ function totalNet(e) { rootTokBase = 0; return root; } -function apiMisuse(msg) { - const e = new Error(msg); +function apiMisuse(msg: string) { + const e: any = new Error(msg); e.apiMisuse = true; return e; } -function editCore(entryRule, edits) { +function editCore(entryRule: string | undefined, edits?: Edit[]) { if (edits === undefined || edits.length === 0) { throw apiMisuse('edit() requires the changes: [{ start, end, text }] (LSP-style - each edit in the coordinates of the document AFTER the preceding edits in the array)'); } @@ -3671,14 +3715,16 @@ function editCore(entryRule, edits) { editDmgE = dE; } -${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // Damage envelope from the composed changes: prefix coordinates are shared, the - // old end comes back through the total delta. + // old end comes back through the total delta. The shared post-fork settle + // (shiftDiags) and the soa window both read these, so they live OUTSIDE the + // lex fork — the non-soa branch reads cs/ceOld/charDelta too. const newLen = docLen; const cs = editDmgS < newLen ? editDmgS : newLen; const ceNew = editDmgE < cs ? cs : editDmgE; const ceOld = ceNew - (newLen - oldLen); const charDelta = newLen - oldLen; +${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // Restart anchor: the last token B ending at/before the damage whose recorded // depths are zero and whose shape carries no cross-token lexer flag (')' control- // head, postfix-ambiguous op). B = -1 restarts at the file head — always sound. @@ -3711,7 +3757,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } // Lex the window into the spare buffers (the old stream stays live for resync). if (altK === null || altCap < tkCap) { - altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); + altK = new (tkK.constructor as any)(tkCap); altT = new (tkT.constructor as any)(tkCap); altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); altCap = tkCap; @@ -3720,7 +3766,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── altSuffMin = null; // the old-suffix min-depth cache follows the alt stream swapBuffers(); // live = scratch, alt = OLD stream tokN = 0; - const startOff = B >= 0 ? (altEnd[B] < 0 ? altEnd[B] + srcLenP1 : altEnd[B]) : 0; + const startOff = B >= 0 ? (altEnd![B] < 0 ? altEnd![B] + srcLenP1 : altEnd![B]) : 0; // Window-materialized relex: lexCore reads a SMALL flat slice of the pieces with // an absolute bias; -2 = ran off the window end before resyncing — re-materialize // a larger window and retry (the common case fits the first one). @@ -3736,7 +3782,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── docLex.length = preLexN; // an aborted attempt re-lexes: drop its pushes tokN = 0; try { - R0 = lexCore(windowStr, 0, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs, initParens.slice(), startOff, wHi < docLen); + R0 = lexCore(windowStr, 0, B >= 0 ? altK![B] : -1, B >= 0 ? altT![B] : 0, r0, ceNew, charDelta, cs, initParens.slice(), startOff, wHi < docLen); } catch (e2) { if (e2 !== LEX_RETRY) { if (recovering) throw e2; // a recovering lexer never throws — a bug @@ -3796,8 +3842,8 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // p is real damage (compared BEFORE the splice clobbers the old slots). let p = B + 1; { let i = 0; - while (i < W && p < R && altK[i] === tkK[p] && altT[i] === tkT[p] && altOff[i] === tkOff[p] - && altEnd[i] === tkEnd[p] && altFl[i] === tkFl[p]) { i++; p++; } + while (i < W && p < R && altK![i] === tkK[p] && altT![i] === tkT[p] && altOff![i] === tkOff[p] + && altEnd![i] === tkEnd[p] && altFl![i] === tkFl[p]) { i++; p++; } } const dOldEnd = R; const tokenDelta = (B + 1 + W) - R; @@ -3810,9 +3856,9 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── tkFl.copyWithin(B + 1 + W, R, oN); tkDp.copyWithin(B + 1 + W, R, oN); tkPd.copyWithin(B + 1 + W, R, oN); } if (W > 0) { - tkK.set(altK.subarray(0, W), B + 1); tkT.set(altT.subarray(0, W), B + 1); - tkOff.set(altOff.subarray(0, W), B + 1); tkEnd.set(altEnd.subarray(0, W), B + 1); - tkFl.set(altFl.subarray(0, W), B + 1); tkDp.set(altDp.subarray(0, W), B + 1); tkPd.set(altPd.subarray(0, W), B + 1); + tkK.set(altK!.subarray(0, W), B + 1); tkT.set(altT!.subarray(0, W), B + 1); + tkOff.set(altOff!.subarray(0, W), B + 1); tkEnd.set(altEnd!.subarray(0, W), B + 1); + tkFl.set(altFl!.subarray(0, W), B + 1); tkDp.set(altDp!.subarray(0, W), B + 1); tkPd.set(altPd!.subarray(0, W), B + 1); } negFrom = B + 1 + W; srcLenP1 = newLen + 1; @@ -3837,12 +3883,12 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; const oText = tkText; if (altK === null || altK.length !== tkCap) { - altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); + altK = new (tkK.constructor as any)(tkCap); altT = new (tkT.constructor as any)(tkCap); altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); } - tkK = altK; tkT = altT; tkOff = altOff; tkEnd = altEnd; tkFl = altFl; - { const _d = tkDp; tkDp = altDp; altDp = _d; const _q = tkPd; tkPd = altPd; altPd = _q; } + tkK = altK!; tkT = altT!; tkOff = altOff!; tkEnd = altEnd!; tkFl = altFl!; + { const _d = tkDp; tkDp = altDp!; altDp = _d; const _q = tkPd; tkPd = altPd!; altPd = _q; } tkText = altText; tkText.length = 0; altK = oK; altT = oT; altOff = oOff; altEnd = oEnd; altFl = oFl; altText = oText; @@ -3851,7 +3897,6 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // from an earlier totality-net edit would go stale lexInto(flattenDoc()); const nN = tokN; - const charDelta = docLen - oldLen; const minN = oN < nN ? oN : nN; let p = 0; while (p < minN && oK[p] === tkK[p] && oT[p] === tkT[p] && oFl[p] === tkFl[p] @@ -3906,7 +3951,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── shiftDiags(cs, ceOld, charDelta); return sroot; } - let root; + let root!: number; { // recovering may already be true here (the window relex recovered a lex error // and pushed its diagnostics): the first attempt then runs with EMPTY bars — @@ -4008,14 +4053,14 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── export { tokenize }; // ── Module-level API: the DEFAULT document (one shared session; tokenize and the // raw tree/tokenAt views read the ACTIVE doc — they are gate/debug surfaces) ── -export function parse(source, entryRule) { activate(docDefault); return parseCore(source, entryRule); } -export function parseEdited(entryRule, edits) { activate(docDefault); return editCore(entryRule, edits); } +export function parse(source: string, entryRule?: string) { activate(docDefault); return parseCore(source, entryRule); } +export function parseEdited(entryRule?: string, edits?: Edit[]) { activate(docDefault); return editCore(entryRule, edits); } // Arena reclamation introspection + budget override — TEST HOOKS (issue #45 C1). __arenaStats // reports the live arena, the compacted-size baseline, and how many edits re-parsed to reclaim; // __setArenaBudget lowers the factor/min so a gate can force compaction deterministically. export function __arenaStats() { return { nodeN, kidN, baseline: arenaLiveBaseline, compactions: arenaCompactions, inPlaceShrink: arenaInPlaceShrink }; } -export function __setArenaBudget(factor, min) { arenaCompactFactor = factor; arenaCompactMin = min; } -export function visit(entry, fns, charBase, tokBase) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } +export function __setArenaBudget(factor: number, min: number) { arenaCompactFactor = factor; arenaCompactMin = min; } +export function visit(entry: number, fns: _VisitFns, charBase?: number, tokBase?: number) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } // ── Handle API: explicit trees over per-instance documents ── // const p = createParser(); const cst = p.parse(text); p.edit(cst, next[, edits]); // The handle is the STABLE IDENTITY of this document's tree: edit() mutates it in @@ -4026,25 +4071,25 @@ export function visit(entry, fns, charBase, tokBase) { activate(docDefault); ret export function createParser() { const d = makeDoc(); let gen = 0; - let entryUsed; - const chk = (cst) => { + let entryUsed: string | undefined; + const chk = (cst: Handle | null | undefined) => { if (cst === null || cst === undefined || cst.d !== d) throw new Error('foreign tree handle: it belongs to another parser instance'); if (cst.gen !== gen) throw new Error('stale tree handle: parse() re-opened this document - use the handle from the latest parse()'); }; - const view = {}; + const view: Record any> = {}; for (const k of Object.keys(tree)) { - const f = tree[k]; - view[k] = (a, b) => { activate(d); return f(a, b); }; + const f = (tree as any)[k]; + view[k] = (a: number, b: number) => { activate(d); return f(a, b); }; } return { - parse(source, entryRule) { + parse(source: string, entryRule?: string) { activate(d); entryUsed = entryRule; gen++; // re-opening resets the arena: old handles die regardless of outcome docDiags.length = 0; docLex.length = 0; docPar.length = 0; - let root; + let root!: number; try { root = parseCore(source, entryRule); lastBars = []; @@ -4095,17 +4140,17 @@ export function createParser() { } return { d, gen, root, errors: docDiags }; }, - edit(cst, edits) { + edit(cst: Handle, edits?: Edit[]) { chk(cst); activate(d); try { cst.root = editCore(entryUsed, edits); } catch (e) { - if (e instanceof RangeError || (e && e.apiMisuse)) throw e; + if (e instanceof RangeError || (e && (e as any).apiMisuse)) throw e; cst.root = totalNet(e); } }, - visit(cst, fns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, + visit(cst: Handle, fns: _VisitFns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, tree: view, }; } diff --git a/src/emit-portable.ts b/src/emit-portable.ts new file mode 100644 index 0000000..4bbf760 --- /dev/null +++ b/src/emit-portable.ts @@ -0,0 +1,389 @@ +// ── emit-portable ── +// +// The target-agnostic emitter (issue #6). `emitParser(grammar, target)` (see emit.ts) derives +// a COMPLETE, self-contained parser in the target's language from the same CstGrammar the +// TS engine uses. It is the agnosticism proof: ONE analysis → ONE intermediate form (IR) +// → N language renderings, all producing the byte-identical CST the interpreter does. +// +// SHARED + target-agnostic (here): the grammar ANALYSIS (reused from grammar-analysis.ts), +// the LEXER specs (derived from token-pattern.ts's structural recognizers — char runs, +// quote-delimited strings, line/block comments — so NO regex engine is needed and the +// emitted Go/Rust compile offline), and `buildIR` — the parse plan as plain data +// (recursive-descent rules as alternative step-lists; the Pratt rule as NUD atoms/brackets/ +// prefix + binary tables + mixfix LEDs). PER-TARGET (a Target): `render(ir)` — the +// language's lexer + CST runtime + the rendering of each IR node. Adding a language is +// implementing one Target. +// +// SCOPE: char-run / quote-string / line+block-comment tokens; recursive descent with +// backtracking alternation, `*`/`?` quantifiers, `sep`, and inline literal-alternation; +// and a Pratt expression engine with operator precedence/associativity, prefix unary, +// bracket NUDs (grouping, array), and mixfix LEDs (call / member / index) tried before +// operators. buildIR THROWS on a construct outside this set rather than emit a wrong +// parser. This is enough to derive a real JavaScript-subset parser (test/fixtures/minijs.ts). +import type { CstGrammar, RuleExpr, TokenDecl, TokenPattern } from './types.ts'; +import { withAwaitYield } from './await-yield-fork.ts'; +import { analyzeGrammar, findEntryRule } from './grammar-analysis.ts'; +import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; +import { + tokenPatternCharLoop, tokenPatternQuoteDelimAndEscape, + tokenPatternBlockDelimiters, tokenPatternLiteralPrefix, +} from './token-pattern.ts'; + +// ── Intermediate representation (plain data; every Target renders THIS) ── + +export type CharRange = [number, number]; // inclusive char-code range +export type LexTok = + | { kind: 'run'; name: string; first: CharRange[]; cont: CharRange[]; skip: boolean } // ident/number char run + | { kind: 'string'; name: string; delim: string; skip: boolean } // delim..delim, `\` escapes next + | { kind: 'line'; name: string; prefix: string; skip: boolean } // prefix..end-of-line + | { kind: 'block'; name: string; open: string; close: string; skip: boolean } // open..close + // The general case: the raw token-pattern AST, compiled to a backtracking-free matcher + // by the target (no regex engine). Subsumes the fast paths above; used for the token + // shapes they don't cleanly recognise (escaped identifiers, the number family, …). + | { kind: 'pattern'; name: string; pattern: TokenPattern; skip: boolean }; + +export type Lit = { value: string; ttype: '$keyword' | '$punct' }; +export type Step = + | { t: 'lit'; value: string; ttype: '$keyword' | '$punct' } // match a literal by text + | { t: 'tok'; name: string } // match a token kind + | { t: 'rule'; name: string } // call a rule, append its node + | { t: 'ruleBp'; name: string; bp: number } // call a Pratt rule at a given binding power (chain-rhs led trailing operand) + | { t: 'star'; step: Step } // repeat inner 0+ + | { t: 'opt'; steps: Step[] } // optional sub-sequence + | { t: 'sep'; elem: Step; delim: string } // elem (delim elem)* + | { t: 'altlit'; opts: Lit[] } // inline alternation of literals (fast path) + | { t: 'alt'; branches: Step[][] } // inline alternation of sub-sequences (backtracking) + | { t: 'not'; steps: Step[] } // zero-width negative lookahead (consumes nothing) + | { t: 'seq'; steps: Step[] } // a grouped sub-sequence (e.g. a star body `(',' Expr)`) + | { t: 'sameLine' } // zero-width: the next token is on the same line (no preceding newline) + | { t: 'suppress'; connectors: string[]; steps: Step[] }; // parse the body with these LED connectors disabled (no-`in` context) +export type Alt = Step[]; + +export type RdRule = { kind: 'rd'; name: string; cstName: string; alts: Alt[] }; +export type Bracket = { first: string; steps: Step[] }; // a literal-led sequence (grouping/array; LED call/index) +export type PrattRule = { + kind: 'pratt'; + name: string; // the (possibly $A/$Y-forked) rule name — used for the parse fn names + cstName: string; // the CANON name — the CST node label (a fork collapses to its base) + nudToks: string[]; // NUD: a bare token wrapped in a node + nudBrackets: Bracket[]; // NUD: '(' … ')' / '[' … ']' + nudSeqs: Step[][]; // NUD: a general sequence (guarded ident, class expr), tried with backtracking + nudCapped: Array<{ steps: Step[]; capBp: number }>; // NUD: an assignment-level capped sequence (arrow function) — parsed only when minBp < capBp, admits no led + prefix: Array<{ op: string; rbp: number }>; // NUD: prefix op then operand at rbp + binary: Array<{ op: string; lbp: number; rbp: number }>; // LED: infix op, bind iff lbp > minBp, rhs at rbp + leds: Bracket[]; // LED: mixfix continuation (call/member/index), tried before operators + ledAccessTail: boolean[]; // parallel to leds: a "closed punct-connector" tail (member/call/index) — disabled once a postfix binds + ledLbp: Array; // parallel to leds: precedence gate (ternary/in/instanceof) — bind only when lbp > minBp; null = bind maximally tight + ledSameLine: boolean[]; // parallel to leds: a leading `sameLine` guard (TS type tails) — the connector must be on the operand's line + ledNotLeftLeaf: Array; // parallel to leds: skip this led when the left node's head-leaf text is in this set (`void.x` etc.) + postfixToks: string[]; // LED: a postfix token `$ X` (e.g. a tagged template), tried like a mixfix led (also an access tail) + postfix: Array<{ op: string; lbp: number }>; // LED: a postfix operator `$ ++` — binds iff lbp > minBp + !tailClosed, no rhs, closes the tail +}; +export type RuleIR = RdRule | PrattRule; + +// Stateful regex-vs-division disambiguation (the JS `/` problem): a `/` starts a regex +// literal in expression context but is division after a value. The lexer threads the +// previous token + a control-head paren stack to decide; the predicate sets are baked +// from the grammar's `regexContext`. Mirrors gen-lexer.ts's prevIsValue exactly. +export type RegexCtx = { + regexToken: string; // the token flagged `regex`, gated on expression context + identToken: string; // identifier token kind (for the keyword-vs-value test) + divisionTypes: string[]; // prev TOKEN KINDS after which `/` is division + divisionTexts: string[]; // prev TEXTS after which `/` is division + regexTexts: string[]; // expression-start keywords (a `/` after them is a regex) + parenHeadKw: string[]; // keywords whose `(` is a control head (regex after its `)`) + memberAccess: string[]; // accessors that make a following keyword a member name, not a head + postfixAfterValue: string[]; // ambiguous postfix/prefix ops (e.g. `!`): value only in postfix +}; + +// Template literals with `${…}` interpolation: a STATEFUL lexer split. A `` ` `` opens a +// span scanned to the next `${` (→ $templateHead) or closing `` ` `` (→ the whole token, +// no substitution); a `}` that closes a hole resumes the span (→ $templateMiddle / Tail). +// A `templateStack` of brace-depths tracks which `}` closes the hole vs. a nested `{…}`. +// The parser assembles head·expr·(middle·expr)*·tail into a synthetic `$template` node. +export type TplCfg = { + token: string; // the token flagged `template`; its NoSubstitution form is a plain leaf + open: string; // `` ` `` + interpOpen: string; // `${` + interpClose: string; // `}` + braceOpen: string; // `{` — a nested one deepens the hole, so its `}` is not the closer + interpRule: string; // the rule that parses each `${…}` hole (the Pratt expression rule) +}; + +export type ParserIR = { + grammarName: string; + entry: string; + tokens: LexTok[]; // for the char scanner, tried in declaration order + puncts: string[]; // punctuation literals, longest-first (maximal munch) + rules: RuleIR[]; + regexCtx: RegexCtx | null; // null unless the grammar has a regex token with context + tpl: TplCfg | null; // null unless the grammar has a template token +}; + +// The target-agnostic parse plan for a grammar. Applies the [Await]/[Yield] context fork +// exactly as createParser does (so `await`/`yield` are keywords inside async/generator bodies +// and identifiers outside — name-forked into $A/$Y/$AY rule families), then builds the IR each +// portable Target (ts/go/rust) renders. The `Target` contract itself lives in emit.ts. +export function portableIR(grammar: CstGrammar): ParserIR { + return buildIR(withAwaitYield(grammar)); +} + +// ── buildIR: grammar + analysis → the target-agnostic parse plan ── + +function buildIR(grammar: CstGrammar): ParserIR { + const a = analyzeGrammar(grammar); + const tokenNames = a.tokenNames; + + const tokens: LexTok[] = grammar.tokens.map((t) => lexTok(t)); + const lits = new Set(); + for (const r of grammar.rules) for (const l of collectLiterals(r.body)) lits.add(l); + for (const lv of grammar.precs) for (const o of lv.operators) lits.add(o.value); + const puncts = [...lits].filter((l) => !isKeywordLiteral(l)).sort((x, y) => y.length - x.length); + + const litTtype = (v: string): '$keyword' | '$punct' => (isKeywordLiteral(v) ? '$keyword' : '$punct'); + + // RuleExpr → Step. `selfName` (when set) maps a self-ref to a fresh rule call. + function stepOf(e: RuleExpr): Step { + switch (e.type) { + case 'literal': return { t: 'lit', value: e.value, ttype: litTtype(e.value) }; + case 'ref': return tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }; + case 'group': { // transparent (ctxMode is invisible to the portable parser) + const ss = altSteps(e.body); + if (e.suppress && e.suppress.length) return { t: 'suppress', connectors: e.suppress, steps: ss }; // no-`in` context + return ss.length === 1 ? ss[0] : { t: 'seq', steps: ss }; + } + case 'not': return { t: 'not', steps: altSteps(e.body) }; // zero-width negative lookahead + case 'sameLine': return { t: 'sameLine' }; // zero-width no-newline assertion + case 'seq': return { t: 'seq', steps: e.items.map(stepOf) }; // grouped sub-sequence (star/sep body) + case 'sep': return { t: 'sep', elem: stepOf(e.element), delim: e.delimiter }; + case 'quantifier': + if (e.kind === '*') return { t: 'star', step: stepOf(e.body) }; + if (e.kind === '?') return { t: 'opt', steps: altSteps(e.body) }; + if (e.kind === '+') return { t: 'seq', steps: [stepOf(e.body), { t: 'star', step: stepOf(e.body) }] }; // x+ = x x* + break; + case 'alt': { + if (e.items.every((it) => it.type === 'literal')) { // fast path: all-literal alternation + return { t: 'altlit', opts: e.items.map((it) => ({ value: (it as { value: string }).value, ttype: litTtype((it as { value: string }).value) })) }; + } + return { t: 'alt', branches: e.items.map(altSteps) }; // general: backtracking over sub-sequences + } + } + throw new Error(`portable: rd construct '${e.type}' not in scope`); + } + function altSteps(e: RuleExpr): Step[] { + if (e.type === 'seq') return e.items.map(stepOf); + return [stepOf(e)]; + } + + const rules: RuleIR[] = grammar.rules.map((r) => { + const cstName = (r as { canon?: string }).canon ?? r.name; // a forked $A/$Y rule labels its CST node with the base name + // Pratt rules AND left-recursive non-Pratt rules (e.g. NewTarget, TS Type) both parse as + // atom-then-continuation: buildPratt detects `startsSelf` and splits accordingly, so routing + // left-recursive rules through it avoids the infinite left-recursion a plain rd rule would hit. + if (a.prattRules.has(r.name) || a.leftRecSet.has(r.name)) return buildPratt(r.name, cstName, r.body, a, stepOf, altSteps, litTtype); + return { kind: 'rd', name: r.name, cstName, alts: r.body.type === 'alt' ? r.body.items.map(altSteps) : [altSteps(r.body)] }; + }); + + // Regex-vs-division context (only if the grammar declares a regex token + config). + let regexCtx: RegexCtx | null = null; + const rxTok = grammar.tokens.find((t) => t.flags.includes('regex')); + const rxCfg = grammar.tokens.find((t) => t.regexContext)?.regexContext; + if (rxTok && rxCfg) { + regexCtx = { + regexToken: rxTok.name, + identToken: grammar.tokens.find((t) => t.identifier)?.name ?? '', + divisionTypes: [...(rxCfg.divisionAfterTypes ?? [])], + divisionTexts: [...(rxCfg.divisionAfterTexts ?? [])], + regexTexts: [...(rxCfg.regexAfterTexts ?? [])], + parenHeadKw: [...(rxCfg.regexAfterParenKeywords ?? [])], + memberAccess: [...(rxCfg.memberAccessTexts ?? [])], + postfixAfterValue: [...(rxCfg.postfixAfterValueTexts ?? [])], + }; + } + + // Template literals (only if the grammar declares a template token). The interpolation + // holes are parsed by the Pratt expression rule — the rule that carries operator leds. + let tpl: TplCfg | null = null; + const tplTok = grammar.tokens.find((t) => t.template); + if (tplTok && tplTok.template) { + const prattName = rules.find((r) => r.kind === 'pratt')?.name; + if (!prattName) throw new Error('portable: a template token needs a Pratt expression rule to parse its interpolations'); + tpl = { + token: tplTok.name, + open: tplTok.template.open, + interpOpen: tplTok.template.interpOpen, + interpClose: tplTok.template.interpClose, + braceOpen: tplTok.template.interpOpen.slice(-1), + interpRule: prattName, + }; + } + + // The [Await]/[Yield] fork names rules `Expr$A`/`Expr$Y` — `$` is a valid TS identifier but + // NOT a Go/Rust one. Sanitize every rule-IDENTIFIER use (`$`→`_`) for the emitted parse-fn + // names; the CST node label (cstName) keeps the canon base name, so the tree is unchanged. + const san = (n: string) => n.replace(/\$/g, '_'); + const sanStep = (s: Step): void => { + if (s.t === 'rule' || s.t === 'ruleBp') s.name = san(s.name); + else if (s.t === 'star') sanStep(s.step); + else if (s.t === 'opt' || s.t === 'not' || s.t === 'seq' || s.t === 'suppress') s.steps.forEach(sanStep); + else if (s.t === 'sep') sanStep(s.elem); + else if (s.t === 'alt') s.branches.forEach((b) => b.forEach(sanStep)); + }; + for (const r of rules) { + r.name = san(r.name); + if (r.kind === 'rd') r.alts.forEach((alt) => alt.forEach(sanStep)); + else { + r.nudBrackets.forEach((b) => b.steps.forEach(sanStep)); + r.nudSeqs.forEach((seq) => seq.forEach(sanStep)); + r.nudCapped.forEach((c) => c.steps.forEach(sanStep)); + r.leds.forEach((b) => b.steps.forEach(sanStep)); + } + } + if (tpl) tpl.interpRule = san(tpl.interpRule); + + return { grammarName: grammar.name ?? 'grammar', entry: san(findEntryRule(grammar)), tokens, puncts, rules, regexCtx, tpl }; +} + +// Classify a token: a fast-path shape (run/string/line/block) when one cleanly matches, +// otherwise the general `pattern` matcher. The fast paths keep the common simple tokens +// (and the calc/minijs grammars) on tight, readable scan code in every target. +function lexTok(t: TokenDecl): LexTok { + const skip = t.flags.includes('skip'); + const qs = tokenPatternQuoteDelimAndEscape(t); + if (qs) return { kind: 'string', name: t.name, delim: qs.delim, skip }; + const bd = tokenPatternBlockDelimiters(t); + if (bd) return { kind: 'block', name: t.name, open: bd[0], close: bd[1], skip }; + const loop = tokenPatternCharLoop(t); + if (loop && loop.bail.length === 0 && !loop.bailNonAscii) { + return { kind: 'run', name: t.name, first: codesToRanges(loop.first), cont: codesToRanges(loop.cont), skip }; + } + const line = lineCommentShape(t.pattern); // PRECISE: prefix-literal then star(non-newline) + if (line) return { kind: 'line', name: t.name, prefix: line, skip }; + return { kind: 'pattern', name: t.name, pattern: t.pattern, skip }; +} + +// A token is a line comment iff its pattern is `seq(, star(charClass excluding \n))`. +function lineCommentShape(p: TokenPattern): string | null { + if (typeof p === 'string' || p.type !== 'seq' || p.items.length !== 2) return null; + const [head, tail] = p.items; + if (typeof head !== 'string') return null; + if (typeof tail === 'string' || tail.type !== 'repeat' || tail.min !== 0) return null; + const body = tail.body; + if (typeof body === 'string' || body.type !== 'charClass' || !body.negate) return null; + const excludesNl = body.items.some((it): boolean => it.type === 'char' && it.value === '\n'); + return excludesNl ? head : null; +} + +function codesToRanges(codes: number[]): CharRange[] { + const s = [...new Set(codes)].sort((x, y) => x - y); + const out: CharRange[] = []; + for (const c of s) { + const last = out[out.length - 1]; + if (last && c === last[1] + 1) last[1] = c; + else out.push([c, c]); + } + return out; +} + +// A Pratt rule's alternatives → NUD atoms/brackets/prefix + binary + mixfix LEDs. +// Binding powers come from the analysis (opTable/prefixOps), single-sourced with the interpreter. +function buildPratt( + name: string, cstName: string, body: RuleExpr, a: ReturnType, + stepOf: (e: RuleExpr) => Step, altSteps: (e: RuleExpr) => Step[], + litTtype: (v: string) => '$keyword' | '$punct', +): PrattRule { + const alts = body.type === 'alt' ? body.items : [body]; + const nudToks: string[] = []; + const nudBrackets: Bracket[] = []; + const nudSeqs: Step[][] = []; + const nudCapped: Array<{ steps: Step[]; capBp: number }> = []; + let sawPrefix = false, sawBinary = false, sawPostfix = false; + const leds: Bracket[] = []; + const ledAccessTail: boolean[] = []; + const ledLbp: Array = []; + const ledSameLine: boolean[] = []; + const ledNotLeftLeaf: Array = []; + const postfixToks: string[] = []; + for (const alt of alts) { + let items = alt.type === 'seq' ? alt.items : [alt]; + // A left-recursive continuation may carry a leading `notLeftLeaf(words)` head-leaf guard + // before the self `$` — strip it and attach the word set to the led it produces. + let nllWords: string[] | null = null; + if (items[0].type === 'notLeftLeaf' && items[1]?.type === 'ref' && items[1].name === name) { + nllWords = items[0].words; items = items.slice(1); + } + const startsSelf = items[0].type === 'ref' && items[0].name === name; + if (!startsSelf) { + // NUD + if (items.length === 1 && items[0].type === 'ref' && a.tokenNames.has(items[0].name)) { nudToks.push(items[0].name); continue; } + if (items[0].type === 'prefix') { sawPrefix = true; continue; } + // A capExpr (arrow function): an assignment-level group{capBelow}. ctxMode in its body + // is treated as transparent (the await/yield fork is not modelled in the portable parser). + if (items.length === 1 && items[0].type === 'group' && items[0].capBelow !== undefined) { + const capBp = a.nudCapOf(items[0]); + if (capBp === null) throw new Error(`portable: capBelow connector '${items[0].capBelow}' has no binding power (rule ${name})`); + const b = items[0].body; + nudCapped.push({ steps: (b.type === 'seq' ? b.items : [b]).map((it) => stepOfPratt(it)), capBp }); + continue; + } + if (items[0].type === 'literal') { nudBrackets.push({ first: items[0].value, steps: items.map((it) => stepOfPratt(it)) }); continue; } + // A single transparent (non-suppress) group unwraps to its body (an explicit grouping). + let nudItems = items; + if (items.length === 1 && items[0].type === 'group' && !items[0].suppress) { + nudItems = items[0].body.type === 'seq' ? items[0].body.items : [items[0].body]; + } + nudSeqs.push(nudItems.map((it) => stepOfPratt(it))); // general NUD sequence (guarded ident, class expr) + continue; + } + // LED (starts with self): `$ op $` (binary, op slot + trailing self) or `$ …` (mixfix) + const restAll = items.slice(1); + const hasSameLine = restAll[0]?.type === 'sameLine'; // a TS type tail: `$ sameLine '<' …` + const rest = hasSameLine ? restAll.slice(1) : restAll; + if (!hasSameLine && rest[0].type === 'op') { sawBinary = true; continue; } + if (!hasSameLine && rest[0].type === 'postfix') { sawPostfix = true; continue; } // postfix operator (`x++`) + if (rest[0].type === 'literal') { + const conn = rest[0].value; + const prec = a.ledPrecByConnector.get(conn); // { lbp, rhsBp } for ternary/in/instanceof + const steps = rest.map((it) => stepOfPratt(it)); + const last = steps[steps.length - 1]; + const lastIsOperand = last !== undefined && last.t === 'rule' && last.name === name; // open binary/ternary operand + // chain-rhs (`in`/`instanceof`): the trailing self-operand parses at the level's bp (left-chain). + if (prec && prec.rhsBp !== null && lastIsOperand) steps[steps.length - 1] = { t: 'ruleBp', name, bp: prec.rhsBp }; + const wordConnector = /^[A-Za-z]/.test(conn); // `in`/`instanceof`/`as` — not a tail + leds.push({ first: conn, steps }); + ledAccessTail.push(!lastIsOperand && !wordConnector); + ledLbp.push(prec ? prec.lbp : null); + ledSameLine.push(hasSameLine); + ledNotLeftLeaf.push(nllWords); + continue; + } + if (rest.length === 1 && rest[0].type === 'ref' && a.tokenNames.has(rest[0].name)) { postfixToks.push(rest[0].name); continue; } // postfix token (tagged template) + throw new Error(`portable: Pratt LED shape not in scope (rule ${name})`); + } + // a self-ref inside a NUD/LED sub-sequence is a fresh parse of this rule + function stepOfPratt(e: RuleExpr): Step { + if (e.type === 'ref' && e.name === name) return { t: 'rule', name }; + if (e.type === 'seq') return { t: 'seq', steps: e.items.map(stepOfPratt) }; + if (e.type === 'sameLine') return { t: 'sameLine' }; + if (e.type === 'not') return { t: 'not', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; + if (e.type === 'group' && e.suppress && e.suppress.length) return { t: 'suppress', connectors: e.suppress, steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; + // ctxMode (await/yield) is transparent to the portable parser (no fork); unwrap the group. + if (e.type === 'group' && !e.capBelow) { + return e.body.type === 'seq' ? { t: 'seq', steps: e.body.items.map(stepOfPratt) } : stepOfPratt(e.body); + } + if (e.type === 'sep') return { t: 'sep', elem: stepOfPratt(e.element), delim: e.delimiter }; + if (e.type === 'quantifier' && e.kind === '?') return { t: 'opt', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; + if (e.type === 'quantifier' && e.kind === '*') return { t: 'star', step: stepOfPratt(e.body) }; + if (e.type === 'quantifier' && e.kind === '+') return { t: 'seq', steps: [stepOfPratt(e.body), { t: 'star', step: stepOfPratt(e.body) }] }; + if (e.type === 'literal') return { t: 'lit', value: e.value, ttype: litTtype(e.value) }; + return stepOf(e); + } + const prefix = sawPrefix ? [...a.prefixOps.entries()].map(([op, info]) => ({ op, rbp: info.rbp })) : []; + const binary = sawBinary + ? [...a.opTable.entries()].filter(([, info]) => info.position === 'infix').map(([op, info]) => ({ op, lbp: info.lbp, rbp: info.rbp })) + : []; + const postfix = sawPostfix + ? [...a.opTable.entries()].filter(([, info]) => info.position === 'postfix').map(([op, info]) => ({ op, lbp: info.lbp })) + : []; + return { kind: 'pratt', name, cstName, nudToks, nudBrackets, nudSeqs, nudCapped, prefix, binary, leds, ledAccessTail, ledLbp, ledSameLine, ledNotLeftLeaf, postfixToks, postfix }; +} diff --git a/src/emit.ts b/src/emit.ts new file mode 100644 index 0000000..1513fb5 --- /dev/null +++ b/src/emit.ts @@ -0,0 +1,33 @@ +// The emit layer's public surface: exactly two APIs, both parameterized by a `Target`. +// +// emitLexer(grammar, target) → the lexer source for that target +// emitParser(grammar, target) → the parser source for that target, REUSING emitLexer +// +// A `Target` owns BOTH halves, so emitParser(grammar, target) reuses the SAME target's lexer — +// jsTarget's parser embeds jsTarget's SoA-int lexer, goTarget's parser embeds goTarget's +// Tok-list lexer. No cross-target lexer format is shared, so the optimized JS path keeps its +// integer-bitmask token dispatch while the portable targets keep their clean byte scanner. +// +// Targets: `jsTarget` (the optimized SoA parser, emit-parser.ts) and the portable +// `tsTarget`/`goTarget`/`rustTarget` (emit-portable.ts + target-*.ts). +import type { CstGrammar } from './types.ts'; + +export interface Target { + name: string; + ext: string; // emitted file extension (no dot) + emitLexer(grammar: CstGrammar): string | null; // null ⇒ runtime-lexer fallback (jsTarget markup/indent grammars) + emitParser(grammar: CstGrammar, lexerSrc: string | null): string; // the parser, embedding `lexerSrc` +} + +export function emitLexer(grammar: CstGrammar, target: Target): string | null { + return target.emitLexer(grammar); +} + +export function emitParser(grammar: CstGrammar, target: Target): string { + return target.emitParser(grammar, emitLexer(grammar, target)); // ← parser reuses lexer +} + +export { jsTarget } from './emit-parser.ts'; +export { tsTarget } from './target-ts.ts'; +export { goTarget } from './target-go.ts'; +export { rustTarget } from './target-rust.ts'; diff --git a/src/gen-ast-types.ts b/src/gen-ast-types.ts deleted file mode 100644 index d76d124..0000000 --- a/src/gen-ast-types.ts +++ /dev/null @@ -1,277 +0,0 @@ -// Generate a TypeScript `.d.ts`-style source describing the *typed* CST that -// `createParser(grammar).parse()` (gen-parser.ts) produces for THIS grammar. -// -// The runtime CST is generic — `CstNode.rule` and `CstLeaf.tokenType` are both -// `string`. This generator specialises those `string`s into the actual set of -// rule names / token types the grammar can yield, so a consumer gets: -// * a discriminated union `Node` keyed on the `rule` literal (exhaustive switch) -// * a `TokenType` union for `CstLeaf.tokenType` -// * per-rule structural typing of `children` (which child kinds can appear) -// -// Everything is DERIVED from the grammar (CstGrammar) — nothing TypeScript- or -// language-specific is hardcoded, matching the engine's language-agnostic rule. -// Field NAMES are deliberately absent: the grammar has no labels on elements -// (e.g. `[$, '.', Ident]`), so children are typed positionally-by-kind, not as -// named accessors. See the note emitted into the output + the report. - -import type { CstGrammar, RuleExpr } from './types.ts'; -import { isKeywordLiteral } from './grammar-utils.ts'; - -// The synthetic leaf/node `tokenType`s the lexer + parser emit in addition to -// the grammar's declared token names. Kept in sync with gen-lexer.ts / gen-parser.ts -// (grep the literal `$...` strings there). `$template` is a *node* rule the parser -// builds for interpolated templates, but it surfaces in `CstChild` positions and -// as a `tokenType` is harmless to include; we also emit a `$template` Node below. -const SYNTHETIC_LEAF_TYPES = [ - '$keyword', // matchLiteral: keyword-shaped literal - '$punct', // matchLiteral: punctuation literal - '$operator', // Pratt: infix/prefix/postfix operator leaf - '$templateHead', // lexer: template up to first `${` - '$templateMiddle', // lexer: `}` … `${` - '$templateTail', // lexer: `}` … closing backtick -] as const; - -// `$template` is the synthetic *node* the parser emits for an interpolated -// template literal (gen-parser.ts parseTemplateExpr). -const SYNTHETIC_TEMPLATE_NODE = '$template'; - -/** A child element a node can contain: either a Node (by rule) or a Leaf (by token type). */ -type ChildKind = - | { kind: 'node'; rule: string } - | { kind: 'leaf'; tokenType: string }; - -/** - * Generate the typed-CST TypeScript source for `grammar`. - * Returns a self-contained module string (no imports) suitable for writing to a - * `.ts`/`.d.ts` file and `import`ing or type-checking. - */ -export function generateAstTypes(grammar: CstGrammar): string { - // The grammar's template token (if any): a ref to it can yield either a plain - // leaf of that token type OR a `$template` node (parseTemplateExpr) / a - // `$templateHead` leaf — mirror gen-parser's matchExpr 'ref' branch. - const templateTokenNames = new Set(grammar.tokens.filter(t => t.template).map(t => t.name)); - - // ── 1. Token-type union ── - // Declared token names + the synthetic leaf types the engine injects. - const leafTokenTypes = [ - ...grammar.tokens.map(t => t.name).sort(), - ...SYNTHETIC_LEAF_TYPES, - ]; - - // ── 2/3. Per-rule child kinds ── - // For each rule, the set of child element kinds a node of that rule can hold, - // collapsed across the rule body's structure. The parser flattens quantifiers, - // `sep`, `alt`, and `group` straight into the parent's `children` array, so a - // node's children are a *sequence drawn from* this set (not a fixed tuple). - const childKindsByRule = new Map(); - for (const rule of grammar.rules) { - childKindsByRule.set(rule.name, deriveChildKinds(rule.name, grammar)); - } - - // ── Emit ── - const lines: string[] = []; - lines.push('// AUTO-GENERATED by src/gen-ast-types.ts — do not edit by hand.'); - lines.push('// Typed CST for the parser output of this grammar.'); - lines.push('//'); - lines.push('// LIMITATION — no named fields: the grammar carries no labels on rule'); - lines.push('// elements (e.g. `[$, \'.\', Ident]` has no field names), so children are'); - lines.push('// typed by *kind* (which Node rules / leaf token types can appear), not as'); - lines.push('// named accessors like `node.name`/`node.value`. Named-field accessors would'); - lines.push('// require adding field labels to the grammar DSL (a future enrichment).'); - lines.push(''); - - // Position info (mirrors CstNode/CstLeaf in gen-parser.ts). - lines.push('export interface CstPos {'); - lines.push(' offset: number;'); - lines.push(' end: number;'); - lines.push('}'); - lines.push(''); - - // Token-type union. - lines.push('/** Every `tokenType` a CstLeaf in this grammar can carry. */'); - lines.push(`export type TokenType =\n${unionBody(leafTokenTypes.map(quote))};`); - lines.push(''); - - // Generic leaf — narrowable on `tokenType`. - lines.push('/** A terminal: one lexer token (or synthetic keyword/punct/operator leaf). */'); - lines.push('export interface CstLeaf extends CstPos {'); - lines.push(' tokenType: TokenType;'); - lines.push('}'); - lines.push(''); - - // The `$template` synthetic node, if the grammar has a template token. - const hasTemplate = templateTokenNames.size > 0; - if (hasTemplate) { - lines.push('/** Synthetic node the parser builds for an interpolated template literal. */'); - lines.push(`export interface ${nodeIfaceName(SYNTHETIC_TEMPLATE_NODE)} extends CstPos {`); - lines.push(` rule: ${quote(SYNTHETIC_TEMPLATE_NODE)};`); - // A $template node holds template leaves plus interpolated expression nodes. - lines.push(' children: CstChild[];'); - lines.push('}'); - lines.push(''); - } - - // Per-rule interfaces. - for (const rule of grammar.rules) { - const kinds = childKindsByRule.get(rule.name)!; - lines.push(`/** \`${rule.name}\` node. Children (flattened, in source order) are drawn from: */`); - lines.push(`export interface ${nodeIfaceName(rule.name)} extends CstPos {`); - lines.push(` rule: ${quote(rule.name)};`); - lines.push(` children: ${childArrayType(kinds)};`); - lines.push('}'); - lines.push(''); - } - - // Discriminated union of all node interfaces (keyed on `rule`). - const nodeMembers = [ - ...(hasTemplate ? [nodeIfaceName(SYNTHETIC_TEMPLATE_NODE)] : []), - ...grammar.rules.map(r => nodeIfaceName(r.name)), - ]; - lines.push('/** Discriminated union of every node kind. Switch on `node.rule` for exhaustiveness. */'); - lines.push(`export type CstNode =\n${unionBody(nodeMembers)};`); - lines.push(''); - - // The `rule` discriminant as a standalone union (handy for callers). - const ruleLiterals = [ - ...(hasTemplate ? [SYNTHETIC_TEMPLATE_NODE] : []), - ...grammar.rules.map(r => r.name), - ]; - lines.push('/** Every `rule` discriminant value (the keys of the CstNode union). */'); - lines.push(`export type RuleName =\n${unionBody(ruleLiterals.map(quote))};`); - lines.push(''); - - // CstChild. - lines.push('/** Any CST element: a node or a leaf. */'); - lines.push('export type CstChild = CstNode | CstLeaf;'); - lines.push(''); - - // A by-rule lookup type, so callers can write `NodeOf<\'Expr\'>`. - lines.push('/** Narrow the CstNode union to the node for a given rule name. */'); - lines.push('export type NodeOf = Extract;'); - lines.push(''); - - return lines.join('\n'); -} - -// ── Child-kind derivation ── - -/** - * The set of child element kinds a node of `ruleName` can directly contain. - * - * The parser (gen-parser.ts) flattens `seq`/`alt`/`group`/`quantifier`/`sep` - * straight into the parent node's `children` array, so we walk the whole body - * and union every terminal/ref it can reach as a direct child: - * - literal → `$keyword` (keyword-shaped) or `$punct` (punctuation) leaf - * - ref → token → a leaf of that token name (+ `$template`/`$templateHead` - * if it is the template token) - * - ref → rule → that rule's Node - * - op/prefix/postfix → handled below for Pratt rules (operator leaves + self-ref) - * - * Pratt / left-recursive rules additionally build children the body doesn't show - * literally — `[lhs, opLeaf, rhs]`, `[opLeaf, rhs]`, `[lhs, opLeaf]` — where `lhs` - * and `rhs` are nodes of the SAME rule and `opLeaf` is a `$operator` leaf. We add - * a self Node-ref and `$operator` whenever the body contains an op/prefix/postfix - * marker, so the type matches what the parser actually emits. - */ -function deriveChildKinds(ruleName: string, grammar: CstGrammar): ChildKind[] { - const tokenNames = new Set(grammar.tokens.map(t => t.name)); - const ruleNames = new Set(grammar.rules.map(r => r.name)); - const templateTokenNames = new Set(grammar.tokens.filter(t => t.template).map(t => t.name)); - const rule = grammar.rules.find(r => r.name === ruleName)!; - - // De-dup by a stable key. - const seen = new Map(); - const add = (c: ChildKind) => { - const key = c.kind === 'node' ? `n:${c.rule}` : `l:${c.tokenType}`; - if (!seen.has(key)) seen.set(key, c); - }; - - let sawMarker = false; - - function walk(expr: RuleExpr): void { - switch (expr.type) { - case 'literal': - add({ kind: 'leaf', tokenType: isKeywordLiteral(expr.value) ? '$keyword' : '$punct' }); - return; - case 'ref': - if (tokenNames.has(expr.name)) { - add({ kind: 'leaf', tokenType: expr.name }); - // A ref to the template token can instead yield a `$template` node or a - // `$templateHead` leaf (parser's matchExpr 'ref' → parseTemplateExpr). - if (templateTokenNames.has(expr.name)) { - add({ kind: 'node', rule: SYNTHETIC_TEMPLATE_NODE }); - add({ kind: 'leaf', tokenType: '$templateHead' }); - } - } else if (ruleNames.has(expr.name)) { - add({ kind: 'node', rule: expr.name }); - } - return; - case 'seq': - case 'alt': - for (const item of expr.items) walk(item); - return; - case 'quantifier': - case 'group': - walk(expr.body); - return; - case 'sep': - // `sep(el, ',')` → repeated `el` interleaved with the `,` delimiter leaf. - walk(expr.element); - add({ kind: 'leaf', tokenType: isKeywordLiteral(expr.delimiter) ? '$keyword' : '$punct' }); - return; - case 'op': - case 'prefix': - case 'postfix': - sawMarker = true; - return; - } - } - - walk(rule.body); - - // Pratt synthesis: operator leaves + self node-refs the parser injects. - if (sawMarker) { - add({ kind: 'leaf', tokenType: '$operator' }); - add({ kind: 'node', rule: ruleName }); - } - - return [...seen.values()]; -} - -// ── Emit helpers ── - -function nodeIfaceName(ruleName: string): string { - // `$template` → `$templateNode`; `Expr` → `ExprNode`. The `$` is a legal TS - // identifier char, so `$templateNode` is a valid interface name. - return `${ruleName}Node`; -} - -/** A single-quoted string literal type, with internal quotes/backslashes escaped. */ -function quote(s: string): string { - return `'${s.replace(/\\/g, '\\\\').replace(/'/g, "\\'")}'`; -} - -/** Render a list of member type strings as an indented `| a | b | c` union body. */ -function unionBody(members: string[]): string { - if (members.length === 0) return ' never'; - return members.map(m => ` | ${m}`).join('\n'); -} - -/** The `children` array type for a set of child kinds. */ -function childArrayType(kinds: ChildKind[]): string { - if (kinds.length === 0) { - // No derivable children (e.g. an empty/marker-only body) — still an array. - return 'CstChild[]'; - } - const members = kinds.map(k => - k.kind === 'node' ? nodeIfaceName(k.rule) : leafOf(k.tokenType), - ); - // Sort for stable output; nodes and leaves intermixed is fine. - members.sort(); - return `Array<\n${members.map(m => ` | ${m}`).join('\n')}\n >`; -} - -/** A `CstLeaf` narrowed to a specific tokenType. */ -function leafOf(tokenType: string): string { - return `(CstLeaf & { tokenType: ${quote(tokenType)} })`; -} diff --git a/src/gen-cst-match.ts b/src/gen-cst-match.ts index a2dca89..c0b3148 100644 --- a/src/gen-cst-match.ts +++ b/src/gen-cst-match.ts @@ -1,5 +1,4 @@ -// Generate per-rule, per-ARM destructurers for a grammar's CST — the VALUE-level -// sibling of gen-ast-types.ts. For every rule it emits +// Generate per-rule, per-ARM destructurers for a grammar's CST. For every rule it emits // // export type Match = { arm: 'if', expr: NodeEntry<'Expr'>, … } | … // export function match(t: TreeAccess, n: NodeEntry<'Rule'>, src: string): Match @@ -74,7 +73,7 @@ function sanitizeIdent(s: string): string { const J = (v: unknown) => JSON.stringify(v); -export function generateCstMatch(grammar: CstGrammar, importFrom: string): string { +export function generateCstMatch(grammar: CstGrammar): string { // Same [Await]/[Yield] fork the parsers apply, so the rule-id space (ruleIdOf) // agrees with the tree. Matchers/types are emitted for BASE rules only (a fork // collapses to its base via RULE_CANON); no-op without ctx markers. diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts index 7fc06ea..a4ed10c 100644 --- a/src/gen-lexer.ts +++ b/src/gen-lexer.ts @@ -842,21 +842,23 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { let wc = source.charCodeAt(pos); if (wc === 32 || (wc >= 9 && wc <= 13)) { do { - if (wc === 10) pendingNl = true; + // JS line terminators: LF, CR, LS, PS (the ECMAScript set, driving ASI / "no + // LineTerminator here"). LF/CR are ASCII (here); LS/PS arrive via the \s regex below. + if (wc === 10 || wc === 13) pendingNl = true; pos++; wc = source.charCodeAt(pos); } while (wc === 32 || (wc >= 9 && wc <= 13)); if (wc > 127) { // a Unicode space may continue the run — absorb it like the old regex did wsReY.lastIndex = pos; const wsMatch = wsReY.exec(source); - if (wsMatch) { if (wsMatch[0].includes('\n')) pendingNl = true; pos += wsMatch[0].length; } + if (wsMatch) { if (/[\n\r\u2028\u2029]/.test(wsMatch[0])) pendingNl = true; pos += wsMatch[0].length; } } continue; } if (wc > 127) { wsReY.lastIndex = pos; const wsMatch = wsReY.exec(source); - if (wsMatch) { if (wsMatch[0].includes('\n')) pendingNl = true; pos += wsMatch[0].length; continue; } + if (wsMatch) { if (/[\n\r\u2028\u2029]/.test(wsMatch[0])) pendingNl = true; pos += wsMatch[0].length; continue; } } } @@ -1178,7 +1180,7 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { if (!tm.skip) { push(mkNamed(tm.name, m[0], pos, tm.k)); } else { - if (m[0].includes('\n')) pendingNl = true; // a skipped comment spanning a newline still terminates the previous line + if (/[\n\r\u2028\u2029]/.test(m[0])) pendingNl = true; // a skipped comment spanning a line terminator still terminates the previous line // An inline comment (indentation grammars) ENDS a plain scalar — flag the next token so a // multi-line fold won't reabsorb a post-comment line (yaml-test-suite 8XDJ / BF9H). if (indent?.comment && m[0].startsWith(indent.comment)) pendingComment = true; diff --git a/src/target-go.ts b/src/target-go.ts new file mode 100644 index 0000000..5d1e9b9 --- /dev/null +++ b/src/target-go.ts @@ -0,0 +1,460 @@ +// The Go Target for emit-portable. Renders the same language-agnostic ParserIR as tsTarget +// into a self-contained Go program (Go stdlib only — the lexer is regex-free, so it compiles +// with no module dependencies). Its CST JSON is checked byte-for-byte against the interpreter, +// so `emitParser(grammar, goTarget)` is a real, verified Go parser derived from the +// same grammar definition. +// +// ARENA allocation (to minimise GC pressure, as tsgo does): nodes live in a flat `nodes []Node`, +// their children in a flat `kids []int32`, and in-progress children accumulate on a `scratch` +// stack. A node is an int32 index, never a heap pointer. Backtracking truncates the three +// slices to saved lengths; the slices keep their capacity across parses (reset to len 0), so a +// warmed parser allocates ~nothing per parse. +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts'; +import { portableIR } from './emit-portable.ts'; +import type { Target } from './emit.ts'; +import type { TokenPattern, CstGrammar } from './types.ts'; + +const J = (v: unknown) => JSON.stringify(v); +const rangeCond = (v: string, rs: CharRange[]) => + '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || ') + ')'; + +// Compile a token-pattern AST to backtracking-free package-level matcher funcs +// `_mN(p int) int` (new position, or -1) over the module-level source `_s`. +function ccCondGo(p: Extract): string { + const parts = p.items.map((it) => + it.type === 'char' ? `cc == ${it.value.charCodeAt(0)}` : `cc >= ${it.from.charCodeAt(0)} && cc <= ${it.to.charCodeAt(0)}`); + const inSet = '(' + parts.join(' || ') + ')'; + return p.negate ? `!${inSet}` : inSet; +} +function compilePat(p: TokenPattern, defs: string[]): string { + const name = `_m${defs.length}`; + defs.push(''); + let body: string; + if (typeof p === 'string') { + body = `{ if p <= len(_s) && strings.HasPrefix(_s[p:], ${J(p)}) { return p + ${p.length} }; return -1 }`; + } else switch (p.type) { + case 'anyChar': body = `{ if p < len(_s) { return p + 1 }; return -1 }`; break; + case 'charClass': body = `{ if p >= len(_s) { return -1 }; cc := int(_s[p]); if ${ccCondGo(p)} { return p + 1 }; return -1 }`; break; + case 'seq': { const ms = p.items.map((x) => compilePat(x, defs)); body = `{ ${ms.map((m) => `p = ${m}(p); if p < 0 { return -1 }`).join('; ')}; return p }`; break; } + case 'alt': { const ms = p.items.map((x) => compilePat(x, defs)); body = `{ ${ms.map((m) => `if r := ${m}(p); r >= 0 { return r }`).join('; ')}; return -1 }`; break; } + case 'repeat': { const m = compilePat(p.body, defs); const mx = p.max !== undefined ? `; if c >= ${p.max} { break }` : ''; body = `{ q, c := p, 0; for { r := ${m}(q); if r < 0 || r == q { break }; q = r; c++${mx} }; if c >= ${p.min} { return q }; return -1 }`; break; } + case 'lookahead': { const m = compilePat(p.body, defs); body = `{ r := ${m}(p); if ${p.negate ? 'r < 0' : 'r >= 0'} { return p }; return -1 }`; break; } + case 'anchor': body = p.kind === 'start' ? `{ if p == 0 { return p }; return -1 }` : `{ if p == len(_s) { return p }; return -1 }`; break; + default: throw new Error(`portable Go lexer: pattern '${(p as { type: string }).type}' unsupported`); + } + defs[Number(name.slice(2))] = `func ${name}(p int) int ${body}`; + return name; +} + +function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): string { + const name = (t as { name: string }).name; + const stateful = rxTok !== undefined || tplTok !== undefined; + if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine + const push = (endE: string) => (t.skip ? `if strings.ContainsAny(src[pos:${endE}], "\\n\\r") { pendingNl = true }; ` : stateful ? `emit(${J(name)}, src[pos:${endE}], pos, ${endE}); ` : `pushTok(${J(name)}, src[pos:${endE}], pos, ${endE}); `); + const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : ''; + if (t.kind === 'run') return `\t\tif ${gate}${rangeCond('c', t.first)} { +\t\t\te := pos + 1 +\t\t\tfor e < n { cc := int(src[e]); if !${rangeCond('cc', t.cont)} { break }; e++ } +\t\t\t${push('e')}pos = e; continue +\t\t}`; + if (t.kind === 'string') return `\t\tif ${gate}c == ${t.delim.charCodeAt(0)} { +\t\t\te := pos + 1 +\t\t\tfor e < n { ch := int(src[e]); if ch == 92 { e += 2; continue }; if ch == ${t.delim.charCodeAt(0)} { e++; break }; e++ } +\t\t\t${push('e')}pos = e; continue +\t\t}`; + if (t.kind === 'line') return `\t\tif ${gate}strings.HasPrefix(src[pos:], ${J(t.prefix)}) { +\t\t\te := pos + ${t.prefix.length} +\t\t\tfor e < n && src[e] != 10 { e++ } +\t\t\t${push('e')}pos = e; continue +\t\t}`; + if (t.kind === 'block') return `\t\tif ${gate}strings.HasPrefix(src[pos:], ${J(t.open)}) { +\t\t\te := pos + ${t.open.length} +\t\t\tfor e < n && !strings.HasPrefix(src[e:], ${J(t.close)}) { e++ } +\t\t\tif e < n { e += ${t.close.length} } +\t\t\t${push('e')}pos = e; continue +\t\t}`; + const m = compilePat(t.pattern, defs); + return `\t\tif ${gate ? gate + 'true' : 'true'} { if e := ${m}(pos); e > pos { ${push('e')}pos = e; continue } }`; +} + +function lexer(ir: ParserIR): string { + const defs: string[] = []; + const rx = ir.regexCtx; + const tpl = ir.tpl; + const stateful = !!(rx || tpl); + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n'); + const pushPunct = stateful ? (p: string) => `emit("", ${J(p)}, pos, pos + ${p.length})` : (p: string) => `pushTok("", ${J(p)}, pos, pos + ${p.length})`; + const puncts = ir.puncts.map((p) => + `\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { ${pushPunct(p)}; pos += ${p.length}; continue }`).join('\n'); + const goMap = (a: string[]) => `map[string]bool{${a.map((x) => `${J(x)}: true`).join(', ')}}`; + const rxState = rx ? `\tprevText, prevKind, bpText := "", "", "" +\thasPrev, hasPrev2 := false, false +\tparenHead := []bool{} +\tlastClose, lastBang := false, false +\t_divT := ${goMap(rx.divisionTexts)} +\t_divK := ${goMap(rx.divisionTypes)} +\t_rxT := ${goMap(rx.regexTexts)} +\t_phK := ${goMap(rx.parenHeadKw)} +\t_mem := ${goMap(rx.memberAccess)} +\t_pav := ${goMap(rx.postfixAfterValue)} +\tconst IDENT = ${J(rx.identToken)} +\tprevIsValue := func() bool { +\t\tif !hasPrev { return false } +\t\tif _pav[prevText] { return lastBang } +\t\tisExprKw := prevKind == IDENT && _rxT[prevText] +\t\tisParenHead := prevText == ")" && lastClose +\t\treturn !isExprKw && !isParenHead && (_divK[prevKind] || _divT[prevText]) +\t} +` : ''; + const tplState = tpl ? `\ttemplateStack := []int{} +\tscanTplSpan := func(p int) (bool, int) { +\t\tfor p < n { +\t\t\tif strings.HasPrefix(src[p:], ${J(tpl.interpOpen)}) { return true, p + ${tpl.interpOpen.length} } +\t\t\tif src[p] == 92 { p += 2; continue } +\t\t\tif strings.HasPrefix(src[p:], ${J(tpl.open)}) { return false, p + ${tpl.open.length} } +\t\t\tp++ +\t\t} +\t\treturn false, p +\t} +\t_ = scanTplSpan +` : ''; + const emitHooks = [ + rx ? `\t\tif text == "(" { +\t\t\tisMember := hasPrev2 && _mem[bpText] +\t\t\tparenHead = append(parenHead, !isMember && prevKind == IDENT && _phK[prevText]) +\t\t} else if text == ")" { +\t\t\tif len(parenHead) > 0 { lastClose = parenHead[len(parenHead)-1]; parenHead = parenHead[:len(parenHead)-1] } else { lastClose = false } +\t\t} +\t\tif _pav[text] { lastBang = prevIsValue() }` : '', + tpl ? `\t\tif len(templateStack) > 0 { if text == ${J(tpl.braceOpen)} { templateStack[len(templateStack)-1]++ } else if text == ${J(tpl.interpClose)} { templateStack[len(templateStack)-1]-- } }` : '', + ].filter(Boolean).join('\n'); + const emitTail = rx ? `\n\t\tbpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true` : ''; + const emitFn = stateful ? `\temit := func(kind, text string, off, end int) { +${emitHooks} +\t\ttoks = append(toks, Tok{kind, text, off, end, pendingNl}); pendingNl = false${emitTail} +\t} +\t_ = emit +` : ''; + const tplDispatch = tpl ? `\t\tif len(templateStack) > 0 && strings.HasPrefix(src[pos:], ${J(tpl.interpClose)}) && templateStack[len(templateStack)-1] == 0 { +\t\t\ttemplateStack = templateStack[:len(templateStack)-1] +\t\t\tinterp, e := scanTplSpan(pos + ${tpl.interpClose.length}) +\t\t\tif interp { emit("$templateMiddle", src[pos:e], pos, e); templateStack = append(templateStack, 0) } else { emit("$templateTail", src[pos:e], pos, e) } +\t\t\tpos = e; continue +\t\t} +\t\tif strings.HasPrefix(src[pos:], ${J(tpl.open)}) { +\t\t\tinterp, e := scanTplSpan(pos + ${tpl.open.length}) +\t\t\tif interp { emit("$templateHead", src[pos:e], pos, e); templateStack = append(templateStack, 0) } else { emit(${J(tpl.token)}, src[pos:e], pos, e) } +\t\t\tpos = e; continue +\t\t} +` : ''; + const pushTokFn = stateful ? '' : `\tpushTok := func(kind, text string, off, end int) { toks = append(toks, Tok{kind, text, off, end, pendingNl}); pendingNl = false }\n\t_ = pushTok\n`; + return `${defs.length ? 'var _s string\n' + defs.join('\n') + '\n' : ''}func lex(src string) []Tok { +\ttoks := toks[:0] +\tn := len(src) +\tpos := 0 +\tpendingNl := false +\t_ = pendingNl +${rxState}${tplState}${emitFn}${pushTokFn}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { +\t\tc := int(src[pos]) +\t\tif c == 10 || c == 13 { pendingNl = true; pos++; continue } // JS line terminators LF/CR (matches the interpreter; LS/PS are multi-byte: non-ASCII boundary) +\t\tif c == 32 || c == 9 || c == 11 || c == 12 || c == 160 || c == 5760 || (c >= 8192 && c <= 8202) || c == 8239 || c == 8287 || c == 12288 || c == 65279 { pos++; continue } +${tplDispatch}${toks} +${puncts} +\t\tpanic(fmt.Sprintf("lex error at %d", pos)) +\t} +\treturn toks +}`; +} + +function stepCond(s: Step): string { + switch (s.t) { + case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)})`; + case 'tok': return `matchTok(${J(s.name)})`; + case 'rule': return `callRule(parse${s.name})`; + case 'ruleBp': return `callRule(func() int32 { return ${s.name}bp(${s.bp}) })`; + case 'star': return `star(func() bool { return ${stepCond(s.step)} })`; + case 'opt': return `opt(func() bool { return ${s.steps.map(stepCond).join(' && ')} })`; + case 'sep': return `sepBy(func() bool { return ${stepCond(s.elem)} }, ${J(s.delim)})`; + case 'altlit': return `altLit([][2]string{${s.opts.map((o) => `{${J(o.value)}, ${J(o.ttype)}}`).join(', ')}})`; + case 'alt': return `func() bool { ${s.branches.map((br) => `{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${br.length ? br.map(stepCond).join(' && ') : 'true'} { return true }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('; ')}; return false }()`; + case 'not': return `func() bool { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); m := ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return !m }()`; + case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; + case 'sameLine': return `func() bool { t := peek(); return t != nil && !t.Nl }()`; + case 'suppress': return `func() bool { _suppressNext = map[string]bool{${s.connectors.map((c) => `${J(c)}: true`).join(', ')}}; _r := (${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}); _suppressNext = nil; return _r }()`; + } +} + +function rdRule(r: RdRule): string { + const alt = (steps: Step[]) => + `\tif ${steps.map(stepCond).join(' && ')} { return finish(${J(r.cstName)}, sb, offAt(save)) } +\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]`; + return `func parse${r.name}() int32 { +\tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +${r.alts.map(alt).join('\n')} +\treturn -1 +}`; +} + +function prattRule(r: PrattRule, tpl: TplCfg | null): string { + const tplNud = tpl && r.nudToks.includes(tpl.token) + ? `\tif t.Kind == "$templateHead" { +\t\tnode := matchTemplate() +\t\tif node < 0 { return -1 } +\t\tsb := len(scratch); scratch = append(scratch, node) +\t\treturn finish(${J(r.cstName)}, sb, nodes[node].Offset) +\t}\n` + : ''; + const bin = r.binary.map((b) => `${J(b.op)}: {${b.lbp}, ${b.rbp}}`).join(', '); + const pre = r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', '); + const atoms = r.nudToks.map((k) => `${J(k)}: true`).join(', '); + const bracketNud = (b: Bracket) => `\tif t.Text == ${J(b.first)} { +\t\tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +\t\tif ${b.steps.map(stepCond).join(' && ')} { return finish(${J(r.cstName)}, sb, t.Off) } +\t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] +\t}`; + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null, sameLine: boolean, nll: string[] | null) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}${sameLine ? '!t.Nl && ' : ''}${nll ? `!_inW([]string{${nll.map(J).join(', ')}}, headLeafText(left)) && ` : ''}!_mySup[${J(b.first)}] && t.Text == ${J(b.first)} { +\t\t\tledSave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +\t\t\tscratch = append(scratch, left) +\t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue } +\t\t\tpos = ledSave; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break +\t\t}`; + const postfixArm = (tok: string) => { + const tplPart = tpl && tok === tpl.token ? ` +\t\tif !tailClosed && t.Kind == "$templateHead" { +\t\t\tnode := matchTemplate() +\t\t\tif node >= 0 { sb := len(scratch); scratch = append(scratch, left, node); left = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue } +\t\t}` : ''; + return `\t\tif !tailClosed && t.Kind == ${J(tok)} { +\t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf(t.Kind, t.Off, t.End)); pos++ +\t\t\tleft = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue +\t\t}${tplPart}`; + }; + const post = r.postfix.map((p) => `${J(p.op)}: ${p.lbp}`).join(', '); + return `var ${r.name}BIN = map[string]bp{${bin}} +var ${r.name}PRE = map[string]int{${pre}} +var ${r.name}POST = map[string]int{${post}} +var ${r.name}ATOM = map[string]bool{${atoms}} +func parse${r.name}() int32 { return ${r.name}bp(0) } +func ${r.name}bp(minBp int) int32 { +\t_mySup := _suppressNext; _suppressNext = nil; _ = _mySup +\tleft := ${r.name}nud(minBp) +\tif left < 0 { return -1 } +\tif _capped { return left } +\ttailClosed := false +\tfor { +\t\tt := peek() +\t\tif t == nil { break } +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i], r.ledSameLine[i], r.ledNotLeftLeaf[i])).join('\n')} +${r.postfixToks.map(postfixArm).join('\n')} +\t\tif post, ok := ${r.name}POST[t.Text]; ok && !tailClosed && post > minBp { +\t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf("$operator", t.Off, t.End)); pos++; tailClosed = true +\t\t\tleft = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue +\t\t} +\t\tinfo, ok := ${r.name}BIN[t.Text] +\t\tif !ok || info.lbp <= minBp { break } +\t\tledSave := pos; sb := len(scratch) +\t\tscratch = append(scratch, left, mkLeaf("$operator", t.Off, t.End)) +\t\tpos++ +\t\trhs := ${r.name}bp(info.rbp) +\t\tif rhs < 0 { pos = ledSave; scratch = scratch[:sb]; break } +\t\tscratch = append(scratch, rhs) +\t\tleft = finish(${J(r.cstName)}, sb, nodes[left].Offset) +\t} +\treturn left +} +func ${r.name}nud(minBp int) int32 { +\t_capped = false +\tt := peek() +\tif t == nil { return -1 } +${r.nudCapped.map((c) => `\tif minBp < ${c.capBp} { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'} { _capped = true; return finish(${J(r.cstName)}, sb, offAt(save)) }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('\n')} +\t_r := func() int32 { // non-capped: a sub-parse may leave _capped set; force it false after +${tplNud}\tif ${r.name}ATOM[t.Kind] { +\t\tsb := len(scratch); scratch = append(scratch, mkLeaf(t.Kind, t.Off, t.End)); pos++ +\t\treturn finish(${J(r.cstName)}, sb, t.Off) +\t} +${r.nudBrackets.map(bracketNud).join('\n')} +\tif pbp, ok := ${r.name}PRE[t.Text]; ok { +\t\tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +\t\tscratch = append(scratch, mkLeaf("$operator", t.Off, t.End)); pos++ +\t\toperand := ${r.name}bp(pbp) +\t\tif operand < 0 { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 } +\t\tscratch = append(scratch, operand) +\t\treturn finish(${J(r.cstName)}, sb, t.Off) +\t} +${r.nudSeqs.map((seq) => `\t{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${seq.length ? seq.map(stepCond).join(' && ') : 'true'} { return finish(${J(r.cstName)}, sb, offAt(save)) }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('\n')} +\treturn -1 +\t}() +\t_capped = false +\treturn _r +}`; +} + +export const goTarget: Target = { + name: 'go', + ext: 'go', + emitLexer(grammar: CstGrammar): string { + return lexer(portableIR(grammar)); + }, + emitParser(grammar: CstGrammar, lexerSrc: string | null): string { + const ir = portableIR(grammar); + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); + const matchTemplate = ir.tpl ? `func matchTemplate() int32 { +\tt := peek() +\tif t == nil || t.Kind != "$templateHead" { return -1 } +\tsb := len(scratch); nb := len(nodes); kb := len(kids); save := pos +\tscratch = append(scratch, mkLeaf("$templateHead", t.Off, t.End)); pos++ +\tfor { +\t\texpr := parse${ir.tpl.interpRule}() +\t\tif expr < 0 { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 } +\t\tscratch = append(scratch, expr) +\t\tnext := peek() +\t\tif next == nil { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 } +\t\tif next.Kind == "$templateMiddle" { scratch = append(scratch, mkLeaf("$templateMiddle", next.Off, next.End)); pos++; continue } +\t\tif next.Kind == "$templateTail" { scratch = append(scratch, mkLeaf("$templateTail", next.Off, next.End)); pos++; break } +\t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 +\t} +\treturn finish("$template", sb, t.Off) +} +` : ''; + return `// GENERATED by emit-portable.ts (goTarget) — parser for grammar "${ir.grammarName}". +package main + +import ( +\t"fmt" +\t"io" +\t"os" +\t"strconv" +\t"strings" +\t"time" +) + +type Tok struct { +\tKind, Text string +\tOff, End int +\tNl bool +} +// Arena node: an int32 index into nodes; children are a flat range in kids. +type Node struct { +\tRule, TokenType string +\tIsLeaf bool +\tKidStart, KidCount, Offset, End int +} +type bp struct{ lbp, rbp int } + +var toks []Tok +var pos int +var _capped bool +var _src string +var _suppressNext map[string]bool +var nodes []Node +var kids []int32 +var scratch []int32 + +${lexerSrc ?? ''} + +func peek() *Tok { +\tif pos < len(toks) { return &toks[pos] } +\treturn nil +} +func offAt(i int) int { if i < len(toks) { return toks[i].Off }; return 0 } +func mkLeaf(ttype string, off, end int) int32 { +\tnodes = append(nodes, Node{TokenType: ttype, IsLeaf: true, Offset: off, End: end}) +\treturn int32(len(nodes) - 1) +} +// Wrap the scratch entries [sb:] as one node's children (flattened into kids); truncate scratch. +func finish(rule string, sb, fallbackOff int) int32 { +\tnn := len(scratch) +\tkidStart := len(kids) +\toff, end := fallbackOff, fallbackOff +\tif nn > sb { off = nodes[scratch[sb]].Offset; end = nodes[scratch[nn-1]].End } +\tkids = append(kids, scratch[sb:nn]...) +\tscratch = scratch[:sb] +\tnodes = append(nodes, Node{Rule: rule, KidStart: kidStart, KidCount: nn - sb, Offset: off, End: end}) +\treturn int32(len(nodes) - 1) +} +func matchLit(value, ttype string) bool { +\tif pos < len(toks) && toks[pos].Text == value { scratch = append(scratch, mkLeaf(ttype, toks[pos].Off, toks[pos].End)); pos++; return true } +\treturn false +} +func matchTok(name string) bool { +\tif pos < len(toks) && toks[pos].Kind == name { scratch = append(scratch, mkLeaf(name, toks[pos].Off, toks[pos].End)); pos++; return true } +\treturn false +} +func callRule(fn func() int32) bool { +\tid := fn() +\tif id < 0 { return false } +\tscratch = append(scratch, id); return true +} +func star(once func() bool) bool { +\tfor { sp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if !once() { pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break } } +\treturn true +} +func opt(body func() bool) bool { +\tsp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if !body() { pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }; return true +} +func sepBy(elem func() bool, delim string) bool { +\tif !elem() { return true } // the whole separated list is optional — zero elements is valid +\tfor { +\t\tsp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +\t\tif !matchLit(delim, "$punct") { pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break } +\t\tif !elem() { break } // a trailing delimiter is allowed — keep the pushed delim and stop +\t} +\treturn true +} +func altLit(opts [][2]string) bool { +\tfor _, o := range opts { if matchLit(o[0], o[1]) { return true } } +\treturn false +} + +${matchTemplate}${ruleFns} + +func writeJSON(id int32, b *strings.Builder) { +\tnd := &nodes[id] +\tif nd.IsLeaf { +\t\tfmt.Fprintf(b, "{\\"tokenType\\":%q,\\"offset\\":%d,\\"end\\":%d}", nd.TokenType, nd.Offset, nd.End) +\t\treturn +\t} +\tfmt.Fprintf(b, "{\\"rule\\":%q,\\"children\\":[", nd.Rule) +\tfor i := 0; i < nd.KidCount; i++ { if i > 0 { b.WriteByte(',') }; writeJSON(kids[nd.KidStart+i], b) } +\tfmt.Fprintf(b, "],\\"offset\\":%d,\\"end\\":%d}", nd.Offset, nd.End) +} + +func headLeafText(id int32) string { +\tfor !nodes[id].IsLeaf && nodes[id].KidCount > 0 { id = kids[nodes[id].KidStart] } +\treturn _src[nodes[id].Offset:nodes[id].End] +} +func _inW(ws []string, s string) bool { for _, w := range ws { if w == s { return true } }; return false } + +func parseOnce(src string) int32 { +\t_src = src +\ttoks = lex(src) +\tpos = 0 +\tnodes = nodes[:0]; kids = kids[:0]; scratch = scratch[:0] +\treturn parse${ir.entry}() +} + +func main() { +\tdata, _ := io.ReadAll(os.Stdin) +\tsrc := string(data) +\t// Self-bench: a numeric arg N times the lex+parse loop and prints ms/iteration. +\tif len(os.Args) > 1 { +\t\tif iters, err := strconv.Atoi(os.Args[1]); err == nil && iters > 0 { +\t\t\tfor i := 0; i < 3; i++ { parseOnce(src) } +\t\t\tt0 := time.Now() +\t\t\tfor i := 0; i < iters; i++ { parseOnce(src) } +\t\t\tfmt.Printf("%.4f\\n", float64(time.Since(t0).Nanoseconds())/1e6/float64(iters)) +\t\t\treturn +\t\t} +\t} +\troot := parseOnce(src) +\tif root < 0 || pos != len(toks) { +\t\tfmt.Fprintf(os.Stderr, "parse error (pos %d/%d)\\n", pos, len(toks)) +\t\tos.Exit(1) +\t} +\tvar b strings.Builder +\twriteJSON(root, &b) +\tos.Stdout.WriteString(b.String()) +} +`; + }, +}; diff --git a/src/target-rust.ts b/src/target-rust.ts new file mode 100644 index 0000000..52051a0 --- /dev/null +++ b/src/target-rust.ts @@ -0,0 +1,440 @@ +// The Rust Target for emit-portable. Renders the same language-agnostic ParserIR as +// tsTarget/goTarget into a self-contained Rust program (no external crates — the lexer is +// regex-free, so it compiles with rustc alone, no Cargo/network). Its CST JSON is checked +// byte-for-byte against the interpreter, so `emitParser(grammar, rustTarget)` is a +// real, verified Rust parser derived from the same grammar definition. +// +// Rust ownership note: a CST node is OWNED (moved), unlike the TS/Go pointer trees. In the +// Pratt LED loop `left` can only be moved into a child vec once the continuation is known to +// match — so a mixfix LED matches its steps into a SEPARATE kids vec first, then (on success) +// moves `left` to the front and reassigns; on failure `left` is untouched and the loop +// returns it. Sub-sequence combinators (star/opt/sep) take non-capturing fn pointers +// `fn(&mut Parser, &mut Vec) -> bool`, threading the parser + kids as params (so nothing +// is captured, sidestepping the borrow checker). +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts'; +import { portableIR } from './emit-portable.ts'; +import type { Target } from './emit.ts'; +import type { TokenPattern, CstGrammar } from './types.ts'; + +const J = (v: unknown) => JSON.stringify(v); +const rangeCond = (v: string, rs: CharRange[]) => + '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `(${lo}..=${hi}).contains(&${v})`)).join(' || ') + ')'; + +// Compile a token-pattern AST to backtracking-free matcher fns `_mN(s, p) -> i64` +// (new position, or -1). Named functions (Rust closures can't recurse); the source is +// threaded as a param (Rust has no convenient module-level mutable string). +function ccCondRs(p: Extract): string { + const parts = p.items.map((it) => + it.type === 'char' ? `cc == ${it.value.charCodeAt(0)}` : `(${it.from.charCodeAt(0)}..=${it.to.charCodeAt(0)}).contains(&cc)`); + const inSet = '(' + parts.join(' || ') + ')'; + return p.negate ? `!${inSet}` : inSet; +} +function compilePat(p: TokenPattern, defs: string[]): string { + const name = `_m${defs.length}`; + defs.push(''); + let body: string; + if (typeof p === 'string') { + body = `if (p as usize) <= s.len() && s[p as usize..].starts_with(${J(p)}) { p + ${p.length} } else { -1 }`; + } else switch (p.type) { + case 'anyChar': body = `if (p as usize) < s.len() { p + 1 } else { -1 }`; break; + case 'charClass': body = `let u = p as usize; if u >= s.len() { return -1; } let cc = s.as_bytes()[u] as u32; if ${ccCondRs(p)} { p + 1 } else { -1 }`; break; + case 'seq': { const ms = p.items.map((x) => compilePat(x, defs)); body = `let mut p = p; ${ms.map((m) => `p = ${m}(s, p); if p < 0 { return -1; }`).join(' ')} p`; break; } + case 'alt': { const ms = p.items.map((x) => compilePat(x, defs)); body = `${ms.map((m) => `{ let r = ${m}(s, p); if r >= 0 { return r; } }`).join(' ')} -1`; break; } + case 'repeat': { const m = compilePat(p.body, defs); const mx = p.max !== undefined ? ` if c >= ${p.max} { break; }` : ''; body = `let mut q = p; let mut c = 0i64; loop { let r = ${m}(s, q); if r < 0 || r == q { break; } q = r; c += 1;${mx} } if c >= ${p.min} { q } else { -1 }`; break; } + case 'lookahead': { const m = compilePat(p.body, defs); body = `let r = ${m}(s, p); if ${p.negate ? 'r < 0' : 'r >= 0'} { p } else { -1 }`; break; } + case 'anchor': body = p.kind === 'start' ? `if p == 0 { p } else { -1 }` : `if p as usize == s.len() { p } else { -1 }`; break; + default: throw new Error(`portable Rust lexer: pattern '${(p as { type: string }).type}' unsupported`); + } + defs[Number(name.slice(2))] = `fn ${name}(s: &str, p: i64) -> i64 { ${body} }`; + return name; +} + +function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): string { + const name = (t as { name: string }).name; + const stateful = rxTok !== undefined || tplTok !== undefined; + if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine + const nlVar = stateful ? 'st.pending_nl' : 'pending_nl'; + const push = (endE: string) => (t.skip ? `if src[pos..${endE}].bytes().any(|c| c == 10 || c == 13) { ${nlVar} = true; } ` : stateful ? `st.emit(${J(name)}, &src[pos..${endE}], pos, ${endE}); ` : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE}, nl: pending_nl }); pending_nl = false; `); + const gate = rxTok !== undefined && name === rxTok ? '!st.prev_is_value() && ' : ''; + if (t.kind === 'run') return ` if ${gate}${rangeCond('c', t.first)} { + let mut e = pos + 1; + while e < n { let cc = b[e] as u32; if !${rangeCond('cc', t.cont)} { break } e += 1; } + ${push('e')}pos = e; continue; + }`; + if (t.kind === 'string') return ` if ${gate}c == ${t.delim.charCodeAt(0)} { + let mut e = pos + 1; + while e < n { let ch = b[e] as u32; if ch == 92 { e += 2; continue } if ch == ${t.delim.charCodeAt(0)} { e += 1; break } e += 1; } + ${push('e')}pos = e; continue; + }`; + if (t.kind === 'line') return ` if ${gate}src[pos..].starts_with(${J(t.prefix)}) { + let mut e = pos + ${t.prefix.length}; + while e < n && b[e] != 10 { e += 1; } + ${push('e')}pos = e; continue; + }`; + if (t.kind === 'block') return ` if ${gate}src[pos..].starts_with(${J(t.open)}) { + let mut e = pos + ${t.open.length}; + while e < n && !src[e..].starts_with(${J(t.close)}) { e += 1; } + if e < n { e += ${t.close.length}; } + ${push('e')}pos = e; continue; + }`; + const m = compilePat(t.pattern, defs); + return ` if ${gate}true { let e = ${m}(src, pos as i64); if e > pos as i64 { let e = e as usize; ${push('e')}pos = e; continue; } }`; +} + +function lexer(ir: ParserIR): string { + const defs: string[] = []; + const rx = ir.regexCtx; + const tpl = ir.tpl; + const stateful = !!(rx || tpl); + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n'); + const puncts = ir.puncts.map((p) => + ` if src[pos..].starts_with(${J(p)}) { ${stateful ? `st.emit("", &src[pos..pos + ${p.length}], pos, pos + ${p.length});` : `toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length}, nl: pending_nl }); pending_nl = false;`} pos += ${p.length}; continue; }`).join('\n'); + const rsArr = (a: string[]) => `&[${a.map(J).join(', ')}]`; + // Struct fields / emit hooks / init are assembled per-feature so a grammar can have regex, + // templates, or both share one LexState. + const rxConsts = rx ? `const _DIVT: &[&str] = ${rsArr(rx.divisionTexts)}; +const _DIVK: &[&str] = ${rsArr(rx.divisionTypes)}; +const _RXT: &[&str] = ${rsArr(rx.regexTexts)}; +const _PHK: &[&str] = ${rsArr(rx.parenHeadKw)}; +const _MEM: &[&str] = ${rsArr(rx.memberAccess)}; +const _PAV: &[&str] = ${rsArr(rx.postfixAfterValue)}; +const _IDENT: &str = ${J(rx.identToken)}; +fn _in(set: &[&str], x: &str) -> bool { set.iter().any(|s| *s == x) } +` : ''; + const tplFn = tpl ? `fn _scan_tpl_span(s: &str, mut p: usize) -> (bool, usize) { + let n = s.len(); + while p < n { + if s[p..].starts_with(${J(tpl.interpOpen)}) { return (true, p + ${tpl.interpOpen.length}); } + if s.as_bytes()[p] == 92 { p += 2; continue; } + if s[p..].starts_with(${J(tpl.open)}) { return (false, p + ${tpl.open.length}); } + p += 1; + } + (false, p) +} +` : ''; + const fields = ['toks: Vec>', 'pending_nl: bool', + rx ? 'prev_text: &\'a str, prev_kind: &\'static str, bp_text: &\'a str, has_prev: bool, has_prev2: bool, paren_head: Vec, last_close: bool, last_bang: bool' : '', + tpl ? 'template_stack: Vec' : ''].filter(Boolean).join(', '); + const prevIsValue = rx ? ` fn prev_is_value(&self) -> bool { + if !self.has_prev { return false; } + if _in(_PAV, self.prev_text) { return self.last_bang; } + let is_expr_kw = self.prev_kind == _IDENT && _in(_RXT, self.prev_text); + let is_paren_head = self.prev_text == ")" && self.last_close; + !is_expr_kw && !is_paren_head && (_in(_DIVK, self.prev_kind) || _in(_DIVT, self.prev_text)) + } +` : ''; + const emitHooks = [ + rx ? ` if text == "(" { let is_member = self.has_prev2 && _in(_MEM, self.bp_text); self.paren_head.push(!is_member && self.prev_kind == _IDENT && _in(_PHK, self.prev_text)); } + else if text == ")" { self.last_close = self.paren_head.pop().unwrap_or(false); } + if _in(_PAV, text) { self.last_bang = self.prev_is_value(); }` : '', + tpl ? ` if !self.template_stack.is_empty() { if text == ${J(tpl.braceOpen)} { *self.template_stack.last_mut().unwrap() += 1; } else if text == ${J(tpl.interpClose)} { *self.template_stack.last_mut().unwrap() -= 1; } }` : '', + ].filter(Boolean).join('\n'); + const emitTail = rx ? ` + self.bp_text = self.prev_text; self.has_prev2 = self.has_prev; self.prev_kind = kind; self.prev_text = text; self.has_prev = true;` : ''; + const stateImpl = stateful ? `struct LexState<'a> { ${fields} } +impl<'a> LexState<'a> { +${prevIsValue} fn emit(&mut self, kind: &'static str, text: &'a str, off: usize, end: usize) { +${emitHooks} + self.toks.push(Tok { kind, text, off, end, nl: self.pending_nl }); self.pending_nl = false;${emitTail} + } +} +` : ''; + const initFields = ['toks: Vec::new()', 'pending_nl: false', + rx ? 'prev_text: "", prev_kind: "", bp_text: "", has_prev: false, has_prev2: false, paren_head: Vec::new(), last_close: false, last_bang: false' : '', + tpl ? 'template_stack: Vec::new()' : ''].filter(Boolean).join(', '); + const open = stateful ? ` let mut st = LexState { ${initFields} };` : ` let mut toks: Vec = Vec::new();\n let mut pending_nl = false;`; + const nlVar = stateful ? 'st.pending_nl' : 'pending_nl'; + const tplDispatch = tpl ? ` if !st.template_stack.is_empty() && src[pos..].starts_with(${J(tpl.interpClose)}) && *st.template_stack.last().unwrap() == 0 { + st.template_stack.pop(); + let (interp, e) = _scan_tpl_span(src, pos + ${tpl.interpClose.length}); + if interp { st.emit("$templateMiddle", &src[pos..e], pos, e); st.template_stack.push(0); } else { st.emit("$templateTail", &src[pos..e], pos, e); } + pos = e; continue; + } + if src[pos..].starts_with(${J(tpl.open)}) { + let (interp, e) = _scan_tpl_span(src, pos + ${tpl.open.length}); + if interp { st.emit("$templateHead", &src[pos..e], pos, e); st.template_stack.push(0); } else { st.emit(${J(tpl.token)}, &src[pos..e], pos, e); } + pos = e; continue; + } +` : ''; + return `${defs.length ? defs.join('\n') + '\n' : ''}${rxConsts}${tplFn}${stateImpl}fn lex<'a>(src: &'a str) -> Vec> { + let b = src.as_bytes(); + let n = b.len(); +${open} + let mut pos = 0usize; + while pos < n { + let c = b[pos] as u32; + if c == 32 || c == 9 { pos += 1; continue; } + if c == 10 || c == 13 { ${nlVar} = true; pos += 1; continue; } // JS line terminators LF/CR (matches the interpreter; LS/PS multi-byte: non-ASCII boundary) +${tplDispatch}${toks} +${puncts} + panic!("lex error at {}", pos); + } + ${stateful ? 'st.toks' : 'toks'} +}`; +} + +// Top-level step: uses `self` and `&mut kids`. +function stepCond(s: Step): string { + switch (s.t) { + case 'lit': return `self.match_lit(${J(s.value)}, ${J(s.ttype)}, &mut kids)`; + case 'tok': return `self.match_tok(${J(s.name)}, &mut kids)`; + case 'rule': return `self.call_rule(Parser::parse_${s.name}, &mut kids)`; + case 'ruleBp': return `self.call_rule(|p| p.${s.name}_bp(${s.bp}), &mut kids)`; + case 'star': return `self.star(|p, k| ${stepCondP(s.step)}, &mut kids)`; + case 'opt': return `self.opt(|p, k| ${s.steps.map(stepCondP).join(' && ')}, &mut kids)`; + case 'sep': return `self.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, &mut kids)`; + case 'altlit': return `self.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], &mut kids)`; + case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(self, &mut kids)`; + case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(self, &mut kids)`; + case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; + case 'sameLine': return `matches!(self.peek(), Some(t) if !t.nl)`; + case 'suppress': return `{ self.suppress_next = vec![${s.connectors.map(J).join(', ')}]; let _r = (${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}); self.suppress_next = Vec::new(); _r }`; + } +} +// A backtracking inline alternation rendered as an immediately-applied closure over (p, k), +// so it composes identically whether it sits at top level or already inside a closure. +function altBody(branches: Step[][]): string { + return `${branches.map((br) => `{ let sp = p.pos; let bk = k.len(); if ${br.length ? br.map(stepCondP).join(' && ') : 'true'} { return true; } p.pos = sp; k.truncate(bk); }`).join(' ')} false`; +} +// Zero-width negative lookahead: try the steps, restore, succeed iff they did NOT all match. +function notBody(steps: Step[]): string { + return `let sp = p.pos; let bk = k.len(); let m = ${steps.length ? steps.map(stepCondP).join(' && ') : 'true'}; p.pos = sp; k.truncate(bk); !m`; +} +// Inside a closure: uses `p` and `k`. +function stepCondP(s: Step): string { + switch (s.t) { + case 'lit': return `p.match_lit(${J(s.value)}, ${J(s.ttype)}, k)`; + case 'tok': return `p.match_tok(${J(s.name)}, k)`; + case 'rule': return `p.call_rule(Parser::parse_${s.name}, k)`; + case 'ruleBp': return `p.call_rule(|p| p.${s.name}_bp(${s.bp}), k)`; + case 'star': return `p.star(|p, k| ${stepCondP(s.step)}, k)`; + case 'opt': return `p.opt(|p, k| ${s.steps.map(stepCondP).join(' && ')}, k)`; + case 'sep': return `p.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, k)`; + case 'altlit': return `p.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], k)`; + case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(p, k)`; + case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(p, k)`; + case 'seq': return `(${s.steps.length ? s.steps.map(stepCondP).join(' && ') : 'true'})`; + case 'sameLine': return `matches!(p.peek(), Some(t) if !t.nl)`; + case 'suppress': return `{ p.suppress_next = vec![${s.connectors.map(J).join(', ')}]; let _r = (${s.steps.length ? s.steps.map(stepCondP).join(' && ') : 'true'}); p.suppress_next = Vec::new(); _r }`; + } +} + +function rdRule(r: RdRule): string { + const alt = (steps: Step[]) => + ` { let mut kids: Vec = Vec::new(); if ${steps.map(stepCond).join(' && ')} { return Some(self.branch(${J(r.cstName)}, kids, save)); } self.pos = save; }`; + return ` fn parse_${r.name}(&mut self) -> Option { + let save = self.pos; +${r.alts.map(alt).join('\n')} + None + }`; +} + +function prattRule(r: PrattRule, tpl: TplCfg | null): string { + const tplNud = tpl && r.nudToks.includes(tpl.token) + ? ` if t.kind == "$templateHead" { + return self.match_template().map(|n| { let (o, e) = (n.offset, n.end); Cst::node(${J(r.cstName)}, vec![n], o, e) }); + }\n` + : ''; + const binArms = r.binary.map((b) => `${J(b.op)} => Some((${b.lbp}, ${b.rbp}))`).join(', '); + const preArms = r.prefix.map((p) => `${J(p.op)} => Some(${p.rbp})`).join(', '); + const atomArm = r.nudToks.map(J).join(' | '); + const bracketNud = (b: Bracket) => ` if t.text == ${J(b.first)} { + let save = self.pos; let mut kids: Vec = Vec::new(); + if ${b.steps.map(stepCond).join(' && ')} { return Some(node(${J(r.cstName)}, kids)); } + self.pos = save; // fall through to the next NUD alternative + }`; + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null, sameLine: boolean, nll: string[] | null) => ` if ${accessTail ? '!tail_closed && ' : ''}${lbp !== null ? `${lbp} > min_bp && ` : ''}${sameLine ? '!t.nl && ' : ''}${nll ? `!self.nll_blocked(&[${nll.map(J).join(', ')}], &left) && ` : ''}!my_sup.iter().any(|c| *c == ${J(b.first)}) && t.text == ${J(b.first)} { + let led_save = self.pos; let mut kids: Vec = Vec::new(); + if ${b.steps.map(stepCond).join(' && ')} { + let mut full = vec![left]; full.append(&mut kids); + left = node(${J(r.cstName)}, full); continue; + } + self.pos = led_save; break; + }`; + const postfixArm = (tok: string) => { + const tplPart = tpl && tok === tpl.token ? ` + if !tail_closed && t.kind == "$templateHead" { if let Some(n) = self.match_template() { left = node(${J(r.cstName)}, vec![left, n]); continue; } }` : ''; + return ` if !tail_closed && t.kind == ${J(tok)} { self.pos += 1; let leaf = Cst::leaf(t.kind, t.off, t.end); left = node(${J(r.cstName)}, vec![left, leaf]); continue; }${tplPart}`; + }; + const postArms = r.postfix.map((p) => `${J(p.op)} => Some(${p.lbp})`).join(', '); + return ` fn parse_${r.name}(&mut self) -> Option { self.${r.name}_bp(0) } + fn ${r.name}_bin(op: &str) -> Option<(i64, i64)> { match op { ${binArms}${binArms ? ', ' : ''}_ => None } } + fn ${r.name}_pre(op: &str) -> Option { match op { ${preArms}${preArms ? ', ' : ''}_ => None } } + fn ${r.name}_post(op: &str) -> Option { match op { ${postArms}${postArms ? ', ' : ''}_ => None } } + fn ${r.name}_atom(kind: &str) -> bool { matches!(kind, ${atomArm || '""'}) } + fn ${r.name}_bp(&mut self, min_bp: i64) -> Option { + let my_sup = std::mem::take(&mut self.suppress_next); + let _ = &my_sup; + let mut left = self.${r.name}_nud(min_bp)?; + if self.capped { return Some(left); } + let mut tail_closed = false; + loop { + let t = match self.peek() { Some(t) => t, None => break }; +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i], r.ledSameLine[i], r.ledNotLeftLeaf[i])).join('\n')} +${r.postfixToks.map(postfixArm).join('\n')} + if let Some(plbp) = Parser::${r.name}_post(t.text) { if !tail_closed && plbp > min_bp { self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); left = node(${J(r.cstName)}, vec![left, op_leaf]); tail_closed = true; continue; } } + let (lbp, rbp) = match Parser::${r.name}_bin(t.text) { Some(x) => x, None => break }; + if lbp <= min_bp { break; } + let led_save = self.pos; + self.pos += 1; + let op_leaf = Cst::leaf("$operator", t.off, t.end); + let rhs = match self.${r.name}_bp(rbp) { Some(r) => r, None => { self.pos = led_save; break; } }; + left = node(${J(r.cstName)}, vec![left, op_leaf, rhs]); + } + Some(left) + } + fn ${r.name}_nud(&mut self, min_bp: i64) -> Option { + self.capped = false; + let t = self.peek()?; +${r.nudCapped.map((c) => ` if min_bp < ${c.capBp} { let save = self.pos; let mut kids: Vec = Vec::new(); if ${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'} { self.capped = true; return Some(self.branch(${J(r.cstName)}, kids, save)); } self.pos = save; }`).join('\n')} + // non-capped: a sub-parse may leave capped set (grouping a capped arrow); force it false after + let r = self.${r.name}_nud_rest(t); + self.capped = false; + r + } + fn ${r.name}_nud_rest(&mut self, t: Tok<'a>) -> Option { +${tplNud} if Parser::${r.name}_atom(t.kind) { + self.pos += 1; + return Some(Cst::node(${J(r.cstName)}, vec![Cst::leaf(t.kind, t.off, t.end)], t.off, t.end)); + } +${r.nudBrackets.map(bracketNud).join('\n')} + if let Some(pbp) = Parser::${r.name}_pre(t.text) { + let save = self.pos; self.pos += 1; + let op_leaf = Cst::leaf("$operator", t.off, t.end); + match self.${r.name}_bp(pbp) { + Some(operand) => { let (o, e) = (t.off, operand.end); return Some(Cst::node(${J(r.cstName)}, vec![op_leaf, operand], o, e)); } + None => { self.pos = save; return None; } + } + } +${r.nudSeqs.map((seq) => ` { let save = self.pos; let mut kids: Vec = Vec::new(); if ${seq.length ? seq.map(stepCond).join(' && ') : 'true'} { return Some(self.branch(${J(r.cstName)}, kids, save)); } self.pos = save; }`).join('\n')} + None + }`; +} + +export const rustTarget: Target = { + name: 'rust', + ext: 'rs', + emitLexer(grammar: CstGrammar): string { + return lexer(portableIR(grammar)); + }, + emitParser(grammar: CstGrammar, lexerSrc: string | null): string { + const ir = portableIR(grammar); + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); + const matchTemplate = ir.tpl ? ` fn match_template(&mut self) -> Option { + let t = self.peek()?; + if t.kind != "$templateHead" { return None; } + let save = self.pos; self.pos += 1; + let mut children: Vec = vec![Cst::leaf("$templateHead", t.off, t.end)]; + loop { + let expr = match self.parse_${ir.tpl.interpRule}() { Some(e) => e, None => { self.pos = save; return None; } }; + children.push(expr); + let next = match self.peek() { Some(x) => x, None => { self.pos = save; return None; } }; + if next.kind == "$templateMiddle" { children.push(Cst::leaf("$templateMiddle", next.off, next.end)); self.pos += 1; continue; } + if next.kind == "$templateTail" { children.push(Cst::leaf("$templateTail", next.off, next.end)); self.pos += 1; break; } + self.pos = save; return None; + } + let o = children[0].offset; let e = children[children.len() - 1].end; + Some(Cst::node("$template", children, o, e)) + } +` : ''; + return `// GENERATED by emit-portable.ts (rustTarget) — parser for grammar "${ir.grammarName}". +#![allow(non_snake_case)] +use std::io::Read; + +// Zero-alloc tokens: kind is a known grammar name (&'static str), text is a slice of the +// source. Tok is Copy, so peek() copies pointers — no per-peek heap work. +#[derive(Clone, Copy)] +struct Tok<'a> { kind: &'static str, text: &'a str, off: usize, end: usize, nl: bool } + +// CST nodes hold only &'static str labels (rule names / token-type tags are all literals) +// + usize spans — no per-node String allocation. +struct Cst { rule: &'static str, children: Vec, is_leaf: bool, token_type: &'static str, offset: usize, end: usize } +impl Cst { + fn leaf(tt: &'static str, off: usize, end: usize) -> Cst { Cst { rule: "", children: Vec::new(), is_leaf: true, token_type: tt, offset: off, end } } + fn node(rule: &'static str, children: Vec, offset: usize, end: usize) -> Cst { Cst { rule, children, is_leaf: false, token_type: "", offset, end } } +} +// offset/end inferred from first/last child (children non-empty). +fn node(rule: &'static str, kids: Vec) -> Cst { let o = kids[0].offset; let e = kids[kids.len() - 1].end; Cst::node(rule, kids, o, e) } + +${lexerSrc ?? ''} + +struct Parser<'a> { toks: Vec>, pos: usize, capped: bool, suppress_next: Vec<&'static str>, src: &'a str } +impl<'a> Parser<'a> { + fn peek(&self) -> Option> { if self.pos < self.toks.len() { Some(self.toks[self.pos]) } else { None } } + fn head_leaf_text(&self, node: &Cst) -> &'a str { + let mut n = node; + while !n.children.is_empty() { n = &n.children[0]; } + &self.src[n.offset..n.end] + } + fn nll_blocked(&self, words: &[&str], node: &Cst) -> bool { let h = self.head_leaf_text(node); words.iter().any(|w| *w == h) } + fn branch(&self, rule: &'static str, kids: Vec, save: usize) -> Cst { + let offset = if !kids.is_empty() { kids[0].offset } else if save < self.toks.len() { self.toks[save].off } else { 0 }; + let end = if !kids.is_empty() { kids[kids.len() - 1].end } else { offset }; + Cst::node(rule, kids, offset, end) + } + fn match_lit(&mut self, value: &str, ttype: &'static str, kids: &mut Vec) -> bool { + match self.peek() { Some(t) if t.text == value => { kids.push(Cst::leaf(ttype, t.off, t.end)); self.pos += 1; true } _ => false } + } + fn match_tok(&mut self, name: &'static str, kids: &mut Vec) -> bool { + match self.peek() { Some(t) if t.kind == name => { kids.push(Cst::leaf(name, t.off, t.end)); self.pos += 1; true } _ => false } + } + fn call_rule(&mut self, f: fn(&mut Parser<'a>) -> Option, kids: &mut Vec) -> bool { + match f(self) { Some(n) => { kids.push(n); true } None => false } + } + fn star(&mut self, once: fn(&mut Parser<'a>, &mut Vec) -> bool, kids: &mut Vec) -> bool { + loop { let sp = self.pos; let before = kids.len(); if !once(self, kids) { self.pos = sp; kids.truncate(before); break; } } + true + } + fn opt(&mut self, body: fn(&mut Parser<'a>, &mut Vec) -> bool, kids: &mut Vec) -> bool { + let sp = self.pos; let before = kids.len(); if !body(self, kids) { self.pos = sp; kids.truncate(before); } true + } + fn sep_by(&mut self, elem: fn(&mut Parser<'a>, &mut Vec) -> bool, delim: &str, kids: &mut Vec) -> bool { + if !elem(self, kids) { return true; } // the whole separated list is optional — zero elements is valid + loop { + let sp = self.pos; let before = kids.len(); + if !self.match_lit(delim, "$punct", kids) { self.pos = sp; kids.truncate(before); break; } + if !elem(self, kids) { break; } // a trailing delimiter is allowed — keep the pushed delim and stop + } + true + } + fn alt_lit(&mut self, opts: &[(&str, &'static str)], kids: &mut Vec) -> bool { + for (v, tt) in opts { if self.match_lit(v, tt, kids) { return true; } } + false + } + +${matchTemplate}${ruleFns} +} + +fn write_json(c: &Cst, out: &mut String) { + if c.is_leaf { + out.push_str(&format!("{{\\"tokenType\\":\\"{}\\",\\"offset\\":{},\\"end\\":{}}}", c.token_type, c.offset, c.end)); + return; + } + out.push_str(&format!("{{\\"rule\\":\\"{}\\",\\"children\\":[", c.rule)); + for (i, k) in c.children.iter().enumerate() { if i > 0 { out.push(','); } write_json(k, out); } + out.push_str(&format!("],\\"offset\\":{},\\"end\\":{}}}", c.offset, c.end)); +} + +fn main() { + let mut src = String::new(); + std::io::stdin().read_to_string(&mut src).unwrap(); + // Self-bench: a numeric arg N times the lex+parse loop and prints ms/iteration. + if let Some(iters) = std::env::args().nth(1).and_then(|a| a.parse::().ok()) { + // black_box on the input + result so the optimizer can't elide the lex/parse. + for _ in 0..3 { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new(), src: &src }; std::hint::black_box(p.parse_${ir.entry}()); } + let t = std::time::Instant::now(); + for _ in 0..iters { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new(), src: &src }; std::hint::black_box(p.parse_${ir.entry}()); } + println!("{:.4}", t.elapsed().as_secs_f64() * 1000.0 / iters as f64); + return; + } + let toks = lex(&src); + let n = toks.len(); + let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new(), src: &src }; + match p.parse_${ir.entry}() { + Some(root) if p.pos == n => { let mut out = String::new(); write_json(&root, &mut out); print!("{}", out); } + _ => { eprintln!("parse error (pos {}/{})", p.pos, n); std::process::exit(1); } + } +} +`; + }, +}; diff --git a/src/target-ts.ts b/src/target-ts.ts new file mode 100644 index 0000000..dc45015 --- /dev/null +++ b/src/target-ts.ts @@ -0,0 +1,378 @@ +// The TypeScript Target for emit-portable. Renders the language-agnostic ParserIR into a +// self-contained TS parser: a char-class/string/comment lexer, a backtracking recursive- +// descent core, a Pratt expression engine (prefix + binary precedence + mixfix call/member/ +// index LEDs), and a CST→JSON printer over stdin. It is the reference rendering — its CST +// is checked byte-for-byte against the interpreter (createParser), so a divergence in the +// portable logic surfaces here before Go/Rust are compiled. +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts'; +import { portableIR } from './emit-portable.ts'; +import type { Target } from './emit.ts'; +import type { CstGrammar } from './types.ts'; + +const J = (v: unknown) => JSON.stringify(v); +const rangeCond = (v: string, rs: CharRange[]) => + '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} === ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || ') + ')'; + +import type { TokenPattern } from './types.ts'; + +// Compile a token-pattern AST to backtracking-free matcher functions `_mN(p): number` +// (returns the new position, or -1 on no match). Greedy `repeat`, ordered `alt`, +// zero-width `lookahead`/`anchor` — the regex-free token-matcher tier. +function ccCond(p: Extract): string { + const parts = p.items.map((it) => + it.type === 'char' ? `cc === ${it.value.charCodeAt(0)}` : `cc >= ${it.from.charCodeAt(0)} && cc <= ${it.to.charCodeAt(0)}`); + const inSet = '(' + parts.join(' || ') + ')'; + return p.negate ? `!${inSet}` : inSet; +} +function compilePat(p: TokenPattern, defs: string[]): string { + const name = `_m${defs.length}`; + defs.push(''); // reserve the slot (keeps numbering stable across recursion) + let body: string; + if (typeof p === 'string') { + body = `=> _s.startsWith(${J(p)}, p) ? p + ${p.length} : -1`; + } else switch (p.type) { + case 'anyChar': body = `=> p < _s.length ? p + 1 : -1`; break; + case 'charClass': body = `=> { if (p >= _s.length) return -1; const cc = _s.charCodeAt(p); return ${ccCond(p)} ? p + 1 : -1; }`; break; + case 'seq': { const ms = p.items.map((x) => compilePat(x, defs)); body = `=> { ${ms.map((m) => `p = ${m}(p); if (p < 0) return -1;`).join(' ')} return p; }`; break; } + case 'alt': { const ms = p.items.map((x) => compilePat(x, defs)); body = `=> { ${ms.map((m) => `{ const r = ${m}(p); if (r >= 0) return r; }`).join(' ')} return -1; }`; break; } + case 'repeat': { const m = compilePat(p.body, defs); const mx = p.max !== undefined ? `if (c >= ${p.max}) break;` : ''; body = `=> { let q = p, c = 0; for (;;) { const r = ${m}(q); if (r < 0 || r === q) break; q = r; c++; ${mx} } return c >= ${p.min} ? q : -1; }`; break; } + case 'lookahead': { const m = compilePat(p.body, defs); body = `=> { const r = ${m}(p); return ${p.negate ? 'r < 0' : 'r >= 0'} ? p : -1; }`; break; } + case 'anchor': body = p.kind === 'start' ? `=> p === 0 ? p : -1` : `=> p === _s.length ? p : -1`; break; + default: throw new Error(`portable TS lexer: pattern '${(p as { type: string }).type}' unsupported`); + } + defs[Number(name.slice(2))] = `const ${name} = (p: number): number ${body};`; + return name; +} + +function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): string { + const name = (t as { name: string }).name; + const stateful = rxTok !== undefined || tplTok !== undefined; + if (tplTok !== undefined && name === tplTok) return ''; // template token is scanned by the state machine + // `emit(...)` threads the lexer state in stateful mode; a plain push otherwise. A skipped + // token (comment) still records a newline it spans, so `sameLine` sees it. + const push = (endExpr: string) => (t.skip ? `if (/[\\n\\r\\u2028\\u2029]/.test(src.slice(pos, ${endExpr}))) pendingNl = true; ` : `${stateful ? 'emit' : 'push'}(${J(name)}, src.slice(pos, ${endExpr}), pos, ${endExpr}); `); + const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : ''; + if (t.kind === 'run') return ` if (${gate}${rangeCond('c', t.first)}) { + let e = pos + 1; + while (e < n) { const cc = src.charCodeAt(e); if (!${rangeCond('cc', t.cont)}) break; e++; } + ${push('e')}pos = e; continue; + }`; + if (t.kind === 'string') return ` if (${gate}c === ${t.delim.charCodeAt(0)}) { + let e = pos + 1; + while (e < n) { const ch = src.charCodeAt(e); if (ch === 92) { e += 2; continue; } if (ch === ${t.delim.charCodeAt(0)}) { e++; break; } e++; } + ${push('e')}pos = e; continue; + }`; + if (t.kind === 'line') return ` if (${gate}src.startsWith(${J(t.prefix)}, pos)) { + let e = pos + ${t.prefix.length}; + while (e < n && src.charCodeAt(e) !== 10) e++; + ${push('e')}pos = e; continue; + }`; + if (t.kind === 'block') return ` if (${gate}src.startsWith(${J(t.open)}, pos)) { + let e = pos + ${t.open.length}; + while (e < n && !src.startsWith(${J(t.close)}, e)) e++; + if (e < n) e += ${t.close.length}; + ${push('e')}pos = e; continue; + }`; + const m = compilePat(t.pattern, defs); + return ` if (${gate}true) { const e = ${m}(pos); if (e > pos) { ${push('e')}pos = e; continue; } }`; +} + +function lexer(ir: ParserIR): string { + const defs: string[] = []; + const rx = ir.regexCtx; + const tpl = ir.tpl; + const stateful = !!(rx || tpl); + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n'); + const pushFn = stateful ? 'emit' : 'push'; + const puncts = ir.puncts.map((p) => + ` if (src.startsWith(${J(p)}, pos)) { ${pushFn}('', ${J(p)}, pos, pos + ${p.length}); pos += ${p.length}; continue; }`).join('\n'); + const set = (a: string[]) => `new Set([${a.map(J).join(', ')}])`; + // Per-feature pieces of the shared `emit`, so a grammar can have regex, templates, or both. + const rxState = rx ? ` let prevText = '', prevKind = '', bpText = '', hasPrev = false, hasPrev2 = false; + const parenHead: boolean[] = []; + let lastClose = false, lastBang = false; + const _divT = ${set(rx.divisionTexts)}, _divK = ${set(rx.divisionTypes)}, _rxT = ${set(rx.regexTexts)}; + const _phK = ${set(rx.parenHeadKw)}, _mem = ${set(rx.memberAccess)}, _pav = ${set(rx.postfixAfterValue)}; + const IDENT = ${J(rx.identToken)}; + function prevIsValue(): boolean { + if (!hasPrev) return false; + if (_pav.has(prevText)) return lastBang; + const isExprKw = prevKind === IDENT && _rxT.has(prevText); + const isParenHead = prevText === ')' && lastClose; + return !isExprKw && !isParenHead && (_divK.has(prevKind) || _divT.has(prevText)); + } +` : ''; + const tplState = tpl ? ` const templateStack: number[] = []; + function scanTplSpan(p: number): { interp: boolean; end: number } { + while (p < n) { + if (src.startsWith(${J(tpl.interpOpen)}, p)) return { interp: true, end: p + ${tpl.interpOpen.length} }; + if (src.charCodeAt(p) === 92) { p += 2; continue; } + if (src.startsWith(${J(tpl.open)}, p)) return { interp: false, end: p + ${tpl.open.length} }; + p++; + } + return { interp: false, end: p }; + } +` : ''; + const emitHooks = [ + rx ? ` if (text === '(') { const isMember = hasPrev2 && _mem.has(bpText); parenHead.push(!isMember && prevKind === IDENT && _phK.has(prevText)); } + else if (text === ')') { lastClose = parenHead.pop() ?? false; } + if (_pav.has(text)) lastBang = prevIsValue();` : '', + tpl ? ` if (templateStack.length > 0) { if (text === ${J(tpl.braceOpen)}) templateStack[templateStack.length - 1]++; else if (text === ${J(tpl.interpClose)}) templateStack[templateStack.length - 1]--; }` : '', + ].filter(Boolean).join('\n'); + const emitTail = rx ? `\n bpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true;` : ''; + const emitFn = stateful ? ` function emit(kind: string, text: string, off: number, end: number): void { +${emitHooks} + toks.push({ kind, text, off, end, nl: pendingNl }); pendingNl = false;${emitTail} + } +` : ''; + // Template dispatch runs at the top of the loop, before token/punct scanning. + const tplDispatch = tpl ? ` if (templateStack.length > 0 && src.startsWith(${J(tpl.interpClose)}, pos) && templateStack[templateStack.length - 1] === 0) { + templateStack.pop(); + const sp = scanTplSpan(pos + ${tpl.interpClose.length}); + if (sp.interp) { emit('$templateMiddle', src.slice(pos, sp.end), pos, sp.end); templateStack.push(0); } + else emit('$templateTail', src.slice(pos, sp.end), pos, sp.end); + pos = sp.end; continue; + } + if (src.startsWith(${J(tpl.open)}, pos)) { + const sp = scanTplSpan(pos + ${tpl.open.length}); + if (sp.interp) { emit('$templateHead', src.slice(pos, sp.end), pos, sp.end); templateStack.push(0); } + else emit(${J(tpl.token)}, src.slice(pos, sp.end), pos, sp.end); + pos = sp.end; continue; + } +` : ''; + return `${defs.length ? 'let _s = "";\n' + defs.join('\n') + '\n' : ''}function lex(src: string): Tok[] { + const toks: Tok[] = []; + const n = src.length; + let pos = 0; + let pendingNl = false; +${defs.length ? ' _s = src;\n' : ''}${rxState}${tplState}${stateful ? emitFn : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end, nl: pendingNl }); pendingNl = false; };\n'} while (pos < n) { + const c = src.charCodeAt(pos); + // JS line terminators LF/CR/LS/PS set newline-before, matching the interpreter (gen-lexer.ts). + if (c === 10 || c === 13 || c === 8232 || c === 8233) { pendingNl = true; pos++; continue; } + if (c === 32 || c === 9 || c === 11 || c === 12 || c === 160 || c === 5760 || (c >= 8192 && c <= 8202) || c === 8239 || c === 8287 || c === 12288 || c === 65279) { pos++; continue; } +${tplDispatch}${toks} +${puncts} + throw new Error('lex error at ' + pos + ': ' + JSON.stringify(src[pos])); + } + return toks; +}`; +} + +// A Step as a boolean expression (appends to the in-scope `kids`). +function stepCond(s: Step): string { + switch (s.t) { + case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)}, kids)`; + case 'tok': return `matchTok(${J(s.name)}, kids)`; + case 'rule': return `callRule(parse${s.name}, kids)`; + case 'ruleBp': return `callRule(() => ${s.name}_bp(${s.bp}), kids)`; + case 'star': return `star(() => ${stepCond(s.step)}, kids)`; + case 'opt': return `opt(() => ${s.steps.map(stepCond).join(' && ')}, kids)`; + case 'sep': return `sepBy(() => ${stepCond(s.elem)}, ${J(s.delim)}, kids)`; + case 'altlit': return `altLit([${s.opts.map((o) => `[${J(o.value)}, ${J(o.ttype)}]`).join(', ')}], kids)`; + case 'alt': return `(() => { ${s.branches.map((br) => `{ const sp = pos; const bk = kids.length; if (${br.length ? br.map(stepCond).join(' && ') : 'true'}) return true; pos = sp; kids.length = bk; }`).join(' ')} return false; })()`; + case 'not': return `(() => { const sp = pos; const bk = kids.length; const m = ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = sp; kids.length = bk; return !m; })()`; + case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; + case 'sameLine': return `(() => { const t = peek(); return t !== null && !t.nl; })()`; + case 'suppress': return `(() => { _suppressNext = new Set([${s.connectors.map(J).join(', ')}]); const _r = (${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}); _suppressNext = null; return _r; })()`; + } +} + +function rdRule(r: RdRule): string { + const alt = (steps: Step[]) => + ` { const kids: Cst[] = []; if (${steps.map(stepCond).join(' && ')}) return branch(${J(r.cstName)}, kids, save); pos = save; }`; + return `function parse${r.name}(): Node | null { + const save = pos; +${r.alts.map(alt).join('\n')} + return null; +}`; +} + +function prattRule(r: PrattRule, tpl: TplCfg | null): string { + const tplNud = tpl && r.nudToks.includes(tpl.token) + ? ` if (t.kind === '$templateHead') { const node = matchTemplate(); return node === null ? null : { rule: ${J(r.cstName)}, children: [node], offset: node.offset, end: node.end }; }\n` + : ''; + const BIN = `{ ${r.binary.map((b) => `${J(b.op)}: { lbp: ${b.lbp}, rbp: ${b.rbp} }`).join(', ')} }`; + const PRE = `{ ${r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', ')} }`; + const atom = `new Set([${r.nudToks.map(J).join(', ')}])`; + const bracketNud = (b: Bracket) => ` if (t.text === ${J(b.first)}) { + const save = pos; const kids: Cst[] = []; + if (${b.steps.map(stepCond).join(' && ')}) return node(${J(r.cstName)}, kids); + pos = save; // fall through to the next NUD alternative (e.g. another '${b.first}'-led form) + }`; + // Access-tail leds (member/call/index) are disabled once a postfix has closed the operand; + // a precedence-gated led (ternary/in/instanceof) binds only when its lbp > minBp. + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null, sameLine: boolean, nll: string[] | null) => ` if (${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}${sameLine ? '!t.nl && ' : ''}${nll ? `!${J(nll)}.includes(headLeafText(left)) && ` : ''}(_mySup === null || !_mySup.has(${J(b.first)})) && t.text === ${J(b.first)}) { + const ledSave = pos; const kids: Cst[] = [left]; + if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.cstName)}, kids); continue; } + pos = ledSave; break; + }`; + // A postfix token (e.g. a tagged template) binds like a mixfix led: `left X` → node(left, X). Also an access tail. + const postfixArm = (tok: string) => { + const tplPart = tpl && tok === tpl.token ? ` + if (!tailClosed && t.kind === '$templateHead') { const node = matchTemplate(); if (node !== null) { left = { rule: ${J(r.cstName)}, children: [left, node], offset: left.offset, end: node.end }; continue; } }` : ''; + return ` if (!tailClosed && t.kind === ${J(tok)}) { const leaf: Leaf = { tokenType: t.kind, offset: t.off, end: t.end }; pos++; left = { rule: ${J(r.cstName)}, children: [left, leaf], offset: left.offset, end: leaf.end }; continue; }${tplPart}`; + }; + const POST = `{ ${r.postfix.map((p) => `${J(p.op)}: ${p.lbp}`).join(', ')} }`; + return `const ${r.name}_BIN: Record = ${BIN}; +const ${r.name}_PRE: Record = ${PRE}; +const ${r.name}_POST: Record = ${POST}; +const ${r.name}_ATOM = ${atom}; +function parse${r.name}(): Node | null { return ${r.name}_bp(0); } +function ${r.name}_bp(minBp: number): Node | null { + const _mySup = _suppressNext; _suppressNext = null; // no-in: consume the suppressed-connector set for this led loop + let left = ${r.name}_nud(minBp); + if (left === null) return null; + if (_capped) return left; // an assignment-level arrow admits no led + let tailClosed = false; + for (;;) { + const t = peek(); + if (t === null) break; +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i], r.ledSameLine[i], r.ledNotLeftLeaf[i])).join('\n')} +${r.postfixToks.map(postfixArm).join('\n')} + const post = ${r.name}_POST[t.text]; + if (!tailClosed && post !== undefined && post > minBp) { pos++; const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; left = { rule: ${J(r.cstName)}, children: [left, opLeaf], offset: left.offset, end: t.end }; tailClosed = true; continue; } + const info = ${r.name}_BIN[t.text]; + if (info === undefined || info.lbp <= minBp) break; + const ledSave = pos; + pos++; + const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; + const rhs = ${r.name}_bp(info.rbp); + if (rhs === null) { pos = ledSave; break; } + left = { rule: ${J(r.cstName)}, children: [left, opLeaf, rhs], offset: left.offset, end: rhs.end }; + } + return left; +} +function ${r.name}_nud(minBp: number): Node | null { + _capped = false; + const t = peek(); + if (t === null) return null; +${r.nudCapped.map((c) => ` if (minBp < ${c.capBp}) { const save = pos; const kids: Cst[] = []; if (${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'}) { _capped = true; return branch(${J(r.cstName)}, kids, save); } pos = save; }`).join('\n')} + // Below is non-capped: a sub-parse may leave _capped set (e.g. grouping a capped arrow), + // so force it false after — only the capped arms above produce a capped node. + const _r = ((): Node | null => { +${tplNud} if (${r.name}_ATOM.has(t.kind)) { pos++; return { rule: ${J(r.cstName)}, children: [{ tokenType: t.kind, offset: t.off, end: t.end }], offset: t.off, end: t.end }; } +${r.nudBrackets.map(bracketNud).join('\n')} + const pbp = ${r.name}_PRE[t.text]; + if (pbp !== undefined) { + const save = pos; pos++; + const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; + const operand = ${r.name}_bp(pbp); + if (operand === null) { pos = save; return null; } + return { rule: ${J(r.cstName)}, children: [opLeaf, operand], offset: t.off, end: operand.end }; + } +${r.nudSeqs.map((seq) => ` { const save = pos; const kids: Cst[] = []; if (${seq.length ? seq.map(stepCond).join(' && ') : 'true'}) return branch(${J(r.cstName)}, kids, save); pos = save; }`).join('\n')} + return null; + })(); + _capped = false; + return _r; +}`; +} + +export const tsTarget: Target = { + name: 'typescript', + ext: 'ts', + emitLexer(grammar: CstGrammar): string { + return lexer(portableIR(grammar)); + }, + emitParser(grammar: CstGrammar, lexerSrc: string | null): string { + const ir = portableIR(grammar); + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); + const matchTemplate = ir.tpl ? `function matchTemplate(): Cst | null { + const t = peek(); + if (t === null || t.kind !== '$templateHead') return null; + const children: Cst[] = []; + const save = pos; pos++; + children.push({ tokenType: '$templateHead', offset: t.off, end: t.end }); + for (;;) { + const expr = parse${ir.tpl.interpRule}(); + if (expr === null) { pos = save; return null; } + children.push(expr); + const next = peek(); + if (next === null) { pos = save; return null; } + if (next.kind === '$templateMiddle') { pos++; children.push({ tokenType: '$templateMiddle', offset: next.off, end: next.end }); continue; } + if (next.kind === '$templateTail') { pos++; children.push({ tokenType: '$templateTail', offset: next.off, end: next.end }); break; } + pos = save; return null; + } + return { rule: '$template', children, offset: children[0].offset, end: children[children.length - 1].end }; +} +` : ''; + return `// GENERATED by emit-portable.ts (tsTarget) — parser for grammar "${ir.grammarName}". +import { readFileSync } from 'node:fs'; + +type Tok = { kind: string; text: string; off: number; end: number; nl: boolean }; +type Leaf = { tokenType: string; offset: number; end: number }; +type Node = { rule: string; children: Cst[]; offset: number; end: number }; +type Cst = Node | Leaf; + +${lexerSrc ?? ''} + +let toks: Tok[] = []; +let pos = 0; +let _capped = false; +let _suppressNext: Set | null = null; +let _src = ''; +function peek(): Tok | null { return pos < toks.length ? toks[pos] : null; } +function headLeafText(node: Cst): string { + let n: Cst = node; + while ('children' in n && n.children.length > 0) n = n.children[0]; + return _src.slice(n.offset, n.end); +} +function branch(rule: string, kids: Cst[], save: number): Node { + const offset = kids.length > 0 ? kids[0].offset : (save < toks.length ? toks[save].off : 0); + const end = kids.length > 0 ? kids[kids.length - 1].end : offset; + return { rule, children: kids, offset, end }; +} +function node(rule: string, kids: Cst[]): Node { + return { rule, children: kids, offset: kids[0].offset, end: kids[kids.length - 1].end }; +} +function matchLit(value: string, ttype: string, kids: Cst[]): boolean { + const t = peek(); + if (t === null || t.text !== value) return false; + kids.push({ tokenType: ttype, offset: t.off, end: t.end }); pos++; return true; +} +function matchTok(name: string, kids: Cst[]): boolean { + const t = peek(); + if (t === null || t.kind !== name) return false; + kids.push({ tokenType: name, offset: t.off, end: t.end }); pos++; return true; +} +function callRule(fn: () => Node | null, kids: Cst[]): boolean { + const n = fn(); + if (n === null) return false; + kids.push(n); return true; +} +function star(once: () => boolean, kids: Cst[]): boolean { + for (;;) { const sp = pos; const before = kids.length; if (!once()) { pos = sp; kids.length = before; break; } } + return true; +} +function opt(body: () => boolean, kids: Cst[]): boolean { + const sp = pos; const before = kids.length; if (!body()) { pos = sp; kids.length = before; } return true; +} +function sepBy(elem: () => boolean, delim: string, kids: Cst[]): boolean { + if (!elem()) return true; // the whole separated list is optional — zero elements is valid + for (;;) { + const sp = pos; const before = kids.length; + if (!matchLit(delim, '$punct', kids)) { pos = sp; kids.length = before; break; } + if (!elem()) break; // a trailing delimiter is allowed — keep the pushed delim and stop + } + return true; +} +function altLit(opts: [string, string][], kids: Cst[]): boolean { + for (const [v, tt] of opts) if (matchLit(v, tt, kids)) return true; + return false; +} + +${matchTemplate}${ruleFns} + +const src = readFileSync(0, 'utf8'); +_src = src; +toks = lex(src); +pos = 0; +const root = parse${ir.entry}(); +if (root === null || pos !== toks.length) { + process.stderr.write('parse error (pos ' + pos + '/' + toks.length + ')\\n'); + process.exit(1); +} +process.stdout.write(JSON.stringify(root)); +`; + }, +}; diff --git a/test/ast-types-smoke.ts b/test/ast-types-smoke.ts deleted file mode 100644 index 1b5714c..0000000 --- a/test/ast-types-smoke.ts +++ /dev/null @@ -1,184 +0,0 @@ -// Smoke test for src/gen-ast-types.ts. -// -// 1. Generate the typed-CST source from the real TypeScript grammar. -// 2. Write it to a temp `.ts` file. -// 3. Write a consumer module that (a) imports the generated types, (b) does an -// exhaustive `switch (node.rule)` proving the discriminated union narrows -// and is complete (a `never` assertion in `default`), and (c) narrows a -// leaf on `tokenType`. -// 4. Type-check BOTH with `tsc --noEmit --strict`. A non-empty diagnostic = -// the generated types are wrong (or not exhaustive) → fail. -// 5. Also assert a few structural facts about the generated string directly. -// -// Run: `node test/ast-types-smoke.ts`. (This file lives under test/, which the -// project tsconfig excludes, so it does not affect `npx tsc --noEmit` for src.) - -import { generateAstTypes } from '../src/gen-ast-types.ts'; -import { execFileSync } from 'node:child_process'; -import { mkdtempSync, writeFileSync, rmSync, existsSync } from 'node:fs'; -import { tmpdir } from 'node:os'; -import { dirname, join, parse as parsePath } from 'node:path'; - -const grammar = (await import('../typescript.ts')).default; - -// Resolve the workspace `tsc` so the temp dir uses the same compiler. Walk up -// from the cwd — under a git worktree, node_modules lives in the parent repo. -function resolveTsc(): string { - let dir = process.cwd(); - while (true) { - const candidate = join(dir, 'node_modules', '.bin', 'tsc'); - if (existsSync(candidate)) return candidate; - const parent = dirname(dir); - if (parent === dir || dir === parsePath(dir).root) break; - dir = parent; - } - return 'tsc'; // fall back to PATH -} -const tscBin = resolveTsc(); - -let fail = 0; -const check = (label: string, cond: boolean) => { - if (cond) console.log(' ok ', label); - else { fail++; console.log(' FAIL', label); } -}; - -// ── 1. Generate ── -const src = generateAstTypes(grammar); - -// ── 5. Direct structural assertions on the generated text ── -check('emits a CstNode discriminated union', /export type CstNode =/.test(src)); -check('emits a TokenType union', /export type TokenType =/.test(src)); -check('emits a RuleName union', /export type RuleName =/.test(src)); -check('emits NodeOf helper', /export type NodeOf/.test(src)); - -// Every declared rule gets an interface with a literal `rule` discriminant. -const missingRule = grammar.rules.find( - r => !src.includes(`export interface ${r.name}Node `) || !src.includes(`rule: '${r.name}'`), -); -check('every grammar rule has a Node interface + literal rule', missingRule === undefined); - -// Synthetic leaf token types are present in the TokenType union. -for (const t of ['$keyword', '$punct', '$operator', '$templateHead', '$templateMiddle', '$templateTail']) { - check(`TokenType includes ${t}`, src.includes(`'${t}'`)); -} -// Declared token names are present too. -check('TokenType includes a declared token (Ident)', src.includes("'Ident'")); - -// The grammar has a template token → a `$template` node interface should exist. -check('emits $templateNode (grammar has a template token)', src.includes("rule: '$template'")); - -// ── 2/3/4. Type-check the generated types + a consumer ── -const dir = mkdtempSync(join(tmpdir(), 'monogram-ast-types-')); -const typesPath = join(dir, 'cst-types.ts'); -const consumerPath = join(dir, 'consumer.ts'); -const tsconfigPath = join(dir, 'tsconfig.json'); - -writeFileSync(typesPath, src); - -// Pick a few real rule names from the grammar to exercise narrowing. -const ruleSample = grammar.rules.slice(0, 3).map(r => r.name); - -// Consumer: exhaustive switch over EVERY rule (built from the grammar so it -// stays complete as the grammar grows), plus explicit narrowing on a couple of -// sampled rules and a leaf. If the union is missing a member, the per-case -// access fails; if it has an EXTRA member we don't handle, the `default` -// `never` assignment fails — both prove the union is exactly right. -const allRuleNames = [ - '$template', - ...grammar.rules.map(r => r.name), -]; -const cases = allRuleNames.map(name => - ` case '${name}': { const _c: CstNode = node; void _c; return node.children.length; }`, -).join('\n'); - -const consumer = `import type { CstNode, CstLeaf, NodeOf, RuleName, TokenType } from './cst-types.ts'; - -// (a) Exhaustive switch on the \`rule\` discriminant: narrows, and \`default\` -// proves completeness via a \`never\` assignment. -export function childCount(node: CstNode): number { - switch (node.rule) { -${cases} - default: { - const _exhaustive: never = node; - return _exhaustive; - } - } -} - -// (b) NodeOf narrows the union to one rule's node. -function sampleNarrowing(n: CstNode) { - ${ruleSample.map((r, i) => `if (n.rule === '${r}') { const x${i}: NodeOf<'${r}'> = n; void x${i}; }`).join('\n ')} -} -void sampleNarrowing; - -// (c) A RuleName value is assignable from a literal in the union. -const someRule: RuleName = '${ruleSample[0]}'; -void someRule; - -// (d) Leaf narrowing on tokenType. -function leafText(leaf: CstLeaf): string { - if (leaf.tokenType === '$keyword') return leaf.text; - const t: TokenType = leaf.tokenType; - void t; - return leaf.text; -} -void leafText; -`; -writeFileSync(consumerPath, consumer); - -writeFileSync(tsconfigPath, JSON.stringify({ - compilerOptions: { - target: 'ES2022', - module: 'Node16', - moduleResolution: 'Node16', - allowImportingTsExtensions: true, - noEmit: true, - strict: true, - skipLibCheck: true, - }, - include: ['cst-types.ts', 'consumer.ts'], -}, null, 2)); - -let tscOut = ''; -let tscOk = true; -try { - execFileSync(tscBin, ['--noEmit', '-p', tsconfigPath], { stdio: 'pipe' }); -} catch (e: any) { - tscOk = false; - tscOut = `${e.stdout?.toString() ?? ''}${e.stderr?.toString() ?? ''}`; -} -check('generated types + exhaustive-switch consumer type-check under tsc --strict', tscOk); -if (!tscOk) { - console.log('\n--- tsc diagnostics ---\n' + tscOut + '\n--- generated source ---\n' + src + '\n--- consumer ---\n' + consumer); -} - -// Negative control: a bogus rule literal must NOT be assignable to RuleName, -// confirming RuleName is a closed union (not widened to `string`). -const badConsumerPath = join(dir, 'bad.ts'); -writeFileSync(badConsumerPath, `import type { RuleName } from './cst-types.ts'; -const bad: RuleName = '___definitely_not_a_rule___'; -void bad; -`); -writeFileSync(join(dir, 'tsconfig.bad.json'), JSON.stringify({ - compilerOptions: { - target: 'ES2022', module: 'Node16', moduleResolution: 'Node16', - allowImportingTsExtensions: true, noEmit: true, strict: true, skipLibCheck: true, - }, - include: ['cst-types.ts', 'bad.ts'], -}, null, 2)); -let bogusRejected = false; -try { - execFileSync(tscBin, ['--noEmit', '-p', join(dir, 'tsconfig.bad.json')], { stdio: 'pipe' }); -} catch { - bogusRejected = true; // tsc errored → the bogus literal was correctly rejected -} -check('RuleName is a closed union (rejects an unknown rule literal)', bogusRejected); - -rmSync(dir, { recursive: true, force: true }); - -console.log( - fail === 0 - ? `\n${grammar.rules.length} rules typed; all AST-type smoke checks pass` - : `\n${fail} FAILED`, -); -process.exit(fail === 0 ? 0 : 1); diff --git a/test/check.ts b/test/check.ts index 1658343..97b5a27 100644 --- a/test/check.ts +++ b/test/check.ts @@ -26,6 +26,8 @@ const GATES: Gate[] = [ { group: 'emit-parity', name: 'emit-parser-verify', args: ['test/emit-parser-verify.ts'] }, { group: 'emit-parity', name: 'emit-reject-messages', args: ['test/emit-reject-messages.ts'] }, { group: 'emit-parity', name: 'emit-lexer-verify', args: ['test/emit-lexer-verify.ts'] }, + { group: 'emit-parity', name: 'emit-tsc-gate', args: ['test/emit-tsc-gate.ts'] }, + { group: 'emit-parity', name: 'portable-targets', args: ['test/portable-targets.ts'] }, { group: 'core', name: 'multi-doc', args: ['test/multi-doc.ts'] }, { group: 'core', name: 'recovery', args: ['test/recovery.ts'] }, { group: 'core', name: 'incremental-grammars', args: ['test/incremental-grammars.ts'] }, diff --git a/test/cst-match-totality.ts b/test/cst-match-totality.ts index d6e382c..2aab35f 100644 --- a/test/cst-match-totality.ts +++ b/test/cst-match-totality.ts @@ -13,7 +13,7 @@ // node test/cst-match-totality.ts import { existsSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; import { join } from 'node:path'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { generateInputs } from './grammar-gen.ts'; const GRAMMARS = ['typescript', 'javascript', 'typescriptreact', 'javascriptreact', 'yaml', 'html']; @@ -51,8 +51,8 @@ function checkTree(em: Emitted, root: number, src: string, matchers: Record f.endsWith('.cst-match.ts') || f.endsWith('.cst-types.ts') || f.endsWith('.d.ts'); +const isGenerated = (f: string) => f.endsWith('.cst-match.ts') || f.endsWith('.d.ts'); export function repoTsFiles(): string[] { const out: string[] = []; diff --git a/test/emit-lexer-verify.ts b/test/emit-lexer-verify.ts index 44fef62..900de91 100644 --- a/test/emit-lexer-verify.ts +++ b/test/emit-lexer-verify.ts @@ -9,14 +9,14 @@ // node test/emit-lexer-verify.ts # in-repo corpus (+ /tmp/ts-repo if present) import { readFileSync, writeFileSync } from 'node:fs'; import { createLexer } from '../src/gen-lexer.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { inRepoCorpus, externalTsFiles } from './emit-corpus.ts'; const grammar = (await import('../typescript.ts')).default; // The reference: createLexer with the SAME intern config the emitted parser bakes. -const EMITTED = '/tmp/emit-lexer-verify-parser.mjs'; -writeFileSync(EMITTED, emitParser(grammar)); +const EMITTED = '/tmp/emit-lexer-verify-parser.mts'; +writeFileSync(EMITTED, emitParser(grammar, jsTarget)); const emitted = await import(EMITTED + '?v=' + Date.now()); const src = readFileSync(EMITTED, 'utf-8'); if (src.includes('createLexer(')) { @@ -26,9 +26,9 @@ if (src.includes('createLexer(')) { // Rebuild the intern config from the emitted tables' source of truth: re-emit via the // analyzer is private, so read the reference lexer through a tiny probe grammar parse — // simplest faithful route: intern maps are exactly the emitted TYPE_KIND/LIT_KW/LIT_PU. -const tk = new Map(JSON.parse(src.match(/const TYPE_KIND = new Map\((.*)\);/)![1])); -const kw = new Map(JSON.parse(src.match(/const LIT_KW = new Map\((.*)\);/)![1])); -const pu = new Map(JSON.parse(src.match(/const LIT_PU = new Map\((.*)\);/)![1])); +const tk = new Map(JSON.parse(src.match(/const TYPE_KIND = new Map(?:<[^>]*>)?\((.*)\);/)![1])); +const kw = new Map(JSON.parse(src.match(/const LIT_KW = new Map(?:<[^>]*>)?\((.*)\);/)![1])); +const pu = new Map(JSON.parse(src.match(/const LIT_PU = new Map(?:<[^>]*>)?\((.*)\);/)![1])); const kPunct = Number(src.match(/const K_PUNCT = (\d+);/)![1]); const kFallback = Number(src.match(/const K_NAMED_FALLBACK = (\d+);/)![1]); const ref = createLexer(grammar, { typeKind: tk, kwLit: kw, puLit: pu, punctKind: kPunct, namedFallback: kFallback }); diff --git a/test/emit-parser-bench.ts b/test/emit-parser-bench.ts index 1680386..5af58a2 100644 --- a/test/emit-parser-bench.ts +++ b/test/emit-parser-bench.ts @@ -9,14 +9,14 @@ // node test/emit-parser-bench.ts # the 4 bench files, N=20 // node test/emit-parser-bench.ts # custom timed-run count import { createParser } from '../src/gen-parser.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { readFileSync, writeFileSync } from 'fs'; const grammar = (await import('../typescript.ts')).default; const oracle = createParser(grammar); -const EMITTED = '/tmp/emitted-parser.mjs'; -writeFileSync(EMITTED, emitParser(grammar)); +const EMITTED = '/tmp/emitted-parser.mts'; +writeFileSync(EMITTED, emitParser(grammar, jsTarget)); const emitted = await import(EMITTED + '?v=' + Date.now()); const N = Number(process.argv[2]) || 20; diff --git a/test/emit-parser-verify.ts b/test/emit-parser-verify.ts index 2f39fe4..b3020bd 100644 --- a/test/emit-parser-verify.ts +++ b/test/emit-parser-verify.ts @@ -13,7 +13,7 @@ // node test/emit-parser-verify.ts # external sweep stride N (default ~400 files) import { objectify } from './emitted-obj.ts'; import { createParser } from '../src/gen-parser.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { inRepoCorpus, externalTsFiles } from './emit-corpus.ts'; import { readFileSync, writeFileSync } from 'fs'; @@ -21,8 +21,8 @@ const grammar = (await import('../typescript.ts')).default; const oracle = createParser(grammar); // Emit, write to /tmp, import the standalone module. -const EMITTED = '/tmp/emitted-parser.mjs'; -writeFileSync(EMITTED, emitParser(grammar)); +const EMITTED = '/tmp/emitted-parser.mts'; +writeFileSync(EMITTED, emitParser(grammar, jsTarget)); const emitted = await import(EMITTED + '?v=' + Date.now()); type Outcome = { ok: true; cst: string } | { ok: false; err: string }; diff --git a/test/emit-reject-messages.ts b/test/emit-reject-messages.ts index dd5c0a1..28891e7 100644 --- a/test/emit-reject-messages.ts +++ b/test/emit-reject-messages.ts @@ -16,15 +16,15 @@ // // node test/emit-reject-messages.ts import { createParser } from '../src/gen-parser.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { inRepoCorpus, externalTsFiles } from './emit-corpus.ts'; import { readFileSync, writeFileSync } from 'fs'; const grammar = (await import('../typescript.ts')).default; const oracle = createParser(grammar); -const EMITTED = '/tmp/emitted-parser-msg.mjs'; -writeFileSync(EMITTED, emitParser(grammar)); +const EMITTED = '/tmp/emitted-parser-msg.mts'; +writeFileSync(EMITTED, emitParser(grammar, jsTarget)); const emitted = await import(EMITTED + '?v=' + Date.now()); function errOf(parse: (s: string) => unknown, code: string): string | null { diff --git a/test/emit-tsc-gate.ts b/test/emit-tsc-gate.ts new file mode 100644 index 0000000..a923934 --- /dev/null +++ b/test/emit-tsc-gate.ts @@ -0,0 +1,72 @@ +// Gate: the EMITTED parser (emit-parser.ts) is type-checked TypeScript. +// +// emitParser produces a standalone TS module — explicit types on every declaration +// (the monomorphic Doc state struct, the matcher/runtime signatures, the baked op / +// rule tables). This gate compiles that module under `tsc --strict --noEmit` and +// fails on ANY diagnostic. Two properties it guards by construction: +// - the type CONTRACT is real and consistent (no implicit any, no arity looseness, +// no shape drift between the swapped buffers and the doc struct) — the part that +// ports to a Go/Rust target; +// - the emitted source stays ERASABLE TypeScript (annotations only): Node runs the +// emitted parser by stripping types, and the CST-identity gate (emit-parser-verify) +// proves the stripped runtime is byte-for-byte the interpreter. +// +// Both emit paths are covered: the self-contained path (soa columns + an emitted +// lexer — the ts/js family) and the fallback path (yaml/html: emitLexer returns null +// so the parser imports createLexer, plus the non-soa piece-text layer). Checking +// every grammar is what forces grammar-specific emission (token width, soa vs piece +// layer, empty vocab sets, the fallback createLexer contract) to stay type-sound — +// and it already paid off: the fallback editCore branch referenced cs/ceOld/ +// parenCachePos declared only in the soa branch (unreached at runtime, invisible +// until this gate), now hoisted/gated correctly. +import { emitParser, jsTarget } from '../src/emit.ts'; +import { writeFileSync } from 'node:fs'; +import { execFileSync } from 'node:child_process'; +import type { CstGrammar } from '../src/types.ts'; + +const GRAMMARS: Array<[string, string]> = [ + ['typescript', '../typescript.ts'], + ['javascript', '../javascript.ts'], + ['typescriptreact', '../typescriptreact.ts'], + ['javascriptreact', '../javascriptreact.ts'], + ['yaml', '../yaml.ts'], + ['html', '../html.ts'], +]; + +// --allowImportingTsExtensions: the fallback-lexer grammars import createLexer from +// '…/src/gen-lexer.ts' (an absolute path baked at emit time); harmless for the +// self-contained grammars, which import nothing. +const TSC_FLAGS = [ + '--strict', '--noEmit', '--target', 'ES2022', '--module', 'ES2022', + '--moduleResolution', 'Bundler', '--skipLibCheck', '--allowImportingTsExtensions', +]; + +let failures = 0; +for (const [name, path] of GRAMMARS) { + let grammar: CstGrammar; + try { + grammar = (await import(path)).default; + } catch { + console.log(` ${name}: (grammar not present — skipped)`); + continue; + } + const out = `/tmp/emit-tsc-gate-${name}.ts`; + writeFileSync(out, emitParser(grammar, jsTarget)); + try { + execFileSync('npx', ['tsc', ...TSC_FLAGS, out], { stdio: 'pipe' }); + console.log(` ${name}: ✓ emitted parser type-checks (tsc --strict)`); + } catch (e: any) { + failures++; + const log = (e.stdout?.toString() ?? '') + (e.stderr?.toString() ?? ''); + const errs = log.split('\n').filter((l: string) => l.includes('error TS')); + console.log(` ${name}: ✗ ${errs.length} tsc error(s):`); + for (const l of errs.slice(0, 30)) console.log(` ${l.replace(out, `emit(${name})`)}`); + if (errs.length > 30) console.log(` … and ${errs.length - 30} more`); + } +} + +if (failures > 0) { + console.error(`\n✗ emitted parser fails strict type-check for ${failures} grammar(s)`); + process.exit(1); +} +console.log('\n✓ emitted parser type-checks under tsc --strict for every grammar'); diff --git a/test/exhaustive-edits.ts b/test/exhaustive-edits.ts index 1485a4f..9aa404f 100644 --- a/test/exhaustive-edits.ts +++ b/test/exhaustive-edits.ts @@ -9,7 +9,7 @@ // node --max-old-space-size=4096 test/exhaustive-edits.ts import { writeFileSync } from 'node:fs'; import { token, rule, defineGrammar, many, opt, sep, plus, oneOf, range, seq, star, noneOf } from '../src/api.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { objectify } from './emitted-obj.ts'; // A deliberately bracket-and-list-shaped grammar: parens force synthesis and @@ -31,8 +31,8 @@ const g = defineGrammar({ rules: { Expr, Stmt, Program }, entry: Program, }); -const emPath = '/tmp/emitted-exhaustive.mjs'; -writeFileSync(emPath, emitParser(g)); +const emPath = '/tmp/emitted-exhaustive.mts'; +writeFileSync(emPath, emitParser(g, jsTarget)); type Cst = { root: number; errors: object[] }; type Parser = { parse(s: string): Cst; edit(c: Cst, e: object[]): void; visit(c: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; const em = (await import(emPath + '?v=' + process.pid)) as { createParser(): Parser; __arenaStats(): { inPlaceShrink: number } }; diff --git a/test/fixtures/altjs.ts b/test/fixtures/altjs.ts new file mode 100644 index 0000000..fe409a7 --- /dev/null +++ b/test/fixtures/altjs.ts @@ -0,0 +1,37 @@ +// Exercises the portable parser's general inline `alt(...)` of NON-literals (the first +// parser-algebra construct javascript.ts needs that buildIR previously rejected). Object +// keys are `alt(Ident, Str, Number)` — a backtracking alternation of token references +// inside a rule sequence, not the all-literal fast path. +import { + token, rule, defineGrammar, left, op, + seq, oneOf, range, star, sep, opt, many, alt, noneOf, +} from '../../src/api.ts'; + +const digit = range('0', '9'); +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); + +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); +const Str = token(seq('"', star(noneOf('"', '\n')), '"'), { scope: 'string.quoted.double' }); + +const jsPrec = [left('+', '-'), left('*', '/')]; + +// key = a NON-literal inline alternation (Ident | Str | Number). +const KeyVal = rule(($) => [[alt(Ident, Str, Number_), ':', Expr]]); +const Expr = rule(($) => [ + Number_, Str, Ident, + ['(', $, ')'], + ['{', opt(sep(KeyVal, ',')), '}'], // object literal + [$, op, $], +]); +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'altjs', + scopeName: 'source.altjs', + tokens: { Ident, Number: Number_, Str }, + prec: jsPrec, + rules: { KeyVal, Expr, Stmt, Program }, +}); diff --git a/test/fixtures/arrowjs.ts b/test/fixtures/arrowjs.ts new file mode 100644 index 0000000..b4967c9 --- /dev/null +++ b/test/fixtures/arrowjs.ts @@ -0,0 +1,36 @@ +// Exercises the capBelow (assignment-level) Pratt construct — arrow functions. A `capExpr` +// NUD is parsed only when the enclosing minBp is LOOSER than its connector's binding power +// (so `1 + (() => x)` needs the parens) and, once parsed, admits NO led (it is "capped"). +// The `=>` body's ctxMode (await/yield) is treated as transparent here — the context fork +// is NOT modelled, so this covers basic arrows, not async/await bodies. +import { + token, rule, defineGrammar, left, right, op, capExpr, alt, + seq, oneOf, range, star, sep, opt, many, +} from '../../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [right('='), left('||'), left('+', '-'), left('*', '/')]; + +const Block = rule(($) => [['{', many(Stmt), '}']]); +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + capExpr('=', '(', opt(sep(Ident, ',')), ')', '=>', alt(Block, $)), // (params) => body + capExpr('=', Ident, '=>', alt(Block, $)), // x => body + [$, op, $], + [$, '(', opt(sep($, ',')), ')'], // call +]); +const Stmt = rule(($) => [Block, [Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'arrowjs', + scopeName: 'source.arrowjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + rules: { Expr, Block, Stmt, Program }, +}); diff --git a/test/fixtures/calc.ts b/test/fixtures/calc.ts new file mode 100644 index 0000000..1d04fd1 --- /dev/null +++ b/test/fixtures/calc.ts @@ -0,0 +1,57 @@ +// A small Pratt grammar — the cross-language target proof for issue #6. +// +// Deliberately minimal but it exercises the constructs that make parsing-as- +// derivation non-trivial: token kinds (Ident/Number), literal keywords, sequences, +// backtracking alternation, quantifiers (opt/many/sep), recursion (grouping), and — +// the crux — a Pratt expression engine with operator PRECEDENCE and associativity +// (`1 + 2 * 3` must group as `1 + (2 * 3)`), prefix unary, and a left-associative +// call/postfix continuation. emitParser(grammar, target) derives a TS, Go, and Rust parser +// from THIS one definition; the cross-language gate proves all three produce the +// byte-identical CST the interpreter (createParser) does. +// +// No lexer lookahead (the full TS grammar's number tokens use `(?!…)`, which Go's +// RE2 and Rust's regex crate reject) — the portable lexer is a dependency-free +// char-class scanner, so the emitted Go/Rust compile offline with no regex engine. +import { + token, rule, defineGrammar, left, right, op, prefix, + seq, oneOf, range, star, many, +} from '../../src/api.ts'; + +const digit = range('0', '9'); +const identStart = oneOf(range('a', 'z'), range('A', 'Z'), '_'); +const identPart = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_'); + +const Ident = token(seq(identStart, star(identPart)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); + +// Precedence ladder (earlier = looser): `+` `-` loosest, then `*` `/`, then prefix +// `-` tightest — so `1 + 2 * 3` is `1 + (2 * 3)` and `-a * b` is `(-a) * b`. +const calcPrec = [ + left('+', '-'), + left('*', '/'), + right(prefix('-')), +]; + +const Expr = rule(($) => [ + Number_, + Ident, + ['(', $, ')'], // grouping (recursion) + [prefix, $], // prefix unary minus (operators from the ladder) + [$, op, $], // binary infix, precedence from the ladder +]); + +const Stmt = rule(($) => [ + ['let', Ident, '=', Expr, ';'], + [Expr, ';'], +]); + +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'calc', + scopeName: 'source.calc', + tokens: { Ident, Number: Number_ }, + prec: calcPrec, + // findEntryRule = the LAST rule, so Program is the entry point. + rules: { Expr, Stmt, Program }, +}); diff --git a/test/fixtures/ledjs.ts b/test/fixtures/ledjs.ts new file mode 100644 index 0000000..7148851 --- /dev/null +++ b/test/fixtures/ledjs.ts @@ -0,0 +1,46 @@ +// Exercises precedence-gated mixfix LEDs: the ternary `? :` (a led that binds LOOSER than the +// operators, so `a == b ? c : d` groups as `(a == b) ? c : d`) and `in`/`instanceof` (chain-rhs +// leds at the relational level — `a in b in c` left-chains as `(a in b) in c`). Both need the +// led-precedence gate the portable parser previously lacked (its mixfix leds bound maximally tight). +import { + token, rule, defineGrammar, left, right, op, + seq, oneOf, range, star, many, +} from '../../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [ + right('='), + left('||'), + left('==', '!='), + left('<', '>'), + left('+', '-'), + left('*', '/'), +]; + +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + [$, op, $], + [$, '?', $, ':', $], // ternary (binds below `||`) + [$, 'in', $], // relational chain-rhs + [$, 'instanceof', $], +]); +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'ledjs', + scopeName: 'source.ledjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + ledPrec: [ + { connector: '?', below: '||' }, + { connector: 'in', sameAs: '<', chainRhs: true }, + { connector: 'instanceof', sameAs: '<', chainRhs: true }, + ], + rules: { Expr, Stmt, Program }, +}); diff --git a/test/fixtures/minijs.ts b/test/fixtures/minijs.ts new file mode 100644 index 0000000..21f9bfb --- /dev/null +++ b/test/fixtures/minijs.ts @@ -0,0 +1,77 @@ +// A real JavaScript SUBSET — the grammar that makes the portable Go/Rust targets +// "comparable with oxc": rich enough that parsing a corpus is realistic work +// (strings, comments, the full operator-precedence ladder, call/member/index +// chains, arrays, and the common statement forms), so the emitted Rust parser can +// be benchmarked against oxc on the same bytes. +// +// Derived from ONE definition by emitParser(grammar, target) into TypeScript, Go, and Rust; +// the cross-language gate proves all three produce the byte-identical CST that the +// interpreter (createParser) does. The portable lexer is regex-free (char scanner +// driven by token-pattern.ts's structural recognizers), so the Go/Rust output +// compiles offline. +// +// Deliberately omitted (ambiguity / scope, not capability): object literals (the +// `{`-block-vs-object split), ternary, template literals, regex literals, keyword +// operators (typeof/void/...), and `for`. The subset stays unambiguous and real. +import { + token, rule, defineGrammar, left, right, op, prefix, alt, + seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, anyChar, +} from '../../src/api.ts'; + +const digit = range('0', '9'); +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); + +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); +const Str = token(seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', anyChar()))), '"'), { scope: 'string.quoted.double' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); +const BlockComment = token(seq('/*', star(altPattern(noneOf('*'), seq('*', noneOf('/')))), '*/'), { skip: true, scope: 'comment.block' }); + +// Operator-precedence ladder (earlier = looser), mirroring JavaScript. +const jsPrec = [ + right('='), + left('||'), left('&&'), + left('|'), left('^'), left('&'), + left('==', '!=', '===', '!=='), + left('<', '>', '<=', '>='), + left('<<', '>>'), + left('+', '-'), + left('*', '/', '%'), + right(prefix('!', '-', '+', '~')), +]; + +const Expr = rule(($) => [ + Number_, + Str, + Ident, + ['(', $, ')'], // grouping + ['[', opt(sep($, ',')), ']'], // array literal + [prefix, $], // prefix unary + [$, op, $], // binary infix (precedence from the ladder) + [$, '(', opt(sep($, ',')), ')'], // call + [$, '.', Ident], // member access + [$, '[', $, ']'], // computed index +]); + +const Block = rule(($) => [['{', many(Stmt), '}']]); + +const Stmt = rule(($) => [ + Block, + [alt('var', 'let', 'const'), Ident, opt('=', Expr), ';'], + ['if', '(', Expr, ')', Stmt, opt('else', Stmt)], + ['while', '(', Expr, ')', Stmt], + ['return', opt(Expr), ';'], + ['function', Ident, '(', opt(sep(Ident, ',')), ')', Block], + [Expr, ';'], +]); + +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'minijs', + scopeName: 'source.minijs', + tokens: { Ident, Number: Number_, Str, LineComment, BlockComment }, + prec: jsPrec, + rules: { Expr, Block, Stmt, Program }, +}); diff --git a/test/fixtures/noinjs.ts b/test/fixtures/noinjs.ts new file mode 100644 index 0000000..54d5395 --- /dev/null +++ b/test/fixtures/noinjs.ts @@ -0,0 +1,35 @@ +// Exercises the no-`in` (suppress) context. In a `for (binding in iterable)` head, the +// binding is parsed with the `in` LED DISABLED — `exclude('in', Expr)` — so the `in` belongs +// to the for-head, not to a relational expression inside the binding. Outside a for-head, `in` +// binds normally. The portable parser threads a suppressed-connector set into the led loop. +import { + token, rule, defineGrammar, left, op, exclude, + seq, oneOf, range, star, many, +} from '../../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [left('||'), left('<', '>'), left('+', '-')]; + +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + [$, op, $], + [$, 'in', $], + [$, '.', Ident], +]); +const ForHead = rule(($) => [['for', '(', exclude('in', Expr), 'in', Expr, ')', Stmt]]); +const Stmt = rule(($) => [ForHead, [Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'noinjs', + scopeName: 'source.noinjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + ledPrec: [{ connector: 'in', sameAs: '<', chainRhs: true }], + rules: { Expr, ForHead, Stmt, Program }, +}); diff --git a/test/fixtures/nudjs.ts b/test/fixtures/nudjs.ts new file mode 100644 index 0000000..d9b54c6 --- /dev/null +++ b/test/fixtures/nudjs.ts @@ -0,0 +1,41 @@ +// Exercises two general Pratt NUD shapes javascript.ts needs (beyond bare-token / prefix / +// bracket): a reserved-word-GUARDED identifier `[not(kw)… Ident]` (zero-width negative +// lookahead before a token) and a quantifier-first NUD `[Decorator? "class" Ident? …]` (a +// class expression). Both compile to a general backtracking NUD sequence; the `not` step +// consumes nothing. (Arrow functions — group{capBelow,ctxMode} — are deferred.) +import { + token, rule, defineGrammar, left, op, + seq, oneOf, range, star, sep, opt, many, alt, not, noneOf, +} from '../../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); +const Decorator = token(seq('@', idStart, star(idCont)), { scope: 'meta.decorator' }); + +const reserved = alt('if', 'else', 'while', 'return', 'class', 'new', 'extends'); + +const Expr = rule(($) => [ + Number_, + [not(reserved), Ident], // reserved-word-guarded identifier + [opt(Decorator), 'class', opt(Ident), opt('extends', $), '{', many(ClassMember), '}'], // class expr (quantifier-first NUD) + ['new', $], // literal-led NUD (bracket) + ['(', $, ')'], + [$, op, $], + [$, '.', Ident], + [$, '(', opt(sep($, ',')), ')'], +]); +const ClassMember = rule(($) => [[opt(Decorator), Ident, '(', ')', '{', '}']]); + +const jsPrec = [left('+', '-'), left('*', '/')]; +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'nudjs', + scopeName: 'source.nudjs', + tokens: { Decorator, Ident, Number: Number_ }, + prec: jsPrec, + rules: { Expr, ClassMember, Stmt, Program }, +}); diff --git a/test/fixtures/postjs.ts b/test/fixtures/postjs.ts new file mode 100644 index 0000000..1ac9340 --- /dev/null +++ b/test/fixtures/postjs.ts @@ -0,0 +1,38 @@ +// Exercises the postfix-operator Pratt LED `[$, postfix]` (e.g. `x++`, `x--`) — a LED that +// consumes the operator and no right operand, binding tight. `++`/`--` are BOTH prefix (NUD, +// `++x`) and postfix (LED, `x++`); the engine resolves them by position. +import { + token, rule, defineGrammar, left, right, op, prefix, postfix, + seq, oneOf, range, star, many, +} from '../../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [ + left('+', '-'), + left('*', '/'), + right(prefix('-', '!', '++', '--')), + left(postfix('++', '--')), +]; + +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + [prefix, $], + [$, op, $], + [$, '.', Ident], + [$, postfix], // postfix operator LED +]); +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'postjs', + scopeName: 'source.postjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + rules: { Expr, Stmt, Program }, +}); diff --git a/test/fixtures/regexjs.ts b/test/fixtures/regexjs.ts new file mode 100644 index 0000000..0f966f9 --- /dev/null +++ b/test/fixtures/regexjs.ts @@ -0,0 +1,77 @@ +// minijs + REGEX literals — exercises the portable lexer's STATEFUL regex-vs-division +// disambiguation (stage 3). A `/` is a regex in expression context but division after a +// value; `if (x) /re/` is a regex (control-head paren), `obj.for(x) / y` is division +// (member name, not a head). The regexContext config + paren-head/bang state are ported +// from createLexer; the gate checks the emitted CST is byte-identical on inputs that mix +// regex literals and division. +import { + token, rule, defineGrammar, left, right, op, prefix, alt, + seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, anyChar, +} from '../../src/api.ts'; + +const digit = range('0', '9'); +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); + +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); +const Str = token(seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', anyChar()))), '"'), { scope: 'string.quoted.double' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); +const BlockComment = token(seq('/*', star(altPattern(noneOf('*'), seq('*', noneOf('/')))), '*/'), { skip: true, scope: 'comment.block' }); + +// Regex literal: `/ body / flags`, body is non-(/\[)newline chars, escapes, or `[...]` classes. +const rxClass = seq('[', star(altPattern(noneOf(']', '\\', '\n'), seq('\\', noneOf('\n')))), ']'); +const rxChar = altPattern(noneOf('/', '\\', '[', '\n'), seq('\\', noneOf('\n')), rxClass); +const rxFirst = altPattern(noneOf('/', '\\', '[', '*', '\n'), seq('\\', noneOf('\n')), rxClass); +const Regex = token(seq('/', rxFirst, star(rxChar), '/', star(idCont)), { + regex: true, scope: 'string.regexp', + regexContext: { + divisionAfterTypes: ['Ident', 'Number', 'Str'], + divisionAfterTexts: [')', ']', 'this', 'true', 'false', 'null'], + regexAfterTexts: ['return', 'typeof', 'delete', 'void', 'in', 'instanceof', 'new', 'do', 'else'], + regexAfterParenKeywords: ['if', 'while', 'for'], + memberAccessTexts: ['.'], + postfixAfterValueTexts: [], + }, +}); + +const jsPrec = [ + right('='), + left('||'), left('&&'), + left('==', '!=', '===', '!=='), + left('<', '>', '<=', '>='), + left('+', '-'), + left('*', '/', '%'), + right(prefix('!', '-', '+', '~')), +]; + +const Expr = rule(($) => [ + Number_, Str, Ident, Regex, + ['(', $, ')'], + ['[', opt(sep($, ',')), ']'], + [prefix, $], + [$, op, $], + [$, '(', opt(sep($, ',')), ')'], + [$, '.', Ident], + [$, '[', $, ']'], +]); + +const Block = rule(($) => [['{', many(Stmt), '}']]); +const Stmt = rule(($) => [ + Block, + [alt('var', 'let', 'const'), Ident, opt('=', Expr), ';'], + ['if', '(', Expr, ')', Stmt, opt('else', Stmt)], + ['while', '(', Expr, ')', Stmt], + ['return', opt(Expr), ';'], + ['function', Ident, '(', opt(sep(Ident, ',')), ')', Block], + [Expr, ';'], +]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'regexjs', + scopeName: 'source.regexjs', + tokens: { LineComment, BlockComment, Number: Number_, Str, Regex, Ident }, + prec: jsPrec, + rules: { Expr, Block, Stmt, Program }, +}); diff --git a/test/fixtures/richtokens.ts b/test/fixtures/richtokens.ts new file mode 100644 index 0000000..3f7bc2d --- /dev/null +++ b/test/fixtures/richtokens.ts @@ -0,0 +1,40 @@ +// A token-stress grammar for the portable lexer's GENERAL matcher (stage 1 of real-grammar +// support). It uses the STATELESS real-JS token shapes the 4-shape fast paths can't handle — +// `\u`-escaped identifiers, the decimal/hex number family with a `(?!IdentChar)` boundary, +// both-quote strings with escapes, and comments — so the portable lexer must compile the raw +// token-pattern AST to a backtracking-free matcher. A trivial parser (a stream of value +// tokens) makes the emitted CST essentially the token stream, so checking it against +// createParser verifies the LEXER. (Stateful tokens — regex, templates — are NOT here; they +// need cross-token lexer state, the next stage.) +import { + token, rule, defineGrammar, + seq, oneOf, range, star, plus, repeat, optPattern, altPattern, noneOf, anyChar, notFollowedBy, many, +} from '../../src/api.ts'; + +const digit = range('0', '9'); +const hexDigit = oneOf(digit, range('a', 'f'), range('A', 'F')); +const idChar = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const uEsc = altPattern(seq('\\u', repeat(hexDigit, 4, 4)), seq('\\u{', plus(hexDigit), '}')); +const boundary = notFollowedBy(idChar); // a number can't be glued to an identifier char + +const Hex = token(seq('0', oneOf('x', 'X'), plus(hexDigit), boundary), { scope: 'constant.numeric.hex' }); +const Number_ = token(seq(plus(digit), star(seq('_', plus(digit))), optPattern(seq('.', plus(digit))), boundary), { scope: 'constant.numeric' }); +const Ident = token(seq(altPattern(oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'), uEsc), star(altPattern(idChar, uEsc))), { identifier: true }); +const Str = token(altPattern( + seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', anyChar()))), '"'), + seq("'", star(altPattern(noneOf("'", '\\'), seq('\\', anyChar()))), "'"), +), { scope: 'string.quoted' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); +const BlockComment = token(seq('/*', star(altPattern(noneOf('*'), seq('*', noneOf('/')))), '*/'), { skip: true, scope: 'comment.block' }); + +// Value = one value token; Program = a stream of them. (Lexer-level disambiguation — Hex vs +// Number — comes from token DECLARATION ORDER, which both engines follow.) +const Value = rule(($) => [Hex, Number_, Ident, Str]); +const Program = rule(($) => [many(Value)]); + +export default defineGrammar({ + name: 'richtokens', + scopeName: 'source.richtokens', + tokens: { Hex, Number: Number_, Ident, Str, LineComment, BlockComment }, + rules: { Value, Program }, +}); diff --git a/test/fixtures/seqjs.ts b/test/fixtures/seqjs.ts new file mode 100644 index 0000000..b1facfd --- /dev/null +++ b/test/fixtures/seqjs.ts @@ -0,0 +1,33 @@ +// Exercises a grouped sub-sequence `seq` step: comma lists written as `star([',', $])` (a +// star whose body is the two-element sequence `, Expr`) rather than `sep(...)`, the shape +// javascript.ts uses for argument/array/sequence lists. +import { + token, rule, defineGrammar, left, op, + seq, oneOf, range, star, opt, many, +} from '../../src/api.ts'; +// `many(',', $)` is the rule-level `(',' Expr)*` — a star whose body is the sequence +// `, Expr`, exactly the shape javascript.ts uses for comma lists. + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [left('+', '-'), left('*', '/')]; +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + ['[', opt($, many(',', $)), ']'], // array literal via star(seq) + [$, op, $], + [$, '(', opt($, many(',', $)), ')'], // call args via star(seq) +]); +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'seqjs', + scopeName: 'source.seqjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + rules: { Expr, Stmt, Program }, +}); diff --git a/test/fixtures/sljs.ts b/test/fixtures/sljs.ts new file mode 100644 index 0000000..5c57d9e --- /dev/null +++ b/test/fixtures/sljs.ts @@ -0,0 +1,29 @@ +// Exercises the `sameLine` zero-width assertion (no line terminator before the next token). +// A `return` takes a value only when it is on the SAME line (ASI-style restricted production): +// `return 1;` keeps the value, `return\n1;` does not. Verifies the lexer's newline-before +// tracking — including a block comment that spans a newline. +import { + token, rule, defineGrammar, left, op, + seq, oneOf, range, star, opt, many, altPattern, noneOf, sameLine, +} from '../../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); +const BlockComment = token(seq('/*', star(altPattern(noneOf('*'), seq('*', noneOf('/')))), '*/'), { skip: true, scope: 'comment.block' }); + +const jsPrec = [left('+', '-'), left('*', '/')]; +const Expr = rule(($) => [Number_, Ident, ['(', $, ')'], [$, op, $]]); +const Ret = rule(($) => [['return', opt(sameLine, Expr), ';']]); // `return` + a SAME-LINE value +const Stmt = rule(($) => [Ret, [Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'sljs', + scopeName: 'source.sljs', + tokens: { Ident, Number: Number_, LineComment, BlockComment }, + prec: jsPrec, + rules: { Expr, Ret, Stmt, Program }, +}); diff --git a/test/fixtures/templatejs.ts b/test/fixtures/templatejs.ts new file mode 100644 index 0000000..cf6f523 --- /dev/null +++ b/test/fixtures/templatejs.ts @@ -0,0 +1,62 @@ +// minijs + TEMPLATE LITERALS — exercises the portable lexer's second STATEFUL feature +// (stage 4): `${…}` interpolation. The lexer splits `` `a${x}b${y}c` `` into +// $templateHead·$templateMiddle·$templateTail around the holes, tracking a brace-depth +// stack so a nested `{…}` (or a nested template) inside a hole doesn't close it; the +// parser assembles the pieces and interpolated expressions into a `$template` node. +import { + token, rule, defineGrammar, left, right, op, prefix, alt, + seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, notFollowedBy, +} from '../../src/api.ts'; + +const digit = range('0', '9'); +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); + +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); +const Str = token(seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', noneOf('\n')))), '"'), { scope: 'string.quoted.double' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); + +// NoSubstitution template: backtick body excludes a real `${` (a `$` not followed by `{` +// stays literal); the `template` config drives the interpolated split in the lexer. +const Template = token( + seq('`', star(altPattern(noneOf('`', '\\', '$'), seq('\\', noneOf('\n')), seq('$', notFollowedBy('{')))), '`'), + { scope: 'string.template', template: { open: '`', interpOpen: '${', interpClose: '}' } }, +); + +const jsPrec = [ + right('='), + left('||'), left('&&'), + left('+', '-'), + left('*', '/', '%'), + right(prefix('!', '-', '+')), +]; + +const Expr = rule(($) => [ + Number_, Str, Template, Ident, + ['(', $, ')'], + ['{', opt(sep(Ident, ',')), '}'], // shorthand object — gives a hole a nested `{ … }` + [prefix, $], + [$, op, $], + [$, '(', opt(sep($, ',')), ')'], + [$, '.', Ident], + [$, Template], // tagged template — a postfix-token LED +]); + +const Block = rule(($) => [['{', many(Stmt), '}']]); +const Stmt = rule(($) => [ + Block, + [alt('var', 'let', 'const'), Ident, opt('=', Expr), ';'], + ['if', '(', Expr, ')', Stmt, opt('else', Stmt)], + ['return', opt(Expr), ';'], + [Expr, ';'], +]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'templatejs', + scopeName: 'source.templatejs', + tokens: { Ident, Number: Number_, Str, Template, LineComment }, + prec: jsPrec, + rules: { Expr, Block, Stmt, Program }, +}); diff --git a/test/head-to-head.ts b/test/head-to-head.ts index 4613e67..1d84a5a 100644 --- a/test/head-to-head.ts +++ b/test/head-to-head.ts @@ -15,7 +15,7 @@ // so it reads through a 16KB chunk callback (its documented large-input path). import { readFileSync } from 'node:fs'; import { createRequire } from 'node:module'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { writeFileSync } from 'node:fs'; import ts from 'typescript'; @@ -26,8 +26,8 @@ const TreeSitter = require(TS_BENCH + '/node_modules/tree-sitter'); const TSLang = require(TS_BENCH + '/node_modules/tree-sitter-typescript').typescript; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-h2h.mjs'; -writeFileSync(emPath, emitParser(grammar)); +const emPath = '/tmp/emitted-h2h.mts'; +writeFileSync(emPath, emitParser(grammar, jsTarget)); const { createParser } = await import(emPath + '?v=' + process.pid); const unit = readFileSync(CORPUS, 'utf-8'); diff --git a/test/incremental-grammars.ts b/test/incremental-grammars.ts index 6c2bbd0..9c4a780 100644 --- a/test/incremental-grammars.ts +++ b/test/incremental-grammars.ts @@ -13,7 +13,7 @@ // // node test/incremental-grammars.ts import { writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { generateInputs } from './grammar-gen.ts'; import { objectify } from './emitted-obj.ts'; @@ -84,8 +84,8 @@ let fails = 0; const failures: string[] = []; for (const name of GRAMMARS) { const grammar = (await import(`../${name}.ts`)).default; - const emPath = `/tmp/emitted-incr-${name}.mjs`; - writeFileSync(emPath, emitParser(grammar)); + const emPath = `/tmp/emitted-incr-${name}.mts`; + writeFileSync(emPath, emitParser(grammar, jsTarget)); const em = (await import(emPath + '?v=' + process.pid)) as Em; const session = em.createParser(); const fresh = em.createParser(); @@ -183,7 +183,7 @@ function replaceOnce(text: string, find: string, repl: string): { next: string; return { next: text.slice(0, at) + repl + text.slice(at + find.length), edit: { start: at, end: at + find.length, text: repl } }; } for (const name of ['javascript', 'typescript']) { - const em = (await import(`/tmp/emitted-incr-${name}.mjs?v=` + process.pid)) as Em; + const em = (await import(`/tmp/emitted-incr-${name}.mts?v=` + process.pid)) as Em; const session = em.createParser(); const fresh = em.createParser(); for (const doc of FORK_DOCS) { diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index 04fdf3b..85814e1 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -8,11 +8,11 @@ // node test/incremental-verify.ts import { objectify } from './emitted-obj.ts'; import { existsSync, readFileSync, writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-incremental.mjs'; -writeFileSync(emPath, emitParser(grammar)); +const emPath = '/tmp/emitted-incremental.mts'; +writeFileSync(emPath, emitParser(grammar, jsTarget)); type Edit = { start: number; end: number; text: string }; type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; type Parser = { diff --git a/test/multi-doc.ts b/test/multi-doc.ts index f5af760..25af324 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -9,12 +9,12 @@ // // node test/multi-doc.ts import { writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { objectify } from './emitted-obj.ts'; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-multidoc.mjs'; -writeFileSync(emPath, emitParser(grammar)); +const emPath = '/tmp/emitted-multidoc.mts'; +writeFileSync(emPath, emitParser(grammar, jsTarget)); type Edit = { start: number; end: number; text: string }; type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; diff --git a/test/portable-targets.ts b/test/portable-targets.ts new file mode 100644 index 0000000..58de024 --- /dev/null +++ b/test/portable-targets.ts @@ -0,0 +1,287 @@ +// Gate: the TARGET-AGNOSTIC emitter (issue #6) — `emitParser(grammar, target)` +// derives a parser in EACH target language that produces the byte-identical CST the +// interpreter (createParser) does. The agnosticism proof by EXECUTION: every grammar is +// rendered to TypeScript, Go, and Rust; the Go/Rust sources are COMPILED and RUN, and each +// parser's CST output is compared, node-for-node, against the createParser oracle over an +// adversarial corpus, plus reject-parity on malformed input. +// +// - calc: operator precedence/associativity, prefix unary, nested grouping. +// - minijs: a real JavaScript SUBSET — a string/comment lexer, the full operator ladder, +// call/member/index chains, arrays, and statement forms (the grammar the Go/Rust +// output is benchmarked against oxc with). +// +// Go/Rust toolchains are optional: a missing `go`/`rustc` is logged and skipped (the TS +// rendering, which needs only node, always runs). +import { execFileSync } from 'node:child_process'; +import { mkdirSync, rmSync, writeFileSync } from 'node:fs'; +import { createParser } from '../src/gen-parser.ts'; +import { emitParser, tsTarget, goTarget, rustTarget } from '../src/emit.ts'; +import type { CstGrammar } from '../src/types.ts'; + +type Case = { grammar: string; path: string; accept: string[]; reject: string[]; tsOnly?: boolean }; +const CASES: Case[] = [ + { + grammar: 'calc', path: './fixtures/calc.ts', + accept: [ + '1;', 'a;', '', '1 + 2 * 3;', '1 * 2 + 3;', '1 - 2 - 3;', 'a / b / c;', '1 + 2 + 3 + 4;', + '-a;', '-(-a);', '- - a;', '-a * b;', '-a + b * c;', '-(a + b) * c;', + '(1);', '((a));', '(1 + 2) * (3 - 4);', 'a * b + c * d - e / f;', + 'let x = 1; let y = x + 2 * x; (y);', 'let z = -(a * b) / (c - -d);', 'foo; bar; baz;', + ], + reject: ['1 +;', '(1;', '1 2;', 'let = 1;', ') ;', '* a;', 'let x 1;'], + }, + { + grammar: 'minijs', path: './fixtures/minijs.ts', + accept: [ + '1;', 'a;', '', 'x = 1 + 2 * 3;', '-a * b + 1;', '(1 + 2) * 3;', + 'foo(a, b);', 'a.b.c;', 'a[0][1];', 'f()()();', 'a.b(c).d[e];', + 'let x = 1; let y = x + 2;', '[1, 2, 3];', '[];', '[a, [b, c]];', + 'if (x < 10) { x = x + 1; } else { y(); }', 'while (i) { i = i - 1; }', + 'function f(a, b) { return a + b; }', 'var s = "hi\\"x"; // c\n s.length;', + '/* block */ a;', 'a === b !== c;', 'a && b || c;', '!a && -b;', + 'return;', 'return a + b;', 'const PI = 3;', '{ a; b; }', + 'f(g(h(x)), [1, 2], y.z);', 'while (a < b) { if (c) { d(); } e = e + 1; }', + ], + // (note: `let = 1;` is VALID minijs — no reserved-word guard, so `let` is an + // identifier and it's an assignment expression; the oracle accepts it too.) + reject: ['1 +;', '(1;', 'if x {}', 'foo(a,;', 'a.;', '[1,', 'function (){}'], + }, + { + // The general token-pattern matcher (stateless real-JS token tier): \u-escaped + // identifiers, the decimal/hex number family with a boundary, both-quote strings — + // compiled to a backtracking-free matcher in all three targets. + grammar: 'richtokens', path: './fixtures/richtokens.ts', + accept: [ + '123', '0xFF', '1_000_000', '3.14', 'foo', 'bar_$x9', '"hi"', "'single'", + '"esc\\"q\\n"', '123 0xa foo "s" 3.14', '0xDEADbeef 42 _id $x cafe // line\n 7', + '/* block */ 99 x', 'caf\\u00e9 \\u0041bc', '1_2_3 0X1F 10.5 a1 b2', + ], + reject: ['12abc', '0x', '"unterminated', '3.', '#'], // ($ is a valid identifier start, not a reject) + }, + { + // The STATEFUL regex-vs-division lexer: `/` is a regex in expression context, division + // after a value. Exercises every branch of prevIsValue — after `=`/keyword/`(`-head + // (regex) vs after value/`)`/`]`/member/call (division), plus regex escapes & classes. + grammar: 'regexjs', path: './fixtures/regexjs.ts', + accept: [ + 'a / b;', 'var r = /abc/g;', 'return /re/;', 'if (x) /re/;', '(a + b) / c;', + 'a.b / c;', 'foo(x) / y;', '[1, 2] / 3;', 'var x = a / b / c;', + 'var re = /[a-z]+/i; x / y;', 'f(/re/, a / b);', 'var z = /a\\/b/;', + 'var d = /\\d+\\w/g;', 'var k = /[\\]]/;', 'if (a) /x/; else b / c;', + ], + // (`var ;` is VALID — `var` is an identifier, so it's the expression statement `var;`.) + reject: ['a / ;', 'if (x /re/;', '/re/', '* a;', 'a = = b;'], + }, + { + // STATEFUL template literals: the `${…}` interpolation split (head/middle/tail) with a + // brace-depth stack — adjacent/multiple holes, exprs in holes, nested templates, and a + // nested `{…}` object inside a hole (which must NOT close the hole). + grammar: 'templatejs', path: './fixtures/templatejs.ts', + accept: [ + 'var a = `hello`;', 'var b = `hi ${name}!`;', 'var c = `${x}${y}`;', + 'var d = `a${ x + 1 }b${ y * 2 }c`;', 'var e = `outer ${ `inner ${z}` } end`;', + 'var f = `${ {a} }`;', 'var f2 = `${ {a, b} } and ${ c }`;', 'var g = `no holes $ here`;', + 'f(`${a}`, `${b}`);', 'var h = `${a}${b}${c}`;', 'return `${ {x, y} }`;', + 'tag`hello`;', 'tag`${a}${b}`;', 'String.raw`a${b}c`.length;', 'x.tag`${y}`;', // tagged (postfix-token LED) + ], + reject: ['var x = `${ }`;', 'var y = `${a`;', '`${a} ${}`;'], + }, + { + // General (non-literal) inline alt: object keys are alt(Ident | Str | Number) — a + // backtracking alternation of token refs inside a rule sequence. + grammar: 'altjs', path: './fixtures/altjs.ts', + accept: [ + '{a: 1};', '{"k": 2};', '{1: x};', '{a: 1, "b": 2, 3: c};', '{x: 1 + 2 * 3};', + '({nested: {inner: 1}});', '{};', 'a + b;', '{k: (1 + 2)};', + ], + reject: ['{a:};', '{: 1};', '{a 1};', '{a: 1,, b: 2};'], + }, + { + // General Pratt NUD sequences: a reserved-word-guarded identifier (`not(kw)… Ident`, + // a zero-width negative lookahead) and a quantifier-first class expression. + grammar: 'nudjs', path: './fixtures/nudjs.ts', + accept: [ + 'x;', 'foo + bar;', 'class C {};', 'class {};', 'class C extends B {};', + '@dec class C { m(){} };', 'new Foo;', 'new C();', 'a.b.c;', + 'class C { @x m(){} n(){} };', 'x + class {} + y;', + ], + reject: ['if;', 'class;', 'new;', 'return + 1;'], // reserved words can't be bare identifiers + }, + { + // Postfix-operator LED (`x++`/`x--`) + the access-tail closure: once a postfix binds, the + // operand is an update expression, so a further postfix or an access tail (`.`/`[`/`(`) + // can't attach (`a++--`, `a++.b` are ill-formed; `(a++).b` is fine). + grammar: 'postjs', path: './fixtures/postjs.ts', + accept: [ + 'x++;', 'x--;', 'a + b++;', '++x;', 'x++ + y;', 'a.b++;', '(x)++;', '--a.b;', + 'x++ * 2;', '(a++).b;', 'x.y.z++;', + ], + reject: ['a++--;', 'a++.b;', 'a++ ++;', '++;'], + }, + { + // A grouped sub-sequence `seq` step: comma lists as `star([',', $])` (e.g. `many(',', $)`), + // the array/argument-list shape javascript.ts uses. + grammar: 'seqjs', path: './fixtures/seqjs.ts', + accept: [ + '[1, 2, 3];', '[];', '[1];', 'f(1, 2);', 'f();', '[a + b, c];', + 'f(g(1, 2), 3);', '(x);', 'f(a)(b, c);', '[[1,2],[3,4]];', + ], + reject: ['[1 2];', 'f(1,);', '[, 1];', 'f(1 2);'], + }, + { + // The `sameLine` zero-width assertion (no line terminator before the next token): + // `return` takes a value only on the same line. Also verifies the lexer's newline-before + // tracking across a block comment that spans a newline. + grammar: 'sljs', path: './fixtures/sljs.ts', + accept: [ + 'return 1;', 'return;', 'return 1 + 2;', '1 + 2;', 'return /* c */ 1;', + '(a);', 'return (1);', 'return\t1;', + ], + // `\r`, LS, PS are JS line terminators just like `\n` (ASI / "no LineTerminator here"), so a + // `return` followed by any of them takes no operand — across all four lexers (interpreter, + // emitted JS, portable ts/go/rust). A `\t` (tab) is whitespace but NOT a terminator → accepted above. + reject: ['return\n1;', 'return\nx;', 'return /*\n*/ 1;', 'return // c\n 1;', 'return\r1;', 'return\r\n1;', 'return /*\r*/ 1;'], + }, + { + // capBelow (assignment-level) arrow functions: a NUD parsed only when minBp < the + // connector's bp, admitting NO led once parsed; the `(x) => y` vs `(x)` ambiguity is + // resolved by longest-match ordering (the arrow is tried first, falls back to grouping). + grammar: 'arrowjs', path: './fixtures/arrowjs.ts', + accept: [ + 'x => x;', '(a, b) => a + b;', '() => {};', 'x = (() => 1);', 'f(() => 1, 2);', + '(x);', 'a + b;', 'x => y => x;', '(() => 2);', '(a) => a;', 'x = y => y;', 'foo();', + '(a,) => b;', '(a, b,) => a;', // trailing comma in params (sep allows a trailing delimiter) + ], + reject: ['=> x;', 'x => ;', '1 + () => 2;', '(,) => b;'], + }, + { + // Precedence-gated mixfix LEDs: ternary `? :` (binds below the operators) and the + // chain-rhs relational leds `in`/`instanceof` (`a in b in c` left-chains). + grammar: 'ledjs', path: './fixtures/ledjs.ts', + accept: [ + 'a == b ? c : d;', 'a ? b : c ? d : e;', 'a + b ? c : d - e;', 'a in b;', + 'a in b in c;', 'x instanceof Y;', 'a < b in c;', '1 + 2 * 3 ? 4 : 5;', + '(a ? b : c) + d;', 'a in b ? c : d;', 'a = b ? c : d;', + ], + reject: ['a ? b;', 'a ? : c;', 'in b;', 'a instanceof;'], + }, + { + // The no-`in` (suppress) context: a `for (binding in iterable)` head parses its binding + // with the `in` led disabled, so `in` belongs to the for-head, not the binding. + grammar: 'noinjs', path: './fixtures/noinjs.ts', + accept: [ + 'for (x in y) z;', 'x in y;', 'for (a.b in c) d;', 'a in b in c;', + 'for ((x) in y) z;', 'for (x in y) a in b;', 'for (x in a in b) z;', + '(a in b);', 'for (a in b) for (c in d) e;', + ], + reject: ['for (x y) z;', 'for x in y;', 'for (in y) z;', 'for (x in) z;'], + }, + { + // The REAL javascript.ts grammar (89 rules after the [Await]/[Yield] fork) — the proof + // that the target-agnostic emitter handles a full language end-to-end in ts/go/rust. + // ASCII corpus only (byte-based go/rust use UTF-8 offsets, identical to the JS oracle's + // UTF-16 offsets for ASCII; non-ASCII offset units differ inherently). + grammar: 'javascript', path: '../javascript.ts', + accept: [ + 'var x = 1, y = 2;', 'function f(a, b) { return a + b; }', 'const g = (x) => x * 2;', + 'x => x + 1;', 'a ? b : c;', 'a.b.c();', 'f(g(1, 2), 3);', '[1, 2, 3].map(f);', + 'for (let i = 0; i < n; i++) x();', 'for (const k in obj) { y(); }', 'while (x) { z(); }', + 'if (a) b(); else c();', 'class C extends B { m() {} get p() { return 1; } }', 'a++; b--;', + 'typeof x; void 0;', 'new Foo(1, 2); new.target;', 'a ?? b; a?.b?.c;', + 'try { f(); } catch (e) { g(); } finally { h(); }', 'switch (x) { case 1: f(); break; default: g(); }', + 'a instanceof B; a in obj;', '(function () {})(); (() => {})();', 'x = a && b || c;', + 'do { x(); } while (y);', 'function* gen() { yield* o(); }', 'const { a, b: c, ...r } = o;', + 'const [p, , q, ...z] = arr;', 'label: for (;;) { break label; }', 'async function h() { await x; }', + ], + reject: ['function (', 'a +;', 'if x {}', '{ a: }', 'for (;;', 'a ? b ;'], + }, + { + // The real typescript.ts grammar — the second, most complex full language proving the + // agnostic emitter (types, generics, interfaces, enums, assertions, variance). ASCII. + grammar: 'typescript', path: '../typescript.ts', + accept: [ + 'const a: number = 1;', 'let s: string;', 'type Alias = { a: number; b?: string };', + 'type U = "a" | "b" | "c";', 'function gen2(x: T, y: U): T { return x; }', + 'interface I extends A { m(x: T): T; }', 'const c = x as const;', + 'function isStr(x: unknown): x is string { return true; }', 'enum E { A, B, C }', + 'const n = maybe!;', 'let arr: number[];', 'type Fn = (x: number) => string;', + 'class C { value!: T; }', + ], + reject: ['interface {}', 'const x: = 1;', 'enum {}', 'a + ;'], + }, +]; + +const sortKeys = (o: unknown): unknown => + Array.isArray(o) ? o.map(sortKeys) + : (o && typeof o === 'object') ? Object.fromEntries(Object.keys(o as object).sort().map((k) => [k, sortKeys((o as Record)[k])])) + : o; +const canon = (o: unknown) => JSON.stringify(sortKeys(o)); + +const TMP = '/tmp/portable-targets'; +rmSync(TMP, { recursive: true, force: true }); +mkdirSync(TMP, { recursive: true }); +const have = (cmd: string, args: string[]) => { try { execFileSync(cmd, args, { stdio: 'pipe' }); return true; } catch { return false; } }; +const HAS_GO = have('go', ['version']); +const HAS_RUST = have('rustc', ['--version']); +if (!HAS_GO) console.log(' go: (toolchain absent — skipped)'); +if (!HAS_RUST) console.log(' rust: (toolchain absent — skipped)'); + +type Outcome = { ok: true; cst: string } | { ok: false }; +function runProc(cmd: string, args: string[], src: string): Outcome { + try { return { ok: true, cst: canon(JSON.parse(execFileSync(cmd, args, { input: src, stdio: ['pipe', 'pipe', 'pipe'] }).toString())) }; } + catch { return { ok: false }; } +} + +let failures = 0; +for (const c of CASES) { + const grammar: CstGrammar = (await import(c.path)).default; + const oracle = createParser(grammar); + const oracleOut = (src: string): Outcome => { try { return { ok: true, cst: canon(oracle.parse(src)) }; } catch { return { ok: false }; } }; + + const dir = `${TMP}/${c.grammar}`; + mkdirSync(dir, { recursive: true }); + const runners: Array<{ label: string; run: (src: string) => Outcome }> = []; + + const tsFile = `${dir}/p.ts`; + writeFileSync(tsFile, emitParser(grammar, tsTarget)); + runners.push({ label: 'typescript', run: (src) => runProc('node', [tsFile], src) }); + + if (HAS_GO && !c.tsOnly) { + const gdir = `${dir}/go`; mkdirSync(gdir, { recursive: true }); + writeFileSync(`${gdir}/main.go`, emitParser(grammar, goTarget)); + writeFileSync(`${gdir}/go.mod`, 'module p\n\ngo 1.21\n'); + execFileSync('go', ['build', '-o', `${gdir}/p`, '.'], { cwd: gdir, stdio: 'pipe' }); + runners.push({ label: 'go', run: (src) => runProc(`${gdir}/p`, [], src) }); + } + if (HAS_RUST && !c.tsOnly) { + const rfile = `${dir}/main.rs`; + writeFileSync(rfile, emitParser(grammar, rustTarget)); + execFileSync('rustc', ['-O', '-A', 'warnings', rfile, '-o', `${dir}/pr`], { stdio: 'pipe' }); + runners.push({ label: 'rust', run: (src) => runProc(`${dir}/pr`, [], src) }); + } + + for (const r of runners) { + let acc = 0, rej = 0; + for (const src of c.accept) { + const want = oracleOut(src), got = r.run(src); + if (want.ok && got.ok && want.cst === got.cst) { acc++; continue; } + failures++; + console.log(` ${c.grammar}/${r.label}: ACCEPT mismatch on ${JSON.stringify(src)}`); + if (want.ok && got.ok) { console.log(` want ${want.cst.slice(0, 140)}`); console.log(` got ${got.cst.slice(0, 140)}`); } + else console.log(` want.ok=${want.ok} got.ok=${got.ok}`); + } + for (const src of c.reject) { + const want = oracleOut(src), got = r.run(src); + if (!want.ok && !got.ok) { rej++; continue; } + failures++; + console.log(` ${c.grammar}/${r.label}: REJECT mismatch on ${JSON.stringify(src)} (oracle ok=${want.ok}, ${r.label} ok=${got.ok})`); + } + console.log(` ${c.grammar}/${r.label}: ${acc}/${c.accept.length} accept ≡ oracle · ${rej}/${c.reject.length} reject ≡ oracle`); + } +} + +if (failures > 0) { + console.error(`\n✗ portable targets diverge from the interpreter (${failures} case(s))`); + process.exit(1); +} +console.log('\n✓ portable parsers (ts/go/rust) derived from each grammar ≡ interpreter CST (compiled & run)'); diff --git a/test/profile-vs-peers.mjs b/test/profile-vs-peers.mjs index 421bc6a..801ecde 100644 --- a/test/profile-vs-peers.mjs +++ b/test/profile-vs-peers.mjs @@ -17,12 +17,12 @@ import { fileURLToPath } from 'node:url'; const REPO = resolve(dirname(fileURLToPath(import.meta.url)), '..'); const acorn = await import(REPO + '/node_modules/acorn/dist/acorn.mjs'); const parse5 = await import(REPO + '/node_modules/parse5/dist/index.js'); -const { emitParser } = await import(REPO + '/src/emit-parser.ts'); +const { emitParser, jsTarget } = await import(REPO + '/src/emit.ts'); -writeFileSync('/tmp/emitted-peers-js.mjs', emitParser((await import(REPO + '/javascript.ts')).default)); -writeFileSync('/tmp/emitted-peers-html.mjs', emitParser((await import(REPO + '/html.ts')).default)); -const monoJs = await import('/tmp/emitted-peers-js.mjs?v=' + Date.now()); -const monoHtml = await import('/tmp/emitted-peers-html.mjs?v=' + Date.now()); +writeFileSync('/tmp/emitted-peers-js.mts', emitParser((await import(REPO + '/javascript.ts')).default, jsTarget)); +writeFileSync('/tmp/emitted-peers-html.mts', emitParser((await import(REPO + '/html.ts')).default, jsTarget)); +const monoJs = await import('/tmp/emitted-peers-js.mts?v=' + Date.now()); +const monoHtml = await import('/tmp/emitted-peers-html.mts?v=' + Date.now()); function time(fn, code, name, n) { const s = process.hrtime.bigint(); diff --git a/test/profile-vs-tsc.mjs b/test/profile-vs-tsc.mjs index b668fdd..61d7382 100644 --- a/test/profile-vs-tsc.mjs +++ b/test/profile-vs-tsc.mjs @@ -11,11 +11,11 @@ import { fileURLToPath } from 'node:url'; const REPO = resolve(dirname(fileURLToPath(import.meta.url)), '..'); const ts = (await import(REPO + '/node_modules/typescript/lib/typescript.js')).default; -const { emitParser } = await import(REPO + '/src/emit-parser.ts'); +const { emitParser, jsTarget } = await import(REPO + '/src/emit.ts'); const grammar = (await import(REPO + '/typescript.ts')).default; -writeFileSync('/tmp/emitted-current.mjs', emitParser(grammar)); -const emitted = await import('/tmp/emitted-current.mjs?v=' + Date.now()); +writeFileSync('/tmp/emitted-current.mts', emitParser(grammar, jsTarget)); +const emitted = await import('/tmp/emitted-current.mts?v=' + Date.now()); const paths = [ '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserharness.ts', diff --git a/test/recovery-conformance.ts b/test/recovery-conformance.ts index 8f1f28c..7c7a7f1 100644 --- a/test/recovery-conformance.ts +++ b/test/recovery-conformance.ts @@ -13,12 +13,12 @@ import { writeFileSync, readFileSync } from 'node:fs'; import { readdir } from 'fs/promises'; import { join } from 'path'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import ts from 'typescript'; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-recovery-conf.mjs'; -writeFileSync(emPath, emitParser(grammar)); +const emPath = '/tmp/emitted-recovery-conf.mts'; +writeFileSync(emPath, emitParser(grammar, jsTarget)); type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; const em = (await import(emPath + '?v=' + process.pid)) as { createParser(): { parse(s: string): Cst } }; const p = em.createParser(); diff --git a/test/recovery.ts b/test/recovery.ts index 5e1d721..193cae8 100644 --- a/test/recovery.ts +++ b/test/recovery.ts @@ -13,12 +13,12 @@ // // node test/recovery.ts import { existsSync, readFileSync, writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { objectify } from './emitted-obj.ts'; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-recovery.mjs'; -writeFileSync(emPath, emitParser(grammar)); +const emPath = '/tmp/emitted-recovery.mts'; +writeFileSync(emPath, emitParser(grammar, jsTarget)); type Edit = { start: number; end: number; text: string }; type Diag = { offset: number; end: number; message: string; related?: { offset: number; end: number; message: string } }; type Cst = { root: number; errors: Diag[] };