From ce7c8bb4e82372ca4b23946ed4816aa2db41f22f Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Sun, 21 Jun 2026 11:34:01 +0800 Subject: [PATCH 01/27] emit: emit type-checked TypeScript (tsTarget, issue #6 first step) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit emitParser now emits a standalone TypeScript module that passes `tsc --strict --noEmit`, replacing the previously untyped JS output. This makes the emitted parser's type contract explicit and gated by construction — the monomorphic parse-state struct (Doc), the matcher/runtime signatures, the spare-buffer mirrors, and the baked op/rule tables all carry types tsc verifies for consistency. That contract is the part a future Go/Rust target must reproduce, so surfacing it now (rather than deferring it to the first non-JS target) is the de-risking first step of issue #6. The additions are erasable TypeScript only (annotations, optional params, `!` assertions) — Node runs the emitted parser by stripping types, so the runtime is unchanged. The arity-looseness the JS output relied on (calling matchers with omitted trailing diagnostic args) is replaced by explicit optional params, the one JS-ism that would not survive a typed/Go/Rust target. Gates: - new emit-tsc-gate: the emitted parser type-checks under `tsc --strict` for the soa + emitted-lexer family (typescript, javascript, typescriptreact, javascriptreact). The fallback-lexer / non-soa path (yaml, html) is logged as deferred — it carries additional untyped surface and a pre-existing latent scope reference (the non-soa editCore branch names cs/ceOld/parenCachePos that exist only in the soa branch; unreached at runtime, hence invisible until now). - emit-parser-verify unchanged: emitted CST stays byte-identical to the interpreter (109/109 in-repo + 401/401 external, 0 mismatches). - bench unchanged (~14x): type-stripping happens once at import, not per parse. Test harnesses that import the emitted module now write `.mts` so Node strips types on import. K_ARR/T_ARR column widths are single-sourced in analyze() so emitRuntime and emitDriver's spare buffers pick the same width. --- src/emit-lexer.ts | 68 +++---- src/emit-parser.ts | 373 +++++++++++++++++++---------------- test/check.ts | 1 + test/cst-match-totality.ts | 2 +- test/emit-lexer-verify.ts | 2 +- test/emit-parser-bench.ts | 2 +- test/emit-parser-verify.ts | 2 +- test/emit-reject-messages.ts | 2 +- test/emit-tsc-gate.ts | 72 +++++++ test/exhaustive-edits.ts | 2 +- test/head-to-head.ts | 2 +- test/incremental-grammars.ts | 4 +- test/incremental-verify.ts | 2 +- test/multi-doc.ts | 2 +- test/recovery-conformance.ts | 2 +- test/recovery.ts | 2 +- 16 files changed, 319 insertions(+), 221 deletions(-) create mode 100644 test/emit-tsc-gate.ts diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 13e254d..ba09347 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -39,7 +39,7 @@ const resyncRetractLine = (indent: string): string => // loop, so `cc>127 && lxNonAsciiWs(cc)` is EXACTLY "the regex would match here" → byte- // identical, minus the wasted exec on the common non-whitespace case (#45 B4). const NON_ASCII_WS_FN = - `function lxNonAsciiWs(cc) { return cc === 0xa0 || cc === 0x1680 || (cc >= 0x2000 && cc <= 0x200a) || cc === 0x2028 || cc === 0x2029 || cc === 0x202f || cc === 0x205f || cc === 0x3000 || cc === 0xfeff; }`; + `function lxNonAsciiWs(cc: number) { return cc === 0xa0 || cc === 0x1680 || (cc >= 0x2000 && cc <= 0x200a) || cc === 0x2028 || cc === 0x2029 || cc === 0x202f || cc === 0x205f || cc === 0x3000 || cc === 0xfeff; }`; // The non-ASCII whitespace fallback, emitted at the two sites that need it (after an ASCII run, // and as the lead char). `cont` appends the `continue` the lead-char site needs. const nonAsciiWsConsume = (v: string, cont: boolean, indent: string): string => @@ -134,22 +134,22 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// min paren depth recorded over the old suffix [j, altN) (pop-on-empty = -1),`); emit(`// built lazily once per edit (the caller nulls it when the alt stream changes).`); emit(`let lexResyncPd = 0;`); - emit(`let altSuffMin = null;`); - emit(`let altSuffMinBuf = null;`); + emit(`let altSuffMin: Int32Array | null = null;`); + emit(`let altSuffMinBuf: Int32Array | null = null;`); emit(`// ')' pops that found an empty stack, in THIS lexCore call's token indices`); - emit(`let lexEmptyPops = [];`); + emit(`let lexEmptyPops: number[] = [];`); emit(`// Min OLD-stream paren depth over the tokens inside the damage itself (set by the`); emit(`// caller before the window lex): the old-side trajectory min starts from here.`); emit(`let wndOldMin0 = 0x7fffffff;`); - emit(`function buildAltSuffMin(lo) {`); + emit(`function buildAltSuffMin(lo: number) {`); emit(` if (altSuffMinBuf === null || altSuffMinBuf.length < altN + 1) altSuffMinBuf = new Int32Array(altN + 1025);`); emit(` altSuffMin = altSuffMinBuf;`); - emit(` altSuffMin[altN] = 0x7fffffff;`); + emit(` altSuffMin![altN] = 0x7fffffff;`); emit(` for (let j = altN - 1; j >= lo; j--) {`); - emit(` let d = altPd[j];`); - emit(` if (d === 0 && altK[j] === K_PUNCT && altT[j] === ${tOf(')')} && (j === 0 || altPd[j - 1] === 0)) d = -1;`); - emit(` const nx = altSuffMin[j + 1];`); - emit(` altSuffMin[j] = d < nx ? d : nx;`); + emit(` let d = altPd![j];`); + emit(` if (d === 0 && altK![j] === K_PUNCT && altT![j] === ${tOf(')')} && (j === 0 || altPd![j - 1] === 0)) d = -1;`); + emit(` const nx = altSuffMin![j + 1];`); + emit(` altSuffMin![j] = d < nx ? d : nx;`); emit(` }`); emit(`}`); emit(`const LX_UNI_IDENT = /[$_\\p{ID_Start}][$\\u200c\\u200d\\p{ID_Continue}]*/uy;`); @@ -175,7 +175,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // Length window → first-charCode switch → per-keyword compare chains (shortest first); // returns exactly what LIT_KW.get(source.slice(a, b)) ?? 0 would — the keyword set is // enumerated completely and keywords are pure ASCII, so charCode compares are exact. - emit(`function lexKwT(source, a, b) {`); + emit(`function lexKwT(source: string, a: number, b: number) {`); const kwEntries = [...st.kwLitKind.entries()]; if (kwEntries.length === 0) { emit(` return 0;`); @@ -205,11 +205,11 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { } emit(`}`); // identTextValid, with the per-token prefix length baked at the call site. - emit(`function lexIdentValid(text, prefixLen) {`); + emit(`function lexIdentValid(text: string, prefixLen: number) {`); emit(` const body = prefixLen > 0 ? text.slice(prefixLen) : text;`); emit(` if (!body.includes('\\\\')) return true;`); emit(` let bad = false;`); - emit(` const decoded = body.replace(LX_DECODE_ESC, (_m, braced, fixed) => {`); + emit(` const decoded = body.replace(LX_DECODE_ESC, (_m: string, braced: string, fixed: string) => {`); emit(` const cp = parseInt(braced ?? fixed, 16);`); emit(` if (cp > 0x10FFFF) { bad = true; return ''; }`); emit(` return String.fromCodePoint(cp);`); @@ -219,7 +219,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` return m !== null && m[0].length === decoded.length;`); emit(`}`); if (templateToken) { - emit(`function lexTplSpan(source, pos, validateEscapes) {`); + emit(`function lexTplSpan(source: string, pos: number, validateEscapes: boolean) {`); emit(` const tplFrom = pos;`); emit(` while (pos < source.length) {`); emit(` if (${startsWithExpr('source', 'pos', tplInterpOpen)}) return { endsWithInterp: true, end: pos + ${tplInterpOpen.length} };`); @@ -256,7 +256,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // — no per-token object, no text slice: text is materialized from the source span only // when a CST leaf is built. Flag bits: 1 = newlineBefore (the only stamp this emitted // lexer ever sets; comment/multilineFlow stamps belong to fallback-only grammars). - emit(`function tokenize(source) {`); + emit(`function tokenize(source: string) {`); emit(` docPieces = [source]; docPieceOff = [0]; docLen = source.length;`); emit(` docFlat = source; docCur = 0;`); emit(` tokN = 0;`); @@ -281,7 +281,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// old token (same k/t, offsets shifted by wndDelta, both depth records 0) while`); emit(`// the window's own stacks are empty — returns that OLD index (the duplicate push`); emit(`// is retracted), or -1 when lexing ran to EOF.`); - emit(`function lexCore(source, startPos, pvK, pvT, wndPtr0, wndMinOff, wndDelta, wndCs, initParens, srcBase, hasMore) {`); + emit(`function lexCore(source: string, startPos: number, pvK: number, pvT: number, wndPtr0: number, wndMinOff: number, wndDelta: number, wndCs?: number, initParens?: boolean[] | null, srcBase?: number, hasMore?: boolean) {`); emit(` if (srcBase === undefined) srcBase = 0;`); emit(` lexWindowMore = hasMore === true;`); emit(` lexSrcBase = srcBase;`); @@ -291,7 +291,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` let extraFl = 0;`); emit(` let lastBangWasPostfix = false;`); emit(` let lastCloseWasParenHead = false;`); - emit(` const templateStack = [];`); + emit(` const templateStack: number[] = [];`); emit(` const parenHeadStack = initParens !== undefined && initParens !== null ? initParens : [];`); emit(` let wndPtr = wndPtr0;`); emit(` let wndHit = -1;`); @@ -301,8 +301,8 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` // tokens and stack ops). An entry at depth <= BOTH mins was open at the`); emit(` // divergence point in both lexes - i.e. it is the SAME entry.`); emit(` let dmgMinOld = wndOldMin0, dmgMinNew = -1;`); - emit(` function tkPush(k, t, off, end) {`); - emit(` off += srcBase; end += srcBase;`); + emit(` function tkPush(k: number, t: number, off: number, end: number) {`); + emit(` off += srcBase!; end += srcBase!;`); emit(` if (tokN === tkCap) growTok();`); emit(` tkK[tokN] = k; tkT[tokN] = t; tkOff[tokN] = off; tkEnd[tokN] = end;`); emit(` tkFl[tokN] = (pendingNl ? 1 : 0) | extraFl;`); @@ -331,20 +331,20 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` // adopted tkPd column by lexResyncPd to the new truth.`); emit(` if (wndPtr >= 0) {`); emit(` const pd = tkPd[tokN - 1];`); - emit(` if (dmgMinNew < 0) { if (off >= wndCs) dmgMinNew = pd; }`); + emit(` if (dmgMinNew < 0) { if (off >= wndCs!) dmgMinNew = pd; }`); emit(` else if (pd < dmgMinNew) dmgMinNew = pd;`); emit(` if (off >= wndMinOff) {`); - emit(` while (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta < off) { if (altPd[wndPtr] < dmgMinOld) dmgMinOld = altPd[wndPtr]; wndPtr++; }`); - emit(` if (wndPtr < altN && (altOff[wndPtr] < 0 ? altOff[wndPtr] + srcLenP1 : altOff[wndPtr]) + wndDelta === off && altK[wndPtr] === k && altT[wndPtr] === t`); - emit(` && (altEnd[wndPtr] < 0 ? altEnd[wndPtr] + srcLenP1 : altEnd[wndPtr]) + wndDelta === end`); + emit(` while (wndPtr < altN && (altOff![wndPtr] < 0 ? altOff![wndPtr] + srcLenP1 : altOff![wndPtr]) + wndDelta < off) { if (altPd![wndPtr] < dmgMinOld) dmgMinOld = altPd![wndPtr]; wndPtr++; }`); + emit(` if (wndPtr < altN && (altOff![wndPtr] < 0 ? altOff![wndPtr] + srcLenP1 : altOff![wndPtr]) + wndDelta === off && altK![wndPtr] === k && altT![wndPtr] === t`); + emit(` && (altEnd![wndPtr] < 0 ? altEnd![wndPtr] + srcLenP1 : altEnd![wndPtr]) + wndDelta === end`); emit(` // the candidate's LEADING-TRIVIA flags must match too: the gap before`); emit(` // it may sit inside the edit (newline removed/added without moving any`); emit(` // token bytes), and parsers read these flags (sameLine / commentBefore)`); - emit(` && altFl[wndPtr] === tkFl[tokN - 1]`); - emit(` && templateStack.length === 0 && altDp[wndPtr] === 0`); + emit(` && altFl![wndPtr] === tkFl[tokN - 1]`); + emit(` && templateStack.length === 0 && altDp![wndPtr] === 0`); emit(` && LX_PFXV[t] === 0 && LX_PARENKW[t] === 0`); emit(` && !(k === K_PUNCT && (t === ${tLParen} || t === ${tRParen}))) {`); - emit(` const q = altPd[wndPtr];`); + emit(` const q = altPd![wndPtr];`); emit(` if (q < dmgMinOld) dmgMinOld = q;`); emit(` if (q === pd && pd <= dmgMinOld && pd <= dmgMinNew) {`); emit(` wndHit = wndPtr;`); @@ -358,7 +358,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(` okTail = docEmptyPops.length === 0 || docEmptyPops[docEmptyPops.length - 1] <= wndPtr;`); emit(` } else {`); emit(` if (altSuffMin === null) buildAltSuffMin(wndPtr0);`); - emit(` okTail = altSuffMin[wndPtr + 1] >= q;`); + emit(` okTail = altSuffMin![wndPtr + 1] >= q;`); emit(` }`); emit(` if (okTail) {`); emit(` wndHit = wndPtr;`); @@ -495,7 +495,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`${ind} extraFl = _ph ? 8 : 0; }`); } else if (lit === ')') { emit(`${ind}if (parenHeadStack.length === 0) { lastCloseWasParenHead = false; lexEmptyPops.push(tokN); }`); - emit(`${ind}else lastCloseWasParenHead = parenHeadStack.pop();`); + emit(`${ind}else lastCloseWasParenHead = parenHeadStack.pop()!;`); } if (regexCtx?.postfixAfterValueTexts?.includes(lit)) { emit(`${ind}lastBangWasPostfix = prevIsValue();`); @@ -635,7 +635,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// lexer flag live (a control-head ')' or a postfix-ambiguous operator would`); emit(`// make the next token's regex-context depend on unrecoverable state). -1 = file`); emit(`// head (always sound, degrades to a full re-lex).`); - emit(`function findRestart(cs) {`); + emit(`function findRestart(cs: number) {`); emit(` let lo = 0, hi = tokN;`); // STRICTLY before the damage: a token ENDING exactly at cs can be EXTENDED by // the edit under maximal munch ('b' + inserted 'x' = 'bx'; '=' + '=' = '=='; @@ -658,9 +658,9 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// openers at that depth are re-opened later, and the re-opener comes first`); emit(`// backward). The '(' records its depth INCLUDING itself, and carries its`); emit(`// control-head-ness as tkFl bit 8.`); - emit(`function reconstructParens(b) {`); + emit(`function reconstructParens(b: number) {`); emit(` let need = b >= 0 ? tkPd[b] : 0;`); - emit(` const out = new Array(need);`); + emit(` const out: boolean[] = new Array(need);`); emit(` for (let i = b; i >= 0 && need > 0; i--) {`); emit(` if (tkK[i] === 1 && tkT[i] === ${tOf('(')} && tkPd[i] === need) { out[need - 1] = (tkFl[i] & 8) !== 0; need--; }`); emit(` }`); @@ -673,9 +673,9 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { emit(`// are splice-stable (every splice begins past its own anchor), so the baseline`); emit(`// stays exact; a backward jump (b < cached) falls back to the full scan.`); emit(`let parenCachePos = -1;`); - emit(`let parenCacheStack = [];`); - emit(`function reconstructParensCached(b) {`); - emit(` let stack;`); + emit(`let parenCacheStack: boolean[] = [];`); + emit(`function reconstructParensCached(b: number) {`); + emit(` let stack: boolean[];`); emit(` if (b < 0) stack = [];`); emit(` else if (parenCachePos >= 0 && parenCachePos <= b) {`); emit(` stack = parenCacheStack;`); diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 68923f3..7bd889b 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -346,12 +346,19 @@ function analyze(grammar: CstGrammar) { typeKind, kwLitKind, puLitKind, classifyKey, }; + // Column element types: Uint8 when the kind/literal id spaces fit a byte (the SoA + // token columns and their spare-buffer mirrors). Single-sourced here so every emit + // function — emitRuntime's `let tk* = new …`, emitDriver's `let alt* …` — agrees. + const tMaxT = Math.max(1, ...kwLitKind.values(), ...puLitKind.values()); + const kArr = KIND_NAMED_FALLBACK <= 255 ? 'Uint8Array' : 'Uint16Array'; + const tArr = tMaxT <= 255 ? 'Uint8Array' : 'Uint16Array'; + return { grammar, tokenNames, opTable, prefixOps, noUnaryLhsOps, postfixOpValues, requireTargetOps, binaryConnectors, prattRules, leftRecSet, ruleByName, prattClassified, leftRecClassified, maxBp, templateTokenName, templateTokenNames, firstTokenOf, altDeepFirst, altNullable, altSecond, ledMeta, contMeta, nudCap, nullableRules, firstSets, symtab, qualKeys, - exprFirst, exprNullable, + exprFirst, exprNullable, kArr, tArr, }; } @@ -865,7 +872,7 @@ class Emitter { if (!nm) { nm = `_q${this.memberFns.size}`; this.memberFns.set(fnKey, nm); - this.helperDefs.push(`function ${nm}(i) { return i >= cap || (${kArr}[tkK[i]] | ${tArr}[tkT[i]]) !== 0; }`); + this.helperDefs.push(`function ${nm}(i: number) { return i >= cap || (${kArr}[tkK[i]] | ${tArr}[tkT[i]]) !== 0; }`); } return nm; } @@ -1052,7 +1059,7 @@ class Emitter { let nm = this.u8Consts.get(key); if (!nm) { if (!this.u8Emitted) { - this.helperDefs.push(`function u8(n, ones) { const a = new Uint8Array(n); for (let i = 0; i < ones.length; i++) a[ones[i]] = 1; return a; }`); + this.helperDefs.push(`function u8(n: number, ones: number[]) { const a = new Uint8Array(n); for (let i = 0; i < ones.length; i++) a[ones[i]] = 1; return a; }`); this.u8Emitted = true; } nm = `_qb${this.u8Consts.size}`; @@ -1178,8 +1185,9 @@ export function emitParser(grammar: CstGrammar): string { } return arr; }; - e.emit(`const OP_BY_T = ${J(byT(a.opTable))};`); - e.emit(`const PREFIX_BY_T = ${J(byT(a.prefixOps))};`); + e.emit(`type OpInfo = { lbp: number; rbp: number; assoc: string; position: string; requireTarget?: boolean };`); + e.emit(`const OP_BY_T: (OpInfo | null)[] = ${J(byT(a.opTable))};`); + e.emit(`const PREFIX_BY_T: (OpInfo | null)[] = ${J(byT(a.prefixOps))};`); } e.emit(`const noUnaryLhsOps = new Set(${J([...a.noUnaryLhsOps])});`); { @@ -1213,7 +1221,7 @@ export function emitParser(grammar: CstGrammar): string { // `++x`) — head kid is an operator-tag leaf in prefixOps — or a postfix-update (`x++`) — // tail kid is an operator-tag leaf in postfixOpValues. A parenthesized cover / member / // element / call / non-null tail has no operator-tag leaf at head or tail, so it passes. - e.emit(`function _notTarget(lhs) {`); + e.emit(`function _notTarget(lhs: number) {`); e.emit(` const n = rowCount[lhs]; if (n === 0) return false;`); e.emit(` const cs = rowStart[lhs];`); e.emit(` const _h = kids[cs];`); @@ -1238,7 +1246,7 @@ export function emitParser(grammar: CstGrammar): string { // nodes). Drives the notLeftLeaf LED gate: a node whose head leaf text is in the arm's word set // (e.g. `void`/`null`/`this` for the type `.` qualification) is not a valid LEFT operand of the // arm. A childless ($missing recovery) node returns '' (matches no word → the arm is not blocked). - e.emit(`function _headLeafText(id) {`); + e.emit(`function _headLeafText(id: number) {`); e.emit(` while (rowCount[id] > 0) {`); e.emit(` const _hh = kids[rowStart[id]];`); e.emit(` if (_hh >= 0) { id = _hh; continue; }`); @@ -1311,13 +1319,10 @@ function resolveLexerImport(): string { return pathResolve(__dir, 'gen-lexer.ts' // ONLY change: where the interpreter called matchExpr(alt)/matchSeq(items) per arm, // these call the GENERATED per-arm matcher functions (installed via the rule fns). function emitRuntime(e: Emitter) { - // Column element type: Uint8 when the kind/literal id spaces fit a byte. - const st = e.a.symtab; - let tMax = 1; - for (const v of st.kwLitKind.values()) tMax = Math.max(tMax, v); - for (const v of st.puLitKind.values()) tMax = Math.max(tMax, v); - const K_ARR = st.KIND_NAMED_FALLBACK <= 255 ? 'Uint8Array' : 'Uint16Array'; - const T_ARR = tMax <= 255 ? 'Uint8Array' : 'Uint16Array'; + // Column element type: Uint8 when the kind/literal id spaces fit a byte (single- + // sourced in analyze() so emitDriver's spare-buffer mirrors pick the same width). + const K_ARR = e.a.kArr; + const T_ARR = e.a.tArr; e.emit(String.raw` // ── Token stream: struct-of-arrays (no per-token object, no eager text) ── // tkK = type kind, tkT = literal kind, tkOff/tkEnd = source span, tkFl = stamp bits @@ -1345,14 +1350,14 @@ let tokN = 0; // joined form for the cold paths that need one (errors, debug views); batch parses // set it directly. Reads route through docChar/docText: flat fast path, piece // lookup (cursor-cached) otherwise. -let docPieces = null; -let docPieceOff = null; +let docPieces: string[] | null = null; +let docPieceOff: number[] | null = null; let docLen = 0; -let docFlat = null; +let docFlat: string | null = null; let docCur = 0; -function docLocate(i) { +function docLocate(i: number) { let k = docCur; - const po = docPieceOff; + const po = docPieceOff!; const n = po.length; if (k >= n || po[k] > i || (k + 1 < n && po[k + 1] <= i)) { let lo = 0, hi = n; @@ -1362,57 +1367,57 @@ function docLocate(i) { } return k; } -function docChar(i) { +function docChar(i: number) { if (docFlat !== null) return docFlat.charCodeAt(i); const k = docLocate(i); - return docPieces[k].charCodeAt(i - docPieceOff[k]); + return docPieces![k].charCodeAt(i - docPieceOff![k]); } -function docText(a, b) { +function docText(a: number, b: number) { if (docFlat !== null) return docFlat.slice(a, b); if (b <= a) return ''; let k = docLocate(a); - const first = docPieces[k]; - const lo = a - docPieceOff[k]; - if (b - docPieceOff[k] <= first.length) return first.slice(lo, b - docPieceOff[k]); + const first = docPieces![k]; + const lo = a - docPieceOff![k]; + if (b - docPieceOff![k] <= first.length) return first.slice(lo, b - docPieceOff![k]); let out = first.slice(lo); k++; - while (k < docPieces.length && docPieceOff[k] < b) { - const piece = docPieces[k]; - const need = b - docPieceOff[k]; + while (k < docPieces!.length && docPieceOff![k] < b) { + const piece = docPieces![k]; + const need = b - docPieceOff![k]; out += need >= piece.length ? piece : piece.slice(0, need); k++; } return out; } function flattenDoc() { - if (docFlat === null) docFlat = docPieces.join(''); + if (docFlat === null) docFlat = docPieces!.join(''); return docFlat; } -function applyChange(start, end, text) { +function applyChange(start: number, end: number, text: string) { const ks = docLocate(start); const ke = docLocate(end > start ? end - 1 : start); - const head = docPieces[ks].slice(0, start - docPieceOff[ks]); - const tailPiece = end > start ? docPieces[ke] : docPieces[ks]; - const tailOff = end - docPieceOff[end > start ? ke : ks]; + const head = docPieces![ks].slice(0, start - docPieceOff![ks]); + const tailPiece = end > start ? docPieces![ke] : docPieces![ks]; + const tailOff = end - docPieceOff![end > start ? ke : ks]; const tail = tailPiece.slice(tailOff); const repl = []; if (head.length > 0) repl.push(head); if (text.length > 0) repl.push(text); if (tail.length > 0) repl.push(tail); - docPieces.splice(ks, (end > start ? ke : ks) - ks + 1, ...repl); + docPieces!.splice(ks, (end > start ? ke : ks) - ks + 1, ...repl); // consolidate when fragmenting (amortized: a join every ≥256 edits) - if (docPieces.length > 256) { - docPieces = [docPieces.join('')]; + if (docPieces!.length > 256) { + docPieces = [docPieces!.join('')]; } docLen += text.length - (end - start); // rebuild offsets from the splice point (suffix offsets shifted anyway) - if (docPieceOff.length !== docPieces.length) docPieceOff.length = docPieces.length; - let off = ks > 0 && ks - 1 < docPieces.length ? docPieceOff[ks - 1] + docPieces[ks - 1].length : 0; - for (let k2 = ks > 0 ? ks : 0; k2 < docPieces.length; k2++) { - docPieceOff[k2] = off; - off += docPieces[k2].length; + if (docPieceOff!.length !== docPieces!.length) docPieceOff!.length = docPieces!.length; + let off = ks > 0 && ks - 1 < docPieces!.length ? docPieceOff![ks - 1] + docPieces![ks - 1].length : 0; + for (let k2 = ks > 0 ? ks : 0; k2 < docPieces!.length; k2++) { + docPieceOff![k2] = off; + off += docPieces![k2].length; } - if (docPieces.length === 1) docPieceOff[0] = 0; + if (docPieces!.length === 1) docPieceOff![0] = 0; docCur = 0; docFlat = null; } @@ -1425,8 +1430,8 @@ function applyChange(start, end, text) { // parses are all-positive and the decode branch never fires. let srcLenP1 = 1; let negFrom = 0x7fffffff; -function toff(i) { const v = tkOff[i]; return v < 0 ? v + srcLenP1 : v; } -function tend(i) { const v = tkEnd[i]; return v < 0 ? v + srcLenP1 : v; } +function toff(i: number) { const v = tkOff[i]; return v < 0 ? v + srcLenP1 : v; } +function tend(i: number) { const v = tkEnd[i]; return v < 0 ? v + srcLenP1 : v; } ${e.soa ? '' : 'let tkText = []; // fallback-lexer text column (synthetic tokens are not source spans)'} function growTok() { tkCap *= 2; @@ -1483,8 +1488,8 @@ let rowNF = new Int32Array(8192).fill(0x7fffffff); // 'succeed' over broken text and wipe its diagnostics). Recovering passes adopt // these rows freely. let rowRM = new Uint8Array(8192); -function ktr(p, k) { const v = kidTokRel[k]; return v < 0 ? v + rowTokLen[p] + 1 : v; } -function kcr(p, k) { const v = kidRel[k]; return v < 0 ? v + rowLen[p] + 1 : v; } +function ktr(p: number, k: number) { const v = kidTokRel[k]; return v < 0 ? v + rowTokLen[p] + 1 : v; } +function kcr(p: number, k: number) { const v = kidRel[k]; return v < 0 ? v + rowLen[p] + 1 : v; } // transient BUILD coordinates (absolute), valid for rows completed in the current // parse and REFRESHED at memo-hit time for reused roots — parents read them at // finishNode to write the children's relative fields; never part of the green tree. @@ -1531,24 +1536,24 @@ function growRows() { const ac = new Int32Array(rowCap); ac.set(absChar); absChar = ac; const at = new Int32Array(rowCap); at.set(absTok); absTok = at; } -function growKids(n) { +function growKids(n: number) { while (kidN + n > kidCap) kidCap *= 2; const k = new Int32Array(kidCap); k.set(kids.subarray(0, kidN)); kids = k; const r = new Int32Array(kidCap); r.set(kidRel.subarray(0, kidN)); kidRel = r; const t = new Int32Array(kidCap); t.set(kidTokRel.subarray(0, kidN)); kidTokRel = t; } -function scPush(e) { +function scPush(e: number) { if (scn === scCap) { scCap *= 2; const s = new Int32Array(scCap); s.set(sc); sc = s; } sc[scn++] = e; } -function entryOff(e) { return e >= 0 ? absChar[e] : toff((~e) >>> 2); } -function entryEnd(e) { return e >= 0 ? absChar[e] + rowLen[e] : tend((~e) >>> 2); } -function entryTok(e) { return e >= 0 ? absTok[e] : (~e) >>> 2; } -function entryTokEnd(e) { return e >= 0 ? absTok[e] + rowTokLen[e] : ((~e) >>> 2) + 1; } +function entryOff(e: number) { return e >= 0 ? absChar[e] : toff((~e) >>> 2); } +function entryEnd(e: number) { return e >= 0 ? absChar[e] + rowLen[e] : tend((~e) >>> 2); } +function entryTok(e: number) { return e >= 0 ? absTok[e] : (~e) >>> 2; } +function entryTokEnd(e: number) { return e >= 0 ? absTok[e] + rowTokLen[e] : ((~e) >>> 2) + 1; } // Complete a node whose children are scratch[mark..scn): copy them into kids, write // the row, truncate scratch, return the id. Empty children = a zero-width node // at the current token (the old offset() rule). -function finishNode(rid, mark) { +function finishNode(rid: number, mark: number) { const n = scn - mark; if (nodeN === rowCap) growRows(); const id = nodeN++; @@ -1607,7 +1612,7 @@ function finishNode(rid, mark) { return id; } // Complete a LED/continuation wrap: children = [lhs, ...scratch[mark..scn)]. -function finishWrap(rid, lhsId, mark) { +function finishWrap(rid: number, lhsId: number, mark: number) { const n = scn - mark; if (nodeN === rowCap) growRows(); const id = nodeN++; @@ -1675,22 +1680,22 @@ let _prattCapped = false; // be identical between a fresh parse and an adoption re-run. frameMax <= maxPos // always, so the hot advance pays one extra compare only at frontier breaches. let frameMax = 0; -let memoNode = []; -let memoEnd = []; -let memoExt = []; // per-entry lookahead extent (see parseRuleEntry) +let memoNode: number[][] = []; +let memoEnd: number[][] = []; +let memoExt: number[][] = []; // per-entry lookahead extent (see parseRuleEntry) // GENERATION-STAMPED memo: the per-rule arrays persist across parses (allocating // fresh multi-million-slot arrays per edit cost ~30% of a large-file edit in GC // alone); an entry is live iff its stamp equals the current generation — bumping // memoGenCur IS the whole reset. -let memoGen = []; +let memoGen: Int32Array[] = []; let memoGenCur = 0; let parseLimit = -1; // cap = the exclusive lookahead bound: min(parseLimit-or-∞, tokN), maintained at the // parseLimit set/restore sites and the one token-stream mutation (the '>' splice). let cap = 0; -let currentPrattContext = null; -let suppressNext = null; -let suppressCur = null; +let currentPrattContext: string | null = null; +let suppressNext: Set | null = null; +let suppressCur: Set | null = null; function offset() { if (pos < cap) return toff(pos); @@ -1703,7 +1708,7 @@ function offset() { // Keyword literal: the interpreter required tok.type !== '' && tokenNames.has(tok.type) // && tok.text === value. With interned kinds that is tok.k >= K_NAMED_MIN (a declared // token name; '' is PUNCT, templates are below NAMED_MIN) && tok.t === KW(value). -function matchKwLit(kw, vs) { +function matchKwLit(kw: number, vs?: number) { // A kw-range t can only come from a named token (template spans never intern to a // keyword), so the old k >= K_NAMED_MIN guard was redundant — one int compare. // vs (optional) = the call site's viable-set id, threaded into the $missing row. @@ -1715,7 +1720,7 @@ function matchKwLit(kw, vs) { // Punct literal: tok.type === '' && tok.text === value, with the gt-splice fallback. // tok.t === PU(value) is the exact-text fast path; the splice handles a longer // gt-led token matching the gt key. value/pu are baked by the caller. -function matchPuLit(pu, vs) { +function matchPuLit(pu: number, vs?: number) { // A pu-range t can only come from a punct token, so the old k === K_PUNCT guard was // redundant — one int compare. The '>'-split lives only in matchPuLitGT ('>' sites). if (pos >= cap || tkT[pos] !== pu) return recovering ? missTok(pu, vs) : false; @@ -1723,7 +1728,7 @@ function matchPuLit(pu, vs) { if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } return true; } -function matchPuLitGT(pu, vs) { +function matchPuLitGT(pu: number, vs?: number) { if (pos >= cap) return false; const off = toff(pos); if (tkT[pos] === pu) { @@ -1783,7 +1788,7 @@ function matchPuLitGT(pu, vs) { } // Generic matchLiteral kept for any unspecialized site: classify value via the baked // tables (no per-call isKeywordLiteral / string compares) and delegate. -function matchLiteral(value) { +function matchLiteral(value: string) { const kw = LIT_KW.get(value); if (kw !== undefined) return matchKwLit(kw); if (value === '>') return matchPuLitGT(LIT_PU.get(value) ?? 0); @@ -1793,7 +1798,7 @@ function matchLiteral(value) { // Match a token ref by its baked TYPE kind: tok.type === name ⟺ tok.k === nameKind. // (No named-token kind equals K_NAMED_FALLBACK, so an unforeseen type never matches.) // The materialized tokenType is type-derived (kind 0) — name needs no baking here. -function matchTokK(nameKind) { +function matchTokK(nameKind: number) { if (pos >= cap || tkK[pos] !== nameKind) return recovering ? missTok(-nameKind) : false; scPush(~(pos << 2)); if (++pos > frameMax) { frameMax = pos; if (pos > maxPos) maxPos = pos; } @@ -1858,7 +1863,7 @@ function emitRuleFns(e: Emitter, a: ReturnType) { else emitNonRecRule(e, a, rule, spine.has(rule.name) && !a.prattRules.has(rule.name) && !a.leftRecSet.has(rule.name)); } // Dispatch table (string rule name → fn), for parseTemplateExpr's dynamic interp rule. - e.emit(`const RULES = {`); + e.emit(`const RULES: Record boolean> = {`); for (const rule of a.grammar.rules) e.emit(` ${J(rule.name)}: ${ruleFn(rule.name)},`); e.emit(`};`); @@ -1954,7 +1959,7 @@ function emitNonRecRule(e: Emitter, a: ReturnType, rule: RuleDec // pratt/left-rec rules. if (memoized) { e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_core); }`); - e.emit(`function ${ruleFn}_core(_minBp) {`); + e.emit(`function ${ruleFn}_core(_minBp: number) {`); } else { e.emit(`function ${ruleFn}() {`); } @@ -2000,7 +2005,7 @@ function emitLeftRecRule(e: Emitter, a: ReturnType, rule: RuleDe contNotLeftLeaf.forEach((words, i) => { if (words) e.emit(`const _NLLC_${sn}_${i} = new Set(${J(words)});`); }); - e.emit(`function ${ruleFn}_lr(_minBp) {`); + e.emit(`function ${ruleFn}_lr(_minBp: number) {`); e.emit(` const saved = pos; const mark = scn;`); e.emit(` let node = -1; let bestAtomPos = saved;`); const atomDispatch = e.altMaskDispatch(atoms, '_am'); @@ -2065,7 +2070,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl meta.notLeftLeaf.forEach((words, i) => { if (words) e.emit(`const _NLL_${sn}_${i} = new Set(${J(words)});`); }); - e.emit(`function ${ruleFn}_pratt(minBp) {`); + e.emit(`function ${ruleFn}_pratt(minBp: number) {`); e.emit(` const saved = pos; const mark = scn;`); e.emit(` let lhs = -1; let bestNudPos = saved;`); // `capped` becomes true iff the winning NUD is a capped (assignment-level) expression — @@ -2322,7 +2327,7 @@ function emitDriver(e: Emitter, a: ReturnType, entry: string) { // and SECOND-token reads past it. Left-to-right parsing keeps the watermark near the // current frontier, so the value is tight on the dominant flow and only OVER- // invalidates (soundly) near big-backtrack clusters. -function parseRuleEntry(idx, rid, name, core) { +function parseRuleEntry(idx: number, rid: number, name: string, core: (minBp: number) => number) { const mySup = suppressNext; suppressNext = null; const capped = parseLimit >= 0; @@ -2499,14 +2504,14 @@ function parseRuleEntry(idx, rid, name, core) { } // Token text at an arbitrary index (cold paths: errors, the tokenAt debug view). -function tokTextAt(i) { +function tokTextAt(i: number) { return ${e.soa ? 'docText(toff(i), tend(i))' : 'tkText[i]'}; } // The k → type-name inverse, for reconstructing a token object (tokenAt). -const K_NAMES = []; +const K_NAMES: string[] = []; for (const [n, k] of TYPE_KIND) K_NAMES[k] = n; // A per-token object view over the columns (gates / debugging — the parser never builds these). -export function tokenAt(i) { +export function tokenAt(i: number) { return { type: K_NAMES[tkK[i]] ?? '', text: tokTextAt(i), @@ -2524,7 +2529,7 @@ export function tokenAt(i) { // The arena IS the tree: parse() returns the root node id and consumers traverse // via visit()/the accessors — nothing is materialized on the parse path. All views // are valid until the NEXT parse (the columns are reused). -function leafTokenType(entry, tokBase) { +function leafTokenType(entry: number, tokBase: number) { const tok = tokBase + ((~entry) >>> 2); const kind = (~entry) & 3; return kind === 1 ? '$keyword' @@ -2539,36 +2544,36 @@ function leafTokenType(entry, tokBase) { // — the node's own absolute start coordinates. Leaf spans come from the token // columns at tokBase + the entry's node-relative token index. export const tree = { - ruleNameOf: (id) => RULE_DISPLAY[rowRule[id]], - ruleIdOf: (id) => rowRule[id], - lenOf: (id) => rowLen[id], - tokLenOf: (id) => rowTokLen[id], + ruleNameOf: (id: number) => RULE_DISPLAY[rowRule[id]], + ruleIdOf: (id: number) => rowRule[id], + lenOf: (id: number) => rowLen[id], + tokLenOf: (id: number) => rowTokLen[id], // a node CHILD's relative coordinates live on the parent edge (kids-parallel) - childRelAt: (id, i) => kcr(id, rowStart[id] + i), - childTokRelAt: (id, i) => ktr(id, rowStart[id] + i), + childRelAt: (id: number, i: number) => kcr(id, rowStart[id] + i), + childTokRelAt: (id: number, i: number) => ktr(id, rowStart[id] + i), // base-threaded spans: nodes from their bases, leaves from the token columns - offsetOf: (entry, charBase, tokBase) => entry >= 0 ? charBase : toff(tokBase + ((~entry) >>> 2)), - endOf: (entry, charBase, tokBase) => entry >= 0 ? charBase + rowLen[entry] : tend(tokBase + ((~entry) >>> 2)), - childCount: (id) => rowCount[id], - childAt: (id, i) => kids[rowStart[id] + i], + offsetOf: (entry: number, charBase: number, tokBase: number) => entry >= 0 ? charBase : toff(tokBase + ((~entry) >>> 2)), + endOf: (entry: number, charBase: number, tokBase: number) => entry >= 0 ? charBase + rowLen[entry] : tend(tokBase + ((~entry) >>> 2)), + childCount: (id: number) => rowCount[id], + childAt: (id: number, i: number) => kids[rowStart[id] + i], // Bulk child load into a caller-owned array; returns the count. One call per node // instead of childCount+childAt-per-probe (the generated matchers' hot path). - childrenInto: (id, out2) => { + childrenInto: (id: number, out2: number[]) => { const n2 = rowCount[id]; const cs2 = rowStart[id]; for (let i2 = 0; i2 < n2; i2++) out2[i2] = kids[cs2 + i2]; return n2; }, - isLeaf: (entry) => entry < 0, - leafToken: (entry, tokBase) => tokBase + ((~entry) >>> 2), + isLeaf: (entry: number) => entry < 0, + leafToken: (entry: number, tokBase: number) => tokBase + ((~entry) >>> 2), leafTokenType, // Int-world leaf accessors (the match-path encoding): kind bits — 0 type-derived, // 1 '$keyword', 2 '$operator' — and the token's TYPE kind int (1 = punctuation). - leafKindOf: (entry) => (~entry) & 3, - leafTokKindOf: (entry, tokBase) => tkK[tokBase + ((~entry) >>> 2)], - leafOffsetOf: (entry, tokBase) => toff(tokBase + ((~entry) >>> 2)), - leafEndOf: (entry, tokBase) => tend(tokBase + ((~entry) >>> 2)), - textOf: (entry, source, charBase, tokBase) => entry >= 0 + leafKindOf: (entry: number) => (~entry) & 3, + leafTokKindOf: (entry: number, tokBase: number) => tkK[tokBase + ((~entry) >>> 2)], + leafOffsetOf: (entry: number, tokBase: number) => toff(tokBase + ((~entry) >>> 2)), + leafEndOf: (entry: number, tokBase: number) => tend(tokBase + ((~entry) >>> 2)), + textOf: (entry: number, source: string, charBase: number, tokBase: number) => entry >= 0 ? source.slice(charBase, charBase + rowLen[entry]) : source.slice(toff(tokBase + ((~entry) >>> 2)), tend(tokBase + ((~entry) >>> 2))), }; @@ -2579,22 +2584,23 @@ export const tree = { // Depth-first traversal threading the RED coordinates: enter/leave receive the // node's absolute (charBase, tokBase); leaf receives its absolute token index. // Call with the root only — the bases default from the root's rel fields. -function visitCore(entry, fns, charBase, tokBase) { +type _VisitFns = { enter?: (id: number, charBase: number, tokBase: number) => boolean | void; leave?: (id: number, charBase: number, tokBase: number) => void; leaf?: (entry: number, tok: number) => void }; +function visitCore(entry: number, fns: _VisitFns, charBase?: number, tokBase?: number) { if (charBase === undefined) { charBase = rootCharBase; tokBase = rootTokBase; } - if (entry < 0) { if (fns.leaf) fns.leaf(entry, tokBase + ((~entry) >>> 2)); return; } - if (fns.enter && fns.enter(entry, charBase, tokBase) === false) return; + if (entry < 0) { if (fns.leaf) fns.leaf(entry, tokBase! + ((~entry) >>> 2)); return; } + if (fns.enter && fns.enter(entry, charBase, tokBase!) === false) return; const n = rowCount[entry]; const cs = rowStart[entry]; for (let i = 0; i < n; i++) { const e = kids[cs + i]; - if (e < 0) { if (fns.leaf) fns.leaf(e, tokBase + ((~e) >>> 2)); } - else visitCore(e, fns, charBase + kcr(entry, cs + i), tokBase + ktr(entry, cs + i)); + if (e < 0) { if (fns.leaf) fns.leaf(e, tokBase! + ((~e) >>> 2)); } + else visitCore(e, fns, charBase + kcr(entry, cs + i), tokBase! + ktr(entry, cs + i)); } - if (fns.leave) fns.leave(entry, charBase, tokBase); + if (fns.leave) fns.leave(entry, charBase, tokBase!); } // Parse to the ARENA: returns the root node id. -function lexInto(source) { +function lexInto(source: string) { ${e.soa ? ` tokenize(source); docEmptyPops = lexEmptyPops.slice();` : String.raw` docPieces = [source]; docPieceOff = [0]; docLen = source.length; docFlat = source; docCur = 0; const _toks = tokenize(source); @@ -2611,14 +2617,14 @@ ${e.soa ? ` tokenize(source); tokN = _n;`} } -function farthest(errPos) { +function farthest(errPos: number) { if (maxPos <= errPos || maxPos >= tokN) return ''; return ' [farthest: offset ' + toff(maxPos) + " near '" + tokTextAt(maxPos).slice(0, 20) + "']"; } // Run the entry rule over the CURRENT token stream (shared by parse / parseEdited — // everything per-parse EXCEPT the memo and the arena cursor, which parseEdited carries). -function runParse(entryRule) { +function runParse(entryRule?: string) { pos = 0; maxPos = 0; frameMax = 0; @@ -2691,15 +2697,15 @@ let adoptDmgStart = 0; // damage window in OLD token coords: [adoptDmgStar let adoptDmgOldEnd = 0; let adoptDelta = 0; // new-minus-old token delta past the damage // cached descent path (top-down): ids + their absolute old token bases -let adoptPath = []; -let adoptBase = []; +let adoptPath: number[] = []; +let adoptBase: number[] = []; // run-extension state: where the last single adoption sat in the old tree (its // parent row / kid index / parent token base), published by adoptSeek, plus the // (pos, rid, generation) signature a repetition must present to consume it. let adoptHitP = -1, adoptHitKid = 0, adoptHitBase = 0; let adoptRunPos = -1, adoptRunRid = -1, adoptRunGen = -1; let adoptRunP = -1, adoptRunKid = 0, adoptRunOq = 0, adoptRunBase = 0; -function adoptSeek(q, rid) { +function adoptSeek(q: number, rid: number) { // reuse the cached path while it still CONTAINS q (strictly inside, not at start) let depth = 0; while (depth < adoptPath.length) { @@ -2710,7 +2716,7 @@ function adoptSeek(q, rid) { } adoptPath.length = depth; adoptBase.length = depth; - let id, base; + let id: number, base: number; if (depth === 0) { if (q < adoptRootTok || q >= adoptRootTok + rowTokLen[adoptRoot]) return -1; id = adoptRoot; base = adoptRootTok; @@ -2779,11 +2785,11 @@ let recovering = false; // adoption reused this pass (a recovering pass adopts error regions wholesale, // so per-pass collection alone would silently drop their diagnostics). docPar // keeps the formatted result for the paths that do not re-parse (surgery). -let docDiags = []; -let docLex = []; -let docPar = []; +let docDiags: Diag[] = []; +let docLex: LexDiag[] = []; +let docPar: Diag[] = []; -function lexMsg(g) { +function lexMsg(g: LexDiag) { if (g.kind === 0) return "Unexpected character at offset " + g.offset + ": '" + g.ch + "'"; if (g.kind === 1) return 'Invalid escape sequence in template at offset ' + g.offset; if (g.kind === 2) return 'Unterminated template literal at offset ' + g.offset; @@ -2801,7 +2807,7 @@ function lexMsg(g) { // past the last bar aborts the attempt, appends the new farthest-fail bar, and the // pass re-runs (adoption keeps re-runs cheap). Bars are text-determined, so fresh // and incremental recovering parses are byte-identical by construction. -let recoverBars = []; +let recoverBars: number[] = []; // (rule, pos) frames currently ON THE STACK during a recovering run, keyed to // their entry SERIAL. Token synthesis makes zero-width matches possible, so a rule // can re-enter itself at the SAME position through a synthesized leading token — @@ -2828,7 +2834,7 @@ let cycleMinSerial = 0x7fffffff; // non-consuming probes, so the frame behaved strictly: a pure function of the // window text, stable under any bar list that stays out of the window. let memoRecFloor = 0x7fffffff; -function barFreeWin(s, m) { +function barFreeWin(s: number, m: number) { const hi = m + 2; for (let i = 0; i < recoverBars.length; i++) { const b = recoverBars[i]; @@ -2855,7 +2861,7 @@ let probing = 0; // group is allowed only once the group consumed past this (committed) — failures // of an uncommitted probe are ordinary "the optional thing isn't there". let probeBase = -1; -function missAt(p2) { +function missAt(p2: number) { for (let i = 0; i < recoverBars.length; i++) { const b = recoverBars[i]; if (b > p2 + 2) break; @@ -2863,7 +2869,7 @@ function missAt(p2) { } return false; } -function missTok(t, vs) { +function missTok(t: number, vs?: number) { if (probing !== 0 || pos <= probeBase || recoverFree || !missAt(pos)) return false; const id = finishNode(RID_MISSING, scn); rowStart[id] = vs ? t | (vs << 21) : t; @@ -2881,7 +2887,7 @@ function missTok(t, vs) { // row carrying the rule identity. Same purity rules as missTok. Returns the node // id (not pushed — call sites differ) or -1. const RULE_MISS_BASE = 1 << 20; -function missRule(rid) { +function missRule(rid: number) { if (probing !== 0 || pos <= probeBase || recoverFree || !missAt(pos)) return -1; const id = finishNode(RID_MISSING, scn); rowStart[id] = RULE_MISS_BASE + rid; @@ -2897,11 +2903,11 @@ function missRule(rid) { // Decode a $missing row's packed expected identity (see missTok): bits 21+ carry // the call site's viable-set id; bit 20 marks a missing nonterminal; else a plain // literal int (>0) or a named token kind (<0). -function missLit(v) { +function missLit(v: number) { if (v >= 1 << 21) return v & 0xFFFFF; return v > 0 && v < RULE_MISS_BASE ? v : 0; } -function missEntry(v, kb) { +function missEntry(v: number, kb: number): Diag { let message; if (v >= 1 << 21) message = 'expected ' + VSETS[v >>> 21]; else if (v >= RULE_MISS_BASE) message = 'expected ' + RULE_DISPLAY[v - RULE_MISS_BASE]; @@ -2909,7 +2915,7 @@ function missEntry(v, kb) { else message = "expected '" + (K_NAMES[-v] ?? '?') + "'"; return { offset: kb, end: kb, message }; } -function collectErrRows(id, charBase, tokBase) { +function collectErrRows(id: number, charBase: number, tokBase: number) { if (rowRule[id] === RID_MISSING) { docPar.push(missEntry(rowStart[id], charBase)); return; @@ -2990,16 +2996,16 @@ function rebuildDiagView() { // stray closer beyond balance. The shifted lexer resync's dominant q=0 case needs // exactly one fact about the whole old suffix ("no pop-on-empty beyond the // candidate"), which this list answers O(1) instead of an O(suffix) min-build. -let docEmptyPops = []; +let docEmptyPops: number[] = []; // Bar list that built lastRoot (that run's token coords); null = free-fire built // (free-fire decisions are not bar-pure — such a tree is never adoptable while // recovering). Strict trees carry []. -let lastBars = []; +let lastBars: number[] | null = []; // A row replays identically in a recovering run iff its window sees the SAME bars // (shifted) the build run saw there — every recovery decision (hook arming, // missTok/missRule, the cycle sentinel) is position-pure, so window text + window // bars determine the frame's behavior completely. -function barsWindowEq(s, q, ext) { +function barsWindowEq(s: number, q: number, ext: number) { if (lastBars === null) return false; const hiN = s + ext + 2, hiO = q + ext + 2; let i = 0, j = 0; @@ -3013,7 +3019,7 @@ function barsWindowEq(s, q, ext) { i++; j++; } } -function recoverArmed(from, reach) { +function recoverArmed(from: number, reach: number) { // armed iff THE FAILING ELEMENT is stuck at a bar: it starts at/before the bar // and its OWN farthest probe sits ON it (+2 read slack). The reach is the // element's frame-local watermark, NOT the global maxPos — a global frontier @@ -3028,7 +3034,7 @@ function recoverArmed(from, reach) { } return false; } -function recoverSkip(canStart, closerT, from0, reach) { +function recoverSkip(canStart: ((p: number) => boolean) | null, closerT: number, from0: number, reach: number) { if (!recoverArmed(from0, reach)) return false; if (pos >= cap) return false; if (closerT >= 0 && tkK[pos] === K_PUNCT && tkT[pos] === closerT) return false; @@ -3055,7 +3061,7 @@ function recoverSkip(canStart, closerT, from0, reach) { // proves the loop's FIRST-set guard true at its position (its first token starts // the rule), and the loop's own continuation checks run again after the run // breaks. Members get no memo entries — a backtracking re-probe just re-adopts. -function runExtend(rid) { +function runExtend(rid: number) { if (rid !== adoptRunRid || memoGenCur !== adoptRunGen) { adoptRunPos = -1; return; } adoptRunPos = -1; const P = adoptRunP; @@ -3100,10 +3106,10 @@ function runExtend(rid) { // re-parse. Prefix kids are kept under the same watermark rule single adoption // uses, made transitive by rowKC: each kid's probe watermark stays at/below the // next kid's start, so checking the LAST kept kid bounds them all. -let surgX = [], surgBase = [], surgA = [], surgB = []; +let surgX: number[] = [], surgBase: number[] = [], surgA: number[] = [], surgB: number[] = []; // composed change envelope handed from the text-application step to the window relex let editDmgS = 0, editDmgE = 0; -function rowKCof(id) { +function rowKCof(id: number) { const c = rowKC[id]; if (c !== 0) return c; const cs = rowStart[id], n = rowCount[id]; @@ -3117,7 +3123,7 @@ function rowKCof(id) { rowKC[id] = ok; return ok; } -function trySurgery(dmgA, dmgB, tokD, chrD) { +function trySurgery(dmgA: number, dmgB: number, tokD: number, chrD: number) { if (adoptRoot < 0) return -1; if (rowRule[adoptRoot] >= RID_ERROR) return -1; // A recovery-made tree (rowRM root) CAN take a strict splice when the edit @@ -3240,8 +3246,8 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { if (recTree) { // the strict re-parse stands for the fresh recovering parse of this span only // if no bar window touches anything it read (probes included) - for (let i = 0; i < lastBars.length; i++) { - const b = lastBars[i]; + for (let i = 0; i < lastBars!.length; i++) { + const b = lastBars![i]; const bn = b < dmgA ? b : b + tokD; if (bn + 2 >= s0 && bn <= maxPos + 2) return -1; } @@ -3458,7 +3464,7 @@ function trySurgery(dmgA, dmgB, tokD, chrD) { // The spare token-column buffer set (parseEdited ping-pongs between the live set and // this one, so steady-state edits never allocate columns). -let altK = null, altT = null, altOff = null, altEnd = null, altFl = null, altDp = null, altPd = null; +let altK: typeof tkK | null = null, altT: typeof tkT | null = null, altOff: typeof tkOff | null = null, altEnd: typeof tkEnd | null = null, altFl: typeof tkFl | null = null, altDp: typeof tkDp | null = null, altPd: typeof tkPd | null = null; let altCap = 0; let altN = 0; // old-stream token count while a window lex runs (lexCore's resync bound) @@ -3469,9 +3475,28 @@ let altN = 0; // old-stream token count while a window lex runs (lexCore's res // variables are the truth, and is written back only when another doc activates. // Per-PARSE transients (pos/maxPos/scratch/adopt*/surg*) reset on every entry and // are shared safely. -function makeDoc() { +type Diag = { offset: number; end: number; message: string; related?: { offset: number; end: number; message: string } }; +type LexDiag = { offset: number; end: number; kind: number; ch: string }; +type Edit = { start: number; end: number; text: string }; +type Doc = { + tkK: typeof tkK; tkT: typeof tkT; tkOff: typeof tkOff; tkEnd: typeof tkEnd; tkFl: typeof tkFl; tkDp: typeof tkDp; tkPd: typeof tkPd; + tkCap: number; tokN: number; srcLenP1: number; negFrom: number; + rowRule: typeof rowRule; rowLen: typeof rowLen; rowTokLen: typeof rowTokLen; rowStart: typeof rowStart; rowCount: typeof rowCount; rowExt: typeof rowExt; + rowOK: typeof rowOK; rowKC: typeof rowKC; rowNF: typeof rowNF; rowRM: typeof rowRM; absChar: typeof absChar; absTok: typeof absTok; + rowCap: number; nodeN: number; + kids: typeof kids; kidRel: typeof kidRel; kidTokRel: typeof kidTokRel; kidCap: number; kidN: number; + memoNode: number[][]; memoEnd: number[][]; memoExt: number[][]; memoGen: Int32Array[]; memoGenCur: number; + docDiags: Diag[]; docLex: LexDiag[]; docPar: Diag[]; + docPieces: string[] | null; docPieceOff: number[] | null; docLen: number; docFlat: string | null; docCur: number; + rootCharBase: number; rootTokBase: number; lastRoot: number; lastRootTok: number; lastBars: number[] | null; docEmptyPops: number[]; +${e.soa ? ' parenCachePos: number; parenCacheStack: boolean[];' : ''} + altK: typeof tkK | null; altT: typeof tkT | null; altOff: typeof tkOff | null; altEnd: typeof tkEnd | null; altFl: typeof tkFl | null; altDp: typeof tkDp | null; altPd: typeof tkPd | null; + altCap: number; altN: number; +}; +type Handle = { d: Doc; gen: number; root: number; errors: Diag[] }; +function makeDoc(): Doc { return { - tkK: new tkK.constructor(4096), tkT: new tkT.constructor(4096), + tkK: new (tkK.constructor as any)(4096), tkT: new (tkT.constructor as any)(4096), tkOff: new Int32Array(4096), tkEnd: new Int32Array(4096), tkFl: new Uint8Array(4096), tkDp: new Uint8Array(4096), tkPd: new Uint16Array(4096), tkCap: 4096, tokN: 0, srcLenP1: 1, negFrom: 0x7fffffff, @@ -3487,13 +3512,13 @@ function makeDoc() { memoNode: [], memoEnd: [], memoExt: [], memoGen: [], memoGenCur: 0, docDiags: [], docLex: [], docPar: [], docPieces: null, docPieceOff: null, docLen: 0, docFlat: null, docCur: 0, - rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, docEmptyPops: [], + rootCharBase: 0, rootTokBase: 0, lastRoot: -1, lastRootTok: 0, lastBars: null, docEmptyPops: [], ${e.soa ? ' parenCachePos: -1, parenCacheStack: [],' : ''} altK: null, altT: null, altOff: null, altEnd: null, altFl: null, altDp: null, altPd: null, altCap: 0, altN: 0, }; } -function saveDoc(d) { +function saveDoc(d: Doc) { d.tkK = tkK; d.tkT = tkT; d.tkOff = tkOff; d.tkEnd = tkEnd; d.tkFl = tkFl; d.tkDp = tkDp; d.tkPd = tkPd; d.tkCap = tkCap; d.tokN = tokN; d.srcLenP1 = srcLenP1; d.negFrom = negFrom; @@ -3511,7 +3536,7 @@ ${e.soa ? ' d.parenCachePos = parenCachePos; d.parenCacheStack = parenCacheStac d.altK = altK; d.altT = altT; d.altOff = altOff; d.altEnd = altEnd; d.altFl = altFl; d.altDp = altDp; d.altPd = altPd; d.altCap = altCap; d.altN = altN; } -function loadDoc(d) { +function loadDoc(d: Doc) { tkK = d.tkK; tkT = d.tkT; tkOff = d.tkOff; tkEnd = d.tkEnd; tkFl = d.tkFl; tkDp = d.tkDp; tkPd = d.tkPd; tkCap = d.tkCap; tokN = d.tokN; srcLenP1 = d.srcLenP1; negFrom = d.negFrom; @@ -3532,26 +3557,26 @@ ${e.soa ? ' parenCachePos = d.parenCachePos; parenCacheStack = d.parenCacheStac const docDefault = makeDoc(); let curDoc = docDefault; loadDoc(docDefault); -function activate(d) { +function activate(d: Doc) { if (d === curDoc) return; saveDoc(curDoc); loadDoc(d); curDoc = d; } function swapBuffers() { - let x; - x = tkK; tkK = altK; altK = x; - x = tkT; tkT = altT; altT = x; - x = tkOff; tkOff = altOff; altOff = x; - x = tkEnd; tkEnd = altEnd; altEnd = x; - x = tkFl; tkFl = altFl; altFl = x; - x = tkDp; tkDp = altDp; altDp = x; - x = tkPd; tkPd = altPd; altPd = x; + let x: any; + x = tkK; tkK = altK!; altK = x; + x = tkT; tkT = altT!; altT = x; + x = tkOff; tkOff = altOff!; altOff = x; + x = tkEnd; tkEnd = altEnd!; altEnd = x; + x = tkFl; tkFl = altFl!; altFl = x; + x = tkDp; tkDp = altDp!; altDp = x; + x = tkPd; tkPd = altPd!; altPd = x; x = tkCap; tkCap = altCap; altCap = x; } ${e.soa ? '' : 'let altText = [];'} -function parseCore(source, entryRule) { +function parseCore(source: string, entryRule?: string) { adoptRoot = -1; adoptRunPos = -1; lexInto(source); @@ -3578,7 +3603,7 @@ function parseCore(source, entryRule) { // Parser-diag shift for the LOCALLY-strict paths (surgery / strict success): the // LEXER list is maintained by the window block (which already dropped the re-lexed // range and shifted the suffix — shifting here would double-apply the delta). -function shiftDiags(a, b, delta) { +function shiftDiags(a: number, b: number, delta: number) { let w = 0; for (let i = 0; i < docPar.length; i++) { const g = docPar[i]; @@ -3617,7 +3642,7 @@ function shiftDiags(a, b, delta) { // Last-resort totality net: a layer without recovery support threw — the handle // API still never crashes. Zero-width $error root + the thrown message as the // diagnostic; the next successful parse/edit resumes normal service. -function totalNet(e) { +function totalNet(e: any) { // the message lives in the SOURCE layer (docLex kind 4) — a later settle rebuilds // the view from the sources, and a view-only push would be wiped by it docLex.length = 0; @@ -3633,12 +3658,12 @@ function totalNet(e) { rootTokBase = 0; return root; } -function apiMisuse(msg) { - const e = new Error(msg); +function apiMisuse(msg: string) { + const e: any = new Error(msg); e.apiMisuse = true; return e; } -function editCore(entryRule, edits) { +function editCore(entryRule: string | undefined, edits?: Edit[]) { if (edits === undefined || edits.length === 0) { throw apiMisuse('edit() requires the changes: [{ start, end, text }] (LSP-style - each edit in the coordinates of the document AFTER the preceding edits in the array)'); } @@ -3711,7 +3736,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── } // Lex the window into the spare buffers (the old stream stays live for resync). if (altK === null || altCap < tkCap) { - altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); + altK = new (tkK.constructor as any)(tkCap); altT = new (tkT.constructor as any)(tkCap); altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); altCap = tkCap; @@ -3720,7 +3745,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── altSuffMin = null; // the old-suffix min-depth cache follows the alt stream swapBuffers(); // live = scratch, alt = OLD stream tokN = 0; - const startOff = B >= 0 ? (altEnd[B] < 0 ? altEnd[B] + srcLenP1 : altEnd[B]) : 0; + const startOff = B >= 0 ? (altEnd![B] < 0 ? altEnd![B] + srcLenP1 : altEnd![B]) : 0; // Window-materialized relex: lexCore reads a SMALL flat slice of the pieces with // an absolute bias; -2 = ran off the window end before resyncing — re-materialize // a larger window and retry (the common case fits the first one). @@ -3736,7 +3761,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── docLex.length = preLexN; // an aborted attempt re-lexes: drop its pushes tokN = 0; try { - R0 = lexCore(windowStr, 0, B >= 0 ? altK[B] : -1, B >= 0 ? altT[B] : 0, r0, ceNew, charDelta, cs, initParens.slice(), startOff, wHi < docLen); + R0 = lexCore(windowStr, 0, B >= 0 ? altK![B] : -1, B >= 0 ? altT![B] : 0, r0, ceNew, charDelta, cs, initParens.slice(), startOff, wHi < docLen); } catch (e2) { if (e2 !== LEX_RETRY) { if (recovering) throw e2; // a recovering lexer never throws — a bug @@ -3796,8 +3821,8 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // p is real damage (compared BEFORE the splice clobbers the old slots). let p = B + 1; { let i = 0; - while (i < W && p < R && altK[i] === tkK[p] && altT[i] === tkT[p] && altOff[i] === tkOff[p] - && altEnd[i] === tkEnd[p] && altFl[i] === tkFl[p]) { i++; p++; } + while (i < W && p < R && altK![i] === tkK[p] && altT![i] === tkT[p] && altOff![i] === tkOff[p] + && altEnd![i] === tkEnd[p] && altFl![i] === tkFl[p]) { i++; p++; } } const dOldEnd = R; const tokenDelta = (B + 1 + W) - R; @@ -3810,9 +3835,9 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── tkFl.copyWithin(B + 1 + W, R, oN); tkDp.copyWithin(B + 1 + W, R, oN); tkPd.copyWithin(B + 1 + W, R, oN); } if (W > 0) { - tkK.set(altK.subarray(0, W), B + 1); tkT.set(altT.subarray(0, W), B + 1); - tkOff.set(altOff.subarray(0, W), B + 1); tkEnd.set(altEnd.subarray(0, W), B + 1); - tkFl.set(altFl.subarray(0, W), B + 1); tkDp.set(altDp.subarray(0, W), B + 1); tkPd.set(altPd.subarray(0, W), B + 1); + tkK.set(altK!.subarray(0, W), B + 1); tkT.set(altT!.subarray(0, W), B + 1); + tkOff.set(altOff!.subarray(0, W), B + 1); tkEnd.set(altEnd!.subarray(0, W), B + 1); + tkFl.set(altFl!.subarray(0, W), B + 1); tkDp.set(altDp!.subarray(0, W), B + 1); tkPd.set(altPd!.subarray(0, W), B + 1); } negFrom = B + 1 + W; srcLenP1 = newLen + 1; @@ -3837,7 +3862,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── const oK = tkK, oT = tkT, oOff = tkOff, oEnd = tkEnd, oFl = tkFl, oN = tokN; const oText = tkText; if (altK === null || altK.length !== tkCap) { - altK = new tkK.constructor(tkCap); altT = new tkT.constructor(tkCap); + altK = new (tkK.constructor as any)(tkCap); altT = new (tkT.constructor as any)(tkCap); altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); } @@ -3906,7 +3931,7 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── shiftDiags(cs, ceOld, charDelta); return sroot; } - let root; + let root!: number; { // recovering may already be true here (the window relex recovered a lex error // and pushed its diagnostics): the first attempt then runs with EMPTY bars — @@ -4008,14 +4033,14 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── export { tokenize }; // ── Module-level API: the DEFAULT document (one shared session; tokenize and the // raw tree/tokenAt views read the ACTIVE doc — they are gate/debug surfaces) ── -export function parse(source, entryRule) { activate(docDefault); return parseCore(source, entryRule); } -export function parseEdited(entryRule, edits) { activate(docDefault); return editCore(entryRule, edits); } +export function parse(source: string, entryRule?: string) { activate(docDefault); return parseCore(source, entryRule); } +export function parseEdited(entryRule?: string, edits?: Edit[]) { activate(docDefault); return editCore(entryRule, edits); } // Arena reclamation introspection + budget override — TEST HOOKS (issue #45 C1). __arenaStats // reports the live arena, the compacted-size baseline, and how many edits re-parsed to reclaim; // __setArenaBudget lowers the factor/min so a gate can force compaction deterministically. export function __arenaStats() { return { nodeN, kidN, baseline: arenaLiveBaseline, compactions: arenaCompactions, inPlaceShrink: arenaInPlaceShrink }; } -export function __setArenaBudget(factor, min) { arenaCompactFactor = factor; arenaCompactMin = min; } -export function visit(entry, fns, charBase, tokBase) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } +export function __setArenaBudget(factor: number, min: number) { arenaCompactFactor = factor; arenaCompactMin = min; } +export function visit(entry: number, fns: _VisitFns, charBase?: number, tokBase?: number) { activate(docDefault); return visitCore(entry, fns, charBase, tokBase); } // ── Handle API: explicit trees over per-instance documents ── // const p = createParser(); const cst = p.parse(text); p.edit(cst, next[, edits]); // The handle is the STABLE IDENTITY of this document's tree: edit() mutates it in @@ -4026,25 +4051,25 @@ export function visit(entry, fns, charBase, tokBase) { activate(docDefault); ret export function createParser() { const d = makeDoc(); let gen = 0; - let entryUsed; - const chk = (cst) => { + let entryUsed: string | undefined; + const chk = (cst: Handle | null | undefined) => { if (cst === null || cst === undefined || cst.d !== d) throw new Error('foreign tree handle: it belongs to another parser instance'); if (cst.gen !== gen) throw new Error('stale tree handle: parse() re-opened this document - use the handle from the latest parse()'); }; - const view = {}; + const view: Record any> = {}; for (const k of Object.keys(tree)) { - const f = tree[k]; - view[k] = (a, b) => { activate(d); return f(a, b); }; + const f = (tree as any)[k]; + view[k] = (a: number, b: number) => { activate(d); return f(a, b); }; } return { - parse(source, entryRule) { + parse(source: string, entryRule?: string) { activate(d); entryUsed = entryRule; gen++; // re-opening resets the arena: old handles die regardless of outcome docDiags.length = 0; docLex.length = 0; docPar.length = 0; - let root; + let root!: number; try { root = parseCore(source, entryRule); lastBars = []; @@ -4095,17 +4120,17 @@ export function createParser() { } return { d, gen, root, errors: docDiags }; }, - edit(cst, edits) { + edit(cst: Handle, edits?: Edit[]) { chk(cst); activate(d); try { cst.root = editCore(entryUsed, edits); } catch (e) { - if (e instanceof RangeError || (e && e.apiMisuse)) throw e; + if (e instanceof RangeError || (e && (e as any).apiMisuse)) throw e; cst.root = totalNet(e); } }, - visit(cst, fns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, + visit(cst: Handle, fns: _VisitFns) { chk(cst); activate(d); return visitCore(cst.root, fns); }, tree: view, }; } diff --git a/test/check.ts b/test/check.ts index 1658343..cf61a77 100644 --- a/test/check.ts +++ b/test/check.ts @@ -26,6 +26,7 @@ const GATES: Gate[] = [ { group: 'emit-parity', name: 'emit-parser-verify', args: ['test/emit-parser-verify.ts'] }, { group: 'emit-parity', name: 'emit-reject-messages', args: ['test/emit-reject-messages.ts'] }, { group: 'emit-parity', name: 'emit-lexer-verify', args: ['test/emit-lexer-verify.ts'] }, + { group: 'emit-parity', name: 'emit-tsc-gate', args: ['test/emit-tsc-gate.ts'] }, { group: 'core', name: 'multi-doc', args: ['test/multi-doc.ts'] }, { group: 'core', name: 'recovery', args: ['test/recovery.ts'] }, { group: 'core', name: 'incremental-grammars', args: ['test/incremental-grammars.ts'] }, diff --git a/test/cst-match-totality.ts b/test/cst-match-totality.ts index d6e382c..25c0d8b 100644 --- a/test/cst-match-totality.ts +++ b/test/cst-match-totality.ts @@ -51,7 +51,7 @@ function checkTree(em: Emitted, root: number, src: string, matchers: Record = [ + ['typescript', '../typescript.ts'], + ['javascript', '../javascript.ts'], + ['typescriptreact', '../typescriptreact.ts'], + ['javascriptreact', '../javascriptreact.ts'], +]; +// Deferred: the fallback-lexer / non-soa path (logged, not gated yet). +const DEFERRED = ['yaml', 'html']; + +const TSC_FLAGS = [ + '--strict', '--noEmit', '--target', 'ES2022', + '--module', 'ES2022', '--moduleResolution', 'Bundler', '--skipLibCheck', +]; + +let failures = 0; +for (const [name, path] of CHECKED) { + let grammar: CstGrammar; + try { + grammar = (await import(path)).default; + } catch { + console.log(` ${name}: (grammar not present — skipped)`); + continue; + } + const out = `/tmp/emit-tsc-gate-${name}.ts`; + writeFileSync(out, emitParser(grammar)); + try { + execFileSync('npx', ['tsc', ...TSC_FLAGS, out], { stdio: 'pipe' }); + console.log(` ${name}: ✓ emitted parser type-checks (tsc --strict)`); + } catch (e: any) { + failures++; + const log = (e.stdout?.toString() ?? '') + (e.stderr?.toString() ?? ''); + const errs = log.split('\n').filter((l: string) => l.includes('error TS')); + console.log(` ${name}: ✗ ${errs.length} tsc error(s):`); + for (const l of errs.slice(0, 30)) console.log(` ${l.replace(out, `emit(${name})`)}`); + if (errs.length > 30) console.log(` … and ${errs.length - 30} more`); + } +} +console.log(` deferred (fallback-lexer / non-soa path, not yet typed): ${DEFERRED.join(', ')}`); + +if (failures > 0) { + console.error(`\n✗ emitted parser fails strict type-check for ${failures} grammar(s)`); + process.exit(1); +} +console.log('\n✓ emitted parser type-checks under tsc --strict (soa + emitted-lexer family)'); diff --git a/test/exhaustive-edits.ts b/test/exhaustive-edits.ts index 1485a4f..72a8ca9 100644 --- a/test/exhaustive-edits.ts +++ b/test/exhaustive-edits.ts @@ -31,7 +31,7 @@ const g = defineGrammar({ rules: { Expr, Stmt, Program }, entry: Program, }); -const emPath = '/tmp/emitted-exhaustive.mjs'; +const emPath = '/tmp/emitted-exhaustive.mts'; writeFileSync(emPath, emitParser(g)); type Cst = { root: number; errors: object[] }; type Parser = { parse(s: string): Cst; edit(c: Cst, e: object[]): void; visit(c: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; diff --git a/test/head-to-head.ts b/test/head-to-head.ts index 4613e67..15f913b 100644 --- a/test/head-to-head.ts +++ b/test/head-to-head.ts @@ -26,7 +26,7 @@ const TreeSitter = require(TS_BENCH + '/node_modules/tree-sitter'); const TSLang = require(TS_BENCH + '/node_modules/tree-sitter-typescript').typescript; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-h2h.mjs'; +const emPath = '/tmp/emitted-h2h.mts'; writeFileSync(emPath, emitParser(grammar)); const { createParser } = await import(emPath + '?v=' + process.pid); diff --git a/test/incremental-grammars.ts b/test/incremental-grammars.ts index 6c2bbd0..404272b 100644 --- a/test/incremental-grammars.ts +++ b/test/incremental-grammars.ts @@ -84,7 +84,7 @@ let fails = 0; const failures: string[] = []; for (const name of GRAMMARS) { const grammar = (await import(`../${name}.ts`)).default; - const emPath = `/tmp/emitted-incr-${name}.mjs`; + const emPath = `/tmp/emitted-incr-${name}.mts`; writeFileSync(emPath, emitParser(grammar)); const em = (await import(emPath + '?v=' + process.pid)) as Em; const session = em.createParser(); @@ -183,7 +183,7 @@ function replaceOnce(text: string, find: string, repl: string): { next: string; return { next: text.slice(0, at) + repl + text.slice(at + find.length), edit: { start: at, end: at + find.length, text: repl } }; } for (const name of ['javascript', 'typescript']) { - const em = (await import(`/tmp/emitted-incr-${name}.mjs?v=` + process.pid)) as Em; + const em = (await import(`/tmp/emitted-incr-${name}.mts?v=` + process.pid)) as Em; const session = em.createParser(); const fresh = em.createParser(); for (const doc of FORK_DOCS) { diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index 04fdf3b..cd01c17 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -11,7 +11,7 @@ import { existsSync, readFileSync, writeFileSync } from 'node:fs'; import { emitParser } from '../src/emit-parser.ts'; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-incremental.mjs'; +const emPath = '/tmp/emitted-incremental.mts'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; end: number; text: string }; type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; diff --git a/test/multi-doc.ts b/test/multi-doc.ts index f5af760..c3d844d 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -13,7 +13,7 @@ import { emitParser } from '../src/emit-parser.ts'; import { objectify } from './emitted-obj.ts'; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-multidoc.mjs'; +const emPath = '/tmp/emitted-multidoc.mts'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; end: number; text: string }; type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; diff --git a/test/recovery-conformance.ts b/test/recovery-conformance.ts index 8f1f28c..a2eda45 100644 --- a/test/recovery-conformance.ts +++ b/test/recovery-conformance.ts @@ -17,7 +17,7 @@ import { emitParser } from '../src/emit-parser.ts'; import ts from 'typescript'; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-recovery-conf.mjs'; +const emPath = '/tmp/emitted-recovery-conf.mts'; writeFileSync(emPath, emitParser(grammar)); type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; const em = (await import(emPath + '?v=' + process.pid)) as { createParser(): { parse(s: string): Cst } }; diff --git a/test/recovery.ts b/test/recovery.ts index 5e1d721..9215c46 100644 --- a/test/recovery.ts +++ b/test/recovery.ts @@ -17,7 +17,7 @@ import { emitParser } from '../src/emit-parser.ts'; import { objectify } from './emitted-obj.ts'; const grammar = (await import('../typescript.ts')).default; -const emPath = '/tmp/emitted-recovery.mjs'; +const emPath = '/tmp/emitted-recovery.mts'; writeFileSync(emPath, emitParser(grammar)); type Edit = { start: number; end: number; text: string }; type Diag = { offset: number; end: number; message: string; related?: { offset: number; end: number; message: string } }; From 2c87267492133b95cc4a8e4fa9ac24fc8d9eb5ad Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Sun, 21 Jun 2026 11:48:47 +0800 Subject: [PATCH 02/27] emit: extend the tsc gate to the fallback-lexer / non-soa path (yaml, html) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brings the yaml/html emit path under the same strict type-check as the ts/js family, so the gate now covers every grammar. Three things this required: - Hoist the edit-damage envelope (newLen/cs/ceNew/ceOld/charDelta) out of the e.soa window branch. shiftDiags(cs, ceOld, charDelta) runs in the SHARED post-fork settle, but those names were declared only in the soa branch — so the non-soa branch referenced undeclared variables. The path is unreached at runtime for the fallback grammars (they full-relex), which is why it stayed invisible; the tsc gate surfaced it. They derive only from shared inputs, so hoisting is behavior-neutral for soa and correct for non-soa. Same fix gates the soa-only parenCachePos cache-invalidation in the '>'-split. - Type the non-soa piece-text columns (tkText/altText: string[]), assert the fallback column swap against the nullable spare buffers, and cast the baked LEX_GRAMMAR at the createLexer boundary. - Give every baked Map/Set an explicit element type at emission. They inferred correctly only when non-empty (ts/js); an empty vocabulary set (yaml/html) collapsed to Map / Set. emit-lexer-verify's TYPE_KIND/LIT_KW/LIT_PU extraction regex now tolerates the `new Map(` generic. Full suite 41/41; emitted CST byte-identical across all 6 grammars (incremental-grammars 610/610). --- src/emit-parser.ts | 47 ++++++++++++++++++++------------------- test/emit-lexer-verify.ts | 6 ++--- test/emit-tsc-gate.ts | 36 +++++++++++++++--------------- 3 files changed, 45 insertions(+), 44 deletions(-) diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 7bd889b..6368898 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -656,7 +656,7 @@ class Emitter { // A suppress-carrying group stages the LED-connector exclusion for the next // parseRule, then matches its body (same as matchExpr 'group'). const pre = (expr.suppress && expr.suppress.length) - ? `suppressNext = new Set(${J(expr.suppress)});` + ? `suppressNext = new Set(${J(expr.suppress)});` : ``; return [pre, this.matchInto(expr.body, onFail)].filter(Boolean).join('\n'); } @@ -1143,9 +1143,9 @@ export function emitParser(grammar: CstGrammar): string { // TYPE_KIND: tok.type → int. LIT_KW / LIT_PU: tok.text → keyword / punct literal int. // Every token is BORN with tok.k (type kind) + tok.t (literal kind) and the stamp // flags — one monomorphic shape, one allocation, no post-pass. - e.emit(`const TYPE_KIND = new Map(${J([...st.typeKind])});`); - e.emit(`const LIT_KW = new Map(${J([...st.kwLitKind])});`); - e.emit(`const LIT_PU = new Map(${J([...st.puLitKind])});`); + e.emit(`const TYPE_KIND = new Map(${J([...st.typeKind])});`); + e.emit(`const LIT_KW = new Map(${J([...st.kwLitKind])});`); + e.emit(`const LIT_PU = new Map(${J([...st.puLitKind])});`); e.emit(`const K_PUNCT = ${st.KIND_PUNCT};`); e.emit(`const K_TEMPLATE_HEAD = ${st.KIND_TEMPLATE_HEAD};`); e.emit(`const K_TEMPLATE_MIDDLE = ${st.KIND_TEMPLATE_HEAD + 1};`); @@ -1158,15 +1158,15 @@ export function emitParser(grammar: CstGrammar): string { if (lexSrc) { e.emit(lexSrc); } else { - e.emit(`const { tokenize } = createLexer(LEX_GRAMMAR, {`); + e.emit(`const { tokenize } = createLexer(LEX_GRAMMAR as any, {`); e.emit(` typeKind: TYPE_KIND, kwLit: LIT_KW, puLit: LIT_PU,`); e.emit(` punctKind: K_PUNCT, namedFallback: K_NAMED_FALLBACK,`); e.emit(`});`); } e.emit(``); // Baked maps. Emit as object literals → Map. - e.emit(`const opTable = new Map(${J([...a.opTable])});`); - e.emit(`const prefixOps = new Map(${J([...a.prefixOps])});`); + e.emit(`const opTable = new Map(${J([...a.opTable])});`); + e.emit(`const prefixOps = new Map(${J([...a.prefixOps])});`); // The same op tables re-keyed by the literal int (tok.t): the Pratt loops look an // operator up for EVERY token they reach, and tok.t is already interned — an array // load replaces the string-keyed Map.get. Equivalent because a token's text can equal @@ -1189,7 +1189,7 @@ export function emitParser(grammar: CstGrammar): string { e.emit(`const OP_BY_T: (OpInfo | null)[] = ${J(byT(a.opTable))};`); e.emit(`const PREFIX_BY_T: (OpInfo | null)[] = ${J(byT(a.prefixOps))};`); } - e.emit(`const noUnaryLhsOps = new Set(${J([...a.noUnaryLhsOps])});`); + e.emit(`const noUnaryLhsOps = new Set(${J([...a.noUnaryLhsOps])});`); { let tSize = 1; for (const v of st.kwLitKind.values()) tSize = Math.max(tSize, v + 1); @@ -1214,8 +1214,8 @@ export function emitParser(grammar: CstGrammar): string { } e.emit(`const REQTGT_T = Uint8Array.from([${rt.join(',')}]);`); } - e.emit(`const postfixOpValues = new Set(${J([...a.postfixOpValues])});`); - e.emit(`const binaryConnectors = new Set(${J([...a.binaryConnectors])});`); + e.emit(`const postfixOpValues = new Set(${J([...a.postfixOpValues])});`); + e.emit(`const binaryConnectors = new Set(${J([...a.binaryConnectors])});`); // Assignment-target shape test (ECMAScript AssignmentTargetType): a node id is NOT a // valid LHS target iff its outermost form is a prefix-op (prefix-unary OR prefix-update // `++x`) — head kid is an operator-tag leaf in prefixOps — or a postfix-update (`x++`) — @@ -1255,8 +1255,8 @@ export function emitParser(grammar: CstGrammar): string { e.emit(` }`); e.emit(` return '';`); e.emit(`}`); - e.emit(`const tokenNames = new Set(${J([...a.tokenNames])});`); - e.emit(`const templateTokenNames = new Set(${J([...a.templateTokenNames])});`); + e.emit(`const tokenNames = new Set(${J([...a.tokenNames])});`); + e.emit(`const templateTokenNames = new Set(${J([...a.templateTokenNames])});`); e.emit(`const templateTokenName = ${J(a.templateTokenName ?? null)};`); e.emit(`const maxBp = ${a.maxBp};`); e.emit(`const ENTRY = ${J(entry)};`); @@ -1280,7 +1280,7 @@ export function emitParser(grammar: CstGrammar): string { } // (recovery sync closers are threaded per-loop from the enclosing seq — see // quantFollowT; a global closer table froze top-level recovery at any ']'.) - e.emit(`const prattRuleNames = new Set(${J([...a.prattRules])});`); + e.emit(`const prattRuleNames = new Set(${J([...a.prattRules])});`); // The expression rule the template-interpolation fallback (findExprRule) picks: // first pratt rule that isn't Type, in declaration order. Bake the resolved name. const exprRuleName = (() => { @@ -1432,7 +1432,7 @@ let srcLenP1 = 1; let negFrom = 0x7fffffff; function toff(i: number) { const v = tkOff[i]; return v < 0 ? v + srcLenP1 : v; } function tend(i: number) { const v = tkEnd[i]; return v < 0 ? v + srcLenP1 : v; } -${e.soa ? '' : 'let tkText = []; // fallback-lexer text column (synthetic tokens are not source spans)'} +${e.soa ? '' : 'let tkText: string[] = []; // fallback-lexer text column (synthetic tokens are not source spans)'} function growTok() { tkCap *= 2; const k = new ${K_ARR}(tkCap); k.set(tkK); tkK = k; @@ -1743,7 +1743,7 @@ function matchPuLitGT(pu: number, vs?: number) { const end0 = tend(pos); ${e.soa ? '' : 'const restText = tkText[pos].slice(1);'} if (tokN === tkCap) growTok(); - parenCachePos = -1; + ${e.soa ? 'parenCachePos = -1;' : ''} // invalidate the paren-stack cache (soa emitted lexer only) // token indices shift past this point: the OLD-TREE adoption mapping // (adoptDmg*/adoptDelta, frozen at edit start) is no longer valid — turn // adoption off for the remainder of this parse (the '>' split is rare; the @@ -2003,7 +2003,7 @@ function emitLeftRecRule(e: Emitter, a: ReturnType, rule: RuleDe e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_lr); }`); // notLeftLeaf head-leaf word sets (module-level, built once) for this rule's gated continuations. contNotLeftLeaf.forEach((words, i) => { - if (words) e.emit(`const _NLLC_${sn}_${i} = new Set(${J(words)});`); + if (words) e.emit(`const _NLLC_${sn}_${i} = new Set(${J(words)});`); }); e.emit(`function ${ruleFn}_lr(_minBp: number) {`); e.emit(` const saved = pos; const mark = scn;`); @@ -2068,7 +2068,7 @@ function emitPrattRule(e: Emitter, a: ReturnType, rule: RuleDecl e.emit(`function ${ruleFn}() { return parseRuleEntry(${e.memoIndex(rule.name)}, ${rid}, ${J(rule.name)}, ${ruleFn}_pratt); }`); // notLeftLeaf head-leaf word sets (module-level, built once) for this rule's gated LED arms. meta.notLeftLeaf.forEach((words, i) => { - if (words) e.emit(`const _NLL_${sn}_${i} = new Set(${J(words)});`); + if (words) e.emit(`const _NLL_${sn}_${i} = new Set(${J(words)});`); }); e.emit(`function ${ruleFn}_pratt(minBp: number) {`); e.emit(` const saved = pos; const mark = scn;`); @@ -3574,7 +3574,7 @@ function swapBuffers() { x = tkPd; tkPd = altPd!; altPd = x; x = tkCap; tkCap = altCap; altCap = x; } -${e.soa ? '' : 'let altText = [];'} +${e.soa ? '' : 'let altText: string[] = [];'} function parseCore(source: string, entryRule?: string) { adoptRoot = -1; @@ -3696,14 +3696,16 @@ function editCore(entryRule: string | undefined, edits?: Edit[]) { editDmgE = dE; } -${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // Damage envelope from the composed changes: prefix coordinates are shared, the - // old end comes back through the total delta. + // old end comes back through the total delta. The shared post-fork settle + // (shiftDiags) and the soa window both read these, so they live OUTSIDE the + // lex fork — the non-soa branch reads cs/ceOld/charDelta too. const newLen = docLen; const cs = editDmgS < newLen ? editDmgS : newLen; const ceNew = editDmgE < cs ? cs : editDmgE; const ceOld = ceNew - (newLen - oldLen); const charDelta = newLen - oldLen; +${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // Restart anchor: the last token B ending at/before the damage whose recorded // depths are zero and whose shape carries no cross-token lexer flag (')' control- // head, postfix-ambiguous op). B = -1 restarts at the file head — always sound. @@ -3866,8 +3868,8 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── altOff = new Int32Array(tkCap); altEnd = new Int32Array(tkCap); altFl = new Uint8Array(tkCap); altDp = new Uint8Array(tkCap); altPd = new Uint16Array(tkCap); } - tkK = altK; tkT = altT; tkOff = altOff; tkEnd = altEnd; tkFl = altFl; - { const _d = tkDp; tkDp = altDp; altDp = _d; const _q = tkPd; tkPd = altPd; altPd = _q; } + tkK = altK!; tkT = altT!; tkOff = altOff!; tkEnd = altEnd!; tkFl = altFl!; + { const _d = tkDp; tkDp = altDp!; altDp = _d; const _q = tkPd; tkPd = altPd!; altPd = _q; } tkText = altText; tkText.length = 0; altK = oK; altT = oT; altOff = oOff; altEnd = oEnd; altFl = oFl; altText = oText; @@ -3876,7 +3878,6 @@ ${e.soa ? String.raw` // ── M1: WINDOWED re-lex ── // from an earlier totality-net edit would go stale lexInto(flattenDoc()); const nN = tokN; - const charDelta = docLen - oldLen; const minN = oN < nN ? oN : nN; let p = 0; while (p < minN && oK[p] === tkK[p] && oT[p] === tkT[p] && oFl[p] === tkFl[p] diff --git a/test/emit-lexer-verify.ts b/test/emit-lexer-verify.ts index d4128d0..e0ab8a5 100644 --- a/test/emit-lexer-verify.ts +++ b/test/emit-lexer-verify.ts @@ -26,9 +26,9 @@ if (src.includes('createLexer(')) { // Rebuild the intern config from the emitted tables' source of truth: re-emit via the // analyzer is private, so read the reference lexer through a tiny probe grammar parse — // simplest faithful route: intern maps are exactly the emitted TYPE_KIND/LIT_KW/LIT_PU. -const tk = new Map(JSON.parse(src.match(/const TYPE_KIND = new Map\((.*)\);/)![1])); -const kw = new Map(JSON.parse(src.match(/const LIT_KW = new Map\((.*)\);/)![1])); -const pu = new Map(JSON.parse(src.match(/const LIT_PU = new Map\((.*)\);/)![1])); +const tk = new Map(JSON.parse(src.match(/const TYPE_KIND = new Map(?:<[^>]*>)?\((.*)\);/)![1])); +const kw = new Map(JSON.parse(src.match(/const LIT_KW = new Map(?:<[^>]*>)?\((.*)\);/)![1])); +const pu = new Map(JSON.parse(src.match(/const LIT_PU = new Map(?:<[^>]*>)?\((.*)\);/)![1])); const kPunct = Number(src.match(/const K_PUNCT = (\d+);/)![1]); const kFallback = Number(src.match(/const K_NAMED_FALLBACK = (\d+);/)![1]); const ref = createLexer(grammar, { typeKind: tk, kwLit: kw, puLit: pu, punctKind: kPunct, namedFallback: kFallback }); diff --git a/test/emit-tsc-gate.ts b/test/emit-tsc-gate.ts index 713d251..e6df929 100644 --- a/test/emit-tsc-gate.ts +++ b/test/emit-tsc-gate.ts @@ -11,37 +11,38 @@ // emitted parser by stripping types, and the CST-identity gate (emit-parser-verify) // proves the stripped runtime is byte-for-byte the interpreter. // -// SCOPE: the self-contained emit path — soa token columns + an emitted lexer — which -// is every grammar WITHOUT markup / indent / newline modes (emitLexer covers them). -// The ts/js family (+ the jsx/tsx variants) goes through it and is enforced here. -// yaml / html take the FALLBACK path (emitLexer returns null → the parser imports -// createLexer) plus the non-soa piece-text layer; that path carries additional -// untyped surface and a pre-existing latent scope issue the gate surfaced (the -// non-soa editCore branch references cs/ceOld/parenCachePos declared only in the soa -// branch). Typing it is tracked separately — listed as DEFERRED below, not silently -// dropped. +// Both emit paths are covered: the self-contained path (soa columns + an emitted +// lexer — the ts/js family) and the fallback path (yaml/html: emitLexer returns null +// so the parser imports createLexer, plus the non-soa piece-text layer). Checking +// every grammar is what forces grammar-specific emission (token width, soa vs piece +// layer, empty vocab sets, the fallback createLexer contract) to stay type-sound — +// and it already paid off: the fallback editCore branch referenced cs/ceOld/ +// parenCachePos declared only in the soa branch (unreached at runtime, invisible +// until this gate), now hoisted/gated correctly. import { emitParser } from '../src/emit-parser.ts'; import { writeFileSync } from 'node:fs'; import { execFileSync } from 'node:child_process'; import type { CstGrammar } from '../src/types.ts'; -// Enforced: the self-contained soa + emitted-lexer path. -const CHECKED: Array<[string, string]> = [ +const GRAMMARS: Array<[string, string]> = [ ['typescript', '../typescript.ts'], ['javascript', '../javascript.ts'], ['typescriptreact', '../typescriptreact.ts'], ['javascriptreact', '../javascriptreact.ts'], + ['yaml', '../yaml.ts'], + ['html', '../html.ts'], ]; -// Deferred: the fallback-lexer / non-soa path (logged, not gated yet). -const DEFERRED = ['yaml', 'html']; +// --allowImportingTsExtensions: the fallback-lexer grammars import createLexer from +// '…/src/gen-lexer.ts' (an absolute path baked at emit time); harmless for the +// self-contained grammars, which import nothing. const TSC_FLAGS = [ - '--strict', '--noEmit', '--target', 'ES2022', - '--module', 'ES2022', '--moduleResolution', 'Bundler', '--skipLibCheck', + '--strict', '--noEmit', '--target', 'ES2022', '--module', 'ES2022', + '--moduleResolution', 'Bundler', '--skipLibCheck', '--allowImportingTsExtensions', ]; let failures = 0; -for (const [name, path] of CHECKED) { +for (const [name, path] of GRAMMARS) { let grammar: CstGrammar; try { grammar = (await import(path)).default; @@ -63,10 +64,9 @@ for (const [name, path] of CHECKED) { if (errs.length > 30) console.log(` … and ${errs.length - 30} more`); } } -console.log(` deferred (fallback-lexer / non-soa path, not yet typed): ${DEFERRED.join(', ')}`); if (failures > 0) { console.error(`\n✗ emitted parser fails strict type-check for ${failures} grammar(s)`); process.exit(1); } -console.log('\n✓ emitted parser type-checks under tsc --strict (soa + emitted-lexer family)'); +console.log('\n✓ emitted parser type-checks under tsc --strict for every grammar'); From 7d47ca3a9f77e06f929094e45b90623639b43fb6 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Sun, 21 Jun 2026 12:20:02 +0800 Subject: [PATCH 03/27] =?UTF-8?q?emit:=20target-agnostic=20emitter=20?= =?UTF-8?q?=E2=80=94=20derived=20Go=20+=20Rust=20parsers=20(issue=20#6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agnosticism payoff of #6, proven by EXECUTION. emit-portable.ts adds `emitPortableParser(grammar, target)`: one analysis → one plain-data IR → a parser rendered in each target language through a `Target` interface. The same grammar (examples/calc.ts) derives a TypeScript, a Go, and a Rust parser; the Go and Rust sources are compiled (`go build` / `rustc`) and run, and every parser's CST is compared node-for-node against the createParser interpreter. This is a SEPARATE, minimal emitter from the optimized emit-parser.ts (no incremental/recovery/arena — each target supplies its own runtime, as the issue frames it). It is the real Target seam: adding a language is implementing one `render(ir)`; buildIR is untouched. Scope = the verifiable core: char-class tokens, recursive descent with backtracking alternation and `*`, and a Pratt expression engine with operator precedence / associativity, prefix unary, and parenthesised grouping. The portable lexer is a dependency-free char scanner (no regex), so the emitted Go/Rust compile offline — sidestepping both the full-TS lexer's lookahead (which Go's RE2 and Rust's regex crate reject) and any crate fetch. buildIR THROWS on a construct it does not model rather than emit a wrong parser; mixfix/postfix LEDs, sep/opt, and lexer lookahead are the documented next increment. Gate: test/portable-targets.ts (group emit-parity) — typescript + go + rust each 21/21 accept ≡ oracle and 7/7 reject ≡ oracle over an adversarial corpus (precedence both directions, left-associativity, prefix chains, nested grouping, multi-statement programs, the empty program, malformed input). Go/Rust toolchains are optional — a missing `go`/`rustc` is skipped (the TS rendering needs only node). Full suite 42/42. --- examples/calc.ts | 57 +++++++++++ src/emit-portable.ts | 173 +++++++++++++++++++++++++++++++++ src/target-go.ts | 203 +++++++++++++++++++++++++++++++++++++++ src/target-rust.ts | 188 ++++++++++++++++++++++++++++++++++++ src/target-ts.ts | 163 +++++++++++++++++++++++++++++++ test/check.ts | 1 + test/portable-targets.ts | 116 ++++++++++++++++++++++ 7 files changed, 901 insertions(+) create mode 100644 examples/calc.ts create mode 100644 src/emit-portable.ts create mode 100644 src/target-go.ts create mode 100644 src/target-rust.ts create mode 100644 src/target-ts.ts create mode 100644 test/portable-targets.ts diff --git a/examples/calc.ts b/examples/calc.ts new file mode 100644 index 0000000..2bfcfff --- /dev/null +++ b/examples/calc.ts @@ -0,0 +1,57 @@ +// A small Pratt grammar — the cross-language target proof for issue #6. +// +// Deliberately minimal but it exercises the constructs that make parsing-as- +// derivation non-trivial: token kinds (Ident/Number), literal keywords, sequences, +// backtracking alternation, quantifiers (opt/many/sep), recursion (grouping), and — +// the crux — a Pratt expression engine with operator PRECEDENCE and associativity +// (`1 + 2 * 3` must group as `1 + (2 * 3)`), prefix unary, and a left-associative +// call/postfix continuation. emitPortableParser derives a TS, Go, and Rust parser +// from THIS one definition; the cross-language gate proves all three produce the +// byte-identical CST the interpreter (createParser) does. +// +// No lexer lookahead (the full TS grammar's number tokens use `(?!…)`, which Go's +// RE2 and Rust's regex crate reject) — the portable lexer is a dependency-free +// char-class scanner, so the emitted Go/Rust compile offline with no regex engine. +import { + token, rule, defineGrammar, left, right, op, prefix, + seq, oneOf, range, star, many, +} from '../src/api.ts'; + +const digit = range('0', '9'); +const identStart = oneOf(range('a', 'z'), range('A', 'Z'), '_'); +const identPart = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_'); + +const Ident = token(seq(identStart, star(identPart)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); + +// Precedence ladder (earlier = looser): `+` `-` loosest, then `*` `/`, then prefix +// `-` tightest — so `1 + 2 * 3` is `1 + (2 * 3)` and `-a * b` is `(-a) * b`. +const calcPrec = [ + left('+', '-'), + left('*', '/'), + right(prefix('-')), +]; + +const Expr = rule(($) => [ + Number_, + Ident, + ['(', $, ')'], // grouping (recursion) + [prefix, $], // prefix unary minus (operators from the ladder) + [$, op, $], // binary infix, precedence from the ladder +]); + +const Stmt = rule(($) => [ + ['let', Ident, '=', Expr, ';'], + [Expr, ';'], +]); + +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'calc', + scopeName: 'source.calc', + tokens: { Ident, Number: Number_ }, + prec: calcPrec, + // findEntryRule = the LAST rule, so Program is the entry point. + rules: { Expr, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts new file mode 100644 index 0000000..900beec --- /dev/null +++ b/src/emit-portable.ts @@ -0,0 +1,173 @@ +// ── emit-portable ── +// +// The target-agnostic emitter (issue #6). `emitPortableParser(grammar, target)` derives +// a COMPLETE, self-contained parser in the target's language from the same CstGrammar the +// TS engine uses. It is the agnosticism proof: ONE analysis → ONE intermediate form (IR) +// → N language renderings, all producing the byte-identical CST the interpreter does. +// +// SHARED + target-agnostic (here): the grammar ANALYSIS (reused from grammar-analysis.ts) +// and `buildIR` — the parse plan as plain data (recursive-descent rules as alternative +// step-lists, the Pratt rule as NUD-atom / prefix / binary tables, the char-class lexer +// specs, the literal vocabulary, the entry rule). PER-TARGET (a Target): `render(ir)` — +// the language's lexer + CST runtime + the rendering of each IR node. Adding a language is +// implementing one Target; nothing here changes. +// +// SCOPE (the verifiable core): char-class tokens (`charClass` then `star(charClass)`), a +// recursive-descent + backtracking-alternation + `*` body, and a Pratt expression engine +// with operator PRECEDENCE/associativity + prefix unary + parenthesised grouping. The +// portable lexer is a dependency-free char scanner (no regex), so the emitted Go/Rust +// compile offline. Richer surface (mixfix/postfix LEDs, `sep`/`opt`, lexer lookahead, +// left-recursion beyond Pratt) is the documented next increment; buildIR THROWS on a +// construct it does not model rather than emit a wrong parser. +import type { CstGrammar, RuleExpr, TokenDecl, TokenPattern } from './types.ts'; +import { analyzeGrammar, findEntryRule } from './grammar-analysis.ts'; +import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; + +// ── Intermediate representation (plain data; every Target renders THIS) ── + +export type CharRange = [number, number]; // inclusive char-code range +export type TokenSpec = { name: string; first: CharRange[]; cont: CharRange[] }; + +export type Step = + | { t: 'lit'; value: string; ttype: '$keyword' | '$punct' } // match a literal by text + | { t: 'tok'; name: string } // match a token kind + | { t: 'rule'; name: string } // call a rule, append its node + | { t: 'star'; step: Step }; // repeat the inner step 0+ times +export type Alt = Step[]; + +export type RdRule = { kind: 'rd'; name: string; alts: Alt[] }; +export type PrattRule = { + kind: 'pratt'; + name: string; + atomToks: string[]; // NUD: a bare token (Number/Ident) wrapped in a node + group: { open: string; close: string } | null; // NUD: '(' Expr ')' + prefix: Array<{ op: string; rbp: number }>; // NUD: prefix op then operand parsed at rbp + binary: Array<{ op: string; lbp: number; rbp: number }>; // LED: infix op, bind iff lbp > minBp, rhs at rbp +}; +export type RuleIR = RdRule | PrattRule; + +export type ParserIR = { + grammarName: string; + entry: string; + tokens: TokenSpec[]; // named tokens, for the char scanner (tried in declaration order) + puncts: string[]; // punctuation literals, sorted longest-first (maximal munch) + rules: RuleIR[]; +}; + +export interface Target { + name: string; + ext: string; // emitted file extension (no dot) + render(ir: ParserIR): string; // the complete, compilable source +} + +export function emitPortableParser(grammar: CstGrammar, target: Target): string { + return target.render(buildIR(grammar)); +} + +// ── buildIR: grammar + analysis → the target-agnostic parse plan ── + +function buildIR(grammar: CstGrammar): ParserIR { + const a = analyzeGrammar(grammar); + const tokenNames = a.tokenNames; + + // Lexer token specs: each token must be `charClass` then `star(charClass)` (the portable + // scanner's shape). Anything else is out of the verifiable core → throw, don't mis-lex. + const tokens: TokenSpec[] = grammar.tokens.map((t) => { + const { first, cont } = charClassFirstCont(t); + return { name: t.name, first, cont }; + }); + + // Literal vocabulary, split keyword (alpha — lexed as an identifier, matched by text) vs + // punctuation (lexed as its own token). Puncts longest-first for maximal munch. + const lits = new Set(); + for (const r of grammar.rules) for (const l of collectLiterals(r.body)) lits.add(l); + for (const lv of grammar.precs) for (const o of lv.operators) lits.add(o.value); + const puncts = [...lits].filter((l) => !isKeywordLiteral(l)).sort((x, y) => y.length - x.length); + + const litTtype = (v: string): '$keyword' | '$punct' => (isKeywordLiteral(v) ? '$keyword' : '$punct'); + + const rules: RuleIR[] = grammar.rules.map((r) => { + if (a.prattRules.has(r.name)) return buildPratt(r.name, r.body, a); + return { kind: 'rd', name: r.name, alts: buildRdAlts(r.body) }; + }); + + function buildRdAlts(body: RuleExpr): Alt[] { + if (body.type === 'alt') return body.items.map(altSteps); + return [altSteps(body)]; + } + function altSteps(e: RuleExpr): Step[] { + if (e.type === 'seq') return e.items.flatMap(stepOf); + return stepOf(e); + } + function stepOf(e: RuleExpr): Step[] { + switch (e.type) { + case 'literal': return [{ t: 'lit', value: e.value, ttype: litTtype(e.value) }]; + case 'ref': return [tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }]; + case 'quantifier': { + if (e.kind !== '*') throw new Error(`portable: quantifier '${e.kind}' not in the verifiable core (only '*')`); + const inner = stepOf(e.body); + if (inner.length !== 1) throw new Error('portable: `*` body must be a single step (a rule/token ref)'); + return [{ t: 'star', step: inner[0] }]; + } + case 'group': return altSteps(e.body); + default: throw new Error(`portable: rd construct '${e.type}' not in the verifiable core`); + } + } + + return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules }; +} + +// A Pratt rule's alternatives, classified into NUD atoms / grouping / prefix and LED binary. +// The binding powers come from the analysis (opTable/prefixOps), so precedence is single- +// sourced with the interpreter. +function buildPratt(name: string, body: RuleExpr, a: ReturnType): PrattRule { + const alts = body.type === 'alt' ? body.items : [body]; + const atomToks: string[] = []; + let group: { open: string; close: string } | null = null; + let sawPrefix = false; + let sawBinary = false; + for (const alt of alts) { + const items = alt.type === 'seq' ? alt.items : [alt]; + if (items.length === 1 && items[0].type === 'ref' && a.tokenNames.has(items[0].name)) { + atomToks.push(items[0].name); // [Token] + } else if (items.length === 3 && items[0].type === 'literal' && items[2].type === 'literal' + && items[1].type === 'ref' && items[1].name === name) { + group = { open: items[0].value, close: items[2].value }; // [ '(' $ ')' ] + } else if (items.length === 2 && items[0].type === 'prefix' && items[1].type === 'ref' && items[1].name === name) { + sawPrefix = true; // [ prefix $ ] + } else if (items.length === 3 && items[0].type === 'ref' && items[0].name === name + && items[1].type === 'op' && items[2].type === 'ref' && items[2].name === name) { + sawBinary = true; // [ $ op $ ] + } else { + throw new Error(`portable: Pratt alt shape not in the verifiable core (rule ${name})`); + } + } + const prefix = sawPrefix + ? [...a.prefixOps.entries()].map(([op, info]) => ({ op, rbp: info.rbp })) + : []; + const binary = sawBinary + ? [...a.opTable.entries()] + .filter(([, info]) => info.position === 'infix') + .map(([op, info]) => ({ op, lbp: info.lbp, rbp: info.rbp })) + : []; + return { kind: 'pratt', name, atomToks, group, prefix, binary }; +} + +// Extract a token's (first-char, continue-char) code ranges from a `charClass` then +// `star(charClass)` pattern. Throws for any other shape (out of the verifiable core). +function charClassFirstCont(t: TokenDecl): { first: CharRange[]; cont: CharRange[] } { + const p = t.pattern; + if (typeof p === 'string' || p.type !== 'seq' || p.items.length !== 2) throw new Error(`portable: token ${t.name} not [charClass, star(charClass)]`); + const head = p.items[0]; + const tail = p.items[1]; + if (typeof tail === 'string' || tail.type !== 'repeat' || tail.min !== 0) throw new Error(`portable: token ${t.name} tail is not star(charClass)`); + return { first: classRanges(head, t.name), cont: classRanges(tail.body, t.name) }; +} +function classRanges(p: TokenPattern, tok: string): CharRange[] { + if (typeof p === 'string' || p.type !== 'charClass' || p.negate) throw new Error(`portable: token ${tok} uses a non-positive char class`); + return p.items.map((it): CharRange => { + if (it.type === 'char') return [it.value.charCodeAt(0), it.value.charCodeAt(0)]; + if (it.type === 'range') return [it.from.charCodeAt(0), it.to.charCodeAt(0)]; + throw new Error(`portable: token ${tok} char-class item '${(it as { type: string }).type}' unsupported`); + }); +} diff --git a/src/target-go.ts b/src/target-go.ts new file mode 100644 index 0000000..bc81629 --- /dev/null +++ b/src/target-go.ts @@ -0,0 +1,203 @@ +// The Go Target for emit-portable. Renders the same language-agnostic ParserIR as tsTarget +// into a self-contained Go program (Go stdlib only — the char-class lexer is regex-free, so +// it compiles with no module dependencies). Its CST JSON is checked byte-for-byte against +// the interpreter, so `emitPortableParser(grammar, goTarget)` is a real, verified Go parser +// derived from the same grammar definition. +import type { ParserIR, RdRule, PrattRule, Step, CharRange, Target } from './emit-portable.ts'; + +const J = (v: unknown) => JSON.stringify(v); +const goStr = (s: string) => J(s); // Go and JSON string literals coincide for our ASCII vocab +const rangeCond = (v: string, rs: CharRange[]) => + rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || '); + +function lexer(ir: ParserIR): string { + const cases = ir.tokens.map((t) => `\t\tif ${rangeCond('c', t.first)} { +\t\t\te := pos + 1 +\t\t\tfor e < n { cc := int(src[e]); if !(${rangeCond('cc', t.cont)}) { break }; e++ } +\t\t\ttoks = append(toks, Tok{${goStr(t.name)}, src[pos:e], pos, e}); pos = e; continue +\t\t}`).join('\n'); + const punctChecks = ir.puncts.map((p) => + `\t\tif strings.HasPrefix(src[pos:], ${goStr(p)}) { toks = append(toks, Tok{"", ${goStr(p)}, pos, pos + ${p.length}}); pos += ${p.length}; continue }`).join('\n'); + return `func lex(src string) []Tok { +\ttoks := []Tok{} +\tn := len(src) +\tpos := 0 +\tfor pos < n { +\t\tc := int(src[pos]) +\t\tif c == 32 || c == 9 || c == 10 || c == 13 { pos++; continue } +${cases} +${punctChecks} +\t\tpanic(fmt.Sprintf("lex error at %d", pos)) +\t} +\treturn toks +}`; +} + +function rdRule(r: RdRule): string { + const alt = (steps: Step[]) => { + const conds = steps.map(stepCond).join(' && '); + return `\t{ kids := []*Cst{}; if ${conds} { return branch(${goStr(r.name)}, kids, save) }; pos = save }`; + }; + return `func parse${r.name}() *Cst { +\tsave := pos +${r.alts.map(alt).join('\n')} +\treturn nil +}`; +} +function stepCond(s: Step): string { + switch (s.t) { + case 'lit': return `matchLit(${goStr(s.value)}, ${goStr(s.ttype)}, &kids)`; + case 'tok': return `matchTok(${goStr(s.name)}, &kids)`; + case 'rule': return `callRule(parse${s.name}, &kids)`; + case 'star': return `star(func() bool { return ${stepCond(s.step)} }, &kids)`; + } +} + +function prattRule(r: PrattRule): string { + const bin = r.binary.map((b) => `${goStr(b.op)}: {${b.lbp}, ${b.rbp}}`).join(', '); + const pre = r.prefix.map((p) => `${goStr(p.op)}: ${p.rbp}`).join(', '); + const atoms = r.atomToks.map((k) => `${goStr(k)}: true`).join(', '); + const g = r.group; + return `var ${r.name}BIN = map[string]bp{${bin}} +var ${r.name}PRE = map[string]int{${pre}} +var ${r.name}ATOM = map[string]bool{${atoms}} +func parse${r.name}() *Cst { return ${r.name}bp(0) } +func ${r.name}bp(minBp int) *Cst { +\tleft := ${r.name}nud() +\tif left == nil { return nil } +\tfor { +\t\tt := peek() +\t\tif t == nil { break } +\t\tinfo, ok := ${r.name}BIN[t.Text] +\t\tif !ok || info.lbp <= minBp { break } +\t\tledSave := pos +\t\tpos++ +\t\topLeaf := &Cst{IsLeaf: true, TokenType: "$operator", Offset: t.Off, End: t.End} +\t\trhs := ${r.name}bp(info.rbp) +\t\tif rhs == nil { pos = ledSave; break } +\t\tleft = &Cst{Rule: ${goStr(r.name)}, Children: []*Cst{left, opLeaf, rhs}, Offset: left.Offset, End: rhs.End} +\t} +\treturn left +} +func ${r.name}nud() *Cst { +\tt := peek() +\tif t == nil { return nil } +\tif ${r.name}ATOM[t.Kind] { +\t\tpos++ +\t\treturn &Cst{Rule: ${goStr(r.name)}, Children: []*Cst{{IsLeaf: true, TokenType: t.Kind, Offset: t.Off, End: t.End}}, Offset: t.Off, End: t.End} +\t} +${g ? `\tif t.Text == ${goStr(g.open)} { +\t\tsave := pos; pos++ +\t\tinner := ${r.name}bp(0) +\t\tc := peek() +\t\tif inner == nil || c == nil || c.Text != ${goStr(g.close)} { pos = save; return nil } +\t\tpos++ +\t\treturn &Cst{Rule: ${goStr(r.name)}, Children: []*Cst{{IsLeaf: true, TokenType: "$punct", Offset: t.Off, End: t.End}, inner, {IsLeaf: true, TokenType: "$punct", Offset: c.Off, End: c.End}}, Offset: t.Off, End: c.End} +\t}` : ''} +\tif pbp, ok := ${r.name}PRE[t.Text]; ok { +\t\tsave := pos; pos++ +\t\topLeaf := &Cst{IsLeaf: true, TokenType: "$operator", Offset: t.Off, End: t.End} +\t\toperand := ${r.name}bp(pbp) +\t\tif operand == nil { pos = save; return nil } +\t\treturn &Cst{Rule: ${goStr(r.name)}, Children: []*Cst{opLeaf, operand}, Offset: t.Off, End: operand.End} +\t} +\treturn nil +}`; +} + +export const goTarget: Target = { + name: 'go', + ext: 'go', + render(ir: ParserIR): string { + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r) : rdRule(r))).join('\n\n'); + return `// GENERATED by emit-portable.ts (goTarget) — parser for grammar "${ir.grammarName}". +package main + +import ( +\t"fmt" +\t"io" +\t"os" +\t"strings" +) + +type Tok struct { +\tKind, Text string +\tOff, End int +} +type Cst struct { +\tRule string +\tChildren []*Cst +\tIsLeaf bool +\tTokenType string +\tOffset int +\tEnd int +} +type bp struct{ lbp, rbp int } + +${lexer(ir)} + +var toks []Tok +var pos int + +func peek() *Tok { +\tif pos < len(toks) { return &toks[pos] } +\treturn nil +} +func branch(rule string, kids []*Cst, save int) *Cst { +\toffset := 0 +\tif len(kids) > 0 { offset = kids[0].Offset } else if save < len(toks) { offset = toks[save].Off } else if len(toks) > 0 { offset = toks[len(toks)-1].End } +\tend := offset +\tif len(kids) > 0 { end = kids[len(kids)-1].End } +\treturn &Cst{Rule: rule, Children: kids, Offset: offset, End: end} +} +func matchLit(value, ttype string, kids *[]*Cst) bool { +\tt := peek() +\tif t == nil || t.Text != value { return false } +\t*kids = append(*kids, &Cst{IsLeaf: true, TokenType: ttype, Offset: t.Off, End: t.End}); pos++; return true +} +func matchTok(name string, kids *[]*Cst) bool { +\tt := peek() +\tif t == nil || t.Kind != name { return false } +\t*kids = append(*kids, &Cst{IsLeaf: true, TokenType: name, Offset: t.Off, End: t.End}); pos++; return true +} +func callRule(fn func() *Cst, kids *[]*Cst) bool { +\tn := fn() +\tif n == nil { return false } +\t*kids = append(*kids, n); return true +} +func star(once func() bool, kids *[]*Cst) bool { +\tfor { sp := pos; before := len(*kids); if !once() { pos = sp; *kids = (*kids)[:before]; break } } +\treturn true +} + +${ruleFns} + +func writeJSON(c *Cst, b *strings.Builder) { +\tif c.IsLeaf { +\t\tfmt.Fprintf(b, "{\\"tokenType\\":%q,\\"offset\\":%d,\\"end\\":%d}", c.TokenType, c.Offset, c.End) +\t\treturn +\t} +\tfmt.Fprintf(b, "{\\"rule\\":%q,\\"children\\":[", c.Rule) +\tfor i, k := range c.Children { +\t\tif i > 0 { b.WriteByte(',') } +\t\twriteJSON(k, b) +\t} +\tfmt.Fprintf(b, "],\\"offset\\":%d,\\"end\\":%d}", c.Offset, c.End) +} + +func main() { +\tdata, _ := io.ReadAll(os.Stdin) +\ttoks = lex(string(data)) +\tpos = 0 +\troot := parse${ir.entry}() +\tif root == nil || pos != len(toks) { +\t\tfmt.Fprintf(os.Stderr, "parse error (pos %d/%d)\\n", pos, len(toks)) +\t\tos.Exit(1) +\t} +\tvar b strings.Builder +\twriteJSON(root, &b) +\tos.Stdout.WriteString(b.String()) +} +`; + }, +}; diff --git a/src/target-rust.ts b/src/target-rust.ts new file mode 100644 index 0000000..726ff1a --- /dev/null +++ b/src/target-rust.ts @@ -0,0 +1,188 @@ +// The Rust Target for emit-portable. Renders the same language-agnostic ParserIR as +// tsTarget/goTarget into a self-contained Rust program (no external crates — the char-class +// lexer is regex-free, so it compiles with rustc alone, no Cargo/network). Its CST JSON is +// checked byte-for-byte against the interpreter, so `emitPortableParser(grammar, rustTarget)` +// is a real, verified Rust parser derived from the same grammar definition. +import type { ParserIR, RdRule, PrattRule, Step, CharRange, Target } from './emit-portable.ts'; + +const J = (v: unknown) => JSON.stringify(v); +const rsStr = (s: string) => J(s); // Rust and JSON string literals coincide for our ASCII vocab +const rangeCond = (v: string, rs: CharRange[]) => + rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `(${lo}..=${hi}).contains(&${v})`)).join(' || '); + +function lexer(ir: ParserIR): string { + const cases = ir.tokens.map((t) => ` if ${rangeCond('c', t.first)} { + let mut e = pos + 1; + while e < n { let cc = b[e] as u32; if !(${rangeCond('cc', t.cont)}) { break } e += 1; } + toks.push(Tok { kind: ${rsStr(t.name)}.to_string(), text: src[pos..e].to_string(), off: pos, end: e }); pos = e; continue; + }`).join('\n'); + const punctChecks = ir.puncts.map((p) => + ` if src[pos..].starts_with(${rsStr(p)}) { toks.push(Tok { kind: String::new(), text: ${rsStr(p)}.to_string(), off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); + return `fn lex(src: &str) -> Vec { + let b = src.as_bytes(); + let n = b.len(); + let mut toks: Vec = Vec::new(); + let mut pos = 0usize; + while pos < n { + let c = b[pos] as u32; + if c == 32 || c == 9 || c == 10 || c == 13 { pos += 1; continue; } +${cases} +${punctChecks} + panic!("lex error at {}", pos); + } + toks +}`; +} + +function rdRule(r: RdRule): string { + const alt = (steps: Step[]) => { + const conds = steps.map(stepCond).join(' && '); + return ` { let mut kids: Vec = Vec::new(); if ${conds} { return Some(self.branch(${rsStr(r.name)}, kids, save)); } self.pos = save; }`; + }; + return ` fn parse_${r.name}(&mut self) -> Option { + let save = self.pos; +${r.alts.map(alt).join('\n')} + None + }`; +} +function stepCond(s: Step): string { + switch (s.t) { + case 'lit': return `self.match_lit(${rsStr(s.value)}, ${rsStr(s.ttype)}, &mut kids)`; + case 'tok': return `self.match_tok(${rsStr(s.name)}, &mut kids)`; + case 'rule': return `self.call_rule(Parser::parse_${s.name}, &mut kids)`; + case 'star': return `self.star(|p, k| ${starInner(s.step)}, &mut kids)`; + } +} +function starInner(s: Step): string { + switch (s.t) { + case 'lit': return `p.match_lit(${rsStr(s.value)}, ${rsStr(s.ttype)}, k)`; + case 'tok': return `p.match_tok(${rsStr(s.name)}, k)`; + case 'rule': return `p.call_rule(Parser::parse_${s.name}, k)`; + case 'star': throw new Error('portable: nested star unsupported'); + } +} + +function prattRule(r: PrattRule): string { + const binArms = r.binary.map((b) => `${rsStr(b.op)} => Some((${b.lbp}, ${b.rbp}))`).join(', '); + const preArms = r.prefix.map((p) => `${rsStr(p.op)} => Some(${p.rbp})`).join(', '); + const atomArm = r.atomToks.map(rsStr).join(' | '); + const g = r.group; + return ` fn parse_${r.name}(&mut self) -> Option { self.${r.name}_bp(0) } + fn ${r.name}_bin(op: &str) -> Option<(i64, i64)> { match op { ${binArms}${binArms ? ', ' : ''}_ => None } } + fn ${r.name}_pre(op: &str) -> Option { match op { ${preArms}${preArms ? ', ' : ''}_ => None } } + fn ${r.name}_atom(kind: &str) -> bool { matches!(kind, ${atomArm || '""'}) } + fn ${r.name}_bp(&mut self, min_bp: i64) -> Option { + let mut left = self.${r.name}_nud()?; + loop { + let t = match self.peek() { Some(t) => t, None => break }; + let (lbp, rbp) = match Parser::${r.name}_bin(&t.text) { Some(x) => x, None => break }; + if lbp <= min_bp { break; } + let led_save = self.pos; + self.pos += 1; + let op_leaf = Cst::leaf("$operator", t.off, t.end); + let rhs = match self.${r.name}_bp(rbp) { Some(r) => r, None => { self.pos = led_save; break; } }; + let (off, end) = (left.offset, rhs.end); + left = Cst::node(${rsStr(r.name)}, vec![left, op_leaf, rhs], off, end); + } + Some(left) + } + fn ${r.name}_nud(&mut self) -> Option { + let t = self.peek()?; + if Parser::${r.name}_atom(&t.kind) { + self.pos += 1; + return Some(Cst::node(${rsStr(r.name)}, vec![Cst::leaf(&t.kind, t.off, t.end)], t.off, t.end)); + } +${g ? ` if t.text == ${rsStr(g.open)} { + let save = self.pos; self.pos += 1; + let inner = self.${r.name}_bp(0); + let c = self.peek(); + match (inner, c) { + (Some(inner), Some(c)) if c.text == ${rsStr(g.close)} => { + self.pos += 1; + let (off, end) = (t.off, c.end); + return Some(Cst::node(${rsStr(r.name)}, vec![Cst::leaf("$punct", t.off, t.end), inner, Cst::leaf("$punct", c.off, c.end)], off, end)); + } + _ => { self.pos = save; return None; } + } + }` : ''} + if let Some(pbp) = Parser::${r.name}_pre(&t.text) { + let save = self.pos; self.pos += 1; + let op_leaf = Cst::leaf("$operator", t.off, t.end); + match self.${r.name}_bp(pbp) { + Some(operand) => { let (off, end) = (t.off, operand.end); return Some(Cst::node(${rsStr(r.name)}, vec![op_leaf, operand], off, end)); } + None => { self.pos = save; return None; } + } + } + None + }`; +} + +export const rustTarget: Target = { + name: 'rust', + ext: 'rs', + render(ir: ParserIR): string { + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r) : rdRule(r))).join('\n\n'); + return `// GENERATED by emit-portable.ts (rustTarget) — parser for grammar "${ir.grammarName}". +#![allow(non_snake_case)] +use std::io::Read; + +#[derive(Clone)] +struct Tok { kind: String, text: String, off: usize, end: usize } + +struct Cst { rule: String, children: Vec, is_leaf: bool, token_type: String, offset: usize, end: usize } +impl Cst { + fn leaf(tt: &str, off: usize, end: usize) -> Cst { Cst { rule: String::new(), children: Vec::new(), is_leaf: true, token_type: tt.to_string(), offset: off, end } } + fn node(rule: &str, children: Vec, offset: usize, end: usize) -> Cst { Cst { rule: rule.to_string(), children, is_leaf: false, token_type: String::new(), offset, end } } +} + +${lexer(ir)} + +struct Parser { toks: Vec, pos: usize } +impl Parser { + fn peek(&self) -> Option { if self.pos < self.toks.len() { Some(self.toks[self.pos].clone()) } else { None } } + fn branch(&self, rule: &str, kids: Vec, save: usize) -> Cst { + let offset = if !kids.is_empty() { kids[0].offset } else if save < self.toks.len() { self.toks[save].off } else if !self.toks.is_empty() { self.toks[self.toks.len() - 1].end } else { 0 }; + let end = if !kids.is_empty() { kids[kids.len() - 1].end } else { offset }; + Cst::node(rule, kids, offset, end) + } + fn match_lit(&mut self, value: &str, ttype: &str, kids: &mut Vec) -> bool { + match self.peek() { Some(t) if t.text == value => { kids.push(Cst::leaf(ttype, t.off, t.end)); self.pos += 1; true } _ => false } + } + fn match_tok(&mut self, name: &str, kids: &mut Vec) -> bool { + match self.peek() { Some(t) if t.kind == name => { kids.push(Cst::leaf(name, t.off, t.end)); self.pos += 1; true } _ => false } + } + fn call_rule(&mut self, f: fn(&mut Parser) -> Option, kids: &mut Vec) -> bool { + match f(self) { Some(n) => { kids.push(n); true } None => false } + } + fn star(&mut self, once: fn(&mut Parser, &mut Vec) -> bool, kids: &mut Vec) -> bool { + loop { let sp = self.pos; let before = kids.len(); if !once(self, kids) { self.pos = sp; kids.truncate(before); break; } } + true + } + +${ruleFns} +} + +fn write_json(c: &Cst, out: &mut String) { + if c.is_leaf { + out.push_str(&format!("{{\\"tokenType\\":\\"{}\\",\\"offset\\":{},\\"end\\":{}}}", c.token_type, c.offset, c.end)); + return; + } + out.push_str(&format!("{{\\"rule\\":\\"{}\\",\\"children\\":[", c.rule)); + for (i, k) in c.children.iter().enumerate() { if i > 0 { out.push(','); } write_json(k, out); } + out.push_str(&format!("],\\"offset\\":{},\\"end\\":{}}}", c.offset, c.end)); +} + +fn main() { + let mut src = String::new(); + std::io::stdin().read_to_string(&mut src).unwrap(); + let toks = lex(&src); + let n = toks.len(); + let mut p = Parser { toks, pos: 0 }; + match p.parse_${ir.entry}() { + Some(root) if p.pos == n => { let mut out = String::new(); write_json(&root, &mut out); print!("{}", out); } + _ => { eprintln!("parse error (pos {}/{})", p.pos, n); std::process::exit(1); } + } +} +`; + }, +}; diff --git a/src/target-ts.ts b/src/target-ts.ts new file mode 100644 index 0000000..ab37220 --- /dev/null +++ b/src/target-ts.ts @@ -0,0 +1,163 @@ +// The TypeScript Target for emit-portable. Renders the language-agnostic ParserIR into a +// self-contained TS parser: a char-class lexer, a backtracking recursive-descent core, a +// Pratt expression engine, and a CST→JSON printer over stdin. It is the reference rendering +// — its CST is checked byte-for-byte against the interpreter (createParser), so a divergence +// in the portable logic shows up here before Go/Rust are even compiled. +import type { ParserIR, RdRule, PrattRule, Step, CharRange, Target } from './emit-portable.ts'; + +const J = (v: unknown) => JSON.stringify(v); +const rangeCond = (v: string, rs: CharRange[]) => + rs.map(([lo, hi]) => (lo === hi ? `${v} === ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || '); + +function lexer(ir: ParserIR): string { + const cases = ir.tokens.map((t) => ` if (${rangeCond('c', t.first)}) { + let e = pos + 1; + while (e < n) { const cc = src.charCodeAt(e); if (!(${rangeCond('cc', t.cont)})) break; e++; } + toks.push({ kind: ${J(t.name)}, text: src.slice(pos, e), off: pos, end: e }); pos = e; continue; + }`).join('\n'); + const punctChecks = ir.puncts.map((p) => + ` if (src.startsWith(${J(p)}, pos)) { toks.push({ kind: '', text: ${J(p)}, off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); + return `function lex(src: string): Tok[] { + const toks: Tok[] = []; + const n = src.length; + let pos = 0; + while (pos < n) { + const c = src.charCodeAt(pos); + if (c === 32 || c === 9 || c === 10 || c === 13) { pos++; continue; } +${cases} +${punctChecks} + throw new Error('lex error at ' + pos + ': ' + JSON.stringify(src[pos])); + } + return toks; +}`; +} + +function rdRule(r: RdRule): string { + const alt = (steps: Step[]) => { + const conds = steps.map(stepCond).join(' && '); + return ` { const kids: Cst[] = []; if (${conds}) return branch(${J(r.name)}, kids, save); pos = save; }`; + }; + return `function parse${r.name}(): Node | null { + const save = pos; +${r.alts.map(alt).join('\n')} + return null; +}`; +} +function stepCond(s: Step): string { + switch (s.t) { + case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)}, kids)`; + case 'tok': return `matchTok(${J(s.name)}, kids)`; + case 'rule': return `callRule(parse${s.name}, kids)`; + case 'star': return `star(() => ${stepCond(s.step)}, kids)`; + } +} + +function prattRule(r: PrattRule): string { + const BIN = `{ ${r.binary.map((b) => `${J(b.op)}: { lbp: ${b.lbp}, rbp: ${b.rbp} }`).join(', ')} }`; + const PRE = `{ ${r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', ')} }`; + const atomSet = `new Set([${r.atomToks.map(J).join(', ')}])`; + const group = r.group; + return `const ${r.name}_BIN: Record = ${BIN}; +const ${r.name}_PRE: Record = ${PRE}; +const ${r.name}_ATOM = ${atomSet}; +function parse${r.name}(): Node | null { return ${r.name}_bp(0); } +function ${r.name}_bp(minBp: number): Node | null { + let left = ${r.name}_nud(); + if (left === null) return null; + for (;;) { + const t = peek(); + if (t === null) break; + const info = ${r.name}_BIN[t.text]; + if (info === undefined || info.lbp <= minBp) break; + const ledSave = pos; + pos++; + const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; + const rhs = ${r.name}_bp(info.rbp); + if (rhs === null) { pos = ledSave; break; } + left = { rule: ${J(r.name)}, children: [left, opLeaf, rhs], offset: left.offset, end: rhs.end }; + } + return left; +} +function ${r.name}_nud(): Node | null { + const t = peek(); + if (t === null) return null; + if (${r.name}_ATOM.has(t.kind)) { pos++; return { rule: ${J(r.name)}, children: [{ tokenType: t.kind, offset: t.off, end: t.end }], offset: t.off, end: t.end }; } +${group ? ` if (t.text === ${J(group.open)}) { + const save = pos; pos++; + const inner = ${r.name}_bp(0); + const c = peek(); + if (inner === null || c === null || c.text !== ${J(group.close)}) { pos = save; return null; } + pos++; + return { rule: ${J(r.name)}, children: [{ tokenType: '$punct', offset: t.off, end: t.end }, inner, { tokenType: '$punct', offset: c.off, end: c.end }], offset: t.off, end: c.end }; + }` : ''} + const pbp = ${r.name}_PRE[t.text]; + if (pbp !== undefined) { + const save = pos; pos++; + const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; + const operand = ${r.name}_bp(pbp); + if (operand === null) { pos = save; return null; } + return { rule: ${J(r.name)}, children: [opLeaf, operand], offset: t.off, end: operand.end }; + } + return null; +}`; +} + +export const tsTarget: Target = { + name: 'typescript', + ext: 'ts', + render(ir: ParserIR): string { + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r) : rdRule(r))).join('\n\n'); + return `// GENERATED by emit-portable.ts (tsTarget) — parser for grammar "${ir.grammarName}". +import { readFileSync } from 'node:fs'; + +type Tok = { kind: string; text: string; off: number; end: number }; +type Leaf = { tokenType: string; offset: number; end: number }; +type Node = { rule: string; children: Cst[]; offset: number; end: number }; +type Cst = Node | Leaf; + +${lexer(ir)} + +let toks: Tok[] = []; +let pos = 0; +function peek(): Tok | null { return pos < toks.length ? toks[pos] : null; } +function curOff(): number { return pos < toks.length ? toks[pos].off : (toks.length > 0 ? toks[toks.length - 1].end : 0); } +function branch(rule: string, kids: Cst[], save: number): Node { + const offset = kids.length > 0 ? kids[0].offset : (save < toks.length ? toks[save].off : curOff()); + const end = kids.length > 0 ? kids[kids.length - 1].end : offset; + return { rule, children: kids, offset, end }; +} +function matchLit(value: string, ttype: string, kids: Cst[]): boolean { + const t = peek(); + if (t === null || t.text !== value) return false; + kids.push({ tokenType: ttype, offset: t.off, end: t.end }); pos++; return true; +} +function matchTok(name: string, kids: Cst[]): boolean { + const t = peek(); + if (t === null || t.kind !== name) return false; + kids.push({ tokenType: name, offset: t.off, end: t.end }); pos++; return true; +} +function callRule(fn: () => Node | null, kids: Cst[]): boolean { + const n = fn(); + if (n === null) return false; + kids.push(n); return true; +} +function star(once: () => boolean, kids: Cst[]): boolean { + for (;;) { const sp = pos; const before = kids.length; if (!once()) { pos = sp; kids.length = before; break; } } + return true; +} + +${ruleFns} + +function offsetEnd(n: Cst): number { return n.end; } +const src = readFileSync(0, 'utf8'); +toks = lex(src); +pos = 0; +const root = parse${ir.entry}(); +if (root === null || pos !== toks.length) { + process.stderr.write('parse error (pos ' + pos + '/' + toks.length + ')\\n'); + process.exit(1); +} +process.stdout.write(JSON.stringify(root)); +`; + }, +}; diff --git a/test/check.ts b/test/check.ts index cf61a77..97b5a27 100644 --- a/test/check.ts +++ b/test/check.ts @@ -27,6 +27,7 @@ const GATES: Gate[] = [ { group: 'emit-parity', name: 'emit-reject-messages', args: ['test/emit-reject-messages.ts'] }, { group: 'emit-parity', name: 'emit-lexer-verify', args: ['test/emit-lexer-verify.ts'] }, { group: 'emit-parity', name: 'emit-tsc-gate', args: ['test/emit-tsc-gate.ts'] }, + { group: 'emit-parity', name: 'portable-targets', args: ['test/portable-targets.ts'] }, { group: 'core', name: 'multi-doc', args: ['test/multi-doc.ts'] }, { group: 'core', name: 'recovery', args: ['test/recovery.ts'] }, { group: 'core', name: 'incremental-grammars', args: ['test/incremental-grammars.ts'] }, diff --git a/test/portable-targets.ts b/test/portable-targets.ts new file mode 100644 index 0000000..5d4e3b0 --- /dev/null +++ b/test/portable-targets.ts @@ -0,0 +1,116 @@ +// Gate: the TARGET-AGNOSTIC emitter (issue #6) — `emitPortableParser(grammar, target)` +// derives a parser in EACH target language that produces the byte-identical CST the +// interpreter (createParser) does. This is the agnosticism proof by EXECUTION: the same +// examples/calc.ts grammar is rendered to TypeScript, Go, and Rust; the Go and Rust +// sources are COMPILED and RUN, and every parser's CST output is compared, node-for-node, +// against the createParser oracle over an adversarial corpus (operator precedence / +// associativity, prefix chains, nested grouping, multi-statement programs, and the empty +// program), plus reject-parity on malformed input. +// +// Go/Rust toolchains are optional: a missing `go` or `rustc` is logged and skipped (the +// TS rendering, which needs only node, always runs) — the same graceful-degrade pattern +// the external-corpus gates use, so this stays green on a machine without them. +import { execFileSync } from 'node:child_process'; +import { mkdirSync, rmSync, writeFileSync } from 'node:fs'; +import { createParser } from '../src/gen-parser.ts'; +import { emitPortableParser } from '../src/emit-portable.ts'; +import { tsTarget } from '../src/target-ts.ts'; +import { goTarget } from '../src/target-go.ts'; +import { rustTarget } from '../src/target-rust.ts'; + +const grammar = (await import('../examples/calc.ts')).default; +const oracle = createParser(grammar); + +// Accepted inputs — each must parse to the SAME CST in every language. +const ACCEPT = [ + '1;', 'a;', '', // atoms + the empty program + '1 + 2 * 3;', '1 * 2 + 3;', // precedence both directions + '1 - 2 - 3;', 'a / b / c;', '1 + 2 + 3 + 4;', // left-associativity + '-a;', '-(-a);', '- - a;', // prefix + prefix chains + '-a * b;', '-a + b * c;', '-(a + b) * c;', // prefix vs infix vs grouping + '(1);', '((a));', '(1 + 2) * (3 - 4);', // nested grouping + 'a * b + c * d - e / f;', // mixed precedence ladder + 'let x = 1; let y = x + 2 * x; (y);', // multi-statement program + 'let z = -(a * b) / (c - -d);', 'foo; bar; baz;', +]; +// Malformed inputs — every parser must REJECT (the oracle throws; the emitted parsers exit 1). +const REJECT = ['1 +;', '(1;', '1 2;', 'let = 1;', ') ;', '* a;', 'let x 1;']; + +type Json = unknown; +const sortKeys = (o: Json): Json => + Array.isArray(o) ? o.map(sortKeys) + : (o && typeof o === 'object') ? Object.fromEntries(Object.keys(o as object).sort().map((k) => [k, sortKeys((o as Record)[k])])) + : o; +const canon = (o: Json) => JSON.stringify(sortKeys(o)); + +function oracleOutcome(src: string): { ok: true; cst: string } | { ok: false } { + try { return { ok: true, cst: canon(oracle.parse(src)) }; } + catch { return { ok: false }; } +} + +const TMP = '/tmp/portable-targets'; +rmSync(TMP, { recursive: true, force: true }); +mkdirSync(TMP, { recursive: true }); + +function have(cmd: string, args: string[]): boolean { + try { execFileSync(cmd, args, { stdio: 'pipe' }); return true; } catch { return false; } +} + +// A runnable target: writes its source, (optionally) compiles, and returns a `run(src)->{ok,cst?}`. +type Runner = { label: string; run: (src: string) => { ok: true; cst: string } | { ok: false } }; + +function tsRunner(): Runner { + const f = `${TMP}/calc.ts`; + writeFileSync(f, emitPortableParser(grammar, tsTarget)); + return { label: 'typescript', run: (src) => runProc('node', [f], src) }; +} +function goRunner(): Runner | null { + if (!have('go', ['version'])) { console.log(' go: (toolchain absent — skipped)'); return null; } + const dir = `${TMP}/go`; mkdirSync(dir, { recursive: true }); + writeFileSync(`${dir}/main.go`, emitPortableParser(grammar, goTarget)); + writeFileSync(`${dir}/go.mod`, 'module calc\n\ngo 1.21\n'); + execFileSync('go', ['build', '-o', `${dir}/calc`, '.'], { cwd: dir, stdio: 'pipe' }); + return { label: 'go', run: (src) => runProc(`${dir}/calc`, [], src) }; +} +function rustRunner(): Runner | null { + if (!have('rustc', ['--version'])) { console.log(' rust: (toolchain absent — skipped)'); return null; } + const dir = `${TMP}/rust`; mkdirSync(dir, { recursive: true }); + const f = `${dir}/main.rs`; + writeFileSync(f, emitPortableParser(grammar, rustTarget)); + execFileSync('rustc', ['-O', f, '-o', `${dir}/calc`], { stdio: 'pipe' }); + return { label: 'rust', run: (src) => runProc(`${dir}/calc`, [], src) }; +} +function runProc(cmd: string, args: string[], src: string): { ok: true; cst: string } | { ok: false } { + try { return { ok: true, cst: canon(JSON.parse(execFileSync(cmd, args, { input: src, stdio: ['pipe', 'pipe', 'pipe'] }).toString())) }; } + catch { return { ok: false }; } +} + +const runners: Runner[] = [tsRunner(), goRunner(), rustRunner()].filter((r): r is Runner => r !== null); + +let failures = 0; +for (const r of runners) { + let acc = 0, rej = 0; + for (const src of ACCEPT) { + const want = oracleOutcome(src); + const got = r.run(src); + if (want.ok && got.ok && want.cst === got.cst) { acc++; continue; } + failures++; + console.log(` ${r.label}: ACCEPT mismatch on ${JSON.stringify(src)}`); + if (want.ok && got.ok) { console.log(` want ${want.cst.slice(0, 140)}`); console.log(` got ${got.cst.slice(0, 140)}`); } + else console.log(` want.ok=${want.ok} got.ok=${got.ok}`); + } + for (const src of REJECT) { + const want = oracleOutcome(src); + const got = r.run(src); + if (!want.ok && !got.ok) { rej++; continue; } + failures++; + console.log(` ${r.label}: REJECT mismatch on ${JSON.stringify(src)} (oracle ok=${want.ok}, ${r.label} ok=${got.ok})`); + } + console.log(` ${r.label}: ${acc}/${ACCEPT.length} accept ≡ oracle · ${rej}/${REJECT.length} reject ≡ oracle`); +} + +if (failures > 0) { + console.error(`\n✗ portable targets diverge from the interpreter (${failures} case(s))`); + process.exit(1); +} +console.log(`\n✓ ${runners.map((r) => r.label).join(' + ')} parsers derived from one grammar ≡ interpreter CST (compiled & run)`); From 3059804a49879d2f799e02ff4ce773a46c958574 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Sun, 21 Jun 2026 18:47:46 +0800 Subject: [PATCH 04/27] Remove gen-ast-types: the typed-CST generator had no load-bearing consumer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `gen-ast-types.ts` emitted `.cst-types.ts` (discriminated-union typing of the CST). Those artifacts are gitignored build outputs, and nothing depended on them: the only consumer was a non-gated smoke test, and gen-cst-match's `importFrom` parameter (the cst-types path) was never used in its body — so the gated cst-match subsystem is fully independent of cst-types. Removed the generator and its smoke test, dropped the cst-types emit + the dead `importFrom` parameter from the gen pipeline, and cleaned the references (.gitignore/.gitattributes/CI comment/README diagram/emit-corpus filter). Verified: `npm run gen` emits no cst-types and the committed artifacts stay in sync; src type-checks; cst-match-totality 31356/0 and the full suite 42/42. --- .gitattributes | 4 +- .github/workflows/ci.yml | 6 +- .gitignore | 3 +- README.md | 3 +- src/cli.ts | 8 +- src/gen-ast-types.ts | 277 --------------------------------------- src/gen-cst-match.ts | 5 +- test/ast-types-smoke.ts | 184 -------------------------- test/emit-corpus.ts | 8 +- 9 files changed, 15 insertions(+), 483 deletions(-) delete mode 100644 src/gen-ast-types.ts delete mode 100644 test/ast-types-smoke.ts diff --git a/.gitattributes b/.gitattributes index eb61982..e141d46 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,8 +1,8 @@ # Generated artifacts (npm run gen) — committed for consumers, CI-gated for # staleness, collapsed in GitHub diffs. The grammar sources (*.ts at the repo # root) are the hand-written truth; everything below is derived from them. -# (*.cst-types.ts / *.cst-match.ts are generated too but NOT committed — see -# .gitignore; they regenerate locally and in CI before typecheck/gates.) +# (*.cst-match.ts is generated too but NOT committed — see .gitignore; +# it regenerates locally and in CI before typecheck/gates.) *.tmLanguage.json linguist-generated=true *.language-configuration.json linguist-generated=true *.monarch.json linguist-generated=true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4816031..c710619 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,9 +29,9 @@ jobs: - run: npm ci - # Regenerate every grammar's artifacts FIRST: the uncommitted ones - # (*.cst-types.ts / *.cst-match.ts, gitignored) must exist before Typecheck - # and the gates, which import them. Then fail if any COMMITTED artifact + # Regenerate every grammar's artifacts FIRST: the uncommitted one + # (*.cst-match.ts, gitignored) must exist before Typecheck + # and the gates, which import it. Then fail if any COMMITTED artifact # drifts from the regenerated output (someone edited a grammar but forgot # to regenerate). Covers all grammars (sources at the repo root) + the # tree-sitter packages. diff --git a/.gitignore b/.gitignore index bb05bd4..72189b4 100644 --- a/.gitignore +++ b/.gitignore @@ -18,7 +18,6 @@ tree-sitter/*/src/node-types.json tree-sitter/*/src/tree_sitter/ tree-sitter/*/*.wasm -# Generated CST consumer artifacts (npm run gen) — derived from the grammar, not +# Generated CST consumer artifact (npm run gen) — derived from the grammar, not # committed: generate locally / in CI before typecheck and gates. -*.cst-types.ts *.cst-match.ts diff --git a/README.md b/README.md index ea56dd7..a8f69cf 100644 --- a/README.md +++ b/README.md @@ -375,8 +375,7 @@ typescript.ts one grammar (TypeScript combinator API) ├─ src/gen-tm.ts ───────────▶ typescript.tmLanguage.json (TextMate highlighter) ├─ src/gen-vscode-config.ts ▶ typescript.language-configuration.json (editor behavior) ├─ src/gen-treesitter.ts ───▶ tree-sitter/ (grammar.js + highlights.scm + scanner.c) - ├─ src/gen-monarch.ts ──────▶ typescript.monarch.json - └─ src/gen-ast-types.ts ────▶ typescript.cst-types.ts + └─ src/gen-monarch.ts ──────▶ typescript.monarch.json shared src/grammar-utils.ts structural helpers used across stages src/api.ts, types.ts the grammar's combinator + type surface diff --git a/src/cli.ts b/src/cli.ts index 9752e16..6a567df 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -4,7 +4,6 @@ import { generateTmLanguage, generateMarkupInjection, generateAliasGrammar, gene import { generateLanguageConfig } from './gen-vscode-config.ts'; import { generateTreeSitter } from './gen-treesitter.ts'; import { generateMonarch } from './gen-monarch.ts'; -import { generateAstTypes } from './gen-ast-types.ts'; import { generateCstMatch } from './gen-cst-match.ts'; import type { CstGrammar, RuleExpr } from './types.ts'; import { tokenPatternSource } from './token-pattern.ts'; @@ -115,11 +114,8 @@ emit(`tree-sitter/${langName}/package.json`, // Monaco Monarch tokenizer (markup-aware: emits a tag/text/raw-text state machine). emit(`${langName}.monarch.json`, JSON.stringify(generateMonarch(grammar), null, 2)); -// CST node types (TypeScript) — generic over rules, fine for markup too. -emit(`${langName}.cst-types.ts`, generateAstTypes(grammar)); - -// Per-arm CST destructurers (value-level sibling of the types above). -emit(`${langName}.cst-match.ts`, generateCstMatch(grammar, `./${langName}.cst-types.ts`)); +// Per-arm CST destructurers. +emit(`${langName}.cst-match.ts`, generateCstMatch(grammar)); function formatExpr(expr: RuleExpr): string { switch (expr.type) { diff --git a/src/gen-ast-types.ts b/src/gen-ast-types.ts deleted file mode 100644 index d76d124..0000000 --- a/src/gen-ast-types.ts +++ /dev/null @@ -1,277 +0,0 @@ -// Generate a TypeScript `.d.ts`-style source describing the *typed* CST that -// `createParser(grammar).parse()` (gen-parser.ts) produces for THIS grammar. -// -// The runtime CST is generic — `CstNode.rule` and `CstLeaf.tokenType` are both -// `string`. This generator specialises those `string`s into the actual set of -// rule names / token types the grammar can yield, so a consumer gets: -// * a discriminated union `Node` keyed on the `rule` literal (exhaustive switch) -// * a `TokenType` union for `CstLeaf.tokenType` -// * per-rule structural typing of `children` (which child kinds can appear) -// -// Everything is DERIVED from the grammar (CstGrammar) — nothing TypeScript- or -// language-specific is hardcoded, matching the engine's language-agnostic rule. -// Field NAMES are deliberately absent: the grammar has no labels on elements -// (e.g. `[$, '.', Ident]`), so children are typed positionally-by-kind, not as -// named accessors. See the note emitted into the output + the report. - -import type { CstGrammar, RuleExpr } from './types.ts'; -import { isKeywordLiteral } from './grammar-utils.ts'; - -// The synthetic leaf/node `tokenType`s the lexer + parser emit in addition to -// the grammar's declared token names. Kept in sync with gen-lexer.ts / gen-parser.ts -// (grep the literal `$...` strings there). `$template` is a *node* rule the parser -// builds for interpolated templates, but it surfaces in `CstChild` positions and -// as a `tokenType` is harmless to include; we also emit a `$template` Node below. -const SYNTHETIC_LEAF_TYPES = [ - '$keyword', // matchLiteral: keyword-shaped literal - '$punct', // matchLiteral: punctuation literal - '$operator', // Pratt: infix/prefix/postfix operator leaf - '$templateHead', // lexer: template up to first `${` - '$templateMiddle', // lexer: `}` … `${` - '$templateTail', // lexer: `}` … closing backtick -] as const; - -// `$template` is the synthetic *node* the parser emits for an interpolated -// template literal (gen-parser.ts parseTemplateExpr). -const SYNTHETIC_TEMPLATE_NODE = '$template'; - -/** A child element a node can contain: either a Node (by rule) or a Leaf (by token type). */ -type ChildKind = - | { kind: 'node'; rule: string } - | { kind: 'leaf'; tokenType: string }; - -/** - * Generate the typed-CST TypeScript source for `grammar`. - * Returns a self-contained module string (no imports) suitable for writing to a - * `.ts`/`.d.ts` file and `import`ing or type-checking. - */ -export function generateAstTypes(grammar: CstGrammar): string { - // The grammar's template token (if any): a ref to it can yield either a plain - // leaf of that token type OR a `$template` node (parseTemplateExpr) / a - // `$templateHead` leaf — mirror gen-parser's matchExpr 'ref' branch. - const templateTokenNames = new Set(grammar.tokens.filter(t => t.template).map(t => t.name)); - - // ── 1. Token-type union ── - // Declared token names + the synthetic leaf types the engine injects. - const leafTokenTypes = [ - ...grammar.tokens.map(t => t.name).sort(), - ...SYNTHETIC_LEAF_TYPES, - ]; - - // ── 2/3. Per-rule child kinds ── - // For each rule, the set of child element kinds a node of that rule can hold, - // collapsed across the rule body's structure. The parser flattens quantifiers, - // `sep`, `alt`, and `group` straight into the parent's `children` array, so a - // node's children are a *sequence drawn from* this set (not a fixed tuple). - const childKindsByRule = new Map(); - for (const rule of grammar.rules) { - childKindsByRule.set(rule.name, deriveChildKinds(rule.name, grammar)); - } - - // ── Emit ── - const lines: string[] = []; - lines.push('// AUTO-GENERATED by src/gen-ast-types.ts — do not edit by hand.'); - lines.push('// Typed CST for the parser output of this grammar.'); - lines.push('//'); - lines.push('// LIMITATION — no named fields: the grammar carries no labels on rule'); - lines.push('// elements (e.g. `[$, \'.\', Ident]` has no field names), so children are'); - lines.push('// typed by *kind* (which Node rules / leaf token types can appear), not as'); - lines.push('// named accessors like `node.name`/`node.value`. Named-field accessors would'); - lines.push('// require adding field labels to the grammar DSL (a future enrichment).'); - lines.push(''); - - // Position info (mirrors CstNode/CstLeaf in gen-parser.ts). - lines.push('export interface CstPos {'); - lines.push(' offset: number;'); - lines.push(' end: number;'); - lines.push('}'); - lines.push(''); - - // Token-type union. - lines.push('/** Every `tokenType` a CstLeaf in this grammar can carry. */'); - lines.push(`export type TokenType =\n${unionBody(leafTokenTypes.map(quote))};`); - lines.push(''); - - // Generic leaf — narrowable on `tokenType`. - lines.push('/** A terminal: one lexer token (or synthetic keyword/punct/operator leaf). */'); - lines.push('export interface CstLeaf extends CstPos {'); - lines.push(' tokenType: TokenType;'); - lines.push('}'); - lines.push(''); - - // The `$template` synthetic node, if the grammar has a template token. - const hasTemplate = templateTokenNames.size > 0; - if (hasTemplate) { - lines.push('/** Synthetic node the parser builds for an interpolated template literal. */'); - lines.push(`export interface ${nodeIfaceName(SYNTHETIC_TEMPLATE_NODE)} extends CstPos {`); - lines.push(` rule: ${quote(SYNTHETIC_TEMPLATE_NODE)};`); - // A $template node holds template leaves plus interpolated expression nodes. - lines.push(' children: CstChild[];'); - lines.push('}'); - lines.push(''); - } - - // Per-rule interfaces. - for (const rule of grammar.rules) { - const kinds = childKindsByRule.get(rule.name)!; - lines.push(`/** \`${rule.name}\` node. Children (flattened, in source order) are drawn from: */`); - lines.push(`export interface ${nodeIfaceName(rule.name)} extends CstPos {`); - lines.push(` rule: ${quote(rule.name)};`); - lines.push(` children: ${childArrayType(kinds)};`); - lines.push('}'); - lines.push(''); - } - - // Discriminated union of all node interfaces (keyed on `rule`). - const nodeMembers = [ - ...(hasTemplate ? [nodeIfaceName(SYNTHETIC_TEMPLATE_NODE)] : []), - ...grammar.rules.map(r => nodeIfaceName(r.name)), - ]; - lines.push('/** Discriminated union of every node kind. Switch on `node.rule` for exhaustiveness. */'); - lines.push(`export type CstNode =\n${unionBody(nodeMembers)};`); - lines.push(''); - - // The `rule` discriminant as a standalone union (handy for callers). - const ruleLiterals = [ - ...(hasTemplate ? [SYNTHETIC_TEMPLATE_NODE] : []), - ...grammar.rules.map(r => r.name), - ]; - lines.push('/** Every `rule` discriminant value (the keys of the CstNode union). */'); - lines.push(`export type RuleName =\n${unionBody(ruleLiterals.map(quote))};`); - lines.push(''); - - // CstChild. - lines.push('/** Any CST element: a node or a leaf. */'); - lines.push('export type CstChild = CstNode | CstLeaf;'); - lines.push(''); - - // A by-rule lookup type, so callers can write `NodeOf<\'Expr\'>`. - lines.push('/** Narrow the CstNode union to the node for a given rule name. */'); - lines.push('export type NodeOf = Extract;'); - lines.push(''); - - return lines.join('\n'); -} - -// ── Child-kind derivation ── - -/** - * The set of child element kinds a node of `ruleName` can directly contain. - * - * The parser (gen-parser.ts) flattens `seq`/`alt`/`group`/`quantifier`/`sep` - * straight into the parent node's `children` array, so we walk the whole body - * and union every terminal/ref it can reach as a direct child: - * - literal → `$keyword` (keyword-shaped) or `$punct` (punctuation) leaf - * - ref → token → a leaf of that token name (+ `$template`/`$templateHead` - * if it is the template token) - * - ref → rule → that rule's Node - * - op/prefix/postfix → handled below for Pratt rules (operator leaves + self-ref) - * - * Pratt / left-recursive rules additionally build children the body doesn't show - * literally — `[lhs, opLeaf, rhs]`, `[opLeaf, rhs]`, `[lhs, opLeaf]` — where `lhs` - * and `rhs` are nodes of the SAME rule and `opLeaf` is a `$operator` leaf. We add - * a self Node-ref and `$operator` whenever the body contains an op/prefix/postfix - * marker, so the type matches what the parser actually emits. - */ -function deriveChildKinds(ruleName: string, grammar: CstGrammar): ChildKind[] { - const tokenNames = new Set(grammar.tokens.map(t => t.name)); - const ruleNames = new Set(grammar.rules.map(r => r.name)); - const templateTokenNames = new Set(grammar.tokens.filter(t => t.template).map(t => t.name)); - const rule = grammar.rules.find(r => r.name === ruleName)!; - - // De-dup by a stable key. - const seen = new Map(); - const add = (c: ChildKind) => { - const key = c.kind === 'node' ? `n:${c.rule}` : `l:${c.tokenType}`; - if (!seen.has(key)) seen.set(key, c); - }; - - let sawMarker = false; - - function walk(expr: RuleExpr): void { - switch (expr.type) { - case 'literal': - add({ kind: 'leaf', tokenType: isKeywordLiteral(expr.value) ? '$keyword' : '$punct' }); - return; - case 'ref': - if (tokenNames.has(expr.name)) { - add({ kind: 'leaf', tokenType: expr.name }); - // A ref to the template token can instead yield a `$template` node or a - // `$templateHead` leaf (parser's matchExpr 'ref' → parseTemplateExpr). - if (templateTokenNames.has(expr.name)) { - add({ kind: 'node', rule: SYNTHETIC_TEMPLATE_NODE }); - add({ kind: 'leaf', tokenType: '$templateHead' }); - } - } else if (ruleNames.has(expr.name)) { - add({ kind: 'node', rule: expr.name }); - } - return; - case 'seq': - case 'alt': - for (const item of expr.items) walk(item); - return; - case 'quantifier': - case 'group': - walk(expr.body); - return; - case 'sep': - // `sep(el, ',')` → repeated `el` interleaved with the `,` delimiter leaf. - walk(expr.element); - add({ kind: 'leaf', tokenType: isKeywordLiteral(expr.delimiter) ? '$keyword' : '$punct' }); - return; - case 'op': - case 'prefix': - case 'postfix': - sawMarker = true; - return; - } - } - - walk(rule.body); - - // Pratt synthesis: operator leaves + self node-refs the parser injects. - if (sawMarker) { - add({ kind: 'leaf', tokenType: '$operator' }); - add({ kind: 'node', rule: ruleName }); - } - - return [...seen.values()]; -} - -// ── Emit helpers ── - -function nodeIfaceName(ruleName: string): string { - // `$template` → `$templateNode`; `Expr` → `ExprNode`. The `$` is a legal TS - // identifier char, so `$templateNode` is a valid interface name. - return `${ruleName}Node`; -} - -/** A single-quoted string literal type, with internal quotes/backslashes escaped. */ -function quote(s: string): string { - return `'${s.replace(/\\/g, '\\\\').replace(/'/g, "\\'")}'`; -} - -/** Render a list of member type strings as an indented `| a | b | c` union body. */ -function unionBody(members: string[]): string { - if (members.length === 0) return ' never'; - return members.map(m => ` | ${m}`).join('\n'); -} - -/** The `children` array type for a set of child kinds. */ -function childArrayType(kinds: ChildKind[]): string { - if (kinds.length === 0) { - // No derivable children (e.g. an empty/marker-only body) — still an array. - return 'CstChild[]'; - } - const members = kinds.map(k => - k.kind === 'node' ? nodeIfaceName(k.rule) : leafOf(k.tokenType), - ); - // Sort for stable output; nodes and leaves intermixed is fine. - members.sort(); - return `Array<\n${members.map(m => ` | ${m}`).join('\n')}\n >`; -} - -/** A `CstLeaf` narrowed to a specific tokenType. */ -function leafOf(tokenType: string): string { - return `(CstLeaf & { tokenType: ${quote(tokenType)} })`; -} diff --git a/src/gen-cst-match.ts b/src/gen-cst-match.ts index a2dca89..c0b3148 100644 --- a/src/gen-cst-match.ts +++ b/src/gen-cst-match.ts @@ -1,5 +1,4 @@ -// Generate per-rule, per-ARM destructurers for a grammar's CST — the VALUE-level -// sibling of gen-ast-types.ts. For every rule it emits +// Generate per-rule, per-ARM destructurers for a grammar's CST. For every rule it emits // // export type Match = { arm: 'if', expr: NodeEntry<'Expr'>, … } | … // export function match(t: TreeAccess, n: NodeEntry<'Rule'>, src: string): Match @@ -74,7 +73,7 @@ function sanitizeIdent(s: string): string { const J = (v: unknown) => JSON.stringify(v); -export function generateCstMatch(grammar: CstGrammar, importFrom: string): string { +export function generateCstMatch(grammar: CstGrammar): string { // Same [Await]/[Yield] fork the parsers apply, so the rule-id space (ruleIdOf) // agrees with the tree. Matchers/types are emitted for BASE rules only (a fork // collapses to its base via RULE_CANON); no-op without ctx markers. diff --git a/test/ast-types-smoke.ts b/test/ast-types-smoke.ts deleted file mode 100644 index 1b5714c..0000000 --- a/test/ast-types-smoke.ts +++ /dev/null @@ -1,184 +0,0 @@ -// Smoke test for src/gen-ast-types.ts. -// -// 1. Generate the typed-CST source from the real TypeScript grammar. -// 2. Write it to a temp `.ts` file. -// 3. Write a consumer module that (a) imports the generated types, (b) does an -// exhaustive `switch (node.rule)` proving the discriminated union narrows -// and is complete (a `never` assertion in `default`), and (c) narrows a -// leaf on `tokenType`. -// 4. Type-check BOTH with `tsc --noEmit --strict`. A non-empty diagnostic = -// the generated types are wrong (or not exhaustive) → fail. -// 5. Also assert a few structural facts about the generated string directly. -// -// Run: `node test/ast-types-smoke.ts`. (This file lives under test/, which the -// project tsconfig excludes, so it does not affect `npx tsc --noEmit` for src.) - -import { generateAstTypes } from '../src/gen-ast-types.ts'; -import { execFileSync } from 'node:child_process'; -import { mkdtempSync, writeFileSync, rmSync, existsSync } from 'node:fs'; -import { tmpdir } from 'node:os'; -import { dirname, join, parse as parsePath } from 'node:path'; - -const grammar = (await import('../typescript.ts')).default; - -// Resolve the workspace `tsc` so the temp dir uses the same compiler. Walk up -// from the cwd — under a git worktree, node_modules lives in the parent repo. -function resolveTsc(): string { - let dir = process.cwd(); - while (true) { - const candidate = join(dir, 'node_modules', '.bin', 'tsc'); - if (existsSync(candidate)) return candidate; - const parent = dirname(dir); - if (parent === dir || dir === parsePath(dir).root) break; - dir = parent; - } - return 'tsc'; // fall back to PATH -} -const tscBin = resolveTsc(); - -let fail = 0; -const check = (label: string, cond: boolean) => { - if (cond) console.log(' ok ', label); - else { fail++; console.log(' FAIL', label); } -}; - -// ── 1. Generate ── -const src = generateAstTypes(grammar); - -// ── 5. Direct structural assertions on the generated text ── -check('emits a CstNode discriminated union', /export type CstNode =/.test(src)); -check('emits a TokenType union', /export type TokenType =/.test(src)); -check('emits a RuleName union', /export type RuleName =/.test(src)); -check('emits NodeOf helper', /export type NodeOf/.test(src)); - -// Every declared rule gets an interface with a literal `rule` discriminant. -const missingRule = grammar.rules.find( - r => !src.includes(`export interface ${r.name}Node `) || !src.includes(`rule: '${r.name}'`), -); -check('every grammar rule has a Node interface + literal rule', missingRule === undefined); - -// Synthetic leaf token types are present in the TokenType union. -for (const t of ['$keyword', '$punct', '$operator', '$templateHead', '$templateMiddle', '$templateTail']) { - check(`TokenType includes ${t}`, src.includes(`'${t}'`)); -} -// Declared token names are present too. -check('TokenType includes a declared token (Ident)', src.includes("'Ident'")); - -// The grammar has a template token → a `$template` node interface should exist. -check('emits $templateNode (grammar has a template token)', src.includes("rule: '$template'")); - -// ── 2/3/4. Type-check the generated types + a consumer ── -const dir = mkdtempSync(join(tmpdir(), 'monogram-ast-types-')); -const typesPath = join(dir, 'cst-types.ts'); -const consumerPath = join(dir, 'consumer.ts'); -const tsconfigPath = join(dir, 'tsconfig.json'); - -writeFileSync(typesPath, src); - -// Pick a few real rule names from the grammar to exercise narrowing. -const ruleSample = grammar.rules.slice(0, 3).map(r => r.name); - -// Consumer: exhaustive switch over EVERY rule (built from the grammar so it -// stays complete as the grammar grows), plus explicit narrowing on a couple of -// sampled rules and a leaf. If the union is missing a member, the per-case -// access fails; if it has an EXTRA member we don't handle, the `default` -// `never` assignment fails — both prove the union is exactly right. -const allRuleNames = [ - '$template', - ...grammar.rules.map(r => r.name), -]; -const cases = allRuleNames.map(name => - ` case '${name}': { const _c: CstNode = node; void _c; return node.children.length; }`, -).join('\n'); - -const consumer = `import type { CstNode, CstLeaf, NodeOf, RuleName, TokenType } from './cst-types.ts'; - -// (a) Exhaustive switch on the \`rule\` discriminant: narrows, and \`default\` -// proves completeness via a \`never\` assignment. -export function childCount(node: CstNode): number { - switch (node.rule) { -${cases} - default: { - const _exhaustive: never = node; - return _exhaustive; - } - } -} - -// (b) NodeOf narrows the union to one rule's node. -function sampleNarrowing(n: CstNode) { - ${ruleSample.map((r, i) => `if (n.rule === '${r}') { const x${i}: NodeOf<'${r}'> = n; void x${i}; }`).join('\n ')} -} -void sampleNarrowing; - -// (c) A RuleName value is assignable from a literal in the union. -const someRule: RuleName = '${ruleSample[0]}'; -void someRule; - -// (d) Leaf narrowing on tokenType. -function leafText(leaf: CstLeaf): string { - if (leaf.tokenType === '$keyword') return leaf.text; - const t: TokenType = leaf.tokenType; - void t; - return leaf.text; -} -void leafText; -`; -writeFileSync(consumerPath, consumer); - -writeFileSync(tsconfigPath, JSON.stringify({ - compilerOptions: { - target: 'ES2022', - module: 'Node16', - moduleResolution: 'Node16', - allowImportingTsExtensions: true, - noEmit: true, - strict: true, - skipLibCheck: true, - }, - include: ['cst-types.ts', 'consumer.ts'], -}, null, 2)); - -let tscOut = ''; -let tscOk = true; -try { - execFileSync(tscBin, ['--noEmit', '-p', tsconfigPath], { stdio: 'pipe' }); -} catch (e: any) { - tscOk = false; - tscOut = `${e.stdout?.toString() ?? ''}${e.stderr?.toString() ?? ''}`; -} -check('generated types + exhaustive-switch consumer type-check under tsc --strict', tscOk); -if (!tscOk) { - console.log('\n--- tsc diagnostics ---\n' + tscOut + '\n--- generated source ---\n' + src + '\n--- consumer ---\n' + consumer); -} - -// Negative control: a bogus rule literal must NOT be assignable to RuleName, -// confirming RuleName is a closed union (not widened to `string`). -const badConsumerPath = join(dir, 'bad.ts'); -writeFileSync(badConsumerPath, `import type { RuleName } from './cst-types.ts'; -const bad: RuleName = '___definitely_not_a_rule___'; -void bad; -`); -writeFileSync(join(dir, 'tsconfig.bad.json'), JSON.stringify({ - compilerOptions: { - target: 'ES2022', module: 'Node16', moduleResolution: 'Node16', - allowImportingTsExtensions: true, noEmit: true, strict: true, skipLibCheck: true, - }, - include: ['cst-types.ts', 'bad.ts'], -}, null, 2)); -let bogusRejected = false; -try { - execFileSync(tscBin, ['--noEmit', '-p', join(dir, 'tsconfig.bad.json')], { stdio: 'pipe' }); -} catch { - bogusRejected = true; // tsc errored → the bogus literal was correctly rejected -} -check('RuleName is a closed union (rejects an unknown rule literal)', bogusRejected); - -rmSync(dir, { recursive: true, force: true }); - -console.log( - fail === 0 - ? `\n${grammar.rules.length} rules typed; all AST-type smoke checks pass` - : `\n${fail} FAILED`, -); -process.exit(fail === 0 ? 0 : 1); diff --git a/test/emit-corpus.ts b/test/emit-corpus.ts index 6fca455..95d48c4 100644 --- a/test/emit-corpus.ts +++ b/test/emit-corpus.ts @@ -136,11 +136,11 @@ export const CURATED_TS_INVALID: string[] = [ ]; // ── 2) The repo's own hand-written .ts sources ────────────────────────────────────────── -// Excludes generated artifacts (*.cst-match.ts / *.cst-types.ts) and caps file size so the -// gate stays fast (the byte-identical CST compare is O(tree size); a 250 KB cap keeps the -// rich, deeply-nested sources like emit-parser.ts while dropping the multi-hundred-KB ones). +// Excludes generated artifacts (*.cst-match.ts) and caps file size so the gate stays fast +// (the byte-identical CST compare is O(tree size); a 250 KB cap keeps the rich, deeply- +// nested sources like emit-parser.ts while dropping the multi-hundred-KB ones). const SIZE_CAP = 250 * 1024; -const isGenerated = (f: string) => f.endsWith('.cst-match.ts') || f.endsWith('.cst-types.ts') || f.endsWith('.d.ts'); +const isGenerated = (f: string) => f.endsWith('.cst-match.ts') || f.endsWith('.d.ts'); export function repoTsFiles(): string[] { const out: string[] = []; From 070b965a726ce843c091834888c15c878bfdb140 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Sun, 21 Jun 2026 20:26:26 +0800 Subject: [PATCH 05/27] emit-portable: grow to a real JS subset; derived Rust matches oxc throughput MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the target-agnostic emitter from the calc proof to examples/minijs.ts — a real JavaScript subset (string/comment lexer, the full operator-precedence ladder, call/member/index mixfix chains, arrays, and the common statement forms) — so the emitted Go/Rust parsers can be benchmarked against oxc on the same bytes. What grew: - Lexer: driven by token-pattern.ts's structural recognizers (char runs, quote strings, line/block comments) — still a regex-free char scanner, so Go/Rust compile offline. - Parser IR: opt/sep/inline-literal-alternation, Pratt bracket NUDs (grouping, array), and mixfix LEDs (call/member/index) tried before operators. - Rust target: zero-allocation tokens (`&str` slices, Copy) and `&'static str` CST labels — no per-token/per-node String. This is decisive: the first naive version (String everywhere, a clone per peek) ran at 9 MB/s, slower than Go; the fix took it to 39 MB/s. Verified: test/portable-targets.ts now covers calc + minijs; ts/go/rust each ≡ the createParser CST (minijs 29/29 accept + 7/7 reject) and byte-identical on a 2.92 MB corpus. Full suite 42/42. Benchmark (oxc-parser 0.137, 2.92 MB JS-subset both engines accept, self-timed lex+parse with black_box): derived-Rust 39 MB/s (0.97x oxc — parity), derived-Go 19 MB/s (2x), oxc 38 MB/s. A grammar-DERIVED, un-hand-tuned Rust parser matches the fastest hand-tuned native JS parser, while building a full CST. minijs is a subset (oxc parses full JS), but both parse the same corpus, so it is a fair throughput comparison on that work; the bench harness is not committed (it needs the external oxc-parser package). --- examples/minijs.ts | 77 +++++++++++++ src/emit-portable.ts | 227 ++++++++++++++++++++++----------------- src/target-go.ts | 148 ++++++++++++++++--------- src/target-rust.ts | 208 ++++++++++++++++++++++------------- src/target-ts.ts | 119 +++++++++++++------- test/portable-targets.ts | 177 ++++++++++++++++-------------- 6 files changed, 613 insertions(+), 343 deletions(-) create mode 100644 examples/minijs.ts diff --git a/examples/minijs.ts b/examples/minijs.ts new file mode 100644 index 0000000..6de468a --- /dev/null +++ b/examples/minijs.ts @@ -0,0 +1,77 @@ +// A real JavaScript SUBSET — the grammar that makes the portable Go/Rust targets +// "comparable with oxc": rich enough that parsing a corpus is realistic work +// (strings, comments, the full operator-precedence ladder, call/member/index +// chains, arrays, and the common statement forms), so the emitted Rust parser can +// be benchmarked against oxc on the same bytes. +// +// Derived from ONE definition by emitPortableParser into TypeScript, Go, and Rust; +// the cross-language gate proves all three produce the byte-identical CST that the +// interpreter (createParser) does. The portable lexer is regex-free (char scanner +// driven by token-pattern.ts's structural recognizers), so the Go/Rust output +// compiles offline. +// +// Deliberately omitted (ambiguity / scope, not capability): object literals (the +// `{`-block-vs-object split), ternary, template literals, regex literals, keyword +// operators (typeof/void/...), and `for`. The subset stays unambiguous and real. +import { + token, rule, defineGrammar, left, right, op, prefix, alt, + seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, anyChar, +} from '../src/api.ts'; + +const digit = range('0', '9'); +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); + +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); +const Str = token(seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', anyChar()))), '"'), { scope: 'string.quoted.double' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); +const BlockComment = token(seq('/*', star(altPattern(noneOf('*'), seq('*', noneOf('/')))), '*/'), { skip: true, scope: 'comment.block' }); + +// Operator-precedence ladder (earlier = looser), mirroring JavaScript. +const jsPrec = [ + right('='), + left('||'), left('&&'), + left('|'), left('^'), left('&'), + left('==', '!=', '===', '!=='), + left('<', '>', '<=', '>='), + left('<<', '>>'), + left('+', '-'), + left('*', '/', '%'), + right(prefix('!', '-', '+', '~')), +]; + +const Expr = rule(($) => [ + Number_, + Str, + Ident, + ['(', $, ')'], // grouping + ['[', opt(sep($, ',')), ']'], // array literal + [prefix, $], // prefix unary + [$, op, $], // binary infix (precedence from the ladder) + [$, '(', opt(sep($, ',')), ')'], // call + [$, '.', Ident], // member access + [$, '[', $, ']'], // computed index +]); + +const Block = rule(($) => [['{', many(Stmt), '}']]); + +const Stmt = rule(($) => [ + Block, + [alt('var', 'let', 'const'), Ident, opt('=', Expr), ';'], + ['if', '(', Expr, ')', Stmt, opt('else', Stmt)], + ['while', '(', Expr, ')', Stmt], + ['return', opt(Expr), ';'], + ['function', Ident, '(', opt(sep(Ident, ',')), ')', Block], + [Expr, ';'], +]); + +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'minijs', + scopeName: 'source.minijs', + tokens: { Ident, Number: Number_, Str, LineComment, BlockComment }, + prec: jsPrec, + rules: { Expr, Block, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 900beec..e445339 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -5,52 +5,67 @@ // TS engine uses. It is the agnosticism proof: ONE analysis → ONE intermediate form (IR) // → N language renderings, all producing the byte-identical CST the interpreter does. // -// SHARED + target-agnostic (here): the grammar ANALYSIS (reused from grammar-analysis.ts) -// and `buildIR` — the parse plan as plain data (recursive-descent rules as alternative -// step-lists, the Pratt rule as NUD-atom / prefix / binary tables, the char-class lexer -// specs, the literal vocabulary, the entry rule). PER-TARGET (a Target): `render(ir)` — -// the language's lexer + CST runtime + the rendering of each IR node. Adding a language is -// implementing one Target; nothing here changes. +// SHARED + target-agnostic (here): the grammar ANALYSIS (reused from grammar-analysis.ts), +// the LEXER specs (derived from token-pattern.ts's structural recognizers — char runs, +// quote-delimited strings, line/block comments — so NO regex engine is needed and the +// emitted Go/Rust compile offline), and `buildIR` — the parse plan as plain data +// (recursive-descent rules as alternative step-lists; the Pratt rule as NUD atoms/brackets/ +// prefix + binary tables + mixfix LEDs). PER-TARGET (a Target): `render(ir)` — the +// language's lexer + CST runtime + the rendering of each IR node. Adding a language is +// implementing one Target. // -// SCOPE (the verifiable core): char-class tokens (`charClass` then `star(charClass)`), a -// recursive-descent + backtracking-alternation + `*` body, and a Pratt expression engine -// with operator PRECEDENCE/associativity + prefix unary + parenthesised grouping. The -// portable lexer is a dependency-free char scanner (no regex), so the emitted Go/Rust -// compile offline. Richer surface (mixfix/postfix LEDs, `sep`/`opt`, lexer lookahead, -// left-recursion beyond Pratt) is the documented next increment; buildIR THROWS on a -// construct it does not model rather than emit a wrong parser. -import type { CstGrammar, RuleExpr, TokenDecl, TokenPattern } from './types.ts'; +// SCOPE: char-run / quote-string / line+block-comment tokens; recursive descent with +// backtracking alternation, `*`/`?` quantifiers, `sep`, and inline literal-alternation; +// and a Pratt expression engine with operator precedence/associativity, prefix unary, +// bracket NUDs (grouping, array), and mixfix LEDs (call / member / index) tried before +// operators. buildIR THROWS on a construct outside this set rather than emit a wrong +// parser. This is enough to derive a real JavaScript-subset parser (examples/minijs.ts). +import type { CstGrammar, RuleExpr, TokenDecl } from './types.ts'; import { analyzeGrammar, findEntryRule } from './grammar-analysis.ts'; import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; +import { + tokenPatternCharLoop, tokenPatternQuoteDelimAndEscape, + tokenPatternBlockDelimiters, tokenPatternLiteralPrefix, +} from './token-pattern.ts'; // ── Intermediate representation (plain data; every Target renders THIS) ── export type CharRange = [number, number]; // inclusive char-code range -export type TokenSpec = { name: string; first: CharRange[]; cont: CharRange[] }; +export type LexTok = + | { kind: 'run'; name: string; first: CharRange[]; cont: CharRange[]; skip: boolean } // ident/number char run + | { kind: 'string'; name: string; delim: string; skip: boolean } // delim..delim, `\` escapes next + | { kind: 'line'; name: string; prefix: string; skip: boolean } // prefix..end-of-line + | { kind: 'block'; name: string; open: string; close: string; skip: boolean }; // open..close +export type Lit = { value: string; ttype: '$keyword' | '$punct' }; export type Step = | { t: 'lit'; value: string; ttype: '$keyword' | '$punct' } // match a literal by text | { t: 'tok'; name: string } // match a token kind | { t: 'rule'; name: string } // call a rule, append its node - | { t: 'star'; step: Step }; // repeat the inner step 0+ times + | { t: 'star'; step: Step } // repeat inner 0+ + | { t: 'opt'; steps: Step[] } // optional sub-sequence + | { t: 'sep'; elem: Step; delim: string } // elem (delim elem)* + | { t: 'altlit'; opts: Lit[] }; // inline alternation of literals export type Alt = Step[]; export type RdRule = { kind: 'rd'; name: string; alts: Alt[] }; +export type Bracket = { first: string; steps: Step[] }; // a literal-led sequence (grouping/array; LED call/index) export type PrattRule = { kind: 'pratt'; name: string; - atomToks: string[]; // NUD: a bare token (Number/Ident) wrapped in a node - group: { open: string; close: string } | null; // NUD: '(' Expr ')' - prefix: Array<{ op: string; rbp: number }>; // NUD: prefix op then operand parsed at rbp + nudToks: string[]; // NUD: a bare token wrapped in a node + nudBrackets: Bracket[]; // NUD: '(' … ')' / '[' … ']' + prefix: Array<{ op: string; rbp: number }>; // NUD: prefix op then operand at rbp binary: Array<{ op: string; lbp: number; rbp: number }>; // LED: infix op, bind iff lbp > minBp, rhs at rbp + leds: Bracket[]; // LED: mixfix continuation (call/member/index), tried before operators }; export type RuleIR = RdRule | PrattRule; export type ParserIR = { grammarName: string; entry: string; - tokens: TokenSpec[]; // named tokens, for the char scanner (tried in declaration order) - puncts: string[]; // punctuation literals, sorted longest-first (maximal munch) + tokens: LexTok[]; // for the char scanner, tried in declaration order + puncts: string[]; // punctuation literals, longest-first (maximal munch) rules: RuleIR[]; }; @@ -70,15 +85,7 @@ function buildIR(grammar: CstGrammar): ParserIR { const a = analyzeGrammar(grammar); const tokenNames = a.tokenNames; - // Lexer token specs: each token must be `charClass` then `star(charClass)` (the portable - // scanner's shape). Anything else is out of the verifiable core → throw, don't mis-lex. - const tokens: TokenSpec[] = grammar.tokens.map((t) => { - const { first, cont } = charClassFirstCont(t); - return { name: t.name, first, cont }; - }); - - // Literal vocabulary, split keyword (alpha — lexed as an identifier, matched by text) vs - // punctuation (lexed as its own token). Puncts longest-first for maximal munch. + const tokens: LexTok[] = grammar.tokens.map((t) => lexTok(t)); const lits = new Set(); for (const r of grammar.rules) for (const l of collectLiterals(r.body)) lits.add(l); for (const lv of grammar.precs) for (const o of lv.operators) lits.add(o.value); @@ -86,88 +93,110 @@ function buildIR(grammar: CstGrammar): ParserIR { const litTtype = (v: string): '$keyword' | '$punct' => (isKeywordLiteral(v) ? '$keyword' : '$punct'); - const rules: RuleIR[] = grammar.rules.map((r) => { - if (a.prattRules.has(r.name)) return buildPratt(r.name, r.body, a); - return { kind: 'rd', name: r.name, alts: buildRdAlts(r.body) }; - }); - - function buildRdAlts(body: RuleExpr): Alt[] { - if (body.type === 'alt') return body.items.map(altSteps); - return [altSteps(body)]; - } - function altSteps(e: RuleExpr): Step[] { - if (e.type === 'seq') return e.items.flatMap(stepOf); - return stepOf(e); - } - function stepOf(e: RuleExpr): Step[] { + // RuleExpr → Step. `selfName` (when set) maps a self-ref to a fresh rule call. + function stepOf(e: RuleExpr): Step { switch (e.type) { - case 'literal': return [{ t: 'lit', value: e.value, ttype: litTtype(e.value) }]; - case 'ref': return [tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }]; - case 'quantifier': { - if (e.kind !== '*') throw new Error(`portable: quantifier '${e.kind}' not in the verifiable core (only '*')`); - const inner = stepOf(e.body); - if (inner.length !== 1) throw new Error('portable: `*` body must be a single step (a rule/token ref)'); - return [{ t: 'star', step: inner[0] }]; + case 'literal': return { t: 'lit', value: e.value, ttype: litTtype(e.value) }; + case 'ref': return tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }; + case 'group': { const ss = altSteps(e.body); if (ss.length !== 1) throw new Error('portable: group must reduce to a single step'); return ss[0]; } + case 'sep': return { t: 'sep', elem: stepOf(e.element), delim: e.delimiter }; + case 'quantifier': + if (e.kind === '*') return { t: 'star', step: stepOf(e.body) }; + if (e.kind === '?') return { t: 'opt', steps: altSteps(e.body) }; + if (e.kind === '+') throw new Error("portable: '+' not yet modeled (use '*')"); + break; + case 'alt': { + const opts: Lit[] = []; + for (const it of e.items) { + if (it.type !== 'literal') throw new Error('portable: inline alt must be all literals'); + opts.push({ value: it.value, ttype: litTtype(it.value) }); + } + return { t: 'altlit', opts }; } - case 'group': return altSteps(e.body); - default: throw new Error(`portable: rd construct '${e.type}' not in the verifiable core`); } + throw new Error(`portable: rd construct '${e.type}' not in scope`); + } + function altSteps(e: RuleExpr): Step[] { + if (e.type === 'seq') return e.items.map(stepOf); + return [stepOf(e)]; } + const rules: RuleIR[] = grammar.rules.map((r) => { + if (a.prattRules.has(r.name)) return buildPratt(r.name, r.body, a, stepOf, altSteps, litTtype); + return { kind: 'rd', name: r.name, alts: r.body.type === 'alt' ? r.body.items.map(altSteps) : [altSteps(r.body)] }; + }); + return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules }; } -// A Pratt rule's alternatives, classified into NUD atoms / grouping / prefix and LED binary. -// The binding powers come from the analysis (opTable/prefixOps), so precedence is single- -// sourced with the interpreter. -function buildPratt(name: string, body: RuleExpr, a: ReturnType): PrattRule { +// Classify a token into a portable scanner spec via the structural recognizers. +function lexTok(t: TokenDecl): LexTok { + const skip = t.flags.includes('skip'); + const qs = tokenPatternQuoteDelimAndEscape(t); + if (qs) return { kind: 'string', name: t.name, delim: qs.delim, skip }; + const bd = tokenPatternBlockDelimiters(t); + if (bd) return { kind: 'block', name: t.name, open: bd[0], close: bd[1], skip }; + const loop = tokenPatternCharLoop(t); + if (loop) { + if (loop.bail.length > 0 || loop.bailNonAscii) throw new Error(`portable: token ${t.name} has a complex continuation (bail) — out of scope`); + return { kind: 'run', name: t.name, first: codesToRanges(loop.first), cont: codesToRanges(loop.cont), skip }; + } + const prefix = tokenPatternLiteralPrefix(t); + if (prefix) return { kind: 'line', name: t.name, prefix, skip }; // prefix with no distinct suffix → to end-of-line + throw new Error(`portable: token ${t.name} shape not recognized by the portable lexer`); +} + +function codesToRanges(codes: number[]): CharRange[] { + const s = [...new Set(codes)].sort((x, y) => x - y); + const out: CharRange[] = []; + for (const c of s) { + const last = out[out.length - 1]; + if (last && c === last[1] + 1) last[1] = c; + else out.push([c, c]); + } + return out; +} + +// A Pratt rule's alternatives → NUD atoms/brackets/prefix + binary + mixfix LEDs. +// Binding powers come from the analysis (opTable/prefixOps), single-sourced with the interpreter. +function buildPratt( + name: string, body: RuleExpr, a: ReturnType, + stepOf: (e: RuleExpr) => Step, altSteps: (e: RuleExpr) => Step[], + litTtype: (v: string) => '$keyword' | '$punct', +): PrattRule { const alts = body.type === 'alt' ? body.items : [body]; - const atomToks: string[] = []; - let group: { open: string; close: string } | null = null; - let sawPrefix = false; - let sawBinary = false; + const nudToks: string[] = []; + const nudBrackets: Bracket[] = []; + let sawPrefix = false, sawBinary = false; + const leds: Bracket[] = []; for (const alt of alts) { const items = alt.type === 'seq' ? alt.items : [alt]; - if (items.length === 1 && items[0].type === 'ref' && a.tokenNames.has(items[0].name)) { - atomToks.push(items[0].name); // [Token] - } else if (items.length === 3 && items[0].type === 'literal' && items[2].type === 'literal' - && items[1].type === 'ref' && items[1].name === name) { - group = { open: items[0].value, close: items[2].value }; // [ '(' $ ')' ] - } else if (items.length === 2 && items[0].type === 'prefix' && items[1].type === 'ref' && items[1].name === name) { - sawPrefix = true; // [ prefix $ ] - } else if (items.length === 3 && items[0].type === 'ref' && items[0].name === name - && items[1].type === 'op' && items[2].type === 'ref' && items[2].name === name) { - sawBinary = true; // [ $ op $ ] - } else { - throw new Error(`portable: Pratt alt shape not in the verifiable core (rule ${name})`); + const startsSelf = items[0].type === 'ref' && items[0].name === name; + if (!startsSelf) { + // NUD + if (items.length === 1 && items[0].type === 'ref' && a.tokenNames.has(items[0].name)) { nudToks.push(items[0].name); continue; } + if (items[0].type === 'prefix') { sawPrefix = true; continue; } + if (items[0].type === 'literal') { nudBrackets.push({ first: items[0].value, steps: items.map((it) => stepOfPratt(it)) }); continue; } + throw new Error(`portable: Pratt NUD shape not in scope (rule ${name})`); } + // LED (starts with self): `$ op $` (binary, op slot + trailing self) or `$ …` (mixfix) + const rest = items.slice(1); + if (rest[0].type === 'op') { sawBinary = true; continue; } + if (rest[0].type === 'literal') { leds.push({ first: rest[0].value, steps: rest.map((it) => stepOfPratt(it)) }); continue; } + throw new Error(`portable: Pratt LED shape not in scope (rule ${name})`); } - const prefix = sawPrefix - ? [...a.prefixOps.entries()].map(([op, info]) => ({ op, rbp: info.rbp })) - : []; + // a self-ref inside a NUD/LED sub-sequence is a fresh parse of this rule + function stepOfPratt(e: RuleExpr): Step { + if (e.type === 'ref' && e.name === name) return { t: 'rule', name }; + if (e.type === 'sep') return { t: 'sep', elem: stepOfPratt(e.element), delim: e.delimiter }; + if (e.type === 'quantifier' && e.kind === '?') return { t: 'opt', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; + if (e.type === 'quantifier' && e.kind === '*') return { t: 'star', step: stepOfPratt(e.body) }; + if (e.type === 'literal') return { t: 'lit', value: e.value, ttype: litTtype(e.value) }; + return stepOf(e); + } + const prefix = sawPrefix ? [...a.prefixOps.entries()].map(([op, info]) => ({ op, rbp: info.rbp })) : []; const binary = sawBinary - ? [...a.opTable.entries()] - .filter(([, info]) => info.position === 'infix') - .map(([op, info]) => ({ op, lbp: info.lbp, rbp: info.rbp })) + ? [...a.opTable.entries()].filter(([, info]) => info.position === 'infix').map(([op, info]) => ({ op, lbp: info.lbp, rbp: info.rbp })) : []; - return { kind: 'pratt', name, atomToks, group, prefix, binary }; -} - -// Extract a token's (first-char, continue-char) code ranges from a `charClass` then -// `star(charClass)` pattern. Throws for any other shape (out of the verifiable core). -function charClassFirstCont(t: TokenDecl): { first: CharRange[]; cont: CharRange[] } { - const p = t.pattern; - if (typeof p === 'string' || p.type !== 'seq' || p.items.length !== 2) throw new Error(`portable: token ${t.name} not [charClass, star(charClass)]`); - const head = p.items[0]; - const tail = p.items[1]; - if (typeof tail === 'string' || tail.type !== 'repeat' || tail.min !== 0) throw new Error(`portable: token ${t.name} tail is not star(charClass)`); - return { first: classRanges(head, t.name), cont: classRanges(tail.body, t.name) }; -} -function classRanges(p: TokenPattern, tok: string): CharRange[] { - if (typeof p === 'string' || p.type !== 'charClass' || p.negate) throw new Error(`portable: token ${tok} uses a non-positive char class`); - return p.items.map((it): CharRange => { - if (it.type === 'char') return [it.value.charCodeAt(0), it.value.charCodeAt(0)]; - if (it.type === 'range') return [it.from.charCodeAt(0), it.to.charCodeAt(0)]; - throw new Error(`portable: token ${tok} char-class item '${(it as { type: string }).type}' unsupported`); - }); + return { kind: 'pratt', name, nudToks, nudBrackets, prefix, binary, leds }; } diff --git a/src/target-go.ts b/src/target-go.ts index bc81629..02af630 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -1,23 +1,43 @@ // The Go Target for emit-portable. Renders the same language-agnostic ParserIR as tsTarget -// into a self-contained Go program (Go stdlib only — the char-class lexer is regex-free, so -// it compiles with no module dependencies). Its CST JSON is checked byte-for-byte against -// the interpreter, so `emitPortableParser(grammar, goTarget)` is a real, verified Go parser -// derived from the same grammar definition. -import type { ParserIR, RdRule, PrattRule, Step, CharRange, Target } from './emit-portable.ts'; +// into a self-contained Go program (Go stdlib only — the lexer is regex-free, so it compiles +// with no module dependencies). Its CST JSON is checked byte-for-byte against the interpreter, +// so `emitPortableParser(grammar, goTarget)` is a real, verified Go parser derived from the +// same grammar definition. +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; const J = (v: unknown) => JSON.stringify(v); -const goStr = (s: string) => J(s); // Go and JSON string literals coincide for our ASCII vocab const rangeCond = (v: string, rs: CharRange[]) => - rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || '); + '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || ') + ')'; -function lexer(ir: ParserIR): string { - const cases = ir.tokens.map((t) => `\t\tif ${rangeCond('c', t.first)} { +function scanTok(t: LexTok): string { + const push = t.skip ? '' : `toks = append(toks, Tok{${J((t as { name: string }).name)}, src[pos:e], pos, e}); `; + if (t.kind === 'run') return `\t\tif ${rangeCond('c', t.first)} { +\t\t\te := pos + 1 +\t\t\tfor e < n { cc := int(src[e]); if !${rangeCond('cc', t.cont)} { break }; e++ } +\t\t\t${push}pos = e; continue +\t\t}`; + if (t.kind === 'string') return `\t\tif c == ${t.delim.charCodeAt(0)} { \t\t\te := pos + 1 -\t\t\tfor e < n { cc := int(src[e]); if !(${rangeCond('cc', t.cont)}) { break }; e++ } -\t\t\ttoks = append(toks, Tok{${goStr(t.name)}, src[pos:e], pos, e}); pos = e; continue -\t\t}`).join('\n'); - const punctChecks = ir.puncts.map((p) => - `\t\tif strings.HasPrefix(src[pos:], ${goStr(p)}) { toks = append(toks, Tok{"", ${goStr(p)}, pos, pos + ${p.length}}); pos += ${p.length}; continue }`).join('\n'); +\t\t\tfor e < n { ch := int(src[e]); if ch == 92 { e += 2; continue }; if ch == ${t.delim.charCodeAt(0)} { e++; break }; e++ } +\t\t\t${push}pos = e; continue +\t\t}`; + if (t.kind === 'line') return `\t\tif strings.HasPrefix(src[pos:], ${J(t.prefix)}) { +\t\t\te := pos + ${t.prefix.length} +\t\t\tfor e < n && src[e] != 10 { e++ } +\t\t\t${push}pos = e; continue +\t\t}`; + return `\t\tif strings.HasPrefix(src[pos:], ${J(t.open)}) { +\t\t\te := pos + ${t.open.length} +\t\t\tfor e < n && !strings.HasPrefix(src[e:], ${J(t.close)}) { e++ } +\t\t\tif e < n { e += ${t.close.length} } +\t\t\t${push}pos = e; continue +\t\t}`; +} + +function lexer(ir: ParserIR): string { + const toks = ir.tokens.map(scanTok).join('\n'); + const puncts = ir.puncts.map((p) => + `\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}}); pos += ${p.length}; continue }`).join('\n'); return `func lex(src string) []Tok { \ttoks := []Tok{} \tn := len(src) @@ -25,39 +45,50 @@ function lexer(ir: ParserIR): string { \tfor pos < n { \t\tc := int(src[pos]) \t\tif c == 32 || c == 9 || c == 10 || c == 13 { pos++; continue } -${cases} -${punctChecks} +${toks} +${puncts} \t\tpanic(fmt.Sprintf("lex error at %d", pos)) \t} \treturn toks }`; } +function stepCond(s: Step): string { + switch (s.t) { + case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)}, &kids)`; + case 'tok': return `matchTok(${J(s.name)}, &kids)`; + case 'rule': return `callRule(parse${s.name}, &kids)`; + case 'star': return `star(func() bool { return ${stepCond(s.step)} }, &kids)`; + case 'opt': return `opt(func() bool { return ${s.steps.map(stepCond).join(' && ')} }, &kids)`; + case 'sep': return `sepBy(func() bool { return ${stepCond(s.elem)} }, ${J(s.delim)}, &kids)`; + case 'altlit': return `altLit([][2]string{${s.opts.map((o) => `{${J(o.value)}, ${J(o.ttype)}}`).join(', ')}}, &kids)`; + } +} + function rdRule(r: RdRule): string { - const alt = (steps: Step[]) => { - const conds = steps.map(stepCond).join(' && '); - return `\t{ kids := []*Cst{}; if ${conds} { return branch(${goStr(r.name)}, kids, save) }; pos = save }`; - }; + const alt = (steps: Step[]) => + `\t{ kids := []*Cst{}; if ${steps.map(stepCond).join(' && ')} { return branch(${J(r.name)}, kids, save) }; pos = save }`; return `func parse${r.name}() *Cst { \tsave := pos ${r.alts.map(alt).join('\n')} \treturn nil }`; } -function stepCond(s: Step): string { - switch (s.t) { - case 'lit': return `matchLit(${goStr(s.value)}, ${goStr(s.ttype)}, &kids)`; - case 'tok': return `matchTok(${goStr(s.name)}, &kids)`; - case 'rule': return `callRule(parse${s.name}, &kids)`; - case 'star': return `star(func() bool { return ${stepCond(s.step)} }, &kids)`; - } -} function prattRule(r: PrattRule): string { - const bin = r.binary.map((b) => `${goStr(b.op)}: {${b.lbp}, ${b.rbp}}`).join(', '); - const pre = r.prefix.map((p) => `${goStr(p.op)}: ${p.rbp}`).join(', '); - const atoms = r.atomToks.map((k) => `${goStr(k)}: true`).join(', '); - const g = r.group; + const bin = r.binary.map((b) => `${J(b.op)}: {${b.lbp}, ${b.rbp}}`).join(', '); + const pre = r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', '); + const atoms = r.nudToks.map((k) => `${J(k)}: true`).join(', '); + const bracketNud = (b: Bracket) => `\tif t.Text == ${J(b.first)} { +\t\tsave := pos; kids := []*Cst{} +\t\tif ${b.steps.map(stepCond).join(' && ')} { return node(${J(r.name)}, kids) } +\t\tpos = save; return nil +\t}`; + const ledArm = (b: Bracket) => `\t\tif t.Text == ${J(b.first)} { +\t\t\tledSave := pos; kids := []*Cst{left} +\t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = node(${J(r.name)}, kids); continue } +\t\t\tpos = ledSave; break +\t\t}`; return `var ${r.name}BIN = map[string]bp{${bin}} var ${r.name}PRE = map[string]int{${pre}} var ${r.name}ATOM = map[string]bool{${atoms}} @@ -68,6 +99,7 @@ func ${r.name}bp(minBp int) *Cst { \tfor { \t\tt := peek() \t\tif t == nil { break } +${r.leds.map(ledArm).join('\n')} \t\tinfo, ok := ${r.name}BIN[t.Text] \t\tif !ok || info.lbp <= minBp { break } \t\tledSave := pos @@ -75,7 +107,7 @@ func ${r.name}bp(minBp int) *Cst { \t\topLeaf := &Cst{IsLeaf: true, TokenType: "$operator", Offset: t.Off, End: t.End} \t\trhs := ${r.name}bp(info.rbp) \t\tif rhs == nil { pos = ledSave; break } -\t\tleft = &Cst{Rule: ${goStr(r.name)}, Children: []*Cst{left, opLeaf, rhs}, Offset: left.Offset, End: rhs.End} +\t\tleft = &Cst{Rule: ${J(r.name)}, Children: []*Cst{left, opLeaf, rhs}, Offset: left.Offset, End: rhs.End} \t} \treturn left } @@ -84,22 +116,15 @@ func ${r.name}nud() *Cst { \tif t == nil { return nil } \tif ${r.name}ATOM[t.Kind] { \t\tpos++ -\t\treturn &Cst{Rule: ${goStr(r.name)}, Children: []*Cst{{IsLeaf: true, TokenType: t.Kind, Offset: t.Off, End: t.End}}, Offset: t.Off, End: t.End} +\t\treturn &Cst{Rule: ${J(r.name)}, Children: []*Cst{{IsLeaf: true, TokenType: t.Kind, Offset: t.Off, End: t.End}}, Offset: t.Off, End: t.End} \t} -${g ? `\tif t.Text == ${goStr(g.open)} { -\t\tsave := pos; pos++ -\t\tinner := ${r.name}bp(0) -\t\tc := peek() -\t\tif inner == nil || c == nil || c.Text != ${goStr(g.close)} { pos = save; return nil } -\t\tpos++ -\t\treturn &Cst{Rule: ${goStr(r.name)}, Children: []*Cst{{IsLeaf: true, TokenType: "$punct", Offset: t.Off, End: t.End}, inner, {IsLeaf: true, TokenType: "$punct", Offset: c.Off, End: c.End}}, Offset: t.Off, End: c.End} -\t}` : ''} +${r.nudBrackets.map(bracketNud).join('\n')} \tif pbp, ok := ${r.name}PRE[t.Text]; ok { \t\tsave := pos; pos++ \t\topLeaf := &Cst{IsLeaf: true, TokenType: "$operator", Offset: t.Off, End: t.End} \t\toperand := ${r.name}bp(pbp) \t\tif operand == nil { pos = save; return nil } -\t\treturn &Cst{Rule: ${goStr(r.name)}, Children: []*Cst{opLeaf, operand}, Offset: t.Off, End: operand.End} +\t\treturn &Cst{Rule: ${J(r.name)}, Children: []*Cst{opLeaf, operand}, Offset: t.Off, End: operand.End} \t} \treturn nil }`; @@ -117,7 +142,9 @@ import ( \t"fmt" \t"io" \t"os" +\t"strconv" \t"strings" +\t"time" ) type Tok struct { @@ -145,11 +172,14 @@ func peek() *Tok { } func branch(rule string, kids []*Cst, save int) *Cst { \toffset := 0 -\tif len(kids) > 0 { offset = kids[0].Offset } else if save < len(toks) { offset = toks[save].Off } else if len(toks) > 0 { offset = toks[len(toks)-1].End } +\tif len(kids) > 0 { offset = kids[0].Offset } else if save < len(toks) { offset = toks[save].Off } \tend := offset \tif len(kids) > 0 { end = kids[len(kids)-1].End } \treturn &Cst{Rule: rule, Children: kids, Offset: offset, End: end} } +func node(rule string, kids []*Cst) *Cst { +\treturn &Cst{Rule: rule, Children: kids, Offset: kids[0].Offset, End: kids[len(kids)-1].End} +} func matchLit(value, ttype string, kids *[]*Cst) bool { \tt := peek() \tif t == nil || t.Text != value { return false } @@ -169,6 +199,18 @@ func star(once func() bool, kids *[]*Cst) bool { \tfor { sp := pos; before := len(*kids); if !once() { pos = sp; *kids = (*kids)[:before]; break } } \treturn true } +func opt(body func() bool, kids *[]*Cst) bool { +\tsp := pos; before := len(*kids); if !body() { pos = sp; *kids = (*kids)[:before] }; return true +} +func sepBy(elem func() bool, delim string, kids *[]*Cst) bool { +\tif !elem() { return false } +\tfor { sp := pos; before := len(*kids); if matchLit(delim, "$punct", kids) && elem() { continue }; pos = sp; *kids = (*kids)[:before]; break } +\treturn true +} +func altLit(opts [][2]string, kids *[]*Cst) bool { +\tfor _, o := range opts { if matchLit(o[0], o[1], kids) { return true } } +\treturn false +} ${ruleFns} @@ -178,16 +220,24 @@ func writeJSON(c *Cst, b *strings.Builder) { \t\treturn \t} \tfmt.Fprintf(b, "{\\"rule\\":%q,\\"children\\":[", c.Rule) -\tfor i, k := range c.Children { -\t\tif i > 0 { b.WriteByte(',') } -\t\twriteJSON(k, b) -\t} +\tfor i, k := range c.Children { if i > 0 { b.WriteByte(',') }; writeJSON(k, b) } \tfmt.Fprintf(b, "],\\"offset\\":%d,\\"end\\":%d}", c.Offset, c.End) } func main() { \tdata, _ := io.ReadAll(os.Stdin) -\ttoks = lex(string(data)) +\tsrc := string(data) +\t// Self-bench: a numeric arg N times the lex+parse loop and prints ms/iteration. +\tif len(os.Args) > 1 { +\t\tif iters, err := strconv.Atoi(os.Args[1]); err == nil && iters > 0 { +\t\t\tfor i := 0; i < 3; i++ { toks = lex(src); pos = 0; parse${ir.entry}() } +\t\t\tt0 := time.Now() +\t\t\tfor i := 0; i < iters; i++ { toks = lex(src); pos = 0; parse${ir.entry}() } +\t\t\tfmt.Printf("%.4f\\n", float64(time.Since(t0).Nanoseconds())/1e6/float64(iters)) +\t\t\treturn +\t\t} +\t} +\ttoks = lex(src) \tpos = 0 \troot := parse${ir.entry}() \tif root == nil || pos != len(toks) { diff --git a/src/target-rust.ts b/src/target-rust.ts index 726ff1a..fb0c641 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -1,24 +1,52 @@ // The Rust Target for emit-portable. Renders the same language-agnostic ParserIR as -// tsTarget/goTarget into a self-contained Rust program (no external crates — the char-class -// lexer is regex-free, so it compiles with rustc alone, no Cargo/network). Its CST JSON is -// checked byte-for-byte against the interpreter, so `emitPortableParser(grammar, rustTarget)` -// is a real, verified Rust parser derived from the same grammar definition. -import type { ParserIR, RdRule, PrattRule, Step, CharRange, Target } from './emit-portable.ts'; +// tsTarget/goTarget into a self-contained Rust program (no external crates — the lexer is +// regex-free, so it compiles with rustc alone, no Cargo/network). Its CST JSON is checked +// byte-for-byte against the interpreter, so `emitPortableParser(grammar, rustTarget)` is a +// real, verified Rust parser derived from the same grammar definition. +// +// Rust ownership note: a CST node is OWNED (moved), unlike the TS/Go pointer trees. In the +// Pratt LED loop `left` can only be moved into a child vec once the continuation is known to +// match — so a mixfix LED matches its steps into a SEPARATE kids vec first, then (on success) +// moves `left` to the front and reassigns; on failure `left` is untouched and the loop +// returns it. Sub-sequence combinators (star/opt/sep) take non-capturing fn pointers +// `fn(&mut Parser, &mut Vec) -> bool`, threading the parser + kids as params (so nothing +// is captured, sidestepping the borrow checker). +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; const J = (v: unknown) => JSON.stringify(v); -const rsStr = (s: string) => J(s); // Rust and JSON string literals coincide for our ASCII vocab const rangeCond = (v: string, rs: CharRange[]) => - rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `(${lo}..=${hi}).contains(&${v})`)).join(' || '); + '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `(${lo}..=${hi}).contains(&${v})`)).join(' || ') + ')'; -function lexer(ir: ParserIR): string { - const cases = ir.tokens.map((t) => ` if ${rangeCond('c', t.first)} { +function scanTok(t: LexTok): string { + const push = t.skip ? '' : `toks.push(Tok { kind: ${J((t as { name: string }).name)}, text: &src[pos..e], off: pos, end: e }); `; + if (t.kind === 'run') return ` if ${rangeCond('c', t.first)} { + let mut e = pos + 1; + while e < n { let cc = b[e] as u32; if !${rangeCond('cc', t.cont)} { break } e += 1; } + ${push}pos = e; continue; + }`; + if (t.kind === 'string') return ` if c == ${t.delim.charCodeAt(0)} { let mut e = pos + 1; - while e < n { let cc = b[e] as u32; if !(${rangeCond('cc', t.cont)}) { break } e += 1; } - toks.push(Tok { kind: ${rsStr(t.name)}.to_string(), text: src[pos..e].to_string(), off: pos, end: e }); pos = e; continue; - }`).join('\n'); - const punctChecks = ir.puncts.map((p) => - ` if src[pos..].starts_with(${rsStr(p)}) { toks.push(Tok { kind: String::new(), text: ${rsStr(p)}.to_string(), off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); - return `fn lex(src: &str) -> Vec { + while e < n { let ch = b[e] as u32; if ch == 92 { e += 2; continue } if ch == ${t.delim.charCodeAt(0)} { e += 1; break } e += 1; } + ${push}pos = e; continue; + }`; + if (t.kind === 'line') return ` if src[pos..].starts_with(${J(t.prefix)}) { + let mut e = pos + ${t.prefix.length}; + while e < n && b[e] != 10 { e += 1; } + ${push}pos = e; continue; + }`; + return ` if src[pos..].starts_with(${J(t.open)}) { + let mut e = pos + ${t.open.length}; + while e < n && !src[e..].starts_with(${J(t.close)}) { e += 1; } + if e < n { e += ${t.close.length}; } + ${push}pos = e; continue; + }`; +} + +function lexer(ir: ParserIR): string { + const toks = ir.tokens.map(scanTok).join('\n'); + const puncts = ir.puncts.map((p) => + ` if src[pos..].starts_with(${J(p)}) { toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); + return `fn lex<'a>(src: &'a str) -> Vec> { let b = src.as_bytes(); let n = b.len(); let mut toks: Vec = Vec::new(); @@ -26,47 +54,66 @@ function lexer(ir: ParserIR): string { while pos < n { let c = b[pos] as u32; if c == 32 || c == 9 || c == 10 || c == 13 { pos += 1; continue; } -${cases} -${punctChecks} +${toks} +${puncts} panic!("lex error at {}", pos); } toks }`; } -function rdRule(r: RdRule): string { - const alt = (steps: Step[]) => { - const conds = steps.map(stepCond).join(' && '); - return ` { let mut kids: Vec = Vec::new(); if ${conds} { return Some(self.branch(${rsStr(r.name)}, kids, save)); } self.pos = save; }`; - }; - return ` fn parse_${r.name}(&mut self) -> Option { - let save = self.pos; -${r.alts.map(alt).join('\n')} - None - }`; -} +// Top-level step: uses `self` and `&mut kids`. function stepCond(s: Step): string { switch (s.t) { - case 'lit': return `self.match_lit(${rsStr(s.value)}, ${rsStr(s.ttype)}, &mut kids)`; - case 'tok': return `self.match_tok(${rsStr(s.name)}, &mut kids)`; + case 'lit': return `self.match_lit(${J(s.value)}, ${J(s.ttype)}, &mut kids)`; + case 'tok': return `self.match_tok(${J(s.name)}, &mut kids)`; case 'rule': return `self.call_rule(Parser::parse_${s.name}, &mut kids)`; - case 'star': return `self.star(|p, k| ${starInner(s.step)}, &mut kids)`; + case 'star': return `self.star(|p, k| ${stepCondP(s.step)}, &mut kids)`; + case 'opt': return `self.opt(|p, k| ${s.steps.map(stepCondP).join(' && ')}, &mut kids)`; + case 'sep': return `self.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, &mut kids)`; + case 'altlit': return `self.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], &mut kids)`; } } -function starInner(s: Step): string { +// Inside a closure: uses `p` and `k`. +function stepCondP(s: Step): string { switch (s.t) { - case 'lit': return `p.match_lit(${rsStr(s.value)}, ${rsStr(s.ttype)}, k)`; - case 'tok': return `p.match_tok(${rsStr(s.name)}, k)`; + case 'lit': return `p.match_lit(${J(s.value)}, ${J(s.ttype)}, k)`; + case 'tok': return `p.match_tok(${J(s.name)}, k)`; case 'rule': return `p.call_rule(Parser::parse_${s.name}, k)`; - case 'star': throw new Error('portable: nested star unsupported'); + case 'star': return `p.star(|p, k| ${stepCondP(s.step)}, k)`; + case 'opt': return `p.opt(|p, k| ${s.steps.map(stepCondP).join(' && ')}, k)`; + case 'sep': return `p.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, k)`; + case 'altlit': return `p.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], k)`; } } +function rdRule(r: RdRule): string { + const alt = (steps: Step[]) => + ` { let mut kids: Vec = Vec::new(); if ${steps.map(stepCond).join(' && ')} { return Some(self.branch(${J(r.name)}, kids, save)); } self.pos = save; }`; + return ` fn parse_${r.name}(&mut self) -> Option { + let save = self.pos; +${r.alts.map(alt).join('\n')} + None + }`; +} + function prattRule(r: PrattRule): string { - const binArms = r.binary.map((b) => `${rsStr(b.op)} => Some((${b.lbp}, ${b.rbp}))`).join(', '); - const preArms = r.prefix.map((p) => `${rsStr(p.op)} => Some(${p.rbp})`).join(', '); - const atomArm = r.atomToks.map(rsStr).join(' | '); - const g = r.group; + const binArms = r.binary.map((b) => `${J(b.op)} => Some((${b.lbp}, ${b.rbp}))`).join(', '); + const preArms = r.prefix.map((p) => `${J(p.op)} => Some(${p.rbp})`).join(', '); + const atomArm = r.nudToks.map(J).join(' | '); + const bracketNud = (b: Bracket) => ` if t.text == ${J(b.first)} { + let save = self.pos; let mut kids: Vec = Vec::new(); + if ${b.steps.map(stepCond).join(' && ')} { return Some(node(${J(r.name)}, kids)); } + self.pos = save; return None; + }`; + const ledArm = (b: Bracket) => ` if t.text == ${J(b.first)} { + let led_save = self.pos; let mut kids: Vec = Vec::new(); + if ${b.steps.map(stepCond).join(' && ')} { + let mut full = vec![left]; full.append(&mut kids); + left = node(${J(r.name)}, full); continue; + } + self.pos = led_save; break; + }`; return ` fn parse_${r.name}(&mut self) -> Option { self.${r.name}_bp(0) } fn ${r.name}_bin(op: &str) -> Option<(i64, i64)> { match op { ${binArms}${binArms ? ', ' : ''}_ => None } } fn ${r.name}_pre(op: &str) -> Option { match op { ${preArms}${preArms ? ', ' : ''}_ => None } } @@ -75,41 +122,29 @@ function prattRule(r: PrattRule): string { let mut left = self.${r.name}_nud()?; loop { let t = match self.peek() { Some(t) => t, None => break }; - let (lbp, rbp) = match Parser::${r.name}_bin(&t.text) { Some(x) => x, None => break }; +${r.leds.map(ledArm).join('\n')} + let (lbp, rbp) = match Parser::${r.name}_bin(t.text) { Some(x) => x, None => break }; if lbp <= min_bp { break; } let led_save = self.pos; self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); let rhs = match self.${r.name}_bp(rbp) { Some(r) => r, None => { self.pos = led_save; break; } }; - let (off, end) = (left.offset, rhs.end); - left = Cst::node(${rsStr(r.name)}, vec![left, op_leaf, rhs], off, end); + left = node(${J(r.name)}, vec![left, op_leaf, rhs]); } Some(left) } fn ${r.name}_nud(&mut self) -> Option { let t = self.peek()?; - if Parser::${r.name}_atom(&t.kind) { + if Parser::${r.name}_atom(t.kind) { self.pos += 1; - return Some(Cst::node(${rsStr(r.name)}, vec![Cst::leaf(&t.kind, t.off, t.end)], t.off, t.end)); + return Some(Cst::node(${J(r.name)}, vec![Cst::leaf(t.kind, t.off, t.end)], t.off, t.end)); } -${g ? ` if t.text == ${rsStr(g.open)} { - let save = self.pos; self.pos += 1; - let inner = self.${r.name}_bp(0); - let c = self.peek(); - match (inner, c) { - (Some(inner), Some(c)) if c.text == ${rsStr(g.close)} => { - self.pos += 1; - let (off, end) = (t.off, c.end); - return Some(Cst::node(${rsStr(r.name)}, vec![Cst::leaf("$punct", t.off, t.end), inner, Cst::leaf("$punct", c.off, c.end)], off, end)); - } - _ => { self.pos = save; return None; } - } - }` : ''} - if let Some(pbp) = Parser::${r.name}_pre(&t.text) { +${r.nudBrackets.map(bracketNud).join('\n')} + if let Some(pbp) = Parser::${r.name}_pre(t.text) { let save = self.pos; self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); match self.${r.name}_bp(pbp) { - Some(operand) => { let (off, end) = (t.off, operand.end); return Some(Cst::node(${rsStr(r.name)}, vec![op_leaf, operand], off, end)); } + Some(operand) => { let (o, e) = (t.off, operand.end); return Some(Cst::node(${J(r.name)}, vec![op_leaf, operand], o, e)); } None => { self.pos = save; return None; } } } @@ -126,38 +161,56 @@ export const rustTarget: Target = { #![allow(non_snake_case)] use std::io::Read; -#[derive(Clone)] -struct Tok { kind: String, text: String, off: usize, end: usize } +// Zero-alloc tokens: kind is a known grammar name (&'static str), text is a slice of the +// source. Tok is Copy, so peek() copies pointers — no per-peek heap work. +#[derive(Clone, Copy)] +struct Tok<'a> { kind: &'static str, text: &'a str, off: usize, end: usize } -struct Cst { rule: String, children: Vec, is_leaf: bool, token_type: String, offset: usize, end: usize } +// CST nodes hold only &'static str labels (rule names / token-type tags are all literals) +// + usize spans — no per-node String allocation. +struct Cst { rule: &'static str, children: Vec, is_leaf: bool, token_type: &'static str, offset: usize, end: usize } impl Cst { - fn leaf(tt: &str, off: usize, end: usize) -> Cst { Cst { rule: String::new(), children: Vec::new(), is_leaf: true, token_type: tt.to_string(), offset: off, end } } - fn node(rule: &str, children: Vec, offset: usize, end: usize) -> Cst { Cst { rule: rule.to_string(), children, is_leaf: false, token_type: String::new(), offset, end } } + fn leaf(tt: &'static str, off: usize, end: usize) -> Cst { Cst { rule: "", children: Vec::new(), is_leaf: true, token_type: tt, offset: off, end } } + fn node(rule: &'static str, children: Vec, offset: usize, end: usize) -> Cst { Cst { rule, children, is_leaf: false, token_type: "", offset, end } } } +// offset/end inferred from first/last child (children non-empty). +fn node(rule: &'static str, kids: Vec) -> Cst { let o = kids[0].offset; let e = kids[kids.len() - 1].end; Cst::node(rule, kids, o, e) } ${lexer(ir)} -struct Parser { toks: Vec, pos: usize } -impl Parser { - fn peek(&self) -> Option { if self.pos < self.toks.len() { Some(self.toks[self.pos].clone()) } else { None } } - fn branch(&self, rule: &str, kids: Vec, save: usize) -> Cst { - let offset = if !kids.is_empty() { kids[0].offset } else if save < self.toks.len() { self.toks[save].off } else if !self.toks.is_empty() { self.toks[self.toks.len() - 1].end } else { 0 }; +struct Parser<'a> { toks: Vec>, pos: usize } +impl<'a> Parser<'a> { + fn peek(&self) -> Option> { if self.pos < self.toks.len() { Some(self.toks[self.pos]) } else { None } } + fn branch(&self, rule: &'static str, kids: Vec, save: usize) -> Cst { + let offset = if !kids.is_empty() { kids[0].offset } else if save < self.toks.len() { self.toks[save].off } else { 0 }; let end = if !kids.is_empty() { kids[kids.len() - 1].end } else { offset }; Cst::node(rule, kids, offset, end) } - fn match_lit(&mut self, value: &str, ttype: &str, kids: &mut Vec) -> bool { + fn match_lit(&mut self, value: &str, ttype: &'static str, kids: &mut Vec) -> bool { match self.peek() { Some(t) if t.text == value => { kids.push(Cst::leaf(ttype, t.off, t.end)); self.pos += 1; true } _ => false } } - fn match_tok(&mut self, name: &str, kids: &mut Vec) -> bool { + fn match_tok(&mut self, name: &'static str, kids: &mut Vec) -> bool { match self.peek() { Some(t) if t.kind == name => { kids.push(Cst::leaf(name, t.off, t.end)); self.pos += 1; true } _ => false } } - fn call_rule(&mut self, f: fn(&mut Parser) -> Option, kids: &mut Vec) -> bool { + fn call_rule(&mut self, f: fn(&mut Parser<'a>) -> Option, kids: &mut Vec) -> bool { match f(self) { Some(n) => { kids.push(n); true } None => false } } - fn star(&mut self, once: fn(&mut Parser, &mut Vec) -> bool, kids: &mut Vec) -> bool { + fn star(&mut self, once: fn(&mut Parser<'a>, &mut Vec) -> bool, kids: &mut Vec) -> bool { loop { let sp = self.pos; let before = kids.len(); if !once(self, kids) { self.pos = sp; kids.truncate(before); break; } } true } + fn opt(&mut self, body: fn(&mut Parser<'a>, &mut Vec) -> bool, kids: &mut Vec) -> bool { + let sp = self.pos; let before = kids.len(); if !body(self, kids) { self.pos = sp; kids.truncate(before); } true + } + fn sep_by(&mut self, elem: fn(&mut Parser<'a>, &mut Vec) -> bool, delim: &str, kids: &mut Vec) -> bool { + if !elem(self, kids) { return false; } + loop { let sp = self.pos; let before = kids.len(); if self.match_lit(delim, "$punct", kids) && elem(self, kids) { continue; } self.pos = sp; kids.truncate(before); break; } + true + } + fn alt_lit(&mut self, opts: &[(&str, &'static str)], kids: &mut Vec) -> bool { + for (v, tt) in opts { if self.match_lit(v, tt, kids) { return true; } } + false + } ${ruleFns} } @@ -175,6 +228,15 @@ fn write_json(c: &Cst, out: &mut String) { fn main() { let mut src = String::new(); std::io::stdin().read_to_string(&mut src).unwrap(); + // Self-bench: a numeric arg N times the lex+parse loop and prints ms/iteration. + if let Some(iters) = std::env::args().nth(1).and_then(|a| a.parse::().ok()) { + // black_box on the input + result so the optimizer can't elide the lex/parse. + for _ in 0..3 { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0 }; std::hint::black_box(p.parse_${ir.entry}()); } + let t = std::time::Instant::now(); + for _ in 0..iters { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0 }; std::hint::black_box(p.parse_${ir.entry}()); } + println!("{:.4}", t.elapsed().as_secs_f64() * 1000.0 / iters as f64); + return; + } let toks = lex(&src); let n = toks.len(); let mut p = Parser { toks, pos: 0 }; diff --git a/src/target-ts.ts b/src/target-ts.ts index ab37220..304eef5 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -1,21 +1,43 @@ // The TypeScript Target for emit-portable. Renders the language-agnostic ParserIR into a -// self-contained TS parser: a char-class lexer, a backtracking recursive-descent core, a -// Pratt expression engine, and a CST→JSON printer over stdin. It is the reference rendering -// — its CST is checked byte-for-byte against the interpreter (createParser), so a divergence -// in the portable logic shows up here before Go/Rust are even compiled. -import type { ParserIR, RdRule, PrattRule, Step, CharRange, Target } from './emit-portable.ts'; +// self-contained TS parser: a char-class/string/comment lexer, a backtracking recursive- +// descent core, a Pratt expression engine (prefix + binary precedence + mixfix call/member/ +// index LEDs), and a CST→JSON printer over stdin. It is the reference rendering — its CST +// is checked byte-for-byte against the interpreter (createParser), so a divergence in the +// portable logic surfaces here before Go/Rust are compiled. +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; const J = (v: unknown) => JSON.stringify(v); const rangeCond = (v: string, rs: CharRange[]) => - rs.map(([lo, hi]) => (lo === hi ? `${v} === ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || '); + '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} === ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || ') + ')'; -function lexer(ir: ParserIR): string { - const cases = ir.tokens.map((t) => ` if (${rangeCond('c', t.first)}) { +function scanTok(t: LexTok): string { + const push = t.skip ? '' : `toks.push({ kind: ${J((t as { name: string }).name)}, text: src.slice(pos, e), off: pos, end: e }); `; + if (t.kind === 'run') return ` if (${rangeCond('c', t.first)}) { + let e = pos + 1; + while (e < n) { const cc = src.charCodeAt(e); if (!${rangeCond('cc', t.cont)}) break; e++; } + ${push}pos = e; continue; + }`; + if (t.kind === 'string') return ` if (c === ${t.delim.charCodeAt(0)}) { let e = pos + 1; - while (e < n) { const cc = src.charCodeAt(e); if (!(${rangeCond('cc', t.cont)})) break; e++; } - toks.push({ kind: ${J(t.name)}, text: src.slice(pos, e), off: pos, end: e }); pos = e; continue; - }`).join('\n'); - const punctChecks = ir.puncts.map((p) => + while (e < n) { const ch = src.charCodeAt(e); if (ch === 92) { e += 2; continue; } if (ch === ${t.delim.charCodeAt(0)}) { e++; break; } e++; } + ${push}pos = e; continue; + }`; + if (t.kind === 'line') return ` if (src.startsWith(${J(t.prefix)}, pos)) { + let e = pos + ${t.prefix.length}; + while (e < n && src.charCodeAt(e) !== 10) e++; + ${push}pos = e; continue; + }`; + return ` if (src.startsWith(${J(t.open)}, pos)) { + let e = pos + ${t.open.length}; + while (e < n && !src.startsWith(${J(t.close)}, e)) e++; + if (e < n) e += ${t.close.length}; + ${push}pos = e; continue; + }`; +} + +function lexer(ir: ParserIR): string { + const toks = ir.tokens.map(scanTok).join('\n'); + const puncts = ir.puncts.map((p) => ` if (src.startsWith(${J(p)}, pos)) { toks.push({ kind: '', text: ${J(p)}, off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); return `function lex(src: string): Tok[] { const toks: Tok[] = []; @@ -24,42 +46,54 @@ function lexer(ir: ParserIR): string { while (pos < n) { const c = src.charCodeAt(pos); if (c === 32 || c === 9 || c === 10 || c === 13) { pos++; continue; } -${cases} -${punctChecks} +${toks} +${puncts} throw new Error('lex error at ' + pos + ': ' + JSON.stringify(src[pos])); } return toks; }`; } -function rdRule(r: RdRule): string { - const alt = (steps: Step[]) => { - const conds = steps.map(stepCond).join(' && '); - return ` { const kids: Cst[] = []; if (${conds}) return branch(${J(r.name)}, kids, save); pos = save; }`; - }; - return `function parse${r.name}(): Node | null { - const save = pos; -${r.alts.map(alt).join('\n')} - return null; -}`; -} +// A Step as a boolean expression (appends to the in-scope `kids`). function stepCond(s: Step): string { switch (s.t) { case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)}, kids)`; case 'tok': return `matchTok(${J(s.name)}, kids)`; case 'rule': return `callRule(parse${s.name}, kids)`; case 'star': return `star(() => ${stepCond(s.step)}, kids)`; + case 'opt': return `opt(() => ${s.steps.map(stepCond).join(' && ')}, kids)`; + case 'sep': return `sepBy(() => ${stepCond(s.elem)}, ${J(s.delim)}, kids)`; + case 'altlit': return `altLit([${s.opts.map((o) => `[${J(o.value)}, ${J(o.ttype)}]`).join(', ')}], kids)`; } } +function rdRule(r: RdRule): string { + const alt = (steps: Step[]) => + ` { const kids: Cst[] = []; if (${steps.map(stepCond).join(' && ')}) return branch(${J(r.name)}, kids, save); pos = save; }`; + return `function parse${r.name}(): Node | null { + const save = pos; +${r.alts.map(alt).join('\n')} + return null; +}`; +} + function prattRule(r: PrattRule): string { const BIN = `{ ${r.binary.map((b) => `${J(b.op)}: { lbp: ${b.lbp}, rbp: ${b.rbp} }`).join(', ')} }`; const PRE = `{ ${r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', ')} }`; - const atomSet = `new Set([${r.atomToks.map(J).join(', ')}])`; - const group = r.group; + const atom = `new Set([${r.nudToks.map(J).join(', ')}])`; + const bracketNud = (b: Bracket) => ` if (t.text === ${J(b.first)}) { + const save = pos; const kids: Cst[] = []; + if (${b.steps.map(stepCond).join(' && ')}) return node(${J(r.name)}, kids); + pos = save; return null; + }`; + const ledArm = (b: Bracket) => ` if (t.text === ${J(b.first)}) { + const ledSave = pos; const kids: Cst[] = [left]; + if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.name)}, kids); continue; } + pos = ledSave; break; + }`; return `const ${r.name}_BIN: Record = ${BIN}; const ${r.name}_PRE: Record = ${PRE}; -const ${r.name}_ATOM = ${atomSet}; +const ${r.name}_ATOM = ${atom}; function parse${r.name}(): Node | null { return ${r.name}_bp(0); } function ${r.name}_bp(minBp: number): Node | null { let left = ${r.name}_nud(); @@ -67,6 +101,7 @@ function ${r.name}_bp(minBp: number): Node | null { for (;;) { const t = peek(); if (t === null) break; +${r.leds.map(ledArm).join('\n')} const info = ${r.name}_BIN[t.text]; if (info === undefined || info.lbp <= minBp) break; const ledSave = pos; @@ -82,14 +117,7 @@ function ${r.name}_nud(): Node | null { const t = peek(); if (t === null) return null; if (${r.name}_ATOM.has(t.kind)) { pos++; return { rule: ${J(r.name)}, children: [{ tokenType: t.kind, offset: t.off, end: t.end }], offset: t.off, end: t.end }; } -${group ? ` if (t.text === ${J(group.open)}) { - const save = pos; pos++; - const inner = ${r.name}_bp(0); - const c = peek(); - if (inner === null || c === null || c.text !== ${J(group.close)}) { pos = save; return null; } - pos++; - return { rule: ${J(r.name)}, children: [{ tokenType: '$punct', offset: t.off, end: t.end }, inner, { tokenType: '$punct', offset: c.off, end: c.end }], offset: t.off, end: c.end }; - }` : ''} +${r.nudBrackets.map(bracketNud).join('\n')} const pbp = ${r.name}_PRE[t.text]; if (pbp !== undefined) { const save = pos; pos++; @@ -120,12 +148,14 @@ ${lexer(ir)} let toks: Tok[] = []; let pos = 0; function peek(): Tok | null { return pos < toks.length ? toks[pos] : null; } -function curOff(): number { return pos < toks.length ? toks[pos].off : (toks.length > 0 ? toks[toks.length - 1].end : 0); } function branch(rule: string, kids: Cst[], save: number): Node { - const offset = kids.length > 0 ? kids[0].offset : (save < toks.length ? toks[save].off : curOff()); + const offset = kids.length > 0 ? kids[0].offset : (save < toks.length ? toks[save].off : 0); const end = kids.length > 0 ? kids[kids.length - 1].end : offset; return { rule, children: kids, offset, end }; } +function node(rule: string, kids: Cst[]): Node { + return { rule, children: kids, offset: kids[0].offset, end: kids[kids.length - 1].end }; +} function matchLit(value: string, ttype: string, kids: Cst[]): boolean { const t = peek(); if (t === null || t.text !== value) return false; @@ -145,10 +175,21 @@ function star(once: () => boolean, kids: Cst[]): boolean { for (;;) { const sp = pos; const before = kids.length; if (!once()) { pos = sp; kids.length = before; break; } } return true; } +function opt(body: () => boolean, kids: Cst[]): boolean { + const sp = pos; const before = kids.length; if (!body()) { pos = sp; kids.length = before; } return true; +} +function sepBy(elem: () => boolean, delim: string, kids: Cst[]): boolean { + if (!elem()) return false; + for (;;) { const sp = pos; const before = kids.length; if (matchLit(delim, '$punct', kids) && elem()) continue; pos = sp; kids.length = before; break; } + return true; +} +function altLit(opts: [string, string][], kids: Cst[]): boolean { + for (const [v, tt] of opts) if (matchLit(v, tt, kids)) return true; + return false; +} ${ruleFns} -function offsetEnd(n: Cst): number { return n.end; } const src = readFileSync(0, 'utf8'); toks = lex(src); pos = 0; diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 5d4e3b0..8c5384e 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -1,15 +1,17 @@ // Gate: the TARGET-AGNOSTIC emitter (issue #6) — `emitPortableParser(grammar, target)` // derives a parser in EACH target language that produces the byte-identical CST the -// interpreter (createParser) does. This is the agnosticism proof by EXECUTION: the same -// examples/calc.ts grammar is rendered to TypeScript, Go, and Rust; the Go and Rust -// sources are COMPILED and RUN, and every parser's CST output is compared, node-for-node, -// against the createParser oracle over an adversarial corpus (operator precedence / -// associativity, prefix chains, nested grouping, multi-statement programs, and the empty -// program), plus reject-parity on malformed input. +// interpreter (createParser) does. The agnosticism proof by EXECUTION: every grammar is +// rendered to TypeScript, Go, and Rust; the Go/Rust sources are COMPILED and RUN, and each +// parser's CST output is compared, node-for-node, against the createParser oracle over an +// adversarial corpus, plus reject-parity on malformed input. // -// Go/Rust toolchains are optional: a missing `go` or `rustc` is logged and skipped (the -// TS rendering, which needs only node, always runs) — the same graceful-degrade pattern -// the external-corpus gates use, so this stays green on a machine without them. +// - calc: operator precedence/associativity, prefix unary, nested grouping. +// - minijs: a real JavaScript SUBSET — a string/comment lexer, the full operator ladder, +// call/member/index chains, arrays, and statement forms (the grammar the Go/Rust +// output is benchmarked against oxc with). +// +// Go/Rust toolchains are optional: a missing `go`/`rustc` is logged and skipped (the TS +// rendering, which needs only node, always runs). import { execFileSync } from 'node:child_process'; import { mkdirSync, rmSync, writeFileSync } from 'node:fs'; import { createParser } from '../src/gen-parser.ts'; @@ -17,100 +19,109 @@ import { emitPortableParser } from '../src/emit-portable.ts'; import { tsTarget } from '../src/target-ts.ts'; import { goTarget } from '../src/target-go.ts'; import { rustTarget } from '../src/target-rust.ts'; +import type { CstGrammar } from '../src/types.ts'; -const grammar = (await import('../examples/calc.ts')).default; -const oracle = createParser(grammar); - -// Accepted inputs — each must parse to the SAME CST in every language. -const ACCEPT = [ - '1;', 'a;', '', // atoms + the empty program - '1 + 2 * 3;', '1 * 2 + 3;', // precedence both directions - '1 - 2 - 3;', 'a / b / c;', '1 + 2 + 3 + 4;', // left-associativity - '-a;', '-(-a);', '- - a;', // prefix + prefix chains - '-a * b;', '-a + b * c;', '-(a + b) * c;', // prefix vs infix vs grouping - '(1);', '((a));', '(1 + 2) * (3 - 4);', // nested grouping - 'a * b + c * d - e / f;', // mixed precedence ladder - 'let x = 1; let y = x + 2 * x; (y);', // multi-statement program - 'let z = -(a * b) / (c - -d);', 'foo; bar; baz;', +type Case = { grammar: string; path: string; accept: string[]; reject: string[] }; +const CASES: Case[] = [ + { + grammar: 'calc', path: '../examples/calc.ts', + accept: [ + '1;', 'a;', '', '1 + 2 * 3;', '1 * 2 + 3;', '1 - 2 - 3;', 'a / b / c;', '1 + 2 + 3 + 4;', + '-a;', '-(-a);', '- - a;', '-a * b;', '-a + b * c;', '-(a + b) * c;', + '(1);', '((a));', '(1 + 2) * (3 - 4);', 'a * b + c * d - e / f;', + 'let x = 1; let y = x + 2 * x; (y);', 'let z = -(a * b) / (c - -d);', 'foo; bar; baz;', + ], + reject: ['1 +;', '(1;', '1 2;', 'let = 1;', ') ;', '* a;', 'let x 1;'], + }, + { + grammar: 'minijs', path: '../examples/minijs.ts', + accept: [ + '1;', 'a;', '', 'x = 1 + 2 * 3;', '-a * b + 1;', '(1 + 2) * 3;', + 'foo(a, b);', 'a.b.c;', 'a[0][1];', 'f()()();', 'a.b(c).d[e];', + 'let x = 1; let y = x + 2;', '[1, 2, 3];', '[];', '[a, [b, c]];', + 'if (x < 10) { x = x + 1; } else { y(); }', 'while (i) { i = i - 1; }', + 'function f(a, b) { return a + b; }', 'var s = "hi\\"x"; // c\n s.length;', + '/* block */ a;', 'a === b !== c;', 'a && b || c;', '!a && -b;', + 'return;', 'return a + b;', 'const PI = 3;', '{ a; b; }', + 'f(g(h(x)), [1, 2], y.z);', 'while (a < b) { if (c) { d(); } e = e + 1; }', + ], + // (note: `let = 1;` is VALID minijs — no reserved-word guard, so `let` is an + // identifier and it's an assignment expression; the oracle accepts it too.) + reject: ['1 +;', '(1;', 'if x {}', 'foo(a,;', 'a.;', '[1,', 'function (){}'], + }, ]; -// Malformed inputs — every parser must REJECT (the oracle throws; the emitted parsers exit 1). -const REJECT = ['1 +;', '(1;', '1 2;', 'let = 1;', ') ;', '* a;', 'let x 1;']; -type Json = unknown; -const sortKeys = (o: Json): Json => +const sortKeys = (o: unknown): unknown => Array.isArray(o) ? o.map(sortKeys) - : (o && typeof o === 'object') ? Object.fromEntries(Object.keys(o as object).sort().map((k) => [k, sortKeys((o as Record)[k])])) + : (o && typeof o === 'object') ? Object.fromEntries(Object.keys(o as object).sort().map((k) => [k, sortKeys((o as Record)[k])])) : o; -const canon = (o: Json) => JSON.stringify(sortKeys(o)); - -function oracleOutcome(src: string): { ok: true; cst: string } | { ok: false } { - try { return { ok: true, cst: canon(oracle.parse(src)) }; } - catch { return { ok: false }; } -} +const canon = (o: unknown) => JSON.stringify(sortKeys(o)); const TMP = '/tmp/portable-targets'; rmSync(TMP, { recursive: true, force: true }); mkdirSync(TMP, { recursive: true }); +const have = (cmd: string, args: string[]) => { try { execFileSync(cmd, args, { stdio: 'pipe' }); return true; } catch { return false; } }; +const HAS_GO = have('go', ['version']); +const HAS_RUST = have('rustc', ['--version']); +if (!HAS_GO) console.log(' go: (toolchain absent — skipped)'); +if (!HAS_RUST) console.log(' rust: (toolchain absent — skipped)'); -function have(cmd: string, args: string[]): boolean { - try { execFileSync(cmd, args, { stdio: 'pipe' }); return true; } catch { return false; } -} - -// A runnable target: writes its source, (optionally) compiles, and returns a `run(src)->{ok,cst?}`. -type Runner = { label: string; run: (src: string) => { ok: true; cst: string } | { ok: false } }; - -function tsRunner(): Runner { - const f = `${TMP}/calc.ts`; - writeFileSync(f, emitPortableParser(grammar, tsTarget)); - return { label: 'typescript', run: (src) => runProc('node', [f], src) }; -} -function goRunner(): Runner | null { - if (!have('go', ['version'])) { console.log(' go: (toolchain absent — skipped)'); return null; } - const dir = `${TMP}/go`; mkdirSync(dir, { recursive: true }); - writeFileSync(`${dir}/main.go`, emitPortableParser(grammar, goTarget)); - writeFileSync(`${dir}/go.mod`, 'module calc\n\ngo 1.21\n'); - execFileSync('go', ['build', '-o', `${dir}/calc`, '.'], { cwd: dir, stdio: 'pipe' }); - return { label: 'go', run: (src) => runProc(`${dir}/calc`, [], src) }; -} -function rustRunner(): Runner | null { - if (!have('rustc', ['--version'])) { console.log(' rust: (toolchain absent — skipped)'); return null; } - const dir = `${TMP}/rust`; mkdirSync(dir, { recursive: true }); - const f = `${dir}/main.rs`; - writeFileSync(f, emitPortableParser(grammar, rustTarget)); - execFileSync('rustc', ['-O', f, '-o', `${dir}/calc`], { stdio: 'pipe' }); - return { label: 'rust', run: (src) => runProc(`${dir}/calc`, [], src) }; -} -function runProc(cmd: string, args: string[], src: string): { ok: true; cst: string } | { ok: false } { +type Outcome = { ok: true; cst: string } | { ok: false }; +function runProc(cmd: string, args: string[], src: string): Outcome { try { return { ok: true, cst: canon(JSON.parse(execFileSync(cmd, args, { input: src, stdio: ['pipe', 'pipe', 'pipe'] }).toString())) }; } catch { return { ok: false }; } } -const runners: Runner[] = [tsRunner(), goRunner(), rustRunner()].filter((r): r is Runner => r !== null); - let failures = 0; -for (const r of runners) { - let acc = 0, rej = 0; - for (const src of ACCEPT) { - const want = oracleOutcome(src); - const got = r.run(src); - if (want.ok && got.ok && want.cst === got.cst) { acc++; continue; } - failures++; - console.log(` ${r.label}: ACCEPT mismatch on ${JSON.stringify(src)}`); - if (want.ok && got.ok) { console.log(` want ${want.cst.slice(0, 140)}`); console.log(` got ${got.cst.slice(0, 140)}`); } - else console.log(` want.ok=${want.ok} got.ok=${got.ok}`); +for (const c of CASES) { + const grammar: CstGrammar = (await import(c.path)).default; + const oracle = createParser(grammar); + const oracleOut = (src: string): Outcome => { try { return { ok: true, cst: canon(oracle.parse(src)) }; } catch { return { ok: false }; } }; + + const dir = `${TMP}/${c.grammar}`; + mkdirSync(dir, { recursive: true }); + const runners: Array<{ label: string; run: (src: string) => Outcome }> = []; + + const tsFile = `${dir}/p.ts`; + writeFileSync(tsFile, emitPortableParser(grammar, tsTarget)); + runners.push({ label: 'typescript', run: (src) => runProc('node', [tsFile], src) }); + + if (HAS_GO) { + const gdir = `${dir}/go`; mkdirSync(gdir, { recursive: true }); + writeFileSync(`${gdir}/main.go`, emitPortableParser(grammar, goTarget)); + writeFileSync(`${gdir}/go.mod`, 'module p\n\ngo 1.21\n'); + execFileSync('go', ['build', '-o', `${gdir}/p`, '.'], { cwd: gdir, stdio: 'pipe' }); + runners.push({ label: 'go', run: (src) => runProc(`${gdir}/p`, [], src) }); } - for (const src of REJECT) { - const want = oracleOutcome(src); - const got = r.run(src); - if (!want.ok && !got.ok) { rej++; continue; } - failures++; - console.log(` ${r.label}: REJECT mismatch on ${JSON.stringify(src)} (oracle ok=${want.ok}, ${r.label} ok=${got.ok})`); + if (HAS_RUST) { + const rfile = `${dir}/main.rs`; + writeFileSync(rfile, emitPortableParser(grammar, rustTarget)); + execFileSync('rustc', ['-O', '-A', 'warnings', rfile, '-o', `${dir}/pr`], { stdio: 'pipe' }); + runners.push({ label: 'rust', run: (src) => runProc(`${dir}/pr`, [], src) }); + } + + for (const r of runners) { + let acc = 0, rej = 0; + for (const src of c.accept) { + const want = oracleOut(src), got = r.run(src); + if (want.ok && got.ok && want.cst === got.cst) { acc++; continue; } + failures++; + console.log(` ${c.grammar}/${r.label}: ACCEPT mismatch on ${JSON.stringify(src)}`); + if (want.ok && got.ok) { console.log(` want ${want.cst.slice(0, 140)}`); console.log(` got ${got.cst.slice(0, 140)}`); } + else console.log(` want.ok=${want.ok} got.ok=${got.ok}`); + } + for (const src of c.reject) { + const want = oracleOut(src), got = r.run(src); + if (!want.ok && !got.ok) { rej++; continue; } + failures++; + console.log(` ${c.grammar}/${r.label}: REJECT mismatch on ${JSON.stringify(src)} (oracle ok=${want.ok}, ${r.label} ok=${got.ok})`); + } + console.log(` ${c.grammar}/${r.label}: ${acc}/${c.accept.length} accept ≡ oracle · ${rej}/${c.reject.length} reject ≡ oracle`); } - console.log(` ${r.label}: ${acc}/${ACCEPT.length} accept ≡ oracle · ${rej}/${REJECT.length} reject ≡ oracle`); } if (failures > 0) { console.error(`\n✗ portable targets diverge from the interpreter (${failures} case(s))`); process.exit(1); } -console.log(`\n✓ ${runners.map((r) => r.label).join(' + ')} parsers derived from one grammar ≡ interpreter CST (compiled & run)`); +console.log('\n✓ portable parsers (ts/go/rust) derived from each grammar ≡ interpreter CST (compiled & run)'); From d1308d3c8f6482227d41e4bbe33d7b097f8b9030 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 00:32:01 +0800 Subject: [PATCH 06/27] emit-portable: arena-allocate the Go target (3.5x faster, vs tsgo) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Go target now allocates its CST from a flat arena instead of a heap *Cst per node: nodes live in `nodes []Node` (a node is an int32 index), children in a flat `kids []int32`, and in-progress children accumulate on a `scratch` stack. Backtracking truncates the three slices to saved lengths; the slices keep their capacity across parses, so a warmed parser allocates ~nothing. Indices (unlike the previous pointers) survive slice reallocation, which is what makes the arena work. This is the Go counterpart of the Rust target's zero-allocation change, and the same allocation lever the optimized emit-parser.ts pays for in JS: it took the derived Go parser from 19 MB/s to 67 MB/s (3.5x) on the 2.92 MB JS-subset corpus. Verified: CST byte-identical to the interpreter on the corpus + the portable gate (calc + minijs, ts/go/rust, 21/21+29/29 accept, 7/7 reject); the truncate-on- backtrack reclamation is exercised by the reject cases. Full suite 42/42. Benchmark vs tsgo (microsoft/typescript-go's native-Go parser, ParseSourceFile only, both parse the corpus clean): derived-Go 67 MB/s, tsgo 33 MB/s. The 3.5x arena win is the apples-to-apples result; the headline 2x-over-tsgo is partly because minijs is a subset of TypeScript (tsgo builds a richer AST — trivia, full node kinds — so it does more per node), not purely better codegen. Takeaway: a grammar-derived parser with arena allocation is in the same league as a hand-tuned native one; naive per-node allocation is what costs the 3.5x. --- src/target-go.ts | 187 ++++++++++++++++++++++++++--------------------- 1 file changed, 104 insertions(+), 83 deletions(-) diff --git a/src/target-go.ts b/src/target-go.ts index 02af630..85f9f30 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -3,6 +3,12 @@ // with no module dependencies). Its CST JSON is checked byte-for-byte against the interpreter, // so `emitPortableParser(grammar, goTarget)` is a real, verified Go parser derived from the // same grammar definition. +// +// ARENA allocation (to minimise GC pressure, as tsgo does): nodes live in a flat `nodes []Node`, +// their children in a flat `kids []int32`, and in-progress children accumulate on a `scratch` +// stack. A node is an int32 index, never a heap pointer. Backtracking truncates the three +// slices to saved lengths; the slices keep their capacity across parses (reset to len 0), so a +// warmed parser allocates ~nothing per parse. import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; const J = (v: unknown) => JSON.stringify(v); @@ -39,7 +45,7 @@ function lexer(ir: ParserIR): string { const puncts = ir.puncts.map((p) => `\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}}); pos += ${p.length}; continue }`).join('\n'); return `func lex(src string) []Tok { -\ttoks := []Tok{} +\ttoks := toks[:0] \tn := len(src) \tpos := 0 \tfor pos < n { @@ -55,23 +61,24 @@ ${puncts} function stepCond(s: Step): string { switch (s.t) { - case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)}, &kids)`; - case 'tok': return `matchTok(${J(s.name)}, &kids)`; - case 'rule': return `callRule(parse${s.name}, &kids)`; - case 'star': return `star(func() bool { return ${stepCond(s.step)} }, &kids)`; - case 'opt': return `opt(func() bool { return ${s.steps.map(stepCond).join(' && ')} }, &kids)`; - case 'sep': return `sepBy(func() bool { return ${stepCond(s.elem)} }, ${J(s.delim)}, &kids)`; - case 'altlit': return `altLit([][2]string{${s.opts.map((o) => `{${J(o.value)}, ${J(o.ttype)}}`).join(', ')}}, &kids)`; + case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)})`; + case 'tok': return `matchTok(${J(s.name)})`; + case 'rule': return `callRule(parse${s.name})`; + case 'star': return `star(func() bool { return ${stepCond(s.step)} })`; + case 'opt': return `opt(func() bool { return ${s.steps.map(stepCond).join(' && ')} })`; + case 'sep': return `sepBy(func() bool { return ${stepCond(s.elem)} }, ${J(s.delim)})`; + case 'altlit': return `altLit([][2]string{${s.opts.map((o) => `{${J(o.value)}, ${J(o.ttype)}}`).join(', ')}})`; } } function rdRule(r: RdRule): string { const alt = (steps: Step[]) => - `\t{ kids := []*Cst{}; if ${steps.map(stepCond).join(' && ')} { return branch(${J(r.name)}, kids, save) }; pos = save }`; - return `func parse${r.name}() *Cst { -\tsave := pos + `\tif ${steps.map(stepCond).join(' && ')} { return finish(${J(r.name)}, sb, offAt(save)) } +\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]`; + return `func parse${r.name}() int32 { +\tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) ${r.alts.map(alt).join('\n')} -\treturn nil +\treturn -1 }`; } @@ -80,53 +87,56 @@ function prattRule(r: PrattRule): string { const pre = r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', '); const atoms = r.nudToks.map((k) => `${J(k)}: true`).join(', '); const bracketNud = (b: Bracket) => `\tif t.Text == ${J(b.first)} { -\t\tsave := pos; kids := []*Cst{} -\t\tif ${b.steps.map(stepCond).join(' && ')} { return node(${J(r.name)}, kids) } -\t\tpos = save; return nil +\t\tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +\t\tif ${b.steps.map(stepCond).join(' && ')} { return finish(${J(r.name)}, sb, t.Off) } +\t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 \t}`; const ledArm = (b: Bracket) => `\t\tif t.Text == ${J(b.first)} { -\t\t\tledSave := pos; kids := []*Cst{left} -\t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = node(${J(r.name)}, kids); continue } -\t\t\tpos = ledSave; break +\t\t\tledSave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +\t\t\tscratch = append(scratch, left) +\t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } +\t\t\tpos = ledSave; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break \t\t}`; return `var ${r.name}BIN = map[string]bp{${bin}} var ${r.name}PRE = map[string]int{${pre}} var ${r.name}ATOM = map[string]bool{${atoms}} -func parse${r.name}() *Cst { return ${r.name}bp(0) } -func ${r.name}bp(minBp int) *Cst { +func parse${r.name}() int32 { return ${r.name}bp(0) } +func ${r.name}bp(minBp int) int32 { \tleft := ${r.name}nud() -\tif left == nil { return nil } +\tif left < 0 { return -1 } \tfor { \t\tt := peek() \t\tif t == nil { break } ${r.leds.map(ledArm).join('\n')} \t\tinfo, ok := ${r.name}BIN[t.Text] \t\tif !ok || info.lbp <= minBp { break } -\t\tledSave := pos +\t\tledSave := pos; sb := len(scratch) +\t\tscratch = append(scratch, left, mkLeaf("$operator", t.Off, t.End)) \t\tpos++ -\t\topLeaf := &Cst{IsLeaf: true, TokenType: "$operator", Offset: t.Off, End: t.End} \t\trhs := ${r.name}bp(info.rbp) -\t\tif rhs == nil { pos = ledSave; break } -\t\tleft = &Cst{Rule: ${J(r.name)}, Children: []*Cst{left, opLeaf, rhs}, Offset: left.Offset, End: rhs.End} +\t\tif rhs < 0 { pos = ledSave; scratch = scratch[:sb]; break } +\t\tscratch = append(scratch, rhs) +\t\tleft = finish(${J(r.name)}, sb, nodes[left].Offset) \t} \treturn left } -func ${r.name}nud() *Cst { +func ${r.name}nud() int32 { \tt := peek() -\tif t == nil { return nil } +\tif t == nil { return -1 } \tif ${r.name}ATOM[t.Kind] { -\t\tpos++ -\t\treturn &Cst{Rule: ${J(r.name)}, Children: []*Cst{{IsLeaf: true, TokenType: t.Kind, Offset: t.Off, End: t.End}}, Offset: t.Off, End: t.End} +\t\tsb := len(scratch); scratch = append(scratch, mkLeaf(t.Kind, t.Off, t.End)); pos++ +\t\treturn finish(${J(r.name)}, sb, t.Off) \t} ${r.nudBrackets.map(bracketNud).join('\n')} \tif pbp, ok := ${r.name}PRE[t.Text]; ok { -\t\tsave := pos; pos++ -\t\topLeaf := &Cst{IsLeaf: true, TokenType: "$operator", Offset: t.Off, End: t.End} +\t\tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +\t\tscratch = append(scratch, mkLeaf("$operator", t.Off, t.End)); pos++ \t\toperand := ${r.name}bp(pbp) -\t\tif operand == nil { pos = save; return nil } -\t\treturn &Cst{Rule: ${J(r.name)}, Children: []*Cst{opLeaf, operand}, Offset: t.Off, End: operand.End} +\t\tif operand < 0 { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 } +\t\tscratch = append(scratch, operand) +\t\treturn finish(${J(r.name)}, sb, t.Off) \t} -\treturn nil +\treturn -1 }`; } @@ -151,77 +161,90 @@ type Tok struct { \tKind, Text string \tOff, End int } -type Cst struct { -\tRule string -\tChildren []*Cst -\tIsLeaf bool -\tTokenType string -\tOffset int -\tEnd int +// Arena node: an int32 index into nodes; children are a flat range in kids. +type Node struct { +\tRule, TokenType string +\tIsLeaf bool +\tKidStart, KidCount, Offset, End int } type bp struct{ lbp, rbp int } -${lexer(ir)} - var toks []Tok var pos int +var nodes []Node +var kids []int32 +var scratch []int32 + +${lexer(ir)} func peek() *Tok { \tif pos < len(toks) { return &toks[pos] } \treturn nil } -func branch(rule string, kids []*Cst, save int) *Cst { -\toffset := 0 -\tif len(kids) > 0 { offset = kids[0].Offset } else if save < len(toks) { offset = toks[save].Off } -\tend := offset -\tif len(kids) > 0 { end = kids[len(kids)-1].End } -\treturn &Cst{Rule: rule, Children: kids, Offset: offset, End: end} +func offAt(i int) int { if i < len(toks) { return toks[i].Off }; return 0 } +func mkLeaf(ttype string, off, end int) int32 { +\tnodes = append(nodes, Node{TokenType: ttype, IsLeaf: true, Offset: off, End: end}) +\treturn int32(len(nodes) - 1) } -func node(rule string, kids []*Cst) *Cst { -\treturn &Cst{Rule: rule, Children: kids, Offset: kids[0].Offset, End: kids[len(kids)-1].End} +// Wrap the scratch entries [sb:] as one node's children (flattened into kids); truncate scratch. +func finish(rule string, sb, fallbackOff int) int32 { +\tnn := len(scratch) +\tkidStart := len(kids) +\toff, end := fallbackOff, fallbackOff +\tif nn > sb { off = nodes[scratch[sb]].Offset; end = nodes[scratch[nn-1]].End } +\tkids = append(kids, scratch[sb:nn]...) +\tscratch = scratch[:sb] +\tnodes = append(nodes, Node{Rule: rule, KidStart: kidStart, KidCount: nn - sb, Offset: off, End: end}) +\treturn int32(len(nodes) - 1) } -func matchLit(value, ttype string, kids *[]*Cst) bool { -\tt := peek() -\tif t == nil || t.Text != value { return false } -\t*kids = append(*kids, &Cst{IsLeaf: true, TokenType: ttype, Offset: t.Off, End: t.End}); pos++; return true +func matchLit(value, ttype string) bool { +\tif pos < len(toks) && toks[pos].Text == value { scratch = append(scratch, mkLeaf(ttype, toks[pos].Off, toks[pos].End)); pos++; return true } +\treturn false } -func matchTok(name string, kids *[]*Cst) bool { -\tt := peek() -\tif t == nil || t.Kind != name { return false } -\t*kids = append(*kids, &Cst{IsLeaf: true, TokenType: name, Offset: t.Off, End: t.End}); pos++; return true +func matchTok(name string) bool { +\tif pos < len(toks) && toks[pos].Kind == name { scratch = append(scratch, mkLeaf(name, toks[pos].Off, toks[pos].End)); pos++; return true } +\treturn false } -func callRule(fn func() *Cst, kids *[]*Cst) bool { -\tn := fn() -\tif n == nil { return false } -\t*kids = append(*kids, n); return true +func callRule(fn func() int32) bool { +\tid := fn() +\tif id < 0 { return false } +\tscratch = append(scratch, id); return true } -func star(once func() bool, kids *[]*Cst) bool { -\tfor { sp := pos; before := len(*kids); if !once() { pos = sp; *kids = (*kids)[:before]; break } } +func star(once func() bool) bool { +\tfor { sp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if !once() { pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break } } \treturn true } -func opt(body func() bool, kids *[]*Cst) bool { -\tsp := pos; before := len(*kids); if !body() { pos = sp; *kids = (*kids)[:before] }; return true +func opt(body func() bool) bool { +\tsp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if !body() { pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }; return true } -func sepBy(elem func() bool, delim string, kids *[]*Cst) bool { +func sepBy(elem func() bool, delim string) bool { \tif !elem() { return false } -\tfor { sp := pos; before := len(*kids); if matchLit(delim, "$punct", kids) && elem() { continue }; pos = sp; *kids = (*kids)[:before]; break } +\tfor { sp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if matchLit(delim, "$punct") && elem() { continue }; pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break } \treturn true } -func altLit(opts [][2]string, kids *[]*Cst) bool { -\tfor _, o := range opts { if matchLit(o[0], o[1], kids) { return true } } +func altLit(opts [][2]string) bool { +\tfor _, o := range opts { if matchLit(o[0], o[1]) { return true } } \treturn false } ${ruleFns} -func writeJSON(c *Cst, b *strings.Builder) { -\tif c.IsLeaf { -\t\tfmt.Fprintf(b, "{\\"tokenType\\":%q,\\"offset\\":%d,\\"end\\":%d}", c.TokenType, c.Offset, c.End) +func writeJSON(id int32, b *strings.Builder) { +\tnd := &nodes[id] +\tif nd.IsLeaf { +\t\tfmt.Fprintf(b, "{\\"tokenType\\":%q,\\"offset\\":%d,\\"end\\":%d}", nd.TokenType, nd.Offset, nd.End) \t\treturn \t} -\tfmt.Fprintf(b, "{\\"rule\\":%q,\\"children\\":[", c.Rule) -\tfor i, k := range c.Children { if i > 0 { b.WriteByte(',') }; writeJSON(k, b) } -\tfmt.Fprintf(b, "],\\"offset\\":%d,\\"end\\":%d}", c.Offset, c.End) +\tfmt.Fprintf(b, "{\\"rule\\":%q,\\"children\\":[", nd.Rule) +\tfor i := 0; i < nd.KidCount; i++ { if i > 0 { b.WriteByte(',') }; writeJSON(kids[nd.KidStart+i], b) } +\tfmt.Fprintf(b, "],\\"offset\\":%d,\\"end\\":%d}", nd.Offset, nd.End) +} + +func parseOnce(src string) int32 { +\ttoks = lex(src) +\tpos = 0 +\tnodes = nodes[:0]; kids = kids[:0]; scratch = scratch[:0] +\treturn parse${ir.entry}() } func main() { @@ -230,17 +253,15 @@ func main() { \t// Self-bench: a numeric arg N times the lex+parse loop and prints ms/iteration. \tif len(os.Args) > 1 { \t\tif iters, err := strconv.Atoi(os.Args[1]); err == nil && iters > 0 { -\t\t\tfor i := 0; i < 3; i++ { toks = lex(src); pos = 0; parse${ir.entry}() } +\t\t\tfor i := 0; i < 3; i++ { parseOnce(src) } \t\t\tt0 := time.Now() -\t\t\tfor i := 0; i < iters; i++ { toks = lex(src); pos = 0; parse${ir.entry}() } +\t\t\tfor i := 0; i < iters; i++ { parseOnce(src) } \t\t\tfmt.Printf("%.4f\\n", float64(time.Since(t0).Nanoseconds())/1e6/float64(iters)) \t\t\treturn \t\t} \t} -\ttoks = lex(src) -\tpos = 0 -\troot := parse${ir.entry}() -\tif root == nil || pos != len(toks) { +\troot := parseOnce(src) +\tif root < 0 || pos != len(toks) { \t\tfmt.Fprintf(os.Stderr, "parse error (pos %d/%d)\\n", pos, len(toks)) \t\tos.Exit(1) \t} From 7314ddebccdc5ecad8c5e350a4e1c6f4b073b324 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 01:28:53 +0800 Subject: [PATCH 07/27] emit-portable: general token-pattern matcher (real-grammar lexer, stage 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toward supporting the real grammar files, the portable lexer gains a GENERAL matcher: a token whose shape the four fast paths (run/string/line/block) don't cleanly recognise is now compiled, from its raw token-pattern AST, to a backtracking-free matcher (literal / charClass / seq / ordered-alt / greedy-repeat / zero-width lookahead+anchor) — no regex engine, so it stays portable. This replaces the previous over-eager `literalPrefix` heuristic that mis-classified numbers/strings/decorators as line comments. This handles the STATELESS real-JS token tier the fast paths could not: `\u`-escaped identifiers, the decimal/hex number family with a `(?!IdentChar)` boundary, and both-quote strings with escapes. examples/richtokens.ts exercises exactly these, and the emitted lexer is verified ≡ createLexer (the gate's richtokens case: 14/14 accept, 5/5 reject — including the Hex-vs-Number boundary disambiguation). Implemented in the TS target so far; Go/Rust throw a clear message on a `pattern` token (their matcher port is the next stage), so calc/minijs stay green in all three. Full suite 42/42. Remaining for the real grammar files (each a further stage): port the matcher to Go/Rust; the STATEFUL lexer (regex-vs-division context, template interpolation) that javascript/typescript need; the markup/indent lexers (html/yaml); and the full parser algebra (not/sameLine/exclude/ctxMode/tsRelax/+/…). --- examples/richtokens.ts | 40 +++++++++++++++++++++++++++++ src/emit-portable.ts | 33 ++++++++++++++++++------ src/target-go.ts | 3 ++- src/target-rust.ts | 3 ++- src/target-ts.ts | 55 ++++++++++++++++++++++++++++++++-------- test/portable-targets.ts | 18 ++++++++++--- 6 files changed, 129 insertions(+), 23 deletions(-) create mode 100644 examples/richtokens.ts diff --git a/examples/richtokens.ts b/examples/richtokens.ts new file mode 100644 index 0000000..ed10aec --- /dev/null +++ b/examples/richtokens.ts @@ -0,0 +1,40 @@ +// A token-stress grammar for the portable lexer's GENERAL matcher (stage 1 of real-grammar +// support). It uses the STATELESS real-JS token shapes the 4-shape fast paths can't handle — +// `\u`-escaped identifiers, the decimal/hex number family with a `(?!IdentChar)` boundary, +// both-quote strings with escapes, and comments — so the portable lexer must compile the raw +// token-pattern AST to a backtracking-free matcher. A trivial parser (a stream of value +// tokens) makes the emitted CST essentially the token stream, so checking it against +// createParser verifies the LEXER. (Stateful tokens — regex, templates — are NOT here; they +// need cross-token lexer state, the next stage.) +import { + token, rule, defineGrammar, + seq, oneOf, range, star, plus, repeat, optPattern, altPattern, noneOf, anyChar, notFollowedBy, many, +} from '../src/api.ts'; + +const digit = range('0', '9'); +const hexDigit = oneOf(digit, range('a', 'f'), range('A', 'F')); +const idChar = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const uEsc = altPattern(seq('\\u', repeat(hexDigit, 4, 4)), seq('\\u{', plus(hexDigit), '}')); +const boundary = notFollowedBy(idChar); // a number can't be glued to an identifier char + +const Hex = token(seq('0', oneOf('x', 'X'), plus(hexDigit), boundary), { scope: 'constant.numeric.hex' }); +const Number_ = token(seq(plus(digit), star(seq('_', plus(digit))), optPattern(seq('.', plus(digit))), boundary), { scope: 'constant.numeric' }); +const Ident = token(seq(altPattern(oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'), uEsc), star(altPattern(idChar, uEsc))), { identifier: true }); +const Str = token(altPattern( + seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', anyChar()))), '"'), + seq("'", star(altPattern(noneOf("'", '\\'), seq('\\', anyChar()))), "'"), +), { scope: 'string.quoted' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); +const BlockComment = token(seq('/*', star(altPattern(noneOf('*'), seq('*', noneOf('/')))), '*/'), { skip: true, scope: 'comment.block' }); + +// Value = one value token; Program = a stream of them. (Lexer-level disambiguation — Hex vs +// Number — comes from token DECLARATION ORDER, which both engines follow.) +const Value = rule(($) => [Hex, Number_, Ident, Str]); +const Program = rule(($) => [many(Value)]); + +export default defineGrammar({ + name: 'richtokens', + scopeName: 'source.richtokens', + tokens: { Hex, Number: Number_, Ident, Str, LineComment, BlockComment }, + rules: { Value, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index e445339..8e8c4c3 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -20,7 +20,7 @@ // bracket NUDs (grouping, array), and mixfix LEDs (call / member / index) tried before // operators. buildIR THROWS on a construct outside this set rather than emit a wrong // parser. This is enough to derive a real JavaScript-subset parser (examples/minijs.ts). -import type { CstGrammar, RuleExpr, TokenDecl } from './types.ts'; +import type { CstGrammar, RuleExpr, TokenDecl, TokenPattern } from './types.ts'; import { analyzeGrammar, findEntryRule } from './grammar-analysis.ts'; import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; import { @@ -35,7 +35,11 @@ export type LexTok = | { kind: 'run'; name: string; first: CharRange[]; cont: CharRange[]; skip: boolean } // ident/number char run | { kind: 'string'; name: string; delim: string; skip: boolean } // delim..delim, `\` escapes next | { kind: 'line'; name: string; prefix: string; skip: boolean } // prefix..end-of-line - | { kind: 'block'; name: string; open: string; close: string; skip: boolean }; // open..close + | { kind: 'block'; name: string; open: string; close: string; skip: boolean } // open..close + // The general case: the raw token-pattern AST, compiled to a backtracking-free matcher + // by the target (no regex engine). Subsumes the fast paths above; used for the token + // shapes they don't cleanly recognise (escaped identifiers, the number family, …). + | { kind: 'pattern'; name: string; pattern: TokenPattern; skip: boolean }; export type Lit = { value: string; ttype: '$keyword' | '$punct' }; export type Step = @@ -129,7 +133,9 @@ function buildIR(grammar: CstGrammar): ParserIR { return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules }; } -// Classify a token into a portable scanner spec via the structural recognizers. +// Classify a token: a fast-path shape (run/string/line/block) when one cleanly matches, +// otherwise the general `pattern` matcher. The fast paths keep the common simple tokens +// (and the calc/minijs grammars) on tight, readable scan code in every target. function lexTok(t: TokenDecl): LexTok { const skip = t.flags.includes('skip'); const qs = tokenPatternQuoteDelimAndEscape(t); @@ -137,13 +143,24 @@ function lexTok(t: TokenDecl): LexTok { const bd = tokenPatternBlockDelimiters(t); if (bd) return { kind: 'block', name: t.name, open: bd[0], close: bd[1], skip }; const loop = tokenPatternCharLoop(t); - if (loop) { - if (loop.bail.length > 0 || loop.bailNonAscii) throw new Error(`portable: token ${t.name} has a complex continuation (bail) — out of scope`); + if (loop && loop.bail.length === 0 && !loop.bailNonAscii) { return { kind: 'run', name: t.name, first: codesToRanges(loop.first), cont: codesToRanges(loop.cont), skip }; } - const prefix = tokenPatternLiteralPrefix(t); - if (prefix) return { kind: 'line', name: t.name, prefix, skip }; // prefix with no distinct suffix → to end-of-line - throw new Error(`portable: token ${t.name} shape not recognized by the portable lexer`); + const line = lineCommentShape(t.pattern); // PRECISE: prefix-literal then star(non-newline) + if (line) return { kind: 'line', name: t.name, prefix: line, skip }; + return { kind: 'pattern', name: t.name, pattern: t.pattern, skip }; +} + +// A token is a line comment iff its pattern is `seq(, star(charClass excluding \n))`. +function lineCommentShape(p: TokenPattern): string | null { + if (typeof p === 'string' || p.type !== 'seq' || p.items.length !== 2) return null; + const [head, tail] = p.items; + if (typeof head !== 'string') return null; + if (typeof tail === 'string' || tail.type !== 'repeat' || tail.min !== 0) return null; + const body = tail.body; + if (typeof body === 'string' || body.type !== 'charClass' || !body.negate) return null; + const excludesNl = body.items.some((it): boolean => it.type === 'char' && it.value === '\n'); + return excludesNl ? head : null; } function codesToRanges(codes: number[]): CharRange[] { diff --git a/src/target-go.ts b/src/target-go.ts index 85f9f30..9c809e1 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -32,12 +32,13 @@ function scanTok(t: LexTok): string { \t\t\tfor e < n && src[e] != 10 { e++ } \t\t\t${push}pos = e; continue \t\t}`; - return `\t\tif strings.HasPrefix(src[pos:], ${J(t.open)}) { + if (t.kind === 'block') return `\t\tif strings.HasPrefix(src[pos:], ${J(t.open)}) { \t\t\te := pos + ${t.open.length} \t\t\tfor e < n && !strings.HasPrefix(src[e:], ${J(t.close)}) { e++ } \t\t\tif e < n { e += ${t.close.length} } \t\t\t${push}pos = e; continue \t\t}`; + throw new Error(`portable Go lexer: general 'pattern' tokens not yet supported (token ${t.name}) — the stateless-token matcher is implemented in the TS target only so far`); } function lexer(ir: ParserIR): string { diff --git a/src/target-rust.ts b/src/target-rust.ts index fb0c641..2fd6a7f 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -34,12 +34,13 @@ function scanTok(t: LexTok): string { while e < n && b[e] != 10 { e += 1; } ${push}pos = e; continue; }`; - return ` if src[pos..].starts_with(${J(t.open)}) { + if (t.kind === 'block') return ` if src[pos..].starts_with(${J(t.open)}) { let mut e = pos + ${t.open.length}; while e < n && !src[e..].starts_with(${J(t.close)}) { e += 1; } if e < n { e += ${t.close.length}; } ${push}pos = e; continue; }`; + throw new Error(`portable Rust lexer: general 'pattern' tokens not yet supported (token ${t.name}) — the stateless-token matcher is implemented in the TS target only so far`); } function lexer(ir: ParserIR): string { diff --git a/src/target-ts.ts b/src/target-ts.ts index 304eef5..d39821b 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -10,40 +10,75 @@ const J = (v: unknown) => JSON.stringify(v); const rangeCond = (v: string, rs: CharRange[]) => '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} === ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || ') + ')'; -function scanTok(t: LexTok): string { - const push = t.skip ? '' : `toks.push({ kind: ${J((t as { name: string }).name)}, text: src.slice(pos, e), off: pos, end: e }); `; +import type { TokenPattern } from './types.ts'; + +// Compile a token-pattern AST to backtracking-free matcher functions `_mN(p): number` +// (returns the new position, or -1 on no match). Greedy `repeat`, ordered `alt`, +// zero-width `lookahead`/`anchor` — the regex-free token-matcher tier. +function ccCond(p: Extract): string { + const parts = p.items.map((it) => + it.type === 'char' ? `cc === ${it.value.charCodeAt(0)}` : `cc >= ${it.from.charCodeAt(0)} && cc <= ${it.to.charCodeAt(0)}`); + const inSet = parts.length === 1 ? parts[0] : '(' + parts.join(' || ') + ')'; + return p.negate ? `!${inSet}` : inSet; +} +function compilePat(p: TokenPattern, defs: string[]): string { + const name = `_m${defs.length}`; + defs.push(''); // reserve the slot (keeps numbering stable across recursion) + let body: string; + if (typeof p === 'string') { + body = `=> _s.startsWith(${J(p)}, p) ? p + ${p.length} : -1`; + } else switch (p.type) { + case 'anyChar': body = `=> p < _s.length ? p + 1 : -1`; break; + case 'charClass': body = `=> { if (p >= _s.length) return -1; const cc = _s.charCodeAt(p); return ${ccCond(p)} ? p + 1 : -1; }`; break; + case 'seq': { const ms = p.items.map((x) => compilePat(x, defs)); body = `=> { ${ms.map((m) => `p = ${m}(p); if (p < 0) return -1;`).join(' ')} return p; }`; break; } + case 'alt': { const ms = p.items.map((x) => compilePat(x, defs)); body = `=> { ${ms.map((m) => `{ const r = ${m}(p); if (r >= 0) return r; }`).join(' ')} return -1; }`; break; } + case 'repeat': { const m = compilePat(p.body, defs); const mx = p.max !== undefined ? `if (c >= ${p.max}) break;` : ''; body = `=> { let q = p, c = 0; for (;;) { const r = ${m}(q); if (r < 0 || r === q) break; q = r; c++; ${mx} } return c >= ${p.min} ? q : -1; }`; break; } + case 'lookahead': { const m = compilePat(p.body, defs); body = `=> { const r = ${m}(p); return ${p.negate ? 'r < 0' : 'r >= 0'} ? p : -1; }`; break; } + case 'anchor': body = p.kind === 'start' ? `=> p === 0 ? p : -1` : `=> p === _s.length ? p : -1`; break; + default: throw new Error(`portable TS lexer: pattern '${(p as { type: string }).type}' unsupported`); + } + defs[Number(name.slice(2))] = `const ${name} = (p: number): number ${body};`; + return name; +} + +function scanTok(t: LexTok, defs: string[]): string { + const name = (t as { name: string }).name; + const push = (endExpr: string) => (t.skip ? '' : `toks.push({ kind: ${J(name)}, text: src.slice(pos, ${endExpr}), off: pos, end: ${endExpr} }); `); if (t.kind === 'run') return ` if (${rangeCond('c', t.first)}) { let e = pos + 1; while (e < n) { const cc = src.charCodeAt(e); if (!${rangeCond('cc', t.cont)}) break; e++; } - ${push}pos = e; continue; + ${push('e')}pos = e; continue; }`; if (t.kind === 'string') return ` if (c === ${t.delim.charCodeAt(0)}) { let e = pos + 1; while (e < n) { const ch = src.charCodeAt(e); if (ch === 92) { e += 2; continue; } if (ch === ${t.delim.charCodeAt(0)}) { e++; break; } e++; } - ${push}pos = e; continue; + ${push('e')}pos = e; continue; }`; if (t.kind === 'line') return ` if (src.startsWith(${J(t.prefix)}, pos)) { let e = pos + ${t.prefix.length}; while (e < n && src.charCodeAt(e) !== 10) e++; - ${push}pos = e; continue; + ${push('e')}pos = e; continue; }`; - return ` if (src.startsWith(${J(t.open)}, pos)) { + if (t.kind === 'block') return ` if (src.startsWith(${J(t.open)}, pos)) { let e = pos + ${t.open.length}; while (e < n && !src.startsWith(${J(t.close)}, e)) e++; if (e < n) e += ${t.close.length}; - ${push}pos = e; continue; + ${push('e')}pos = e; continue; }`; + const m = compilePat(t.pattern, defs); + return ` { const e = ${m}(pos); if (e > pos) { ${push('e')}pos = e; continue; } }`; } function lexer(ir: ParserIR): string { - const toks = ir.tokens.map(scanTok).join('\n'); + const defs: string[] = []; + const toks = ir.tokens.map((t) => scanTok(t, defs)).join('\n'); const puncts = ir.puncts.map((p) => ` if (src.startsWith(${J(p)}, pos)) { toks.push({ kind: '', text: ${J(p)}, off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); - return `function lex(src: string): Tok[] { + return `${defs.length ? 'let _s = "";\n' + defs.join('\n') + '\n' : ''}function lex(src: string): Tok[] { const toks: Tok[] = []; const n = src.length; let pos = 0; - while (pos < n) { +${defs.length ? ' _s = src;\n' : ''} while (pos < n) { const c = src.charCodeAt(pos); if (c === 32 || c === 9 || c === 10 || c === 13) { pos++; continue; } ${toks} diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 8c5384e..afd1821 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -21,7 +21,7 @@ import { goTarget } from '../src/target-go.ts'; import { rustTarget } from '../src/target-rust.ts'; import type { CstGrammar } from '../src/types.ts'; -type Case = { grammar: string; path: string; accept: string[]; reject: string[] }; +type Case = { grammar: string; path: string; accept: string[]; reject: string[]; tsOnly?: boolean }; const CASES: Case[] = [ { grammar: 'calc', path: '../examples/calc.ts', @@ -49,6 +49,18 @@ const CASES: Case[] = [ // identifier and it's an assignment expression; the oracle accepts it too.) reject: ['1 +;', '(1;', 'if x {}', 'foo(a,;', 'a.;', '[1,', 'function (){}'], }, + { + // The general token-pattern matcher (stateless real-JS token tier): \u-escaped + // identifiers, the decimal/hex number family with a boundary, both-quote strings. + // TS-only for now — the Go/Rust port of the pattern matcher is the next stage. + grammar: 'richtokens', path: '../examples/richtokens.ts', tsOnly: true, + accept: [ + '123', '0xFF', '1_000_000', '3.14', 'foo', 'bar_$x9', '"hi"', "'single'", + '"esc\\"q\\n"', '123 0xa foo "s" 3.14', '0xDEADbeef 42 _id $x cafe // line\n 7', + '/* block */ 99 x', 'caf\\u00e9 \\u0041bc', '1_2_3 0X1F 10.5 a1 b2', + ], + reject: ['12abc', '0x', '"unterminated', '3.', '#'], // ($ is a valid identifier start, not a reject) + }, ]; const sortKeys = (o: unknown): unknown => @@ -86,14 +98,14 @@ for (const c of CASES) { writeFileSync(tsFile, emitPortableParser(grammar, tsTarget)); runners.push({ label: 'typescript', run: (src) => runProc('node', [tsFile], src) }); - if (HAS_GO) { + if (HAS_GO && !c.tsOnly) { const gdir = `${dir}/go`; mkdirSync(gdir, { recursive: true }); writeFileSync(`${gdir}/main.go`, emitPortableParser(grammar, goTarget)); writeFileSync(`${gdir}/go.mod`, 'module p\n\ngo 1.21\n'); execFileSync('go', ['build', '-o', `${gdir}/p`, '.'], { cwd: gdir, stdio: 'pipe' }); runners.push({ label: 'go', run: (src) => runProc(`${gdir}/p`, [], src) }); } - if (HAS_RUST) { + if (HAS_RUST && !c.tsOnly) { const rfile = `${dir}/main.rs`; writeFileSync(rfile, emitPortableParser(grammar, rustTarget)); execFileSync('rustc', ['-O', '-A', 'warnings', rfile, '-o', `${dir}/pr`], { stdio: 'pipe' }); From 747b03950f138cc2b28bb65968dd1d87fa2f2aba Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 01:41:54 +0800 Subject: [PATCH 08/27] emit-portable: port the general token matcher to Go + Rust (lexer convergence) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The target-agnostic lexer is now uniform across all three targets: the general token-pattern matcher (stage 1, TS only) is ported to Go and Rust, so a `pattern` token compiles to a backtracking-free matcher in every language — Go as package-level `_mN(p int) int` funcs over a module-level source, Rust as named `_mN(s, p) -> i64` funcs (closures can't recurse) threading the source as a param. This is the lexer half of the issue-#6 target parameter: ONE target-agnostic lexer, rendered per language. The optimized emit-lexer.ts stays a separate, JS-perf path — it fills the arena parser's struct-of-arrays integer columns, a different token contract than the portable Tok list, so merging would deoptimize it; the two already share what should be shared (the token-pattern.ts algebra + recognizers). Verified: examples/richtokens.ts (escaped idents, the number family with a boundary, both-quote strings) now runs in ts/go/rust, each CST byte-identical to createParser (gate: 14/14 accept + 5/5 reject per target). Full suite 42/42. --- src/target-go.ts | 52 ++++++++++++++++++++++++++++++++-------- src/target-rust.ts | 51 ++++++++++++++++++++++++++++++++------- test/portable-targets.ts | 6 ++--- 3 files changed, 87 insertions(+), 22 deletions(-) diff --git a/src/target-go.ts b/src/target-go.ts index 9c809e1..b5f1926 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -10,46 +10,78 @@ // slices to saved lengths; the slices keep their capacity across parses (reset to len 0), so a // warmed parser allocates ~nothing per parse. import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; +import type { TokenPattern } from './types.ts'; const J = (v: unknown) => JSON.stringify(v); const rangeCond = (v: string, rs: CharRange[]) => '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `${v} >= ${lo} && ${v} <= ${hi}`)).join(' || ') + ')'; -function scanTok(t: LexTok): string { - const push = t.skip ? '' : `toks = append(toks, Tok{${J((t as { name: string }).name)}, src[pos:e], pos, e}); `; +// Compile a token-pattern AST to backtracking-free package-level matcher funcs +// `_mN(p int) int` (new position, or -1) over the module-level source `_s`. +function ccCondGo(p: Extract): string { + const parts = p.items.map((it) => + it.type === 'char' ? `cc == ${it.value.charCodeAt(0)}` : `cc >= ${it.from.charCodeAt(0)} && cc <= ${it.to.charCodeAt(0)}`); + const inSet = parts.length === 1 ? parts[0] : '(' + parts.join(' || ') + ')'; + return p.negate ? `!${inSet}` : inSet; +} +function compilePat(p: TokenPattern, defs: string[]): string { + const name = `_m${defs.length}`; + defs.push(''); + let body: string; + if (typeof p === 'string') { + body = `{ if p <= len(_s) && strings.HasPrefix(_s[p:], ${J(p)}) { return p + ${p.length} }; return -1 }`; + } else switch (p.type) { + case 'anyChar': body = `{ if p < len(_s) { return p + 1 }; return -1 }`; break; + case 'charClass': body = `{ if p >= len(_s) { return -1 }; cc := int(_s[p]); if ${ccCondGo(p)} { return p + 1 }; return -1 }`; break; + case 'seq': { const ms = p.items.map((x) => compilePat(x, defs)); body = `{ ${ms.map((m) => `p = ${m}(p); if p < 0 { return -1 }`).join('; ')}; return p }`; break; } + case 'alt': { const ms = p.items.map((x) => compilePat(x, defs)); body = `{ ${ms.map((m) => `if r := ${m}(p); r >= 0 { return r }`).join('; ')}; return -1 }`; break; } + case 'repeat': { const m = compilePat(p.body, defs); const mx = p.max !== undefined ? `; if c >= ${p.max} { break }` : ''; body = `{ q, c := p, 0; for { r := ${m}(q); if r < 0 || r == q { break }; q = r; c++${mx} }; if c >= ${p.min} { return q }; return -1 }`; break; } + case 'lookahead': { const m = compilePat(p.body, defs); body = `{ r := ${m}(p); if ${p.negate ? 'r < 0' : 'r >= 0'} { return p }; return -1 }`; break; } + case 'anchor': body = p.kind === 'start' ? `{ if p == 0 { return p }; return -1 }` : `{ if p == len(_s) { return p }; return -1 }`; break; + default: throw new Error(`portable Go lexer: pattern '${(p as { type: string }).type}' unsupported`); + } + defs[Number(name.slice(2))] = `func ${name}(p int) int ${body}`; + return name; +} + +function scanTok(t: LexTok, defs: string[]): string { + const name = (t as { name: string }).name; + const push = (endE: string) => (t.skip ? '' : `toks = append(toks, Tok{${J(name)}, src[pos:${endE}], pos, ${endE}}); `); if (t.kind === 'run') return `\t\tif ${rangeCond('c', t.first)} { \t\t\te := pos + 1 \t\t\tfor e < n { cc := int(src[e]); if !${rangeCond('cc', t.cont)} { break }; e++ } -\t\t\t${push}pos = e; continue +\t\t\t${push('e')}pos = e; continue \t\t}`; if (t.kind === 'string') return `\t\tif c == ${t.delim.charCodeAt(0)} { \t\t\te := pos + 1 \t\t\tfor e < n { ch := int(src[e]); if ch == 92 { e += 2; continue }; if ch == ${t.delim.charCodeAt(0)} { e++; break }; e++ } -\t\t\t${push}pos = e; continue +\t\t\t${push('e')}pos = e; continue \t\t}`; if (t.kind === 'line') return `\t\tif strings.HasPrefix(src[pos:], ${J(t.prefix)}) { \t\t\te := pos + ${t.prefix.length} \t\t\tfor e < n && src[e] != 10 { e++ } -\t\t\t${push}pos = e; continue +\t\t\t${push('e')}pos = e; continue \t\t}`; if (t.kind === 'block') return `\t\tif strings.HasPrefix(src[pos:], ${J(t.open)}) { \t\t\te := pos + ${t.open.length} \t\t\tfor e < n && !strings.HasPrefix(src[e:], ${J(t.close)}) { e++ } \t\t\tif e < n { e += ${t.close.length} } -\t\t\t${push}pos = e; continue +\t\t\t${push('e')}pos = e; continue \t\t}`; - throw new Error(`portable Go lexer: general 'pattern' tokens not yet supported (token ${t.name}) — the stateless-token matcher is implemented in the TS target only so far`); + const m = compilePat(t.pattern, defs); + return `\t\tif e := ${m}(pos); e > pos { ${push('e')}pos = e; continue }`; } function lexer(ir: ParserIR): string { - const toks = ir.tokens.map(scanTok).join('\n'); + const defs: string[] = []; + const toks = ir.tokens.map((t) => scanTok(t, defs)).join('\n'); const puncts = ir.puncts.map((p) => `\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}}); pos += ${p.length}; continue }`).join('\n'); - return `func lex(src string) []Tok { + return `${defs.length ? 'var _s string\n' + defs.join('\n') + '\n' : ''}func lex(src string) []Tok { \ttoks := toks[:0] \tn := len(src) \tpos := 0 -\tfor pos < n { +${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { \t\tc := int(src[pos]) \t\tif c == 32 || c == 9 || c == 10 || c == 13 { pos++; continue } ${toks} diff --git a/src/target-rust.ts b/src/target-rust.ts index 2fd6a7f..7ad6382 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -12,42 +12,75 @@ // `fn(&mut Parser, &mut Vec) -> bool`, threading the parser + kids as params (so nothing // is captured, sidestepping the borrow checker). import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; +import type { TokenPattern } from './types.ts'; const J = (v: unknown) => JSON.stringify(v); const rangeCond = (v: string, rs: CharRange[]) => '(' + rs.map(([lo, hi]) => (lo === hi ? `${v} == ${lo}` : `(${lo}..=${hi}).contains(&${v})`)).join(' || ') + ')'; -function scanTok(t: LexTok): string { - const push = t.skip ? '' : `toks.push(Tok { kind: ${J((t as { name: string }).name)}, text: &src[pos..e], off: pos, end: e }); `; +// Compile a token-pattern AST to backtracking-free matcher fns `_mN(s, p) -> i64` +// (new position, or -1). Named functions (Rust closures can't recurse); the source is +// threaded as a param (Rust has no convenient module-level mutable string). +function ccCondRs(p: Extract): string { + const parts = p.items.map((it) => + it.type === 'char' ? `cc == ${it.value.charCodeAt(0)}` : `(${it.from.charCodeAt(0)}..=${it.to.charCodeAt(0)}).contains(&cc)`); + const inSet = parts.length === 1 ? parts[0] : '(' + parts.join(' || ') + ')'; + return p.negate ? `!${inSet}` : inSet; +} +function compilePat(p: TokenPattern, defs: string[]): string { + const name = `_m${defs.length}`; + defs.push(''); + let body: string; + if (typeof p === 'string') { + body = `if (p as usize) <= s.len() && s[p as usize..].starts_with(${J(p)}) { p + ${p.length} } else { -1 }`; + } else switch (p.type) { + case 'anyChar': body = `if (p as usize) < s.len() { p + 1 } else { -1 }`; break; + case 'charClass': body = `let u = p as usize; if u >= s.len() { return -1; } let cc = s.as_bytes()[u] as u32; if ${ccCondRs(p)} { p + 1 } else { -1 }`; break; + case 'seq': { const ms = p.items.map((x) => compilePat(x, defs)); body = `let mut p = p; ${ms.map((m) => `p = ${m}(s, p); if p < 0 { return -1; }`).join(' ')} p`; break; } + case 'alt': { const ms = p.items.map((x) => compilePat(x, defs)); body = `${ms.map((m) => `{ let r = ${m}(s, p); if r >= 0 { return r; } }`).join(' ')} -1`; break; } + case 'repeat': { const m = compilePat(p.body, defs); const mx = p.max !== undefined ? ` if c >= ${p.max} { break; }` : ''; body = `let mut q = p; let mut c = 0i64; loop { let r = ${m}(s, q); if r < 0 || r == q { break; } q = r; c += 1;${mx} } if c >= ${p.min} { q } else { -1 }`; break; } + case 'lookahead': { const m = compilePat(p.body, defs); body = `let r = ${m}(s, p); if ${p.negate ? 'r < 0' : 'r >= 0'} { p } else { -1 }`; break; } + case 'anchor': body = p.kind === 'start' ? `if p == 0 { p } else { -1 }` : `if p as usize == s.len() { p } else { -1 }`; break; + default: throw new Error(`portable Rust lexer: pattern '${(p as { type: string }).type}' unsupported`); + } + defs[Number(name.slice(2))] = `fn ${name}(s: &str, p: i64) -> i64 { ${body} }`; + return name; +} + +function scanTok(t: LexTok, defs: string[]): string { + const name = (t as { name: string }).name; + const push = (endE: string) => (t.skip ? '' : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE} }); `); if (t.kind === 'run') return ` if ${rangeCond('c', t.first)} { let mut e = pos + 1; while e < n { let cc = b[e] as u32; if !${rangeCond('cc', t.cont)} { break } e += 1; } - ${push}pos = e; continue; + ${push('e')}pos = e; continue; }`; if (t.kind === 'string') return ` if c == ${t.delim.charCodeAt(0)} { let mut e = pos + 1; while e < n { let ch = b[e] as u32; if ch == 92 { e += 2; continue } if ch == ${t.delim.charCodeAt(0)} { e += 1; break } e += 1; } - ${push}pos = e; continue; + ${push('e')}pos = e; continue; }`; if (t.kind === 'line') return ` if src[pos..].starts_with(${J(t.prefix)}) { let mut e = pos + ${t.prefix.length}; while e < n && b[e] != 10 { e += 1; } - ${push}pos = e; continue; + ${push('e')}pos = e; continue; }`; if (t.kind === 'block') return ` if src[pos..].starts_with(${J(t.open)}) { let mut e = pos + ${t.open.length}; while e < n && !src[e..].starts_with(${J(t.close)}) { e += 1; } if e < n { e += ${t.close.length}; } - ${push}pos = e; continue; + ${push('e')}pos = e; continue; }`; - throw new Error(`portable Rust lexer: general 'pattern' tokens not yet supported (token ${t.name}) — the stateless-token matcher is implemented in the TS target only so far`); + const m = compilePat(t.pattern, defs); + return ` { let e = ${m}(src, pos as i64); if e > pos as i64 { let e = e as usize; ${push('e')}pos = e; continue; } }`; } function lexer(ir: ParserIR): string { - const toks = ir.tokens.map(scanTok).join('\n'); + const defs: string[] = []; + const toks = ir.tokens.map((t) => scanTok(t, defs)).join('\n'); const puncts = ir.puncts.map((p) => ` if src[pos..].starts_with(${J(p)}) { toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); - return `fn lex<'a>(src: &'a str) -> Vec> { + return `${defs.length ? defs.join('\n') + '\n' : ''}fn lex<'a>(src: &'a str) -> Vec> { let b = src.as_bytes(); let n = b.len(); let mut toks: Vec = Vec::new(); diff --git a/test/portable-targets.ts b/test/portable-targets.ts index afd1821..4522023 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -51,9 +51,9 @@ const CASES: Case[] = [ }, { // The general token-pattern matcher (stateless real-JS token tier): \u-escaped - // identifiers, the decimal/hex number family with a boundary, both-quote strings. - // TS-only for now — the Go/Rust port of the pattern matcher is the next stage. - grammar: 'richtokens', path: '../examples/richtokens.ts', tsOnly: true, + // identifiers, the decimal/hex number family with a boundary, both-quote strings — + // compiled to a backtracking-free matcher in all three targets. + grammar: 'richtokens', path: '../examples/richtokens.ts', accept: [ '123', '0xFF', '1_000_000', '3.14', 'foo', 'bar_$x9', '"hi"', "'single'", '"esc\\"q\\n"', '123 0xa foo "s" 3.14', '0xDEADbeef 42 _id $x cafe // line\n 7', From b10cfddd9bb44c6828c3c99a158ac1c46fa80ba8 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 02:04:12 +0800 Subject: [PATCH 09/27] emit-portable: stateful regex-vs-division lexer in all three targets (stage 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The portable lexer gains its first STATEFUL capability — the JS `/` problem. A `/` starts a regex literal in expression context but is division after a value; the lexer now threads the previous token plus a control-head paren stack to decide, gating the regex token on the same prevIsValue predicate gen-lexer.ts uses. The regexContext sets (division-after type/text, expression-start keywords, control-head keywords, member accessors, ambiguous postfix ops) are baked from the grammar into an IR.regexCtx and rendered per target: TS/Go via closures over the lex state, Rust via a LexState struct (two closures can't co-capture the same mutable state). examples/regexjs.ts (minijs + regex literals) verifies it: `a / b` is division, `/re/` after `=`/keyword is a regex, `if (x) /re/` is a regex (control head), `obj.for(x) / y` is division (member name, not a head), `[1,2] / 3` is division — all ts/go/rust CSTs byte-identical to createParser (gate: 15/15 accept, 5/5 reject per target). Full suite 42/42. Also fixes a single-item negated char-class losing its parens (`!cc == 10` instead of `!(cc == 10)`) in all three matchers — surfaced by the Go compiler, and by adding regex-escape cases the earlier corpus had missed (an aggregate that passed for the wrong reason). Remaining for the real grammar files: template interpolation, the markup/indent lexers, and the full parser algebra. --- examples/regexjs.ts | 77 ++++++++++++++++++++++++++++++++++++++++ src/emit-portable.ts | 35 +++++++++++++++++- src/target-go.ts | 58 ++++++++++++++++++++++++------ src/target-rust.ts | 59 +++++++++++++++++++++++------- src/target-ts.ts | 49 +++++++++++++++++++------ test/portable-targets.ts | 14 ++++++++ 6 files changed, 256 insertions(+), 36 deletions(-) create mode 100644 examples/regexjs.ts diff --git a/examples/regexjs.ts b/examples/regexjs.ts new file mode 100644 index 0000000..b9ad82d --- /dev/null +++ b/examples/regexjs.ts @@ -0,0 +1,77 @@ +// minijs + REGEX literals — exercises the portable lexer's STATEFUL regex-vs-division +// disambiguation (stage 3). A `/` is a regex in expression context but division after a +// value; `if (x) /re/` is a regex (control-head paren), `obj.for(x) / y` is division +// (member name, not a head). The regexContext config + paren-head/bang state are ported +// from createLexer; the gate checks the emitted CST is byte-identical on inputs that mix +// regex literals and division. +import { + token, rule, defineGrammar, left, right, op, prefix, alt, + seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, anyChar, +} from '../src/api.ts'; + +const digit = range('0', '9'); +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); + +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); +const Str = token(seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', anyChar()))), '"'), { scope: 'string.quoted.double' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); +const BlockComment = token(seq('/*', star(altPattern(noneOf('*'), seq('*', noneOf('/')))), '*/'), { skip: true, scope: 'comment.block' }); + +// Regex literal: `/ body / flags`, body is non-(/\[)newline chars, escapes, or `[...]` classes. +const rxClass = seq('[', star(altPattern(noneOf(']', '\\', '\n'), seq('\\', noneOf('\n')))), ']'); +const rxChar = altPattern(noneOf('/', '\\', '[', '\n'), seq('\\', noneOf('\n')), rxClass); +const rxFirst = altPattern(noneOf('/', '\\', '[', '*', '\n'), seq('\\', noneOf('\n')), rxClass); +const Regex = token(seq('/', rxFirst, star(rxChar), '/', star(idCont)), { + regex: true, scope: 'string.regexp', + regexContext: { + divisionAfterTypes: ['Ident', 'Number', 'Str'], + divisionAfterTexts: [')', ']', 'this', 'true', 'false', 'null'], + regexAfterTexts: ['return', 'typeof', 'delete', 'void', 'in', 'instanceof', 'new', 'do', 'else'], + regexAfterParenKeywords: ['if', 'while', 'for'], + memberAccessTexts: ['.'], + postfixAfterValueTexts: [], + }, +}); + +const jsPrec = [ + right('='), + left('||'), left('&&'), + left('==', '!=', '===', '!=='), + left('<', '>', '<=', '>='), + left('+', '-'), + left('*', '/', '%'), + right(prefix('!', '-', '+', '~')), +]; + +const Expr = rule(($) => [ + Number_, Str, Ident, Regex, + ['(', $, ')'], + ['[', opt(sep($, ',')), ']'], + [prefix, $], + [$, op, $], + [$, '(', opt(sep($, ',')), ')'], + [$, '.', Ident], + [$, '[', $, ']'], +]); + +const Block = rule(($) => [['{', many(Stmt), '}']]); +const Stmt = rule(($) => [ + Block, + [alt('var', 'let', 'const'), Ident, opt('=', Expr), ';'], + ['if', '(', Expr, ')', Stmt, opt('else', Stmt)], + ['while', '(', Expr, ')', Stmt], + ['return', opt(Expr), ';'], + ['function', Ident, '(', opt(sep(Ident, ',')), ')', Block], + [Expr, ';'], +]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'regexjs', + scopeName: 'source.regexjs', + tokens: { LineComment, BlockComment, Number: Number_, Str, Regex, Ident }, + prec: jsPrec, + rules: { Expr, Block, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 8e8c4c3..30881ca 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -65,12 +65,28 @@ export type PrattRule = { }; export type RuleIR = RdRule | PrattRule; +// Stateful regex-vs-division disambiguation (the JS `/` problem): a `/` starts a regex +// literal in expression context but is division after a value. The lexer threads the +// previous token + a control-head paren stack to decide; the predicate sets are baked +// from the grammar's `regexContext`. Mirrors gen-lexer.ts's prevIsValue exactly. +export type RegexCtx = { + regexToken: string; // the token flagged `regex`, gated on expression context + identToken: string; // identifier token kind (for the keyword-vs-value test) + divisionTypes: string[]; // prev TOKEN KINDS after which `/` is division + divisionTexts: string[]; // prev TEXTS after which `/` is division + regexTexts: string[]; // expression-start keywords (a `/` after them is a regex) + parenHeadKw: string[]; // keywords whose `(` is a control head (regex after its `)`) + memberAccess: string[]; // accessors that make a following keyword a member name, not a head + postfixAfterValue: string[]; // ambiguous postfix/prefix ops (e.g. `!`): value only in postfix +}; + export type ParserIR = { grammarName: string; entry: string; tokens: LexTok[]; // for the char scanner, tried in declaration order puncts: string[]; // punctuation literals, longest-first (maximal munch) rules: RuleIR[]; + regexCtx: RegexCtx | null; // null unless the grammar has a regex token with context }; export interface Target { @@ -130,7 +146,24 @@ function buildIR(grammar: CstGrammar): ParserIR { return { kind: 'rd', name: r.name, alts: r.body.type === 'alt' ? r.body.items.map(altSteps) : [altSteps(r.body)] }; }); - return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules }; + // Regex-vs-division context (only if the grammar declares a regex token + config). + let regexCtx: RegexCtx | null = null; + const rxTok = grammar.tokens.find((t) => t.flags.includes('regex')); + const rxCfg = grammar.tokens.find((t) => t.regexContext)?.regexContext; + if (rxTok && rxCfg) { + regexCtx = { + regexToken: rxTok.name, + identToken: grammar.tokens.find((t) => t.identifier)?.name ?? '', + divisionTypes: [...(rxCfg.divisionAfterTypes ?? [])], + divisionTexts: [...(rxCfg.divisionAfterTexts ?? [])], + regexTexts: [...(rxCfg.regexAfterTexts ?? [])], + parenHeadKw: [...(rxCfg.regexAfterParenKeywords ?? [])], + memberAccess: [...(rxCfg.memberAccessTexts ?? [])], + postfixAfterValue: [...(rxCfg.postfixAfterValueTexts ?? [])], + }; + } + + return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules, regexCtx }; } // Classify a token: a fast-path shape (run/string/line/block) when one cleanly matches, diff --git a/src/target-go.ts b/src/target-go.ts index b5f1926..b39a811 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -21,7 +21,7 @@ const rangeCond = (v: string, rs: CharRange[]) => function ccCondGo(p: Extract): string { const parts = p.items.map((it) => it.type === 'char' ? `cc == ${it.value.charCodeAt(0)}` : `cc >= ${it.from.charCodeAt(0)} && cc <= ${it.to.charCodeAt(0)}`); - const inSet = parts.length === 1 ? parts[0] : '(' + parts.join(' || ') + ')'; + const inSet = '(' + parts.join(' || ') + ')'; return p.negate ? `!${inSet}` : inSet; } function compilePat(p: TokenPattern, defs: string[]): string { @@ -44,44 +44,80 @@ function compilePat(p: TokenPattern, defs: string[]): string { return name; } -function scanTok(t: LexTok, defs: string[]): string { +function scanTok(t: LexTok, defs: string[], rxTok?: string): string { const name = (t as { name: string }).name; - const push = (endE: string) => (t.skip ? '' : `toks = append(toks, Tok{${J(name)}, src[pos:${endE}], pos, ${endE}}); `); - if (t.kind === 'run') return `\t\tif ${rangeCond('c', t.first)} { + const stateful = rxTok !== undefined; + const push = (endE: string) => (t.skip ? '' : stateful ? `emit(${J(name)}, src[pos:${endE}], pos, ${endE}); ` : `toks = append(toks, Tok{${J(name)}, src[pos:${endE}], pos, ${endE}}); `); + const gate = stateful && name === rxTok ? '!prevIsValue() && ' : ''; + if (t.kind === 'run') return `\t\tif ${gate}${rangeCond('c', t.first)} { \t\t\te := pos + 1 \t\t\tfor e < n { cc := int(src[e]); if !${rangeCond('cc', t.cont)} { break }; e++ } \t\t\t${push('e')}pos = e; continue \t\t}`; - if (t.kind === 'string') return `\t\tif c == ${t.delim.charCodeAt(0)} { + if (t.kind === 'string') return `\t\tif ${gate}c == ${t.delim.charCodeAt(0)} { \t\t\te := pos + 1 \t\t\tfor e < n { ch := int(src[e]); if ch == 92 { e += 2; continue }; if ch == ${t.delim.charCodeAt(0)} { e++; break }; e++ } \t\t\t${push('e')}pos = e; continue \t\t}`; - if (t.kind === 'line') return `\t\tif strings.HasPrefix(src[pos:], ${J(t.prefix)}) { + if (t.kind === 'line') return `\t\tif ${gate}strings.HasPrefix(src[pos:], ${J(t.prefix)}) { \t\t\te := pos + ${t.prefix.length} \t\t\tfor e < n && src[e] != 10 { e++ } \t\t\t${push('e')}pos = e; continue \t\t}`; - if (t.kind === 'block') return `\t\tif strings.HasPrefix(src[pos:], ${J(t.open)}) { + if (t.kind === 'block') return `\t\tif ${gate}strings.HasPrefix(src[pos:], ${J(t.open)}) { \t\t\te := pos + ${t.open.length} \t\t\tfor e < n && !strings.HasPrefix(src[e:], ${J(t.close)}) { e++ } \t\t\tif e < n { e += ${t.close.length} } \t\t\t${push('e')}pos = e; continue \t\t}`; const m = compilePat(t.pattern, defs); - return `\t\tif e := ${m}(pos); e > pos { ${push('e')}pos = e; continue }`; + return `\t\tif ${gate ? gate + 'true' : 'true'} { if e := ${m}(pos); e > pos { ${push('e')}pos = e; continue } }`; } function lexer(ir: ParserIR): string { const defs: string[] = []; - const toks = ir.tokens.map((t) => scanTok(t, defs)).join('\n'); + const rx = ir.regexCtx; + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken)).join('\n'); + const pushPunct = rx ? (p: string) => `emit("", ${J(p)}, pos, pos + ${p.length})` : (p: string) => `toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}})`; const puncts = ir.puncts.map((p) => - `\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}}); pos += ${p.length}; continue }`).join('\n'); + `\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { ${pushPunct(p)}; pos += ${p.length}; continue }`).join('\n'); + const goMap = (a: string[]) => `map[string]bool{${a.map((x) => `${J(x)}: true`).join(', ')}}`; + const stateBlock = rx ? `\tprevText, prevKind, bpText := "", "", "" +\thasPrev, hasPrev2 := false, false +\tparenHead := []bool{} +\tlastClose, lastBang := false, false +\t_divT := ${goMap(rx.divisionTexts)} +\t_divK := ${goMap(rx.divisionTypes)} +\t_rxT := ${goMap(rx.regexTexts)} +\t_phK := ${goMap(rx.parenHeadKw)} +\t_mem := ${goMap(rx.memberAccess)} +\t_pav := ${goMap(rx.postfixAfterValue)} +\tconst IDENT = ${J(rx.identToken)} +\tprevIsValue := func() bool { +\t\tif !hasPrev { return false } +\t\tif _pav[prevText] { return lastBang } +\t\tisExprKw := prevKind == IDENT && _rxT[prevText] +\t\tisParenHead := prevText == ")" && lastClose +\t\treturn !isExprKw && !isParenHead && (_divK[prevKind] || _divT[prevText]) +\t} +\temit := func(kind, text string, off, end int) { +\t\tif text == "(" { +\t\t\tisMember := hasPrev2 && _mem[bpText] +\t\t\tparenHead = append(parenHead, !isMember && prevKind == IDENT && _phK[prevText]) +\t\t} else if text == ")" { +\t\t\tif len(parenHead) > 0 { lastClose = parenHead[len(parenHead)-1]; parenHead = parenHead[:len(parenHead)-1] } else { lastClose = false } +\t\t} +\t\tif _pav[text] { lastBang = prevIsValue() } +\t\ttoks = append(toks, Tok{kind, text, off, end}) +\t\tbpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true +\t} +\t_ = bpText; _ = hasPrev2; _ = lastBang; _ = prevIsValue +` : ''; return `${defs.length ? 'var _s string\n' + defs.join('\n') + '\n' : ''}func lex(src string) []Tok { \ttoks := toks[:0] \tn := len(src) \tpos := 0 -${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { +${stateBlock}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { \t\tc := int(src[pos]) \t\tif c == 32 || c == 9 || c == 10 || c == 13 { pos++; continue } ${toks} diff --git a/src/target-rust.ts b/src/target-rust.ts index 7ad6382..ba78f6f 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -24,7 +24,7 @@ const rangeCond = (v: string, rs: CharRange[]) => function ccCondRs(p: Extract): string { const parts = p.items.map((it) => it.type === 'char' ? `cc == ${it.value.charCodeAt(0)}` : `(${it.from.charCodeAt(0)}..=${it.to.charCodeAt(0)}).contains(&cc)`); - const inSet = parts.length === 1 ? parts[0] : '(' + parts.join(' || ') + ')'; + const inSet = '(' + parts.join(' || ') + ')'; return p.negate ? `!${inSet}` : inSet; } function compilePat(p: TokenPattern, defs: string[]): string { @@ -47,43 +47,76 @@ function compilePat(p: TokenPattern, defs: string[]): string { return name; } -function scanTok(t: LexTok, defs: string[]): string { +function scanTok(t: LexTok, defs: string[], rxTok?: string): string { const name = (t as { name: string }).name; - const push = (endE: string) => (t.skip ? '' : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE} }); `); - if (t.kind === 'run') return ` if ${rangeCond('c', t.first)} { + const stateful = rxTok !== undefined; + const push = (endE: string) => (t.skip ? '' : stateful ? `st.emit(${J(name)}, &src[pos..${endE}], pos, ${endE}); ` : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE} }); `); + const gate = stateful && name === rxTok ? '!st.prev_is_value() && ' : ''; + if (t.kind === 'run') return ` if ${gate}${rangeCond('c', t.first)} { let mut e = pos + 1; while e < n { let cc = b[e] as u32; if !${rangeCond('cc', t.cont)} { break } e += 1; } ${push('e')}pos = e; continue; }`; - if (t.kind === 'string') return ` if c == ${t.delim.charCodeAt(0)} { + if (t.kind === 'string') return ` if ${gate}c == ${t.delim.charCodeAt(0)} { let mut e = pos + 1; while e < n { let ch = b[e] as u32; if ch == 92 { e += 2; continue } if ch == ${t.delim.charCodeAt(0)} { e += 1; break } e += 1; } ${push('e')}pos = e; continue; }`; - if (t.kind === 'line') return ` if src[pos..].starts_with(${J(t.prefix)}) { + if (t.kind === 'line') return ` if ${gate}src[pos..].starts_with(${J(t.prefix)}) { let mut e = pos + ${t.prefix.length}; while e < n && b[e] != 10 { e += 1; } ${push('e')}pos = e; continue; }`; - if (t.kind === 'block') return ` if src[pos..].starts_with(${J(t.open)}) { + if (t.kind === 'block') return ` if ${gate}src[pos..].starts_with(${J(t.open)}) { let mut e = pos + ${t.open.length}; while e < n && !src[e..].starts_with(${J(t.close)}) { e += 1; } if e < n { e += ${t.close.length}; } ${push('e')}pos = e; continue; }`; const m = compilePat(t.pattern, defs); - return ` { let e = ${m}(src, pos as i64); if e > pos as i64 { let e = e as usize; ${push('e')}pos = e; continue; } }`; + return ` if ${gate}true { let e = ${m}(src, pos as i64); if e > pos as i64 { let e = e as usize; ${push('e')}pos = e; continue; } }`; } function lexer(ir: ParserIR): string { const defs: string[] = []; - const toks = ir.tokens.map((t) => scanTok(t, defs)).join('\n'); + const rx = ir.regexCtx; + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken)).join('\n'); const puncts = ir.puncts.map((p) => - ` if src[pos..].starts_with(${J(p)}) { toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); - return `${defs.length ? defs.join('\n') + '\n' : ''}fn lex<'a>(src: &'a str) -> Vec> { + ` if src[pos..].starts_with(${J(p)}) { ${rx ? `st.emit("", &src[pos..pos + ${p.length}], pos, pos + ${p.length});` : `toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length} });`} pos += ${p.length}; continue; }`).join('\n'); + const rsArr = (a: string[]) => `&[${a.map(J).join(', ')}]`; + const rxPreamble = rx ? `const _DIVT: &[&str] = ${rsArr(rx.divisionTexts)}; +const _DIVK: &[&str] = ${rsArr(rx.divisionTypes)}; +const _RXT: &[&str] = ${rsArr(rx.regexTexts)}; +const _PHK: &[&str] = ${rsArr(rx.parenHeadKw)}; +const _MEM: &[&str] = ${rsArr(rx.memberAccess)}; +const _PAV: &[&str] = ${rsArr(rx.postfixAfterValue)}; +const _IDENT: &str = ${J(rx.identToken)}; +fn _in(set: &[&str], x: &str) -> bool { set.iter().any(|s| *s == x) } +struct LexState<'a> { toks: Vec>, prev_text: &'a str, prev_kind: &'static str, bp_text: &'a str, has_prev: bool, has_prev2: bool, paren_head: Vec, last_close: bool, last_bang: bool } +impl<'a> LexState<'a> { + fn prev_is_value(&self) -> bool { + if !self.has_prev { return false; } + if _in(_PAV, self.prev_text) { return self.last_bang; } + let is_expr_kw = self.prev_kind == _IDENT && _in(_RXT, self.prev_text); + let is_paren_head = self.prev_text == ")" && self.last_close; + !is_expr_kw && !is_paren_head && (_in(_DIVK, self.prev_kind) || _in(_DIVT, self.prev_text)) + } + fn emit(&mut self, kind: &'static str, text: &'a str, off: usize, end: usize) { + if text == "(" { let is_member = self.has_prev2 && _in(_MEM, self.bp_text); self.paren_head.push(!is_member && self.prev_kind == _IDENT && _in(_PHK, self.prev_text)); } + else if text == ")" { self.last_close = self.paren_head.pop().unwrap_or(false); } + if _in(_PAV, text) { self.last_bang = self.prev_is_value(); } + self.toks.push(Tok { kind, text, off, end }); + self.bp_text = self.prev_text; self.has_prev2 = self.has_prev; self.prev_kind = kind; self.prev_text = text; self.has_prev = true; + } +} +` : ''; + const open = rx + ? ` let mut st = LexState { toks: Vec::new(), prev_text: "", prev_kind: "", bp_text: "", has_prev: false, has_prev2: false, paren_head: Vec::new(), last_close: false, last_bang: false };` + : ` let mut toks: Vec = Vec::new();`; + return `${defs.length ? defs.join('\n') + '\n' : ''}${rxPreamble}fn lex<'a>(src: &'a str) -> Vec> { let b = src.as_bytes(); let n = b.len(); - let mut toks: Vec = Vec::new(); +${open} let mut pos = 0usize; while pos < n { let c = b[pos] as u32; @@ -92,7 +125,7 @@ ${toks} ${puncts} panic!("lex error at {}", pos); } - toks + ${rx ? 'st.toks' : 'toks'} }`; } diff --git a/src/target-ts.ts b/src/target-ts.ts index d39821b..08acf52 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -18,7 +18,7 @@ import type { TokenPattern } from './types.ts'; function ccCond(p: Extract): string { const parts = p.items.map((it) => it.type === 'char' ? `cc === ${it.value.charCodeAt(0)}` : `cc >= ${it.from.charCodeAt(0)} && cc <= ${it.to.charCodeAt(0)}`); - const inSet = parts.length === 1 ? parts[0] : '(' + parts.join(' || ') + ')'; + const inSet = '(' + parts.join(' || ') + ')'; return p.negate ? `!${inSet}` : inSet; } function compilePat(p: TokenPattern, defs: string[]): string { @@ -41,44 +41,71 @@ function compilePat(p: TokenPattern, defs: string[]): string { return name; } -function scanTok(t: LexTok, defs: string[]): string { +function scanTok(t: LexTok, defs: string[], rxTok?: string): string { const name = (t as { name: string }).name; - const push = (endExpr: string) => (t.skip ? '' : `toks.push({ kind: ${J(name)}, text: src.slice(pos, ${endExpr}), off: pos, end: ${endExpr} }); `); - if (t.kind === 'run') return ` if (${rangeCond('c', t.first)}) { + const stateful = rxTok !== undefined; + // `emit(...)` threads the regex-context state in stateful mode; a plain push otherwise. + const push = (endExpr: string) => (t.skip ? '' : `${stateful ? 'emit' : 'push'}(${J(name)}, src.slice(pos, ${endExpr}), pos, ${endExpr}); `); + const gate = stateful && name === rxTok ? '!prevIsValue() && ' : ''; + if (t.kind === 'run') return ` if (${gate}${rangeCond('c', t.first)}) { let e = pos + 1; while (e < n) { const cc = src.charCodeAt(e); if (!${rangeCond('cc', t.cont)}) break; e++; } ${push('e')}pos = e; continue; }`; - if (t.kind === 'string') return ` if (c === ${t.delim.charCodeAt(0)}) { + if (t.kind === 'string') return ` if (${gate}c === ${t.delim.charCodeAt(0)}) { let e = pos + 1; while (e < n) { const ch = src.charCodeAt(e); if (ch === 92) { e += 2; continue; } if (ch === ${t.delim.charCodeAt(0)}) { e++; break; } e++; } ${push('e')}pos = e; continue; }`; - if (t.kind === 'line') return ` if (src.startsWith(${J(t.prefix)}, pos)) { + if (t.kind === 'line') return ` if (${gate}src.startsWith(${J(t.prefix)}, pos)) { let e = pos + ${t.prefix.length}; while (e < n && src.charCodeAt(e) !== 10) e++; ${push('e')}pos = e; continue; }`; - if (t.kind === 'block') return ` if (src.startsWith(${J(t.open)}, pos)) { + if (t.kind === 'block') return ` if (${gate}src.startsWith(${J(t.open)}, pos)) { let e = pos + ${t.open.length}; while (e < n && !src.startsWith(${J(t.close)}, e)) e++; if (e < n) e += ${t.close.length}; ${push('e')}pos = e; continue; }`; const m = compilePat(t.pattern, defs); - return ` { const e = ${m}(pos); if (e > pos) { ${push('e')}pos = e; continue; } }`; + return ` if (${gate}true) { const e = ${m}(pos); if (e > pos) { ${push('e')}pos = e; continue; } }`; } function lexer(ir: ParserIR): string { const defs: string[] = []; - const toks = ir.tokens.map((t) => scanTok(t, defs)).join('\n'); + const rx = ir.regexCtx; + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken)).join('\n'); + const pushFn = rx ? 'emit' : 'push'; const puncts = ir.puncts.map((p) => - ` if (src.startsWith(${J(p)}, pos)) { toks.push({ kind: '', text: ${J(p)}, off: pos, end: pos + ${p.length} }); pos += ${p.length}; continue; }`).join('\n'); + ` if (src.startsWith(${J(p)}, pos)) { ${pushFn}('', ${J(p)}, pos, pos + ${p.length}); pos += ${p.length}; continue; }`).join('\n'); + const set = (a: string[]) => `new Set([${a.map(J).join(', ')}])`; + const stateBlock = rx ? ` let prevText = '', prevKind = '', bpText = '', hasPrev = false, hasPrev2 = false; + const parenHead: boolean[] = []; + let lastClose = false, lastBang = false; + const _divT = ${set(rx.divisionTexts)}, _divK = ${set(rx.divisionTypes)}, _rxT = ${set(rx.regexTexts)}; + const _phK = ${set(rx.parenHeadKw)}, _mem = ${set(rx.memberAccess)}, _pav = ${set(rx.postfixAfterValue)}; + const IDENT = ${J(rx.identToken)}; + function prevIsValue(): boolean { + if (!hasPrev) return false; + if (_pav.has(prevText)) return lastBang; + const isExprKw = prevKind === IDENT && _rxT.has(prevText); + const isParenHead = prevText === ')' && lastClose; + return !isExprKw && !isParenHead && (_divK.has(prevKind) || _divT.has(prevText)); + } + function emit(kind: string, text: string, off: number, end: number): void { + if (text === '(') { const isMember = hasPrev2 && _mem.has(bpText); parenHead.push(!isMember && prevKind === IDENT && _phK.has(prevText)); } + else if (text === ')') { lastClose = parenHead.pop() ?? false; } + if (_pav.has(text)) lastBang = prevIsValue(); + toks.push({ kind, text, off, end }); + bpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true; + } +` : ''; return `${defs.length ? 'let _s = "";\n' + defs.join('\n') + '\n' : ''}function lex(src: string): Tok[] { const toks: Tok[] = []; const n = src.length; let pos = 0; -${defs.length ? ' _s = src;\n' : ''} while (pos < n) { +${defs.length ? ' _s = src;\n' : ''}${stateBlock}${rx ? '' : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end }); };\n'} while (pos < n) { const c = src.charCodeAt(pos); if (c === 32 || c === 9 || c === 10 || c === 13) { pos++; continue; } ${toks} diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 4522023..bf0e6ea 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -61,6 +61,20 @@ const CASES: Case[] = [ ], reject: ['12abc', '0x', '"unterminated', '3.', '#'], // ($ is a valid identifier start, not a reject) }, + { + // The STATEFUL regex-vs-division lexer: `/` is a regex in expression context, division + // after a value. Exercises every branch of prevIsValue — after `=`/keyword/`(`-head + // (regex) vs after value/`)`/`]`/member/call (division), plus regex escapes & classes. + grammar: 'regexjs', path: '../examples/regexjs.ts', + accept: [ + 'a / b;', 'var r = /abc/g;', 'return /re/;', 'if (x) /re/;', '(a + b) / c;', + 'a.b / c;', 'foo(x) / y;', '[1, 2] / 3;', 'var x = a / b / c;', + 'var re = /[a-z]+/i; x / y;', 'f(/re/, a / b);', 'var z = /a\\/b/;', + 'var d = /\\d+\\w/g;', 'var k = /[\\]]/;', 'if (a) /x/; else b / c;', + ], + // (`var ;` is VALID — `var` is an identifier, so it's the expression statement `var;`.) + reject: ['a / ;', 'if (x /re/;', '/re/', '* a;', 'a = = b;'], + }, ]; const sortKeys = (o: unknown): unknown => From c0d84d004a4295f6495042130ab5254e5be4e51f Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 02:30:41 +0800 Subject: [PATCH 10/27] emit-portable: template-literal interpolation in all three targets (stage 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The portable lexer's second stateful feature: `${…}` interpolation. A `` ` `` opens a span scanned to the next `${` (emit $templateHead) or closing `` ` `` (the whole token, no substitution); a `}` that closes a hole resumes the span ($templateMiddle / Tail). A templateStack of brace-depths decides which `}` closes the hole versus a nested `{…}` (object/block) or nested template inside it. The parser's Pratt nud sees a $templateHead and assembles head·expr·(middle·expr)*·tail into a synthetic $template node, parsing each hole with the Pratt expression rule. The lexer state machine generalises cleanly with the regex one — a grammar can have regex, templates, or both share one emit() / LexState (Rust: a struct that now also carries the template_stack). examples/templatejs.ts (minijs + templates + a shorthand object so a hole can hold `{…}`) verifies it: no-substitution, adjacent/multiple holes, expressions in holes, NESTED templates, and an object inside a hole (the brace-depth counter) — all ts/go/rust CSTs byte-identical to createParser (gate: 11/11 accept, 4/4 reject per target). Full suite 42/42. Tagged templates (`` tag`…` `` — a postfix-token Pratt LED) are out of scope here; that's a parser-algebra gap, the remaining work alongside the markup/indent lexers. --- examples/templatejs.ts | 61 +++++++++++++++++++++ src/emit-portable.ts | 34 +++++++++++- src/target-go.ts | 96 ++++++++++++++++++++++++++------- src/target-rust.ts | 111 ++++++++++++++++++++++++++++++--------- src/target-ts.ts | 95 ++++++++++++++++++++++++++------- test/portable-targets.ts | 13 +++++ 6 files changed, 346 insertions(+), 64 deletions(-) create mode 100644 examples/templatejs.ts diff --git a/examples/templatejs.ts b/examples/templatejs.ts new file mode 100644 index 0000000..e2f2367 --- /dev/null +++ b/examples/templatejs.ts @@ -0,0 +1,61 @@ +// minijs + TEMPLATE LITERALS — exercises the portable lexer's second STATEFUL feature +// (stage 4): `${…}` interpolation. The lexer splits `` `a${x}b${y}c` `` into +// $templateHead·$templateMiddle·$templateTail around the holes, tracking a brace-depth +// stack so a nested `{…}` (or a nested template) inside a hole doesn't close it; the +// parser assembles the pieces and interpolated expressions into a `$template` node. +import { + token, rule, defineGrammar, left, right, op, prefix, alt, + seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, notFollowedBy, +} from '../src/api.ts'; + +const digit = range('0', '9'); +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); + +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); +const Str = token(seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', noneOf('\n')))), '"'), { scope: 'string.quoted.double' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); + +// NoSubstitution template: backtick body excludes a real `${` (a `$` not followed by `{` +// stays literal); the `template` config drives the interpolated split in the lexer. +const Template = token( + seq('`', star(altPattern(noneOf('`', '\\', '$'), seq('\\', noneOf('\n')), seq('$', notFollowedBy('{')))), '`'), + { scope: 'string.template', template: { open: '`', interpOpen: '${', interpClose: '}' } }, +); + +const jsPrec = [ + right('='), + left('||'), left('&&'), + left('+', '-'), + left('*', '/', '%'), + right(prefix('!', '-', '+')), +]; + +const Expr = rule(($) => [ + Number_, Str, Template, Ident, + ['(', $, ')'], + ['{', opt(sep(Ident, ',')), '}'], // shorthand object — gives a hole a nested `{ … }` + [prefix, $], + [$, op, $], + [$, '(', opt(sep($, ',')), ')'], + [$, '.', Ident], +]); + +const Block = rule(($) => [['{', many(Stmt), '}']]); +const Stmt = rule(($) => [ + Block, + [alt('var', 'let', 'const'), Ident, opt('=', Expr), ';'], + ['if', '(', Expr, ')', Stmt, opt('else', Stmt)], + ['return', opt(Expr), ';'], + [Expr, ';'], +]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'templatejs', + scopeName: 'source.templatejs', + tokens: { Ident, Number: Number_, Str, Template, LineComment }, + prec: jsPrec, + rules: { Expr, Block, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 30881ca..bfdaf0b 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -80,6 +80,20 @@ export type RegexCtx = { postfixAfterValue: string[]; // ambiguous postfix/prefix ops (e.g. `!`): value only in postfix }; +// Template literals with `${…}` interpolation: a STATEFUL lexer split. A `` ` `` opens a +// span scanned to the next `${` (→ $templateHead) or closing `` ` `` (→ the whole token, +// no substitution); a `}` that closes a hole resumes the span (→ $templateMiddle / Tail). +// A `templateStack` of brace-depths tracks which `}` closes the hole vs. a nested `{…}`. +// The parser assembles head·expr·(middle·expr)*·tail into a synthetic `$template` node. +export type TplCfg = { + token: string; // the token flagged `template`; its NoSubstitution form is a plain leaf + open: string; // `` ` `` + interpOpen: string; // `${` + interpClose: string; // `}` + braceOpen: string; // `{` — a nested one deepens the hole, so its `}` is not the closer + interpRule: string; // the rule that parses each `${…}` hole (the Pratt expression rule) +}; + export type ParserIR = { grammarName: string; entry: string; @@ -87,6 +101,7 @@ export type ParserIR = { puncts: string[]; // punctuation literals, longest-first (maximal munch) rules: RuleIR[]; regexCtx: RegexCtx | null; // null unless the grammar has a regex token with context + tpl: TplCfg | null; // null unless the grammar has a template token }; export interface Target { @@ -163,7 +178,24 @@ function buildIR(grammar: CstGrammar): ParserIR { }; } - return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules, regexCtx }; + // Template literals (only if the grammar declares a template token). The interpolation + // holes are parsed by the Pratt expression rule — the rule that carries operator leds. + let tpl: TplCfg | null = null; + const tplTok = grammar.tokens.find((t) => t.template); + if (tplTok && tplTok.template) { + const prattName = rules.find((r) => r.kind === 'pratt')?.name; + if (!prattName) throw new Error('portable: a template token needs a Pratt expression rule to parse its interpolations'); + tpl = { + token: tplTok.name, + open: tplTok.template.open, + interpOpen: tplTok.template.interpOpen, + interpClose: tplTok.template.interpClose, + braceOpen: tplTok.template.interpOpen.slice(-1), + interpRule: prattName, + }; + } + + return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules, regexCtx, tpl }; } // Classify a token: a fast-path shape (run/string/line/block) when one cleanly matches, diff --git a/src/target-go.ts b/src/target-go.ts index b39a811..9d74c46 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -9,7 +9,7 @@ // stack. A node is an int32 index, never a heap pointer. Backtracking truncates the three // slices to saved lengths; the slices keep their capacity across parses (reset to len 0), so a // warmed parser allocates ~nothing per parse. -import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts'; import type { TokenPattern } from './types.ts'; const J = (v: unknown) => JSON.stringify(v); @@ -44,11 +44,12 @@ function compilePat(p: TokenPattern, defs: string[]): string { return name; } -function scanTok(t: LexTok, defs: string[], rxTok?: string): string { +function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): string { const name = (t as { name: string }).name; - const stateful = rxTok !== undefined; + const stateful = rxTok !== undefined || tplTok !== undefined; + if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine const push = (endE: string) => (t.skip ? '' : stateful ? `emit(${J(name)}, src[pos:${endE}], pos, ${endE}); ` : `toks = append(toks, Tok{${J(name)}, src[pos:${endE}], pos, ${endE}}); `); - const gate = stateful && name === rxTok ? '!prevIsValue() && ' : ''; + const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : ''; if (t.kind === 'run') return `\t\tif ${gate}${rangeCond('c', t.first)} { \t\t\te := pos + 1 \t\t\tfor e < n { cc := int(src[e]); if !${rangeCond('cc', t.cont)} { break }; e++ } @@ -77,12 +78,14 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string): string { function lexer(ir: ParserIR): string { const defs: string[] = []; const rx = ir.regexCtx; - const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken)).join('\n'); - const pushPunct = rx ? (p: string) => `emit("", ${J(p)}, pos, pos + ${p.length})` : (p: string) => `toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}})`; + const tpl = ir.tpl; + const stateful = !!(rx || tpl); + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n'); + const pushPunct = stateful ? (p: string) => `emit("", ${J(p)}, pos, pos + ${p.length})` : (p: string) => `toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}})`; const puncts = ir.puncts.map((p) => `\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { ${pushPunct(p)}; pos += ${p.length}; continue }`).join('\n'); const goMap = (a: string[]) => `map[string]bool{${a.map((x) => `${J(x)}: true`).join(', ')}}`; - const stateBlock = rx ? `\tprevText, prevKind, bpText := "", "", "" + const rxState = rx ? `\tprevText, prevKind, bpText := "", "", "" \thasPrev, hasPrev2 := false, false \tparenHead := []bool{} \tlastClose, lastBang := false, false @@ -100,27 +103,56 @@ function lexer(ir: ParserIR): string { \t\tisParenHead := prevText == ")" && lastClose \t\treturn !isExprKw && !isParenHead && (_divK[prevKind] || _divT[prevText]) \t} -\temit := func(kind, text string, off, end int) { -\t\tif text == "(" { +` : ''; + const tplState = tpl ? `\ttemplateStack := []int{} +\tscanTplSpan := func(p int) (bool, int) { +\t\tfor p < n { +\t\t\tif strings.HasPrefix(src[p:], ${J(tpl.interpOpen)}) { return true, p + ${tpl.interpOpen.length} } +\t\t\tif src[p] == 92 { p += 2; continue } +\t\t\tif strings.HasPrefix(src[p:], ${J(tpl.open)}) { return false, p + ${tpl.open.length} } +\t\t\tp++ +\t\t} +\t\treturn false, p +\t} +\t_ = scanTplSpan +` : ''; + const emitHooks = [ + rx ? `\t\tif text == "(" { \t\t\tisMember := hasPrev2 && _mem[bpText] \t\t\tparenHead = append(parenHead, !isMember && prevKind == IDENT && _phK[prevText]) \t\t} else if text == ")" { \t\t\tif len(parenHead) > 0 { lastClose = parenHead[len(parenHead)-1]; parenHead = parenHead[:len(parenHead)-1] } else { lastClose = false } \t\t} -\t\tif _pav[text] { lastBang = prevIsValue() } -\t\ttoks = append(toks, Tok{kind, text, off, end}) -\t\tbpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true +\t\tif _pav[text] { lastBang = prevIsValue() }` : '', + tpl ? `\t\tif len(templateStack) > 0 { if text == ${J(tpl.braceOpen)} { templateStack[len(templateStack)-1]++ } else if text == ${J(tpl.interpClose)} { templateStack[len(templateStack)-1]-- } }` : '', + ].filter(Boolean).join('\n'); + const emitTail = rx ? `\n\t\tbpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true` : ''; + const emitFn = stateful ? `\temit := func(kind, text string, off, end int) { +${emitHooks} +\t\ttoks = append(toks, Tok{kind, text, off, end})${emitTail} \t} -\t_ = bpText; _ = hasPrev2; _ = lastBang; _ = prevIsValue +\t_ = emit +` : ''; + const tplDispatch = tpl ? `\t\tif len(templateStack) > 0 && strings.HasPrefix(src[pos:], ${J(tpl.interpClose)}) && templateStack[len(templateStack)-1] == 0 { +\t\t\ttemplateStack = templateStack[:len(templateStack)-1] +\t\t\tinterp, e := scanTplSpan(pos + ${tpl.interpClose.length}) +\t\t\tif interp { emit("$templateMiddle", src[pos:e], pos, e); templateStack = append(templateStack, 0) } else { emit("$templateTail", src[pos:e], pos, e) } +\t\t\tpos = e; continue +\t\t} +\t\tif strings.HasPrefix(src[pos:], ${J(tpl.open)}) { +\t\t\tinterp, e := scanTplSpan(pos + ${tpl.open.length}) +\t\t\tif interp { emit("$templateHead", src[pos:e], pos, e); templateStack = append(templateStack, 0) } else { emit(${J(tpl.token)}, src[pos:e], pos, e) } +\t\t\tpos = e; continue +\t\t} ` : ''; return `${defs.length ? 'var _s string\n' + defs.join('\n') + '\n' : ''}func lex(src string) []Tok { \ttoks := toks[:0] \tn := len(src) \tpos := 0 -${stateBlock}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { +${rxState}${tplState}${emitFn}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { \t\tc := int(src[pos]) \t\tif c == 32 || c == 9 || c == 10 || c == 13 { pos++; continue } -${toks} +${tplDispatch}${toks} ${puncts} \t\tpanic(fmt.Sprintf("lex error at %d", pos)) \t} @@ -151,7 +183,15 @@ ${r.alts.map(alt).join('\n')} }`; } -function prattRule(r: PrattRule): string { +function prattRule(r: PrattRule, tpl: TplCfg | null): string { + const tplNud = tpl && r.nudToks.includes(tpl.token) + ? `\tif t.Kind == "$templateHead" { +\t\tnode := matchTemplate() +\t\tif node < 0 { return -1 } +\t\tsb := len(scratch); scratch = append(scratch, node) +\t\treturn finish(${J(r.name)}, sb, nodes[node].Offset) +\t}\n` + : ''; const bin = r.binary.map((b) => `${J(b.op)}: {${b.lbp}, ${b.rbp}}`).join(', '); const pre = r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', '); const atoms = r.nudToks.map((k) => `${J(k)}: true`).join(', '); @@ -192,7 +232,7 @@ ${r.leds.map(ledArm).join('\n')} func ${r.name}nud() int32 { \tt := peek() \tif t == nil { return -1 } -\tif ${r.name}ATOM[t.Kind] { +${tplNud}\tif ${r.name}ATOM[t.Kind] { \t\tsb := len(scratch); scratch = append(scratch, mkLeaf(t.Kind, t.Off, t.End)); pos++ \t\treturn finish(${J(r.name)}, sb, t.Off) \t} @@ -213,7 +253,25 @@ export const goTarget: Target = { name: 'go', ext: 'go', render(ir: ParserIR): string { - const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r) : rdRule(r))).join('\n\n'); + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); + const matchTemplate = ir.tpl ? `func matchTemplate() int32 { +\tt := peek() +\tif t == nil || t.Kind != "$templateHead" { return -1 } +\tsb := len(scratch); nb := len(nodes); kb := len(kids); save := pos +\tscratch = append(scratch, mkLeaf("$templateHead", t.Off, t.End)); pos++ +\tfor { +\t\texpr := parse${ir.tpl.interpRule}() +\t\tif expr < 0 { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 } +\t\tscratch = append(scratch, expr) +\t\tnext := peek() +\t\tif next == nil { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 } +\t\tif next.Kind == "$templateMiddle" { scratch = append(scratch, mkLeaf("$templateMiddle", next.Off, next.End)); pos++; continue } +\t\tif next.Kind == "$templateTail" { scratch = append(scratch, mkLeaf("$templateTail", next.Off, next.End)); pos++; break } +\t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 +\t} +\treturn finish("$template", sb, t.Off) +} +` : ''; return `// GENERATED by emit-portable.ts (goTarget) — parser for grammar "${ir.grammarName}". package main @@ -296,7 +354,7 @@ func altLit(opts [][2]string) bool { \treturn false } -${ruleFns} +${matchTemplate}${ruleFns} func writeJSON(id int32, b *strings.Builder) { \tnd := &nodes[id] diff --git a/src/target-rust.ts b/src/target-rust.ts index ba78f6f..51aebf5 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -11,7 +11,7 @@ // returns it. Sub-sequence combinators (star/opt/sep) take non-capturing fn pointers // `fn(&mut Parser, &mut Vec) -> bool`, threading the parser + kids as params (so nothing // is captured, sidestepping the borrow checker). -import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts'; import type { TokenPattern } from './types.ts'; const J = (v: unknown) => JSON.stringify(v); @@ -47,11 +47,12 @@ function compilePat(p: TokenPattern, defs: string[]): string { return name; } -function scanTok(t: LexTok, defs: string[], rxTok?: string): string { +function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): string { const name = (t as { name: string }).name; - const stateful = rxTok !== undefined; + const stateful = rxTok !== undefined || tplTok !== undefined; + if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine const push = (endE: string) => (t.skip ? '' : stateful ? `st.emit(${J(name)}, &src[pos..${endE}], pos, ${endE}); ` : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE} }); `); - const gate = stateful && name === rxTok ? '!st.prev_is_value() && ' : ''; + const gate = rxTok !== undefined && name === rxTok ? '!st.prev_is_value() && ' : ''; if (t.kind === 'run') return ` if ${gate}${rangeCond('c', t.first)} { let mut e = pos + 1; while e < n { let cc = b[e] as u32; if !${rangeCond('cc', t.cont)} { break } e += 1; } @@ -80,11 +81,15 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string): string { function lexer(ir: ParserIR): string { const defs: string[] = []; const rx = ir.regexCtx; - const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken)).join('\n'); + const tpl = ir.tpl; + const stateful = !!(rx || tpl); + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n'); const puncts = ir.puncts.map((p) => - ` if src[pos..].starts_with(${J(p)}) { ${rx ? `st.emit("", &src[pos..pos + ${p.length}], pos, pos + ${p.length});` : `toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length} });`} pos += ${p.length}; continue; }`).join('\n'); + ` if src[pos..].starts_with(${J(p)}) { ${stateful ? `st.emit("", &src[pos..pos + ${p.length}], pos, pos + ${p.length});` : `toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length} });`} pos += ${p.length}; continue; }`).join('\n'); const rsArr = (a: string[]) => `&[${a.map(J).join(', ')}]`; - const rxPreamble = rx ? `const _DIVT: &[&str] = ${rsArr(rx.divisionTexts)}; + // Struct fields / emit hooks / init are assembled per-feature so a grammar can have regex, + // templates, or both share one LexState. + const rxConsts = rx ? `const _DIVT: &[&str] = ${rsArr(rx.divisionTexts)}; const _DIVK: &[&str] = ${rsArr(rx.divisionTypes)}; const _RXT: &[&str] = ${rsArr(rx.regexTexts)}; const _PHK: &[&str] = ${rsArr(rx.parenHeadKw)}; @@ -92,28 +97,62 @@ const _MEM: &[&str] = ${rsArr(rx.memberAccess)}; const _PAV: &[&str] = ${rsArr(rx.postfixAfterValue)}; const _IDENT: &str = ${J(rx.identToken)}; fn _in(set: &[&str], x: &str) -> bool { set.iter().any(|s| *s == x) } -struct LexState<'a> { toks: Vec>, prev_text: &'a str, prev_kind: &'static str, bp_text: &'a str, has_prev: bool, has_prev2: bool, paren_head: Vec, last_close: bool, last_bang: bool } -impl<'a> LexState<'a> { - fn prev_is_value(&self) -> bool { +` : ''; + const tplFn = tpl ? `fn _scan_tpl_span(s: &str, mut p: usize) -> (bool, usize) { + let n = s.len(); + while p < n { + if s[p..].starts_with(${J(tpl.interpOpen)}) { return (true, p + ${tpl.interpOpen.length}); } + if s.as_bytes()[p] == 92 { p += 2; continue; } + if s[p..].starts_with(${J(tpl.open)}) { return (false, p + ${tpl.open.length}); } + p += 1; + } + (false, p) +} +` : ''; + const fields = ['toks: Vec>', + rx ? 'prev_text: &\'a str, prev_kind: &\'static str, bp_text: &\'a str, has_prev: bool, has_prev2: bool, paren_head: Vec, last_close: bool, last_bang: bool' : '', + tpl ? 'template_stack: Vec' : ''].filter(Boolean).join(', '); + const prevIsValue = rx ? ` fn prev_is_value(&self) -> bool { if !self.has_prev { return false; } if _in(_PAV, self.prev_text) { return self.last_bang; } let is_expr_kw = self.prev_kind == _IDENT && _in(_RXT, self.prev_text); let is_paren_head = self.prev_text == ")" && self.last_close; !is_expr_kw && !is_paren_head && (_in(_DIVK, self.prev_kind) || _in(_DIVT, self.prev_text)) } - fn emit(&mut self, kind: &'static str, text: &'a str, off: usize, end: usize) { - if text == "(" { let is_member = self.has_prev2 && _in(_MEM, self.bp_text); self.paren_head.push(!is_member && self.prev_kind == _IDENT && _in(_PHK, self.prev_text)); } +` : ''; + const emitHooks = [ + rx ? ` if text == "(" { let is_member = self.has_prev2 && _in(_MEM, self.bp_text); self.paren_head.push(!is_member && self.prev_kind == _IDENT && _in(_PHK, self.prev_text)); } else if text == ")" { self.last_close = self.paren_head.pop().unwrap_or(false); } - if _in(_PAV, text) { self.last_bang = self.prev_is_value(); } - self.toks.push(Tok { kind, text, off, end }); - self.bp_text = self.prev_text; self.has_prev2 = self.has_prev; self.prev_kind = kind; self.prev_text = text; self.has_prev = true; + if _in(_PAV, text) { self.last_bang = self.prev_is_value(); }` : '', + tpl ? ` if !self.template_stack.is_empty() { if text == ${J(tpl.braceOpen)} { *self.template_stack.last_mut().unwrap() += 1; } else if text == ${J(tpl.interpClose)} { *self.template_stack.last_mut().unwrap() -= 1; } }` : '', + ].filter(Boolean).join('\n'); + const emitTail = rx ? ` + self.bp_text = self.prev_text; self.has_prev2 = self.has_prev; self.prev_kind = kind; self.prev_text = text; self.has_prev = true;` : ''; + const stateImpl = stateful ? `struct LexState<'a> { ${fields} } +impl<'a> LexState<'a> { +${prevIsValue} fn emit(&mut self, kind: &'static str, text: &'a str, off: usize, end: usize) { +${emitHooks} + self.toks.push(Tok { kind, text, off, end });${emitTail} } } ` : ''; - const open = rx - ? ` let mut st = LexState { toks: Vec::new(), prev_text: "", prev_kind: "", bp_text: "", has_prev: false, has_prev2: false, paren_head: Vec::new(), last_close: false, last_bang: false };` - : ` let mut toks: Vec = Vec::new();`; - return `${defs.length ? defs.join('\n') + '\n' : ''}${rxPreamble}fn lex<'a>(src: &'a str) -> Vec> { + const initFields = ['toks: Vec::new()', + rx ? 'prev_text: "", prev_kind: "", bp_text: "", has_prev: false, has_prev2: false, paren_head: Vec::new(), last_close: false, last_bang: false' : '', + tpl ? 'template_stack: Vec::new()' : ''].filter(Boolean).join(', '); + const open = stateful ? ` let mut st = LexState { ${initFields} };` : ` let mut toks: Vec = Vec::new();`; + const tplDispatch = tpl ? ` if !st.template_stack.is_empty() && src[pos..].starts_with(${J(tpl.interpClose)}) && *st.template_stack.last().unwrap() == 0 { + st.template_stack.pop(); + let (interp, e) = _scan_tpl_span(src, pos + ${tpl.interpClose.length}); + if interp { st.emit("$templateMiddle", &src[pos..e], pos, e); st.template_stack.push(0); } else { st.emit("$templateTail", &src[pos..e], pos, e); } + pos = e; continue; + } + if src[pos..].starts_with(${J(tpl.open)}) { + let (interp, e) = _scan_tpl_span(src, pos + ${tpl.open.length}); + if interp { st.emit("$templateHead", &src[pos..e], pos, e); st.template_stack.push(0); } else { st.emit(${J(tpl.token)}, &src[pos..e], pos, e); } + pos = e; continue; + } +` : ''; + return `${defs.length ? defs.join('\n') + '\n' : ''}${rxConsts}${tplFn}${stateImpl}fn lex<'a>(src: &'a str) -> Vec> { let b = src.as_bytes(); let n = b.len(); ${open} @@ -121,11 +160,11 @@ ${open} while pos < n { let c = b[pos] as u32; if c == 32 || c == 9 || c == 10 || c == 13 { pos += 1; continue; } -${toks} +${tplDispatch}${toks} ${puncts} panic!("lex error at {}", pos); } - ${rx ? 'st.toks' : 'toks'} + ${stateful ? 'st.toks' : 'toks'} }`; } @@ -164,7 +203,12 @@ ${r.alts.map(alt).join('\n')} }`; } -function prattRule(r: PrattRule): string { +function prattRule(r: PrattRule, tpl: TplCfg | null): string { + const tplNud = tpl && r.nudToks.includes(tpl.token) + ? ` if t.kind == "$templateHead" { + return self.match_template().map(|n| { let (o, e) = (n.offset, n.end); Cst::node(${J(r.name)}, vec![n], o, e) }); + }\n` + : ''; const binArms = r.binary.map((b) => `${J(b.op)} => Some((${b.lbp}, ${b.rbp}))`).join(', '); const preArms = r.prefix.map((p) => `${J(p.op)} => Some(${p.rbp})`).join(', '); const atomArm = r.nudToks.map(J).join(' | '); @@ -202,7 +246,7 @@ ${r.leds.map(ledArm).join('\n')} } fn ${r.name}_nud(&mut self) -> Option { let t = self.peek()?; - if Parser::${r.name}_atom(t.kind) { +${tplNud} if Parser::${r.name}_atom(t.kind) { self.pos += 1; return Some(Cst::node(${J(r.name)}, vec![Cst::leaf(t.kind, t.off, t.end)], t.off, t.end)); } @@ -223,7 +267,24 @@ export const rustTarget: Target = { name: 'rust', ext: 'rs', render(ir: ParserIR): string { - const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r) : rdRule(r))).join('\n\n'); + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); + const matchTemplate = ir.tpl ? ` fn match_template(&mut self) -> Option { + let t = self.peek()?; + if t.kind != "$templateHead" { return None; } + let save = self.pos; self.pos += 1; + let mut children: Vec = vec![Cst::leaf("$templateHead", t.off, t.end)]; + loop { + let expr = match self.parse_${ir.tpl.interpRule}() { Some(e) => e, None => { self.pos = save; return None; } }; + children.push(expr); + let next = match self.peek() { Some(x) => x, None => { self.pos = save; return None; } }; + if next.kind == "$templateMiddle" { children.push(Cst::leaf("$templateMiddle", next.off, next.end)); self.pos += 1; continue; } + if next.kind == "$templateTail" { children.push(Cst::leaf("$templateTail", next.off, next.end)); self.pos += 1; break; } + self.pos = save; return None; + } + let o = children[0].offset; let e = children[children.len() - 1].end; + Some(Cst::node("$template", children, o, e)) + } +` : ''; return `// GENERATED by emit-portable.ts (rustTarget) — parser for grammar "${ir.grammarName}". #![allow(non_snake_case)] use std::io::Read; @@ -279,7 +340,7 @@ impl<'a> Parser<'a> { false } -${ruleFns} +${matchTemplate}${ruleFns} } fn write_json(c: &Cst, out: &mut String) { diff --git a/src/target-ts.ts b/src/target-ts.ts index 08acf52..d9014d8 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -4,7 +4,7 @@ // index LEDs), and a CST→JSON printer over stdin. It is the reference rendering — its CST // is checked byte-for-byte against the interpreter (createParser), so a divergence in the // portable logic surfaces here before Go/Rust are compiled. -import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts'; +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts'; const J = (v: unknown) => JSON.stringify(v); const rangeCond = (v: string, rs: CharRange[]) => @@ -41,12 +41,13 @@ function compilePat(p: TokenPattern, defs: string[]): string { return name; } -function scanTok(t: LexTok, defs: string[], rxTok?: string): string { +function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): string { const name = (t as { name: string }).name; - const stateful = rxTok !== undefined; - // `emit(...)` threads the regex-context state in stateful mode; a plain push otherwise. + const stateful = rxTok !== undefined || tplTok !== undefined; + if (tplTok !== undefined && name === tplTok) return ''; // template token is scanned by the state machine + // `emit(...)` threads the lexer state in stateful mode; a plain push otherwise. const push = (endExpr: string) => (t.skip ? '' : `${stateful ? 'emit' : 'push'}(${J(name)}, src.slice(pos, ${endExpr}), pos, ${endExpr}); `); - const gate = stateful && name === rxTok ? '!prevIsValue() && ' : ''; + const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : ''; if (t.kind === 'run') return ` if (${gate}${rangeCond('c', t.first)}) { let e = pos + 1; while (e < n) { const cc = src.charCodeAt(e); if (!${rangeCond('cc', t.cont)}) break; e++; } @@ -75,12 +76,15 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string): string { function lexer(ir: ParserIR): string { const defs: string[] = []; const rx = ir.regexCtx; - const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken)).join('\n'); - const pushFn = rx ? 'emit' : 'push'; + const tpl = ir.tpl; + const stateful = !!(rx || tpl); + const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n'); + const pushFn = stateful ? 'emit' : 'push'; const puncts = ir.puncts.map((p) => ` if (src.startsWith(${J(p)}, pos)) { ${pushFn}('', ${J(p)}, pos, pos + ${p.length}); pos += ${p.length}; continue; }`).join('\n'); const set = (a: string[]) => `new Set([${a.map(J).join(', ')}])`; - const stateBlock = rx ? ` let prevText = '', prevKind = '', bpText = '', hasPrev = false, hasPrev2 = false; + // Per-feature pieces of the shared `emit`, so a grammar can have regex, templates, or both. + const rxState = rx ? ` let prevText = '', prevKind = '', bpText = '', hasPrev = false, hasPrev2 = false; const parenHead: boolean[] = []; let lastClose = false, lastBang = false; const _divT = ${set(rx.divisionTexts)}, _divK = ${set(rx.divisionTypes)}, _rxT = ${set(rx.regexTexts)}; @@ -93,22 +97,53 @@ function lexer(ir: ParserIR): string { const isParenHead = prevText === ')' && lastClose; return !isExprKw && !isParenHead && (_divK.has(prevKind) || _divT.has(prevText)); } - function emit(kind: string, text: string, off: number, end: number): void { - if (text === '(') { const isMember = hasPrev2 && _mem.has(bpText); parenHead.push(!isMember && prevKind === IDENT && _phK.has(prevText)); } +` : ''; + const tplState = tpl ? ` const templateStack: number[] = []; + function scanTplSpan(p: number): { interp: boolean; end: number } { + while (p < n) { + if (src.startsWith(${J(tpl.interpOpen)}, p)) return { interp: true, end: p + ${tpl.interpOpen.length} }; + if (src.charCodeAt(p) === 92) { p += 2; continue; } + if (src.startsWith(${J(tpl.open)}, p)) return { interp: false, end: p + ${tpl.open.length} }; + p++; + } + return { interp: false, end: p }; + } +` : ''; + const emitHooks = [ + rx ? ` if (text === '(') { const isMember = hasPrev2 && _mem.has(bpText); parenHead.push(!isMember && prevKind === IDENT && _phK.has(prevText)); } else if (text === ')') { lastClose = parenHead.pop() ?? false; } - if (_pav.has(text)) lastBang = prevIsValue(); - toks.push({ kind, text, off, end }); - bpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true; + if (_pav.has(text)) lastBang = prevIsValue();` : '', + tpl ? ` if (templateStack.length > 0) { if (text === ${J(tpl.braceOpen)}) templateStack[templateStack.length - 1]++; else if (text === ${J(tpl.interpClose)}) templateStack[templateStack.length - 1]--; }` : '', + ].filter(Boolean).join('\n'); + const emitTail = rx ? `\n bpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true;` : ''; + const emitFn = stateful ? ` function emit(kind: string, text: string, off: number, end: number): void { +${emitHooks} + toks.push({ kind, text, off, end });${emitTail} } +` : ''; + // Template dispatch runs at the top of the loop, before token/punct scanning. + const tplDispatch = tpl ? ` if (templateStack.length > 0 && src.startsWith(${J(tpl.interpClose)}, pos) && templateStack[templateStack.length - 1] === 0) { + templateStack.pop(); + const sp = scanTplSpan(pos + ${tpl.interpClose.length}); + if (sp.interp) { emit('$templateMiddle', src.slice(pos, sp.end), pos, sp.end); templateStack.push(0); } + else emit('$templateTail', src.slice(pos, sp.end), pos, sp.end); + pos = sp.end; continue; + } + if (src.startsWith(${J(tpl.open)}, pos)) { + const sp = scanTplSpan(pos + ${tpl.open.length}); + if (sp.interp) { emit('$templateHead', src.slice(pos, sp.end), pos, sp.end); templateStack.push(0); } + else emit(${J(tpl.token)}, src.slice(pos, sp.end), pos, sp.end); + pos = sp.end; continue; + } ` : ''; return `${defs.length ? 'let _s = "";\n' + defs.join('\n') + '\n' : ''}function lex(src: string): Tok[] { const toks: Tok[] = []; const n = src.length; let pos = 0; -${defs.length ? ' _s = src;\n' : ''}${stateBlock}${rx ? '' : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end }); };\n'} while (pos < n) { +${defs.length ? ' _s = src;\n' : ''}${rxState}${tplState}${stateful ? emitFn : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end }); };\n'} while (pos < n) { const c = src.charCodeAt(pos); if (c === 32 || c === 9 || c === 10 || c === 13) { pos++; continue; } -${toks} +${tplDispatch}${toks} ${puncts} throw new Error('lex error at ' + pos + ': ' + JSON.stringify(src[pos])); } @@ -139,7 +174,10 @@ ${r.alts.map(alt).join('\n')} }`; } -function prattRule(r: PrattRule): string { +function prattRule(r: PrattRule, tpl: TplCfg | null): string { + const tplNud = tpl && r.nudToks.includes(tpl.token) + ? ` if (t.kind === '$templateHead') { const node = matchTemplate(); return node === null ? null : { rule: ${J(r.name)}, children: [node], offset: node.offset, end: node.end }; }\n` + : ''; const BIN = `{ ${r.binary.map((b) => `${J(b.op)}: { lbp: ${b.lbp}, rbp: ${b.rbp} }`).join(', ')} }`; const PRE = `{ ${r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', ')} }`; const atom = `new Set([${r.nudToks.map(J).join(', ')}])`; @@ -178,7 +216,7 @@ ${r.leds.map(ledArm).join('\n')} function ${r.name}_nud(): Node | null { const t = peek(); if (t === null) return null; - if (${r.name}_ATOM.has(t.kind)) { pos++; return { rule: ${J(r.name)}, children: [{ tokenType: t.kind, offset: t.off, end: t.end }], offset: t.off, end: t.end }; } +${tplNud} if (${r.name}_ATOM.has(t.kind)) { pos++; return { rule: ${J(r.name)}, children: [{ tokenType: t.kind, offset: t.off, end: t.end }], offset: t.off, end: t.end }; } ${r.nudBrackets.map(bracketNud).join('\n')} const pbp = ${r.name}_PRE[t.text]; if (pbp !== undefined) { @@ -196,7 +234,26 @@ export const tsTarget: Target = { name: 'typescript', ext: 'ts', render(ir: ParserIR): string { - const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r) : rdRule(r))).join('\n\n'); + const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); + const matchTemplate = ir.tpl ? `function matchTemplate(): Cst | null { + const t = peek(); + if (t === null || t.kind !== '$templateHead') return null; + const children: Cst[] = []; + const save = pos; pos++; + children.push({ tokenType: '$templateHead', offset: t.off, end: t.end }); + for (;;) { + const expr = parse${ir.tpl.interpRule}(); + if (expr === null) { pos = save; return null; } + children.push(expr); + const next = peek(); + if (next === null) { pos = save; return null; } + if (next.kind === '$templateMiddle') { pos++; children.push({ tokenType: '$templateMiddle', offset: next.off, end: next.end }); continue; } + if (next.kind === '$templateTail') { pos++; children.push({ tokenType: '$templateTail', offset: next.off, end: next.end }); break; } + pos = save; return null; + } + return { rule: '$template', children, offset: children[0].offset, end: children[children.length - 1].end }; +} +` : ''; return `// GENERATED by emit-portable.ts (tsTarget) — parser for grammar "${ir.grammarName}". import { readFileSync } from 'node:fs'; @@ -250,7 +307,7 @@ function altLit(opts: [string, string][], kids: Cst[]): boolean { return false; } -${ruleFns} +${matchTemplate}${ruleFns} const src = readFileSync(0, 'utf8'); toks = lex(src); diff --git a/test/portable-targets.ts b/test/portable-targets.ts index bf0e6ea..8de7bb6 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -75,6 +75,19 @@ const CASES: Case[] = [ // (`var ;` is VALID — `var` is an identifier, so it's the expression statement `var;`.) reject: ['a / ;', 'if (x /re/;', '/re/', '* a;', 'a = = b;'], }, + { + // STATEFUL template literals: the `${…}` interpolation split (head/middle/tail) with a + // brace-depth stack — adjacent/multiple holes, exprs in holes, nested templates, and a + // nested `{…}` object inside a hole (which must NOT close the hole). + grammar: 'templatejs', path: '../examples/templatejs.ts', + accept: [ + 'var a = `hello`;', 'var b = `hi ${name}!`;', 'var c = `${x}${y}`;', + 'var d = `a${ x + 1 }b${ y * 2 }c`;', 'var e = `outer ${ `inner ${z}` } end`;', + 'var f = `${ {a} }`;', 'var f2 = `${ {a, b} } and ${ c }`;', 'var g = `no holes $ here`;', + 'f(`${a}`, `${b}`);', 'var h = `${a}${b}${c}`;', 'return `${ {x, y} }`;', + ], + reject: ['var x = `${ }`;', 'var y = `${a`;', '`${a} ${}`;', 'tag`${a}`;'], + }, ]; const sortKeys = (o: unknown): unknown => From c99a67d717c2f8e066b985f6e00e55e2a64cb485 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 02:40:52 +0800 Subject: [PATCH 11/27] =?UTF-8?q?emit-portable:=20postfix-token=20Pratt=20?= =?UTF-8?q?LED=20=E2=80=94=20tagged=20templates=20(stage=206=20begins)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first parser-algebra construct toward the real grammar files: a LED whose continuation is a single token, `$ X` (e.g. a tagged template `` tag`…` ``). buildPratt classified LEDs only as binary (`$ op $`) or mixfix-literal (`$ lit …`) and threw on this shape; it now collects such tokens into PrattRule.postfixToks, and each target renders an LED arm that wraps `left X` into a node — tried like a mixfix led (binds tight, no min-bp gate). When the postfix token is the template token the arm also accepts a `$templateHead` and runs matchTemplate, so a tagged template can itself be interpolated. examples/templatejs.ts restores `[$, Template]`; the gate now covers `` tag`…` ``, `` String.raw`a${b}c`.length ``, `` x.tag`${y}` `` (tagged after a member) across ts/go/rust (15/15 accept, 3/3 reject per target). Full suite 42/42. --- examples/templatejs.ts | 1 + src/emit-portable.ts | 5 ++++- src/target-go.ts | 12 ++++++++++++ src/target-rust.ts | 6 ++++++ src/target-ts.ts | 7 +++++++ test/portable-targets.ts | 3 ++- 6 files changed, 32 insertions(+), 2 deletions(-) diff --git a/examples/templatejs.ts b/examples/templatejs.ts index e2f2367..8cda83d 100644 --- a/examples/templatejs.ts +++ b/examples/templatejs.ts @@ -40,6 +40,7 @@ const Expr = rule(($) => [ [$, op, $], [$, '(', opt(sep($, ',')), ')'], [$, '.', Ident], + [$, Template], // tagged template — a postfix-token LED ]); const Block = rule(($) => [['{', many(Stmt), '}']]); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index bfdaf0b..cd166a1 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -62,6 +62,7 @@ export type PrattRule = { prefix: Array<{ op: string; rbp: number }>; // NUD: prefix op then operand at rbp binary: Array<{ op: string; lbp: number; rbp: number }>; // LED: infix op, bind iff lbp > minBp, rhs at rbp leds: Bracket[]; // LED: mixfix continuation (call/member/index), tried before operators + postfixToks: string[]; // LED: a postfix token `$ X` (e.g. a tagged template), tried like a mixfix led }; export type RuleIR = RdRule | PrattRule; @@ -251,6 +252,7 @@ function buildPratt( const nudBrackets: Bracket[] = []; let sawPrefix = false, sawBinary = false; const leds: Bracket[] = []; + const postfixToks: string[] = []; for (const alt of alts) { const items = alt.type === 'seq' ? alt.items : [alt]; const startsSelf = items[0].type === 'ref' && items[0].name === name; @@ -265,6 +267,7 @@ function buildPratt( const rest = items.slice(1); if (rest[0].type === 'op') { sawBinary = true; continue; } if (rest[0].type === 'literal') { leds.push({ first: rest[0].value, steps: rest.map((it) => stepOfPratt(it)) }); continue; } + if (rest.length === 1 && rest[0].type === 'ref' && a.tokenNames.has(rest[0].name)) { postfixToks.push(rest[0].name); continue; } // postfix token (tagged template) throw new Error(`portable: Pratt LED shape not in scope (rule ${name})`); } // a self-ref inside a NUD/LED sub-sequence is a fresh parse of this rule @@ -280,5 +283,5 @@ function buildPratt( const binary = sawBinary ? [...a.opTable.entries()].filter(([, info]) => info.position === 'infix').map(([op, info]) => ({ op, lbp: info.lbp, rbp: info.rbp })) : []; - return { kind: 'pratt', name, nudToks, nudBrackets, prefix, binary, leds }; + return { kind: 'pratt', name, nudToks, nudBrackets, prefix, binary, leds, postfixToks }; } diff --git a/src/target-go.ts b/src/target-go.ts index 9d74c46..c60fd41 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -206,6 +206,17 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { \t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } \t\t\tpos = ledSave; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break \t\t}`; + const postfixArm = (tok: string) => { + const tplPart = tpl && tok === tpl.token ? ` +\t\tif t.Kind == "$templateHead" { +\t\t\tnode := matchTemplate() +\t\t\tif node >= 0 { sb := len(scratch); scratch = append(scratch, left, node); left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } +\t\t}` : ''; + return `\t\tif t.Kind == ${J(tok)} { +\t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf(t.Kind, t.Off, t.End)); pos++ +\t\t\tleft = finish(${J(r.name)}, sb, nodes[left].Offset); continue +\t\t}${tplPart}`; + }; return `var ${r.name}BIN = map[string]bp{${bin}} var ${r.name}PRE = map[string]int{${pre}} var ${r.name}ATOM = map[string]bool{${atoms}} @@ -217,6 +228,7 @@ func ${r.name}bp(minBp int) int32 { \t\tt := peek() \t\tif t == nil { break } ${r.leds.map(ledArm).join('\n')} +${r.postfixToks.map(postfixArm).join('\n')} \t\tinfo, ok := ${r.name}BIN[t.Text] \t\tif !ok || info.lbp <= minBp { break } \t\tledSave := pos; sb := len(scratch) diff --git a/src/target-rust.ts b/src/target-rust.ts index 51aebf5..0ca28b1 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -225,6 +225,11 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { } self.pos = led_save; break; }`; + const postfixArm = (tok: string) => { + const tplPart = tpl && tok === tpl.token ? ` + if t.kind == "$templateHead" { if let Some(n) = self.match_template() { left = node(${J(r.name)}, vec![left, n]); continue; } }` : ''; + return ` if t.kind == ${J(tok)} { self.pos += 1; let leaf = Cst::leaf(t.kind, t.off, t.end); left = node(${J(r.name)}, vec![left, leaf]); continue; }${tplPart}`; + }; return ` fn parse_${r.name}(&mut self) -> Option { self.${r.name}_bp(0) } fn ${r.name}_bin(op: &str) -> Option<(i64, i64)> { match op { ${binArms}${binArms ? ', ' : ''}_ => None } } fn ${r.name}_pre(op: &str) -> Option { match op { ${preArms}${preArms ? ', ' : ''}_ => None } } @@ -234,6 +239,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { loop { let t = match self.peek() { Some(t) => t, None => break }; ${r.leds.map(ledArm).join('\n')} +${r.postfixToks.map(postfixArm).join('\n')} let (lbp, rbp) = match Parser::${r.name}_bin(t.text) { Some(x) => x, None => break }; if lbp <= min_bp { break; } let led_save = self.pos; diff --git a/src/target-ts.ts b/src/target-ts.ts index d9014d8..7d9418e 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -191,6 +191,12 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.name)}, kids); continue; } pos = ledSave; break; }`; + // A postfix token (e.g. a tagged template) binds like a mixfix led: `left X` → node(left, X). + const postfixArm = (tok: string) => { + const tplPart = tpl && tok === tpl.token ? ` + if (t.kind === '$templateHead') { const node = matchTemplate(); if (node !== null) { left = { rule: ${J(r.name)}, children: [left, node], offset: left.offset, end: node.end }; continue; } }` : ''; + return ` if (t.kind === ${J(tok)}) { const leaf: Leaf = { tokenType: t.kind, offset: t.off, end: t.end }; pos++; left = { rule: ${J(r.name)}, children: [left, leaf], offset: left.offset, end: leaf.end }; continue; }${tplPart}`; + }; return `const ${r.name}_BIN: Record = ${BIN}; const ${r.name}_PRE: Record = ${PRE}; const ${r.name}_ATOM = ${atom}; @@ -202,6 +208,7 @@ function ${r.name}_bp(minBp: number): Node | null { const t = peek(); if (t === null) break; ${r.leds.map(ledArm).join('\n')} +${r.postfixToks.map(postfixArm).join('\n')} const info = ${r.name}_BIN[t.text]; if (info === undefined || info.lbp <= minBp) break; const ledSave = pos; diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 8de7bb6..2712214 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -85,8 +85,9 @@ const CASES: Case[] = [ 'var d = `a${ x + 1 }b${ y * 2 }c`;', 'var e = `outer ${ `inner ${z}` } end`;', 'var f = `${ {a} }`;', 'var f2 = `${ {a, b} } and ${ c }`;', 'var g = `no holes $ here`;', 'f(`${a}`, `${b}`);', 'var h = `${a}${b}${c}`;', 'return `${ {x, y} }`;', + 'tag`hello`;', 'tag`${a}${b}`;', 'String.raw`a${b}c`.length;', 'x.tag`${y}`;', // tagged (postfix-token LED) ], - reject: ['var x = `${ }`;', 'var y = `${a`;', '`${a} ${}`;', 'tag`${a}`;'], + reject: ['var x = `${ }`;', 'var y = `${a`;', '`${a} ${}`;'], }, ]; From 65498edbd4258ebd6a292aec9b3803cbf5966c10 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 02:49:39 +0800 Subject: [PATCH 12/27] emit-portable: general (non-literal) inline alt in all three targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit buildIR only accepted an inline `alt(...)` whose every branch was a literal (the altlit fast path) and threw otherwise — the first parser-algebra construct javascript.ts hits. It now compiles a non-literal alternation into an `alt` step whose branches are full sub-sequences, rendered as a backtracking try-each: each branch saves the position (and the arena lengths) and restores them on failure before the next branch. Rendered as an immediately-applied closure in every target (Go needs `;` between the consecutive block statements; Rust reuses the closure body in both the top-level and in-closure step contexts). examples/altjs.ts (object keys are `alt(Ident | Str | Number)`) verifies it across ts/go/rust — 9/9 accept, 4/4 reject per target, byte-identical to createParser. Full suite 42/42. With this, javascript.ts clears the inline-alt wall and advances to the next parser construct (a Pratt NUD shape). --- examples/altjs.ts | 37 +++++++++++++++++++++++++++++++++++++ src/emit-portable.ts | 11 +++++------ src/target-go.ts | 1 + src/target-rust.ts | 7 +++++++ src/target-ts.ts | 1 + test/portable-targets.ts | 10 ++++++++++ 6 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 examples/altjs.ts diff --git a/examples/altjs.ts b/examples/altjs.ts new file mode 100644 index 0000000..d1f117d --- /dev/null +++ b/examples/altjs.ts @@ -0,0 +1,37 @@ +// Exercises the portable parser's general inline `alt(...)` of NON-literals (the first +// parser-algebra construct javascript.ts needs that buildIR previously rejected). Object +// keys are `alt(Ident, Str, Number)` — a backtracking alternation of token references +// inside a rule sequence, not the all-literal fast path. +import { + token, rule, defineGrammar, left, op, + seq, oneOf, range, star, sep, opt, many, alt, noneOf, +} from '../src/api.ts'; + +const digit = range('0', '9'); +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); + +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' }); +const Str = token(seq('"', star(noneOf('"', '\n')), '"'), { scope: 'string.quoted.double' }); + +const jsPrec = [left('+', '-'), left('*', '/')]; + +// key = a NON-literal inline alternation (Ident | Str | Number). +const KeyVal = rule(($) => [[alt(Ident, Str, Number_), ':', Expr]]); +const Expr = rule(($) => [ + Number_, Str, Ident, + ['(', $, ')'], + ['{', opt(sep(KeyVal, ',')), '}'], // object literal + [$, op, $], +]); +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'altjs', + scopeName: 'source.altjs', + tokens: { Ident, Number: Number_, Str }, + prec: jsPrec, + rules: { KeyVal, Expr, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index cd166a1..b773b9f 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -49,7 +49,8 @@ export type Step = | { t: 'star'; step: Step } // repeat inner 0+ | { t: 'opt'; steps: Step[] } // optional sub-sequence | { t: 'sep'; elem: Step; delim: string } // elem (delim elem)* - | { t: 'altlit'; opts: Lit[] }; // inline alternation of literals + | { t: 'altlit'; opts: Lit[] } // inline alternation of literals (fast path) + | { t: 'alt'; branches: Step[][] }; // inline alternation of sub-sequences (backtracking) export type Alt = Step[]; export type RdRule = { kind: 'rd'; name: string; alts: Alt[] }; @@ -142,12 +143,10 @@ function buildIR(grammar: CstGrammar): ParserIR { if (e.kind === '+') throw new Error("portable: '+' not yet modeled (use '*')"); break; case 'alt': { - const opts: Lit[] = []; - for (const it of e.items) { - if (it.type !== 'literal') throw new Error('portable: inline alt must be all literals'); - opts.push({ value: it.value, ttype: litTtype(it.value) }); + if (e.items.every((it) => it.type === 'literal')) { // fast path: all-literal alternation + return { t: 'altlit', opts: e.items.map((it) => ({ value: (it as { value: string }).value, ttype: litTtype((it as { value: string }).value) })) }; } - return { t: 'altlit', opts }; + return { t: 'alt', branches: e.items.map(altSteps) }; // general: backtracking over sub-sequences } } throw new Error(`portable: rd construct '${e.type}' not in scope`); diff --git a/src/target-go.ts b/src/target-go.ts index c60fd41..e9bfb03 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -169,6 +169,7 @@ function stepCond(s: Step): string { case 'opt': return `opt(func() bool { return ${s.steps.map(stepCond).join(' && ')} })`; case 'sep': return `sepBy(func() bool { return ${stepCond(s.elem)} }, ${J(s.delim)})`; case 'altlit': return `altLit([][2]string{${s.opts.map((o) => `{${J(o.value)}, ${J(o.ttype)}}`).join(', ')}})`; + case 'alt': return `func() bool { ${s.branches.map((br) => `{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${br.length ? br.map(stepCond).join(' && ') : 'true'} { return true }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('; ')}; return false }()`; } } diff --git a/src/target-rust.ts b/src/target-rust.ts index 0ca28b1..381a1e3 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -178,8 +178,14 @@ function stepCond(s: Step): string { case 'opt': return `self.opt(|p, k| ${s.steps.map(stepCondP).join(' && ')}, &mut kids)`; case 'sep': return `self.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, &mut kids)`; case 'altlit': return `self.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], &mut kids)`; + case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(self, &mut kids)`; } } +// A backtracking inline alternation rendered as an immediately-applied closure over (p, k), +// so it composes identically whether it sits at top level or already inside a closure. +function altBody(branches: Step[][]): string { + return `${branches.map((br) => `{ let sp = p.pos; let bk = k.len(); if ${br.length ? br.map(stepCondP).join(' && ') : 'true'} { return true; } p.pos = sp; k.truncate(bk); }`).join(' ')} false`; +} // Inside a closure: uses `p` and `k`. function stepCondP(s: Step): string { switch (s.t) { @@ -190,6 +196,7 @@ function stepCondP(s: Step): string { case 'opt': return `p.opt(|p, k| ${s.steps.map(stepCondP).join(' && ')}, k)`; case 'sep': return `p.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, k)`; case 'altlit': return `p.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], k)`; + case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(p, k)`; } } diff --git a/src/target-ts.ts b/src/target-ts.ts index 7d9418e..8a4cc50 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -161,6 +161,7 @@ function stepCond(s: Step): string { case 'opt': return `opt(() => ${s.steps.map(stepCond).join(' && ')}, kids)`; case 'sep': return `sepBy(() => ${stepCond(s.elem)}, ${J(s.delim)}, kids)`; case 'altlit': return `altLit([${s.opts.map((o) => `[${J(o.value)}, ${J(o.ttype)}]`).join(', ')}], kids)`; + case 'alt': return `(() => { ${s.branches.map((br) => `{ const sp = pos; const bk = kids.length; if (${br.length ? br.map(stepCond).join(' && ') : 'true'}) return true; pos = sp; kids.length = bk; }`).join(' ')} return false; })()`; } } diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 2712214..cf3d881 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -89,6 +89,16 @@ const CASES: Case[] = [ ], reject: ['var x = `${ }`;', 'var y = `${a`;', '`${a} ${}`;'], }, + { + // General (non-literal) inline alt: object keys are alt(Ident | Str | Number) — a + // backtracking alternation of token refs inside a rule sequence. + grammar: 'altjs', path: '../examples/altjs.ts', + accept: [ + '{a: 1};', '{"k": 2};', '{1: x};', '{a: 1, "b": 2, 3: c};', '{x: 1 + 2 * 3};', + '({nested: {inner: 1}});', '{};', 'a + b;', '{k: (1 + 2)};', + ], + reject: ['{a:};', '{: 1};', '{a 1};', '{a: 1,, b: 2};'], + }, ]; const sortKeys = (o: unknown): unknown => From 8fc593a8c06ca3a979dd7127ee3bc935629372d3 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 03:02:35 +0800 Subject: [PATCH 13/27] emit-portable: `not` step + general Pratt NUD sequences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two coupled parser-algebra constructs, the next javascript.ts wall after inline-alt: - A `not` step — zero-width negative lookahead: try the inner steps, restore the position (and arena/kids) unconditionally, succeed iff they did NOT match. Rendered as an immediately-applied closure in every target (Rust shares one body across the two step contexts, like `alt`). - General Pratt NUD sequences (PrattRule.nudSeqs) — a NUD that is neither a bare token, a prefix op, nor a literal-led bracket: a backtracking try-each sequence producing a node. Covers the reserved-word-guarded identifier (`not(kw)… Ident`) and the quantifier-first class expression (`Decorator? class Ident? … { … }`). A single transparent group unwraps to its body; a group carrying capBelow/ctxMode/suppress (arrow functions, await/yield context) is explicitly deferred with a clear message. examples/nudjs.ts verifies both across ts/go/rust — 11/11 accept, 4/4 reject per target, byte-identical to createParser. Full suite 42/42. javascript.ts now clears the NUD wall and advances to the next construct (a Pratt LED shape). --- examples/nudjs.ts | 41 ++++++++++++++++++++++++++++++++++++++++ src/emit-portable.ts | 22 ++++++++++++++++++--- src/target-go.ts | 2 ++ src/target-rust.ts | 7 +++++++ src/target-ts.ts | 2 ++ test/portable-targets.ts | 11 +++++++++++ 6 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 examples/nudjs.ts diff --git a/examples/nudjs.ts b/examples/nudjs.ts new file mode 100644 index 0000000..a443b7b --- /dev/null +++ b/examples/nudjs.ts @@ -0,0 +1,41 @@ +// Exercises two general Pratt NUD shapes javascript.ts needs (beyond bare-token / prefix / +// bracket): a reserved-word-GUARDED identifier `[not(kw)… Ident]` (zero-width negative +// lookahead before a token) and a quantifier-first NUD `[Decorator? "class" Ident? …]` (a +// class expression). Both compile to a general backtracking NUD sequence; the `not` step +// consumes nothing. (Arrow functions — group{capBelow,ctxMode} — are deferred.) +import { + token, rule, defineGrammar, left, op, + seq, oneOf, range, star, sep, opt, many, alt, not, noneOf, +} from '../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); +const Decorator = token(seq('@', idStart, star(idCont)), { scope: 'meta.decorator' }); + +const reserved = alt('if', 'else', 'while', 'return', 'class', 'new', 'extends'); + +const Expr = rule(($) => [ + Number_, + [not(reserved), Ident], // reserved-word-guarded identifier + [opt(Decorator), 'class', opt(Ident), opt('extends', $), '{', many(ClassMember), '}'], // class expr (quantifier-first NUD) + ['new', $], // literal-led NUD (bracket) + ['(', $, ')'], + [$, op, $], + [$, '.', Ident], + [$, '(', opt(sep($, ',')), ')'], +]); +const ClassMember = rule(($) => [[opt(Decorator), Ident, '(', ')', '{', '}']]); + +const jsPrec = [left('+', '-'), left('*', '/')]; +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'nudjs', + scopeName: 'source.nudjs', + tokens: { Decorator, Ident, Number: Number_ }, + prec: jsPrec, + rules: { Expr, ClassMember, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index b773b9f..28a5196 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -50,7 +50,8 @@ export type Step = | { t: 'opt'; steps: Step[] } // optional sub-sequence | { t: 'sep'; elem: Step; delim: string } // elem (delim elem)* | { t: 'altlit'; opts: Lit[] } // inline alternation of literals (fast path) - | { t: 'alt'; branches: Step[][] }; // inline alternation of sub-sequences (backtracking) + | { t: 'alt'; branches: Step[][] } // inline alternation of sub-sequences (backtracking) + | { t: 'not'; steps: Step[] }; // zero-width negative lookahead (consumes nothing) export type Alt = Step[]; export type RdRule = { kind: 'rd'; name: string; alts: Alt[] }; @@ -60,6 +61,7 @@ export type PrattRule = { name: string; nudToks: string[]; // NUD: a bare token wrapped in a node nudBrackets: Bracket[]; // NUD: '(' … ')' / '[' … ']' + nudSeqs: Step[][]; // NUD: a general sequence (guarded ident, class expr), tried with backtracking prefix: Array<{ op: string; rbp: number }>; // NUD: prefix op then operand at rbp binary: Array<{ op: string; lbp: number; rbp: number }>; // LED: infix op, bind iff lbp > minBp, rhs at rbp leds: Bracket[]; // LED: mixfix continuation (call/member/index), tried before operators @@ -136,6 +138,7 @@ function buildIR(grammar: CstGrammar): ParserIR { case 'literal': return { t: 'lit', value: e.value, ttype: litTtype(e.value) }; case 'ref': return tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }; case 'group': { const ss = altSteps(e.body); if (ss.length !== 1) throw new Error('portable: group must reduce to a single step'); return ss[0]; } + case 'not': return { t: 'not', steps: altSteps(e.body) }; // zero-width negative lookahead case 'sep': return { t: 'sep', elem: stepOf(e.element), delim: e.delimiter }; case 'quantifier': if (e.kind === '*') return { t: 'star', step: stepOf(e.body) }; @@ -249,6 +252,7 @@ function buildPratt( const alts = body.type === 'alt' ? body.items : [body]; const nudToks: string[] = []; const nudBrackets: Bracket[] = []; + const nudSeqs: Step[][] = []; let sawPrefix = false, sawBinary = false; const leds: Bracket[] = []; const postfixToks: string[] = []; @@ -260,7 +264,17 @@ function buildPratt( if (items.length === 1 && items[0].type === 'ref' && a.tokenNames.has(items[0].name)) { nudToks.push(items[0].name); continue; } if (items[0].type === 'prefix') { sawPrefix = true; continue; } if (items[0].type === 'literal') { nudBrackets.push({ first: items[0].value, steps: items.map((it) => stepOfPratt(it)) }); continue; } - throw new Error(`portable: Pratt NUD shape not in scope (rule ${name})`); + // A single transparent group unwraps to its body (an explicit grouping of the NUD sequence). + let nudItems = items; + if (items.length === 1 && items[0].type === 'group' && !items[0].capBelow && !items[0].ctxMode && !items[0].suppress) { + nudItems = items[0].body.type === 'seq' ? items[0].body.items : [items[0].body]; + } + // capBelow / ctxMode (arrow functions, await/yield context) are a deeper construct — defer. + if (nudItems.some((it) => it.type === 'group' && (it.capBelow || it.ctxMode || it.suppress))) { + throw new Error(`portable: Pratt NUD with capBelow/ctxMode/suppress not yet in scope (rule ${name}) — arrow functions etc.`); + } + nudSeqs.push(nudItems.map((it) => stepOfPratt(it))); // general NUD sequence (guarded ident, class expr) + continue; } // LED (starts with self): `$ op $` (binary, op slot + trailing self) or `$ …` (mixfix) const rest = items.slice(1); @@ -272,6 +286,8 @@ function buildPratt( // a self-ref inside a NUD/LED sub-sequence is a fresh parse of this rule function stepOfPratt(e: RuleExpr): Step { if (e.type === 'ref' && e.name === name) return { t: 'rule', name }; + if (e.type === 'not') return { t: 'not', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; + if (e.type === 'group' && !e.capBelow && !e.ctxMode && !e.suppress && e.body.type !== 'seq') return stepOfPratt(e.body); if (e.type === 'sep') return { t: 'sep', elem: stepOfPratt(e.element), delim: e.delimiter }; if (e.type === 'quantifier' && e.kind === '?') return { t: 'opt', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; if (e.type === 'quantifier' && e.kind === '*') return { t: 'star', step: stepOfPratt(e.body) }; @@ -282,5 +298,5 @@ function buildPratt( const binary = sawBinary ? [...a.opTable.entries()].filter(([, info]) => info.position === 'infix').map(([op, info]) => ({ op, lbp: info.lbp, rbp: info.rbp })) : []; - return { kind: 'pratt', name, nudToks, nudBrackets, prefix, binary, leds, postfixToks }; + return { kind: 'pratt', name, nudToks, nudBrackets, nudSeqs, prefix, binary, leds, postfixToks }; } diff --git a/src/target-go.ts b/src/target-go.ts index e9bfb03..73357fd 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -170,6 +170,7 @@ function stepCond(s: Step): string { case 'sep': return `sepBy(func() bool { return ${stepCond(s.elem)} }, ${J(s.delim)})`; case 'altlit': return `altLit([][2]string{${s.opts.map((o) => `{${J(o.value)}, ${J(o.ttype)}}`).join(', ')}})`; case 'alt': return `func() bool { ${s.branches.map((br) => `{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${br.length ? br.map(stepCond).join(' && ') : 'true'} { return true }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('; ')}; return false }()`; + case 'not': return `func() bool { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); m := ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return !m }()`; } } @@ -258,6 +259,7 @@ ${r.nudBrackets.map(bracketNud).join('\n')} \t\tscratch = append(scratch, operand) \t\treturn finish(${J(r.name)}, sb, t.Off) \t} +${r.nudSeqs.map((seq) => `\t{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${seq.length ? seq.map(stepCond).join(' && ') : 'true'} { return finish(${J(r.name)}, sb, offAt(save)) }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('\n')} \treturn -1 }`; } diff --git a/src/target-rust.ts b/src/target-rust.ts index 381a1e3..9e253ff 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -179,6 +179,7 @@ function stepCond(s: Step): string { case 'sep': return `self.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, &mut kids)`; case 'altlit': return `self.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], &mut kids)`; case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(self, &mut kids)`; + case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(self, &mut kids)`; } } // A backtracking inline alternation rendered as an immediately-applied closure over (p, k), @@ -186,6 +187,10 @@ function stepCond(s: Step): string { function altBody(branches: Step[][]): string { return `${branches.map((br) => `{ let sp = p.pos; let bk = k.len(); if ${br.length ? br.map(stepCondP).join(' && ') : 'true'} { return true; } p.pos = sp; k.truncate(bk); }`).join(' ')} false`; } +// Zero-width negative lookahead: try the steps, restore, succeed iff they did NOT all match. +function notBody(steps: Step[]): string { + return `let sp = p.pos; let bk = k.len(); let m = ${steps.length ? steps.map(stepCondP).join(' && ') : 'true'}; p.pos = sp; k.truncate(bk); !m`; +} // Inside a closure: uses `p` and `k`. function stepCondP(s: Step): string { switch (s.t) { @@ -197,6 +202,7 @@ function stepCondP(s: Step): string { case 'sep': return `p.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, k)`; case 'altlit': return `p.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], k)`; case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(p, k)`; + case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(p, k)`; } } @@ -272,6 +278,7 @@ ${r.nudBrackets.map(bracketNud).join('\n')} None => { self.pos = save; return None; } } } +${r.nudSeqs.map((seq) => ` { let save = self.pos; let mut kids: Vec = Vec::new(); if ${seq.length ? seq.map(stepCond).join(' && ') : 'true'} { return Some(self.branch(${J(r.name)}, kids, save)); } self.pos = save; }`).join('\n')} None }`; } diff --git a/src/target-ts.ts b/src/target-ts.ts index 8a4cc50..f394f54 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -162,6 +162,7 @@ function stepCond(s: Step): string { case 'sep': return `sepBy(() => ${stepCond(s.elem)}, ${J(s.delim)}, kids)`; case 'altlit': return `altLit([${s.opts.map((o) => `[${J(o.value)}, ${J(o.ttype)}]`).join(', ')}], kids)`; case 'alt': return `(() => { ${s.branches.map((br) => `{ const sp = pos; const bk = kids.length; if (${br.length ? br.map(stepCond).join(' && ') : 'true'}) return true; pos = sp; kids.length = bk; }`).join(' ')} return false; })()`; + case 'not': return `(() => { const sp = pos; const bk = kids.length; const m = ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = sp; kids.length = bk; return !m; })()`; } } @@ -234,6 +235,7 @@ ${r.nudBrackets.map(bracketNud).join('\n')} if (operand === null) { pos = save; return null; } return { rule: ${J(r.name)}, children: [opLeaf, operand], offset: t.off, end: operand.end }; } +${r.nudSeqs.map((seq) => ` { const save = pos; const kids: Cst[] = []; if (${seq.length ? seq.map(stepCond).join(' && ') : 'true'}) return branch(${J(r.name)}, kids, save); pos = save; }`).join('\n')} return null; }`; } diff --git a/test/portable-targets.ts b/test/portable-targets.ts index cf3d881..429bd53 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -99,6 +99,17 @@ const CASES: Case[] = [ ], reject: ['{a:};', '{: 1};', '{a 1};', '{a: 1,, b: 2};'], }, + { + // General Pratt NUD sequences: a reserved-word-guarded identifier (`not(kw)… Ident`, + // a zero-width negative lookahead) and a quantifier-first class expression. + grammar: 'nudjs', path: '../examples/nudjs.ts', + accept: [ + 'x;', 'foo + bar;', 'class C {};', 'class {};', 'class C extends B {};', + '@dec class C { m(){} };', 'new Foo;', 'new C();', 'a.b.c;', + 'class C { @x m(){} n(){} };', 'x + class {} + y;', + ], + reject: ['if;', 'class;', 'new;', 'return + 1;'], // reserved words can't be bare identifiers + }, ]; const sortKeys = (o: unknown): unknown => From 22cfc5e5b7a5c63df1605ac28924275973943be1 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 03:21:27 +0800 Subject: [PATCH 14/27] emit-portable: postfix-operator Pratt LED + access-tail closure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The next javascript.ts construct after the NUD cluster: a postfix operator LED `[$, postfix]` (`x++`, `x--`) — consume the operator, no right operand, bind iff lbp > minBp. With it comes the access-tail CLOSURE that makes it correct: once a postfix binds, the operand is an update expression, so a further postfix or an access tail (`.x`, `[i]`, `(…)`, a tagged template) can no longer attach. The led loop now threads a `tailClosed` flag — set by a postfix, gating both further postfixes and the access-tail leds. An access-tail led is detected structurally (buildPratt): a led whose last step is not a fresh same-rule operand (closed, not an open binary/ternary) and whose connector is a punctuator, not a word operator — so `in`/`instanceof`/`?:` still bind after `a++`. examples/postjs.ts verifies it across ts/go/rust: `a++--`, `a++.b`, `a++ ++` are rejected, `(a++).b` and `x.y.z++` accepted — 11/11 accept, 4/4 reject per target, byte-identical to createParser. Full suite 42/42. javascript.ts now clears the LED wall and advances to the next construct (a nested `seq` rd step). --- examples/postjs.ts | 38 ++++++++++++++++++++++++++++++++++++++ src/emit-portable.ts | 23 +++++++++++++++++++---- src/target-go.ts | 15 +++++++++++---- src/target-rust.ts | 12 ++++++++---- src/target-ts.ts | 16 +++++++++++----- test/portable-targets.ts | 11 +++++++++++ 6 files changed, 98 insertions(+), 17 deletions(-) create mode 100644 examples/postjs.ts diff --git a/examples/postjs.ts b/examples/postjs.ts new file mode 100644 index 0000000..239fdec --- /dev/null +++ b/examples/postjs.ts @@ -0,0 +1,38 @@ +// Exercises the postfix-operator Pratt LED `[$, postfix]` (e.g. `x++`, `x--`) — a LED that +// consumes the operator and no right operand, binding tight. `++`/`--` are BOTH prefix (NUD, +// `++x`) and postfix (LED, `x++`); the engine resolves them by position. +import { + token, rule, defineGrammar, left, right, op, prefix, postfix, + seq, oneOf, range, star, many, +} from '../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [ + left('+', '-'), + left('*', '/'), + right(prefix('-', '!', '++', '--')), + left(postfix('++', '--')), +]; + +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + [prefix, $], + [$, op, $], + [$, '.', Ident], + [$, postfix], // postfix operator LED +]); +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'postjs', + scopeName: 'source.postjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + rules: { Expr, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 28a5196..1b642c8 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -65,7 +65,9 @@ export type PrattRule = { prefix: Array<{ op: string; rbp: number }>; // NUD: prefix op then operand at rbp binary: Array<{ op: string; lbp: number; rbp: number }>; // LED: infix op, bind iff lbp > minBp, rhs at rbp leds: Bracket[]; // LED: mixfix continuation (call/member/index), tried before operators - postfixToks: string[]; // LED: a postfix token `$ X` (e.g. a tagged template), tried like a mixfix led + ledAccessTail: boolean[]; // parallel to leds: a "closed punct-connector" tail (member/call/index) — disabled once a postfix binds + postfixToks: string[]; // LED: a postfix token `$ X` (e.g. a tagged template), tried like a mixfix led (also an access tail) + postfix: Array<{ op: string; lbp: number }>; // LED: a postfix operator `$ ++` — binds iff lbp > minBp + !tailClosed, no rhs, closes the tail }; export type RuleIR = RdRule | PrattRule; @@ -253,8 +255,9 @@ function buildPratt( const nudToks: string[] = []; const nudBrackets: Bracket[] = []; const nudSeqs: Step[][] = []; - let sawPrefix = false, sawBinary = false; + let sawPrefix = false, sawBinary = false, sawPostfix = false; const leds: Bracket[] = []; + const ledAccessTail: boolean[] = []; const postfixToks: string[] = []; for (const alt of alts) { const items = alt.type === 'seq' ? alt.items : [alt]; @@ -279,7 +282,16 @@ function buildPratt( // LED (starts with self): `$ op $` (binary, op slot + trailing self) or `$ …` (mixfix) const rest = items.slice(1); if (rest[0].type === 'op') { sawBinary = true; continue; } - if (rest[0].type === 'literal') { leds.push({ first: rest[0].value, steps: rest.map((it) => stepOfPratt(it)) }); continue; } + if (rest[0].type === 'postfix') { sawPostfix = true; continue; } // postfix operator (`x++`) + if (rest[0].type === 'literal') { + const steps = rest.map((it) => stepOfPratt(it)); + const last = steps[steps.length - 1]; + const lastIsOperand = last !== undefined && last.t === 'rule' && last.name === name; // open binary/ternary operand + const wordConnector = /^[A-Za-z]/.test(rest[0].value); // `in`/`instanceof`/`as` — not a tail + leds.push({ first: rest[0].value, steps }); + ledAccessTail.push(!lastIsOperand && !wordConnector); + continue; + } if (rest.length === 1 && rest[0].type === 'ref' && a.tokenNames.has(rest[0].name)) { postfixToks.push(rest[0].name); continue; } // postfix token (tagged template) throw new Error(`portable: Pratt LED shape not in scope (rule ${name})`); } @@ -298,5 +310,8 @@ function buildPratt( const binary = sawBinary ? [...a.opTable.entries()].filter(([, info]) => info.position === 'infix').map(([op, info]) => ({ op, lbp: info.lbp, rbp: info.rbp })) : []; - return { kind: 'pratt', name, nudToks, nudBrackets, nudSeqs, prefix, binary, leds, postfixToks }; + const postfix = sawPostfix + ? [...a.opTable.entries()].filter(([, info]) => info.position === 'postfix').map(([op, info]) => ({ op, lbp: info.lbp })) + : []; + return { kind: 'pratt', name, nudToks, nudBrackets, nudSeqs, prefix, binary, leds, ledAccessTail, postfixToks, postfix }; } diff --git a/src/target-go.ts b/src/target-go.ts index 73357fd..6c04e9e 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -202,7 +202,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { \t\tif ${b.steps.map(stepCond).join(' && ')} { return finish(${J(r.name)}, sb, t.Off) } \t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 \t}`; - const ledArm = (b: Bracket) => `\t\tif t.Text == ${J(b.first)} { + const ledArm = (b: Bracket, accessTail: boolean) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}t.Text == ${J(b.first)} { \t\t\tledSave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) \t\t\tscratch = append(scratch, left) \t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } @@ -210,27 +210,34 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { \t\t}`; const postfixArm = (tok: string) => { const tplPart = tpl && tok === tpl.token ? ` -\t\tif t.Kind == "$templateHead" { +\t\tif !tailClosed && t.Kind == "$templateHead" { \t\t\tnode := matchTemplate() \t\t\tif node >= 0 { sb := len(scratch); scratch = append(scratch, left, node); left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } \t\t}` : ''; - return `\t\tif t.Kind == ${J(tok)} { + return `\t\tif !tailClosed && t.Kind == ${J(tok)} { \t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf(t.Kind, t.Off, t.End)); pos++ \t\t\tleft = finish(${J(r.name)}, sb, nodes[left].Offset); continue \t\t}${tplPart}`; }; + const post = r.postfix.map((p) => `${J(p.op)}: ${p.lbp}`).join(', '); return `var ${r.name}BIN = map[string]bp{${bin}} var ${r.name}PRE = map[string]int{${pre}} +var ${r.name}POST = map[string]int{${post}} var ${r.name}ATOM = map[string]bool{${atoms}} func parse${r.name}() int32 { return ${r.name}bp(0) } func ${r.name}bp(minBp int) int32 { \tleft := ${r.name}nud() \tif left < 0 { return -1 } +\ttailClosed := false \tfor { \t\tt := peek() \t\tif t == nil { break } -${r.leds.map(ledArm).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} +\t\tif post, ok := ${r.name}POST[t.Text]; ok && !tailClosed && post > minBp { +\t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf("$operator", t.Off, t.End)); pos++; tailClosed = true +\t\t\tleft = finish(${J(r.name)}, sb, nodes[left].Offset); continue +\t\t} \t\tinfo, ok := ${r.name}BIN[t.Text] \t\tif !ok || info.lbp <= minBp { break } \t\tledSave := pos; sb := len(scratch) diff --git a/src/target-rust.ts b/src/target-rust.ts index 9e253ff..9f14ab8 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -230,7 +230,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { if ${b.steps.map(stepCond).join(' && ')} { return Some(node(${J(r.name)}, kids)); } self.pos = save; return None; }`; - const ledArm = (b: Bracket) => ` if t.text == ${J(b.first)} { + const ledArm = (b: Bracket, accessTail: boolean) => ` if ${accessTail ? '!tail_closed && ' : ''}t.text == ${J(b.first)} { let led_save = self.pos; let mut kids: Vec = Vec::new(); if ${b.steps.map(stepCond).join(' && ')} { let mut full = vec![left]; full.append(&mut kids); @@ -240,19 +240,23 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { }`; const postfixArm = (tok: string) => { const tplPart = tpl && tok === tpl.token ? ` - if t.kind == "$templateHead" { if let Some(n) = self.match_template() { left = node(${J(r.name)}, vec![left, n]); continue; } }` : ''; - return ` if t.kind == ${J(tok)} { self.pos += 1; let leaf = Cst::leaf(t.kind, t.off, t.end); left = node(${J(r.name)}, vec![left, leaf]); continue; }${tplPart}`; + if !tail_closed && t.kind == "$templateHead" { if let Some(n) = self.match_template() { left = node(${J(r.name)}, vec![left, n]); continue; } }` : ''; + return ` if !tail_closed && t.kind == ${J(tok)} { self.pos += 1; let leaf = Cst::leaf(t.kind, t.off, t.end); left = node(${J(r.name)}, vec![left, leaf]); continue; }${tplPart}`; }; + const postArms = r.postfix.map((p) => `${J(p.op)} => Some(${p.lbp})`).join(', '); return ` fn parse_${r.name}(&mut self) -> Option { self.${r.name}_bp(0) } fn ${r.name}_bin(op: &str) -> Option<(i64, i64)> { match op { ${binArms}${binArms ? ', ' : ''}_ => None } } fn ${r.name}_pre(op: &str) -> Option { match op { ${preArms}${preArms ? ', ' : ''}_ => None } } + fn ${r.name}_post(op: &str) -> Option { match op { ${postArms}${postArms ? ', ' : ''}_ => None } } fn ${r.name}_atom(kind: &str) -> bool { matches!(kind, ${atomArm || '""'}) } fn ${r.name}_bp(&mut self, min_bp: i64) -> Option { let mut left = self.${r.name}_nud()?; + let mut tail_closed = false; loop { let t = match self.peek() { Some(t) => t, None => break }; -${r.leds.map(ledArm).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} + if let Some(plbp) = Parser::${r.name}_post(t.text) { if !tail_closed && plbp > min_bp { self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); left = node(${J(r.name)}, vec![left, op_leaf]); tail_closed = true; continue; } } let (lbp, rbp) = match Parser::${r.name}_bin(t.text) { Some(x) => x, None => break }; if lbp <= min_bp { break; } let led_save = self.pos; diff --git a/src/target-ts.ts b/src/target-ts.ts index f394f54..ffb12e2 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -188,29 +188,35 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { if (${b.steps.map(stepCond).join(' && ')}) return node(${J(r.name)}, kids); pos = save; return null; }`; - const ledArm = (b: Bracket) => ` if (t.text === ${J(b.first)}) { + // Access-tail leds (member/call/index) are disabled once a postfix has closed the operand. + const ledArm = (b: Bracket, accessTail: boolean) => ` if (${accessTail ? '!tailClosed && ' : ''}t.text === ${J(b.first)}) { const ledSave = pos; const kids: Cst[] = [left]; if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.name)}, kids); continue; } pos = ledSave; break; }`; - // A postfix token (e.g. a tagged template) binds like a mixfix led: `left X` → node(left, X). + // A postfix token (e.g. a tagged template) binds like a mixfix led: `left X` → node(left, X). Also an access tail. const postfixArm = (tok: string) => { const tplPart = tpl && tok === tpl.token ? ` - if (t.kind === '$templateHead') { const node = matchTemplate(); if (node !== null) { left = { rule: ${J(r.name)}, children: [left, node], offset: left.offset, end: node.end }; continue; } }` : ''; - return ` if (t.kind === ${J(tok)}) { const leaf: Leaf = { tokenType: t.kind, offset: t.off, end: t.end }; pos++; left = { rule: ${J(r.name)}, children: [left, leaf], offset: left.offset, end: leaf.end }; continue; }${tplPart}`; + if (!tailClosed && t.kind === '$templateHead') { const node = matchTemplate(); if (node !== null) { left = { rule: ${J(r.name)}, children: [left, node], offset: left.offset, end: node.end }; continue; } }` : ''; + return ` if (!tailClosed && t.kind === ${J(tok)}) { const leaf: Leaf = { tokenType: t.kind, offset: t.off, end: t.end }; pos++; left = { rule: ${J(r.name)}, children: [left, leaf], offset: left.offset, end: leaf.end }; continue; }${tplPart}`; }; + const POST = `{ ${r.postfix.map((p) => `${J(p.op)}: ${p.lbp}`).join(', ')} }`; return `const ${r.name}_BIN: Record = ${BIN}; const ${r.name}_PRE: Record = ${PRE}; +const ${r.name}_POST: Record = ${POST}; const ${r.name}_ATOM = ${atom}; function parse${r.name}(): Node | null { return ${r.name}_bp(0); } function ${r.name}_bp(minBp: number): Node | null { let left = ${r.name}_nud(); if (left === null) return null; + let tailClosed = false; for (;;) { const t = peek(); if (t === null) break; -${r.leds.map(ledArm).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} + const post = ${r.name}_POST[t.text]; + if (!tailClosed && post !== undefined && post > minBp) { pos++; const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; left = { rule: ${J(r.name)}, children: [left, opLeaf], offset: left.offset, end: t.end }; tailClosed = true; continue; } const info = ${r.name}_BIN[t.text]; if (info === undefined || info.lbp <= minBp) break; const ledSave = pos; diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 429bd53..c26f9a0 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -110,6 +110,17 @@ const CASES: Case[] = [ ], reject: ['if;', 'class;', 'new;', 'return + 1;'], // reserved words can't be bare identifiers }, + { + // Postfix-operator LED (`x++`/`x--`) + the access-tail closure: once a postfix binds, the + // operand is an update expression, so a further postfix or an access tail (`.`/`[`/`(`) + // can't attach (`a++--`, `a++.b` are ill-formed; `(a++).b` is fine). + grammar: 'postjs', path: '../examples/postjs.ts', + accept: [ + 'x++;', 'x--;', 'a + b++;', '++x;', 'x++ + y;', 'a.b++;', '(x)++;', '--a.b;', + 'x++ * 2;', '(a++).b;', 'x.y.z++;', + ], + reject: ['a++--;', 'a++.b;', 'a++ ++;', '++;'], + }, ]; const sortKeys = (o: unknown): unknown => From ab022a775ca571c3b4905c3a664414770ba72025 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 03:31:14 +0800 Subject: [PATCH 15/27] emit-portable: grouped sub-sequence `seq` step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The next javascript.ts construct: a `seq` reaching stepOf — a star/sep body that is itself a sequence, e.g. a comma list written `star([',', $])` (`many(',', $)`), the shape javascript.ts uses for array/argument/sequence lists. stepOf/stepOfPratt now compile a sequence into a `seq` step, rendered as the conjunction of its sub-steps (the enclosing star/opt/sep handles backtracking). examples/seqjs.ts verifies it across ts/go/rust — 10/10 accept, 4/4 reject per target, byte-identical to createParser. Full suite 42/42. javascript.ts now advances to the deferred construct it has been heading toward: arrow functions (group{capBelow, ctxMode} — assignment-level precedence + the await/yield context fork). --- examples/seqjs.ts | 33 +++++++++++++++++++++++++++++++++ src/emit-portable.ts | 5 ++++- src/target-go.ts | 1 + src/target-rust.ts | 2 ++ src/target-ts.ts | 1 + test/portable-targets.ts | 10 ++++++++++ 6 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 examples/seqjs.ts diff --git a/examples/seqjs.ts b/examples/seqjs.ts new file mode 100644 index 0000000..d0e40fe --- /dev/null +++ b/examples/seqjs.ts @@ -0,0 +1,33 @@ +// Exercises a grouped sub-sequence `seq` step: comma lists written as `star([',', $])` (a +// star whose body is the two-element sequence `, Expr`) rather than `sep(...)`, the shape +// javascript.ts uses for argument/array/sequence lists. +import { + token, rule, defineGrammar, left, op, + seq, oneOf, range, star, opt, many, +} from '../src/api.ts'; +// `many(',', $)` is the rule-level `(',' Expr)*` — a star whose body is the sequence +// `, Expr`, exactly the shape javascript.ts uses for comma lists. + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [left('+', '-'), left('*', '/')]; +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + ['[', opt($, many(',', $)), ']'], // array literal via star(seq) + [$, op, $], + [$, '(', opt($, many(',', $)), ')'], // call args via star(seq) +]); +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'seqjs', + scopeName: 'source.seqjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + rules: { Expr, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 1b642c8..89bfb00 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -51,7 +51,8 @@ export type Step = | { t: 'sep'; elem: Step; delim: string } // elem (delim elem)* | { t: 'altlit'; opts: Lit[] } // inline alternation of literals (fast path) | { t: 'alt'; branches: Step[][] } // inline alternation of sub-sequences (backtracking) - | { t: 'not'; steps: Step[] }; // zero-width negative lookahead (consumes nothing) + | { t: 'not'; steps: Step[] } // zero-width negative lookahead (consumes nothing) + | { t: 'seq'; steps: Step[] }; // a grouped sub-sequence (e.g. a star body `(',' Expr)`) export type Alt = Step[]; export type RdRule = { kind: 'rd'; name: string; alts: Alt[] }; @@ -141,6 +142,7 @@ function buildIR(grammar: CstGrammar): ParserIR { case 'ref': return tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }; case 'group': { const ss = altSteps(e.body); if (ss.length !== 1) throw new Error('portable: group must reduce to a single step'); return ss[0]; } case 'not': return { t: 'not', steps: altSteps(e.body) }; // zero-width negative lookahead + case 'seq': return { t: 'seq', steps: e.items.map(stepOf) }; // grouped sub-sequence (star/sep body) case 'sep': return { t: 'sep', elem: stepOf(e.element), delim: e.delimiter }; case 'quantifier': if (e.kind === '*') return { t: 'star', step: stepOf(e.body) }; @@ -298,6 +300,7 @@ function buildPratt( // a self-ref inside a NUD/LED sub-sequence is a fresh parse of this rule function stepOfPratt(e: RuleExpr): Step { if (e.type === 'ref' && e.name === name) return { t: 'rule', name }; + if (e.type === 'seq') return { t: 'seq', steps: e.items.map(stepOfPratt) }; if (e.type === 'not') return { t: 'not', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; if (e.type === 'group' && !e.capBelow && !e.ctxMode && !e.suppress && e.body.type !== 'seq') return stepOfPratt(e.body); if (e.type === 'sep') return { t: 'sep', elem: stepOfPratt(e.element), delim: e.delimiter }; diff --git a/src/target-go.ts b/src/target-go.ts index 6c04e9e..8a936d5 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -171,6 +171,7 @@ function stepCond(s: Step): string { case 'altlit': return `altLit([][2]string{${s.opts.map((o) => `{${J(o.value)}, ${J(o.ttype)}}`).join(', ')}})`; case 'alt': return `func() bool { ${s.branches.map((br) => `{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${br.length ? br.map(stepCond).join(' && ') : 'true'} { return true }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('; ')}; return false }()`; case 'not': return `func() bool { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); m := ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return !m }()`; + case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; } } diff --git a/src/target-rust.ts b/src/target-rust.ts index 9f14ab8..20f540d 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -180,6 +180,7 @@ function stepCond(s: Step): string { case 'altlit': return `self.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], &mut kids)`; case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(self, &mut kids)`; case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(self, &mut kids)`; + case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; } } // A backtracking inline alternation rendered as an immediately-applied closure over (p, k), @@ -203,6 +204,7 @@ function stepCondP(s: Step): string { case 'altlit': return `p.alt_lit(&[${s.opts.map((o) => `(${J(o.value)}, ${J(o.ttype)})`).join(', ')}], k)`; case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(p, k)`; case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(p, k)`; + case 'seq': return `(${s.steps.length ? s.steps.map(stepCondP).join(' && ') : 'true'})`; } } diff --git a/src/target-ts.ts b/src/target-ts.ts index ffb12e2..41ad3f3 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -163,6 +163,7 @@ function stepCond(s: Step): string { case 'altlit': return `altLit([${s.opts.map((o) => `[${J(o.value)}, ${J(o.ttype)}]`).join(', ')}], kids)`; case 'alt': return `(() => { ${s.branches.map((br) => `{ const sp = pos; const bk = kids.length; if (${br.length ? br.map(stepCond).join(' && ') : 'true'}) return true; pos = sp; kids.length = bk; }`).join(' ')} return false; })()`; case 'not': return `(() => { const sp = pos; const bk = kids.length; const m = ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = sp; kids.length = bk; return !m; })()`; + case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; } } diff --git a/test/portable-targets.ts b/test/portable-targets.ts index c26f9a0..a433a1d 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -121,6 +121,16 @@ const CASES: Case[] = [ ], reject: ['a++--;', 'a++.b;', 'a++ ++;', '++;'], }, + { + // A grouped sub-sequence `seq` step: comma lists as `star([',', $])` (e.g. `many(',', $)`), + // the array/argument-list shape javascript.ts uses. + grammar: 'seqjs', path: '../examples/seqjs.ts', + accept: [ + '[1, 2, 3];', '[];', '[1];', 'f(1, 2);', 'f();', '[a + b, c];', + 'f(g(1, 2), 3);', '(x);', 'f(a)(b, c);', '[[1,2],[3,4]];', + ], + reject: ['[1 2];', 'f(1,);', '[, 1];', 'f(1 2);'], + }, ]; const sortKeys = (o: unknown): unknown => From 9624d4fb06ccbfc16560ac420ee9176263600582 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 03:44:01 +0800 Subject: [PATCH 16/27] emit-portable: `sameLine` zero-width assertion + lexer newline tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit typescript.ts's first parser-algebra blocker (and a piece of async arrows): the `sameLine` restricted-production assertion — matches, consuming nothing, iff the next token has no preceding line terminator. The lexer now tracks newline-before per token (a `nl` flag on Tok), set when the skipped whitespace contains a newline OR a skipped comment spans one, so a block comment across a newline counts. In the stateful lexer the flag lives on LexState; otherwise a local threaded through the plain push. examples/sljs.ts (a `return` that takes a value only on the same line) verifies it across ts/go/rust: `return 1;` keeps the value; `return\n1;`, `return /*\n*/ 1;` (block comment spanning a newline) and `return // c\n 1;` correctly reject — 7/7 accept, 4/4 reject per target, byte-identical to createParser. Full suite 42/42. typescript.ts now clears sameLine and advances to `notLeftLeaf`; javascript.ts remains at arrow functions (capBelow/ctxMode). --- examples/sljs.ts | 29 +++++++++++++++++++++++++++++ src/emit-portable.ts | 5 ++++- src/target-go.ts | 16 +++++++++++----- src/target-rust.ts | 21 +++++++++++++-------- src/target-ts.ts | 16 ++++++++++------ test/portable-targets.ts | 11 +++++++++++ 6 files changed, 78 insertions(+), 20 deletions(-) create mode 100644 examples/sljs.ts diff --git a/examples/sljs.ts b/examples/sljs.ts new file mode 100644 index 0000000..68421a5 --- /dev/null +++ b/examples/sljs.ts @@ -0,0 +1,29 @@ +// Exercises the `sameLine` zero-width assertion (no line terminator before the next token). +// A `return` takes a value only when it is on the SAME line (ASI-style restricted production): +// `return 1;` keeps the value, `return\n1;` does not. Verifies the lexer's newline-before +// tracking — including a block comment that spans a newline. +import { + token, rule, defineGrammar, left, op, + seq, oneOf, range, star, opt, many, altPattern, noneOf, sameLine, +} from '../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); +const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' }); +const BlockComment = token(seq('/*', star(altPattern(noneOf('*'), seq('*', noneOf('/')))), '*/'), { skip: true, scope: 'comment.block' }); + +const jsPrec = [left('+', '-'), left('*', '/')]; +const Expr = rule(($) => [Number_, Ident, ['(', $, ')'], [$, op, $]]); +const Ret = rule(($) => [['return', opt(sameLine, Expr), ';']]); // `return` + a SAME-LINE value +const Stmt = rule(($) => [Ret, [Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'sljs', + scopeName: 'source.sljs', + tokens: { Ident, Number: Number_, LineComment, BlockComment }, + prec: jsPrec, + rules: { Expr, Ret, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 89bfb00..079bdee 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -52,7 +52,8 @@ export type Step = | { t: 'altlit'; opts: Lit[] } // inline alternation of literals (fast path) | { t: 'alt'; branches: Step[][] } // inline alternation of sub-sequences (backtracking) | { t: 'not'; steps: Step[] } // zero-width negative lookahead (consumes nothing) - | { t: 'seq'; steps: Step[] }; // a grouped sub-sequence (e.g. a star body `(',' Expr)`) + | { t: 'seq'; steps: Step[] } // a grouped sub-sequence (e.g. a star body `(',' Expr)`) + | { t: 'sameLine' }; // zero-width: the next token is on the same line (no preceding newline) export type Alt = Step[]; export type RdRule = { kind: 'rd'; name: string; alts: Alt[] }; @@ -142,6 +143,7 @@ function buildIR(grammar: CstGrammar): ParserIR { case 'ref': return tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }; case 'group': { const ss = altSteps(e.body); if (ss.length !== 1) throw new Error('portable: group must reduce to a single step'); return ss[0]; } case 'not': return { t: 'not', steps: altSteps(e.body) }; // zero-width negative lookahead + case 'sameLine': return { t: 'sameLine' }; // zero-width no-newline assertion case 'seq': return { t: 'seq', steps: e.items.map(stepOf) }; // grouped sub-sequence (star/sep body) case 'sep': return { t: 'sep', elem: stepOf(e.element), delim: e.delimiter }; case 'quantifier': @@ -301,6 +303,7 @@ function buildPratt( function stepOfPratt(e: RuleExpr): Step { if (e.type === 'ref' && e.name === name) return { t: 'rule', name }; if (e.type === 'seq') return { t: 'seq', steps: e.items.map(stepOfPratt) }; + if (e.type === 'sameLine') return { t: 'sameLine' }; if (e.type === 'not') return { t: 'not', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; if (e.type === 'group' && !e.capBelow && !e.ctxMode && !e.suppress && e.body.type !== 'seq') return stepOfPratt(e.body); if (e.type === 'sep') return { t: 'sep', elem: stepOfPratt(e.element), delim: e.delimiter }; diff --git a/src/target-go.ts b/src/target-go.ts index 8a936d5..82b70c5 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -48,7 +48,7 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): st const name = (t as { name: string }).name; const stateful = rxTok !== undefined || tplTok !== undefined; if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine - const push = (endE: string) => (t.skip ? '' : stateful ? `emit(${J(name)}, src[pos:${endE}], pos, ${endE}); ` : `toks = append(toks, Tok{${J(name)}, src[pos:${endE}], pos, ${endE}}); `); + const push = (endE: string) => (t.skip ? `if strings.Contains(src[pos:${endE}], "\\n") { pendingNl = true }; ` : stateful ? `emit(${J(name)}, src[pos:${endE}], pos, ${endE}); ` : `pushTok(${J(name)}, src[pos:${endE}], pos, ${endE}); `); const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : ''; if (t.kind === 'run') return `\t\tif ${gate}${rangeCond('c', t.first)} { \t\t\te := pos + 1 @@ -81,7 +81,7 @@ function lexer(ir: ParserIR): string { const tpl = ir.tpl; const stateful = !!(rx || tpl); const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n'); - const pushPunct = stateful ? (p: string) => `emit("", ${J(p)}, pos, pos + ${p.length})` : (p: string) => `toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}})`; + const pushPunct = stateful ? (p: string) => `emit("", ${J(p)}, pos, pos + ${p.length})` : (p: string) => `pushTok("", ${J(p)}, pos, pos + ${p.length})`; const puncts = ir.puncts.map((p) => `\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { ${pushPunct(p)}; pos += ${p.length}; continue }`).join('\n'); const goMap = (a: string[]) => `map[string]bool{${a.map((x) => `${J(x)}: true`).join(', ')}}`; @@ -129,7 +129,7 @@ function lexer(ir: ParserIR): string { const emitTail = rx ? `\n\t\tbpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true` : ''; const emitFn = stateful ? `\temit := func(kind, text string, off, end int) { ${emitHooks} -\t\ttoks = append(toks, Tok{kind, text, off, end})${emitTail} +\t\ttoks = append(toks, Tok{kind, text, off, end, pendingNl}); pendingNl = false${emitTail} \t} \t_ = emit ` : ''; @@ -145,13 +145,17 @@ ${emitHooks} \t\t\tpos = e; continue \t\t} ` : ''; + const pushTokFn = stateful ? '' : `\tpushTok := func(kind, text string, off, end int) { toks = append(toks, Tok{kind, text, off, end, pendingNl}); pendingNl = false }\n\t_ = pushTok\n`; return `${defs.length ? 'var _s string\n' + defs.join('\n') + '\n' : ''}func lex(src string) []Tok { \ttoks := toks[:0] \tn := len(src) \tpos := 0 -${rxState}${tplState}${emitFn}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { +\tpendingNl := false +\t_ = pendingNl +${rxState}${tplState}${emitFn}${pushTokFn}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { \t\tc := int(src[pos]) -\t\tif c == 32 || c == 9 || c == 10 || c == 13 { pos++; continue } +\t\tif c == 32 || c == 9 { pos++; continue } +\t\tif c == 10 || c == 13 { pendingNl = true; pos++; continue } ${tplDispatch}${toks} ${puncts} \t\tpanic(fmt.Sprintf("lex error at %d", pos)) @@ -172,6 +176,7 @@ function stepCond(s: Step): string { case 'alt': return `func() bool { ${s.branches.map((br) => `{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${br.length ? br.map(stepCond).join(' && ') : 'true'} { return true }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('; ')}; return false }()`; case 'not': return `func() bool { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); m := ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return !m }()`; case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; + case 'sameLine': return `func() bool { t := peek(); return t != nil && !t.Nl }()`; } } @@ -310,6 +315,7 @@ import ( type Tok struct { \tKind, Text string \tOff, End int +\tNl bool } // Arena node: an int32 index into nodes; children are a flat range in kids. type Node struct { diff --git a/src/target-rust.ts b/src/target-rust.ts index 20f540d..4b032fb 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -51,7 +51,8 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): st const name = (t as { name: string }).name; const stateful = rxTok !== undefined || tplTok !== undefined; if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine - const push = (endE: string) => (t.skip ? '' : stateful ? `st.emit(${J(name)}, &src[pos..${endE}], pos, ${endE}); ` : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE} }); `); + const nlVar = stateful ? 'st.pending_nl' : 'pending_nl'; + const push = (endE: string) => (t.skip ? `if src[pos..${endE}].contains('\\n') { ${nlVar} = true; } ` : stateful ? `st.emit(${J(name)}, &src[pos..${endE}], pos, ${endE}); ` : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE}, nl: pending_nl }); pending_nl = false; `); const gate = rxTok !== undefined && name === rxTok ? '!st.prev_is_value() && ' : ''; if (t.kind === 'run') return ` if ${gate}${rangeCond('c', t.first)} { let mut e = pos + 1; @@ -85,7 +86,7 @@ function lexer(ir: ParserIR): string { const stateful = !!(rx || tpl); const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n'); const puncts = ir.puncts.map((p) => - ` if src[pos..].starts_with(${J(p)}) { ${stateful ? `st.emit("", &src[pos..pos + ${p.length}], pos, pos + ${p.length});` : `toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length} });`} pos += ${p.length}; continue; }`).join('\n'); + ` if src[pos..].starts_with(${J(p)}) { ${stateful ? `st.emit("", &src[pos..pos + ${p.length}], pos, pos + ${p.length});` : `toks.push(Tok { kind: "", text: &src[pos..pos + ${p.length}], off: pos, end: pos + ${p.length}, nl: pending_nl }); pending_nl = false;`} pos += ${p.length}; continue; }`).join('\n'); const rsArr = (a: string[]) => `&[${a.map(J).join(', ')}]`; // Struct fields / emit hooks / init are assembled per-feature so a grammar can have regex, // templates, or both share one LexState. @@ -109,7 +110,7 @@ fn _in(set: &[&str], x: &str) -> bool { set.iter().any(|s| *s == x) } (false, p) } ` : ''; - const fields = ['toks: Vec>', + const fields = ['toks: Vec>', 'pending_nl: bool', rx ? 'prev_text: &\'a str, prev_kind: &\'static str, bp_text: &\'a str, has_prev: bool, has_prev2: bool, paren_head: Vec, last_close: bool, last_bang: bool' : '', tpl ? 'template_stack: Vec' : ''].filter(Boolean).join(', '); const prevIsValue = rx ? ` fn prev_is_value(&self) -> bool { @@ -132,14 +133,15 @@ fn _in(set: &[&str], x: &str) -> bool { set.iter().any(|s| *s == x) } impl<'a> LexState<'a> { ${prevIsValue} fn emit(&mut self, kind: &'static str, text: &'a str, off: usize, end: usize) { ${emitHooks} - self.toks.push(Tok { kind, text, off, end });${emitTail} + self.toks.push(Tok { kind, text, off, end, nl: self.pending_nl }); self.pending_nl = false;${emitTail} } } ` : ''; - const initFields = ['toks: Vec::new()', + const initFields = ['toks: Vec::new()', 'pending_nl: false', rx ? 'prev_text: "", prev_kind: "", bp_text: "", has_prev: false, has_prev2: false, paren_head: Vec::new(), last_close: false, last_bang: false' : '', tpl ? 'template_stack: Vec::new()' : ''].filter(Boolean).join(', '); - const open = stateful ? ` let mut st = LexState { ${initFields} };` : ` let mut toks: Vec = Vec::new();`; + const open = stateful ? ` let mut st = LexState { ${initFields} };` : ` let mut toks: Vec = Vec::new();\n let mut pending_nl = false;`; + const nlVar = stateful ? 'st.pending_nl' : 'pending_nl'; const tplDispatch = tpl ? ` if !st.template_stack.is_empty() && src[pos..].starts_with(${J(tpl.interpClose)}) && *st.template_stack.last().unwrap() == 0 { st.template_stack.pop(); let (interp, e) = _scan_tpl_span(src, pos + ${tpl.interpClose.length}); @@ -159,7 +161,8 @@ ${open} let mut pos = 0usize; while pos < n { let c = b[pos] as u32; - if c == 32 || c == 9 || c == 10 || c == 13 { pos += 1; continue; } + if c == 32 || c == 9 { pos += 1; continue; } + if c == 10 || c == 13 { ${nlVar} = true; pos += 1; continue; } ${tplDispatch}${toks} ${puncts} panic!("lex error at {}", pos); @@ -181,6 +184,7 @@ function stepCond(s: Step): string { case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(self, &mut kids)`; case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(self, &mut kids)`; case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; + case 'sameLine': return `matches!(self.peek(), Some(t) if !t.nl)`; } } // A backtracking inline alternation rendered as an immediately-applied closure over (p, k), @@ -205,6 +209,7 @@ function stepCondP(s: Step): string { case 'alt': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${altBody(s.branches)} })(p, k)`; case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(p, k)`; case 'seq': return `(${s.steps.length ? s.steps.map(stepCondP).join(' && ') : 'true'})`; + case 'sameLine': return `matches!(p.peek(), Some(t) if !t.nl)`; } } @@ -318,7 +323,7 @@ use std::io::Read; // Zero-alloc tokens: kind is a known grammar name (&'static str), text is a slice of the // source. Tok is Copy, so peek() copies pointers — no per-peek heap work. #[derive(Clone, Copy)] -struct Tok<'a> { kind: &'static str, text: &'a str, off: usize, end: usize } +struct Tok<'a> { kind: &'static str, text: &'a str, off: usize, end: usize, nl: bool } // CST nodes hold only &'static str labels (rule names / token-type tags are all literals) // + usize spans — no per-node String allocation. diff --git a/src/target-ts.ts b/src/target-ts.ts index 41ad3f3..fcfb818 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -45,8 +45,9 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): st const name = (t as { name: string }).name; const stateful = rxTok !== undefined || tplTok !== undefined; if (tplTok !== undefined && name === tplTok) return ''; // template token is scanned by the state machine - // `emit(...)` threads the lexer state in stateful mode; a plain push otherwise. - const push = (endExpr: string) => (t.skip ? '' : `${stateful ? 'emit' : 'push'}(${J(name)}, src.slice(pos, ${endExpr}), pos, ${endExpr}); `); + // `emit(...)` threads the lexer state in stateful mode; a plain push otherwise. A skipped + // token (comment) still records a newline it spans, so `sameLine` sees it. + const push = (endExpr: string) => (t.skip ? `if (src.slice(pos, ${endExpr}).indexOf('\\n') >= 0) pendingNl = true; ` : `${stateful ? 'emit' : 'push'}(${J(name)}, src.slice(pos, ${endExpr}), pos, ${endExpr}); `); const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : ''; if (t.kind === 'run') return ` if (${gate}${rangeCond('c', t.first)}) { let e = pos + 1; @@ -118,7 +119,7 @@ function lexer(ir: ParserIR): string { const emitTail = rx ? `\n bpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true;` : ''; const emitFn = stateful ? ` function emit(kind: string, text: string, off: number, end: number): void { ${emitHooks} - toks.push({ kind, text, off, end });${emitTail} + toks.push({ kind, text, off, end, nl: pendingNl }); pendingNl = false;${emitTail} } ` : ''; // Template dispatch runs at the top of the loop, before token/punct scanning. @@ -140,9 +141,11 @@ ${emitHooks} const toks: Tok[] = []; const n = src.length; let pos = 0; -${defs.length ? ' _s = src;\n' : ''}${rxState}${tplState}${stateful ? emitFn : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end }); };\n'} while (pos < n) { + let pendingNl = false; +${defs.length ? ' _s = src;\n' : ''}${rxState}${tplState}${stateful ? emitFn : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end, nl: pendingNl }); pendingNl = false; };\n'} while (pos < n) { const c = src.charCodeAt(pos); - if (c === 32 || c === 9 || c === 10 || c === 13) { pos++; continue; } + if (c === 32 || c === 9) { pos++; continue; } + if (c === 10 || c === 13) { pendingNl = true; pos++; continue; } ${tplDispatch}${toks} ${puncts} throw new Error('lex error at ' + pos + ': ' + JSON.stringify(src[pos])); @@ -164,6 +167,7 @@ function stepCond(s: Step): string { case 'alt': return `(() => { ${s.branches.map((br) => `{ const sp = pos; const bk = kids.length; if (${br.length ? br.map(stepCond).join(' && ') : 'true'}) return true; pos = sp; kids.length = bk; }`).join(' ')} return false; })()`; case 'not': return `(() => { const sp = pos; const bk = kids.length; const m = ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = sp; kids.length = bk; return !m; })()`; case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; + case 'sameLine': return `(() => { const t = peek(); return t !== null && !t.nl; })()`; } } @@ -274,7 +278,7 @@ export const tsTarget: Target = { return `// GENERATED by emit-portable.ts (tsTarget) — parser for grammar "${ir.grammarName}". import { readFileSync } from 'node:fs'; -type Tok = { kind: string; text: string; off: number; end: number }; +type Tok = { kind: string; text: string; off: number; end: number; nl: boolean }; type Leaf = { tokenType: string; offset: number; end: number }; type Node = { rule: string; children: Cst[]; offset: number; end: number }; type Cst = Node | Leaf; diff --git a/test/portable-targets.ts b/test/portable-targets.ts index a433a1d..e9cfc59 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -131,6 +131,17 @@ const CASES: Case[] = [ ], reject: ['[1 2];', 'f(1,);', '[, 1];', 'f(1 2);'], }, + { + // The `sameLine` zero-width assertion (no line terminator before the next token): + // `return` takes a value only on the same line. Also verifies the lexer's newline-before + // tracking across a block comment that spans a newline. + grammar: 'sljs', path: '../examples/sljs.ts', + accept: [ + 'return 1;', 'return;', 'return 1 + 2;', '1 + 2;', 'return /* c */ 1;', + '(a);', 'return (1);', + ], + reject: ['return\n1;', 'return\nx;', 'return /*\n*/ 1;', 'return // c\n 1;'], + }, ]; const sortKeys = (o: unknown): unknown => From f807c6b5d72fc52cb8041c1764bac304348c20ef Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 04:00:32 +0800 Subject: [PATCH 17/27] emit-portable: capBelow arrow functions + fix sep trailing-delimiter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hardest parser construct, the wall javascript.ts has been heading toward: assignment-level (capBelow) NUDs — arrow functions. A capExpr NUD carries the binding power of its connector; it is parsed only when the enclosing minBp is LOOSER than that (so `1 + () => x` needs parens), and once parsed it is "capped" — the led loop is skipped entirely (`() => {} || a` rejects). The nud now takes minBp, tries the capped sequences FIRST (so the `(x) => y` vs `(x)` ambiguity resolves by longest-match — the arrow is attempted, then falls back to grouping), and signals the cap via `_capped`. The `=>` body's ctxMode (await/yield) is treated as transparent: the context fork is not modelled, so this covers basic arrows, not async/await bodies. Also fixes a latent `sep` bug surfaced by `(a,) => b`: gen-parser's sep allows a trailing delimiter, the portable sepBy did not. Now matched in all three targets — earlier grammars simply had no trailing-delimiter test, so the aggregate passed for the wrong reason. examples/arrowjs.ts verifies it across ts/go/rust — 14/14 accept (incl. trailing commas and curried `x => y => x`), 4/4 reject, byte-identical to createParser. Full suite 42/42. javascript.ts clears the arrow wall and advances to the next group case. --- examples/arrowjs.ts | 36 ++++++++++++++++++++++++++++++++++++ src/emit-portable.ts | 24 ++++++++++++++++++------ src/target-go.ts | 14 +++++++++++--- src/target-rust.ts | 21 ++++++++++++++------- src/target-ts.ts | 14 +++++++++++--- test/portable-targets.ts | 12 ++++++++++++ 6 files changed, 102 insertions(+), 19 deletions(-) create mode 100644 examples/arrowjs.ts diff --git a/examples/arrowjs.ts b/examples/arrowjs.ts new file mode 100644 index 0000000..486cdaa --- /dev/null +++ b/examples/arrowjs.ts @@ -0,0 +1,36 @@ +// Exercises the capBelow (assignment-level) Pratt construct — arrow functions. A `capExpr` +// NUD is parsed only when the enclosing minBp is LOOSER than its connector's binding power +// (so `1 + (() => x)` needs the parens) and, once parsed, admits NO led (it is "capped"). +// The `=>` body's ctxMode (await/yield) is treated as transparent here — the context fork +// is NOT modelled, so this covers basic arrows, not async/await bodies. +import { + token, rule, defineGrammar, left, right, op, capExpr, alt, + seq, oneOf, range, star, sep, opt, many, +} from '../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [right('='), left('||'), left('+', '-'), left('*', '/')]; + +const Block = rule(($) => [['{', many(Stmt), '}']]); +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + capExpr('=', '(', opt(sep(Ident, ',')), ')', '=>', alt(Block, $)), // (params) => body + capExpr('=', Ident, '=>', alt(Block, $)), // x => body + [$, op, $], + [$, '(', opt(sep($, ',')), ')'], // call +]); +const Stmt = rule(($) => [Block, [Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'arrowjs', + scopeName: 'source.arrowjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + rules: { Expr, Block, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 079bdee..12751b0 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -64,6 +64,7 @@ export type PrattRule = { nudToks: string[]; // NUD: a bare token wrapped in a node nudBrackets: Bracket[]; // NUD: '(' … ')' / '[' … ']' nudSeqs: Step[][]; // NUD: a general sequence (guarded ident, class expr), tried with backtracking + nudCapped: Array<{ steps: Step[]; capBp: number }>; // NUD: an assignment-level capped sequence (arrow function) — parsed only when minBp < capBp, admits no led prefix: Array<{ op: string; rbp: number }>; // NUD: prefix op then operand at rbp binary: Array<{ op: string; lbp: number; rbp: number }>; // LED: infix op, bind iff lbp > minBp, rhs at rbp leds: Bracket[]; // LED: mixfix continuation (call/member/index), tried before operators @@ -259,6 +260,7 @@ function buildPratt( const nudToks: string[] = []; const nudBrackets: Bracket[] = []; const nudSeqs: Step[][] = []; + const nudCapped: Array<{ steps: Step[]; capBp: number }> = []; let sawPrefix = false, sawBinary = false, sawPostfix = false; const leds: Bracket[] = []; const ledAccessTail: boolean[] = []; @@ -270,15 +272,24 @@ function buildPratt( // NUD if (items.length === 1 && items[0].type === 'ref' && a.tokenNames.has(items[0].name)) { nudToks.push(items[0].name); continue; } if (items[0].type === 'prefix') { sawPrefix = true; continue; } + // A capExpr (arrow function): an assignment-level group{capBelow}. ctxMode in its body + // is treated as transparent (the await/yield fork is not modelled in the portable parser). + if (items.length === 1 && items[0].type === 'group' && items[0].capBelow !== undefined) { + const capBp = a.nudCapOf(items[0]); + if (capBp === null) throw new Error(`portable: capBelow connector '${items[0].capBelow}' has no binding power (rule ${name})`); + const b = items[0].body; + nudCapped.push({ steps: (b.type === 'seq' ? b.items : [b]).map((it) => stepOfPratt(it)), capBp }); + continue; + } if (items[0].type === 'literal') { nudBrackets.push({ first: items[0].value, steps: items.map((it) => stepOfPratt(it)) }); continue; } // A single transparent group unwraps to its body (an explicit grouping of the NUD sequence). let nudItems = items; - if (items.length === 1 && items[0].type === 'group' && !items[0].capBelow && !items[0].ctxMode && !items[0].suppress) { + if (items.length === 1 && items[0].type === 'group' && !items[0].suppress) { nudItems = items[0].body.type === 'seq' ? items[0].body.items : [items[0].body]; } - // capBelow / ctxMode (arrow functions, await/yield context) are a deeper construct — defer. - if (nudItems.some((it) => it.type === 'group' && (it.capBelow || it.ctxMode || it.suppress))) { - throw new Error(`portable: Pratt NUD with capBelow/ctxMode/suppress not yet in scope (rule ${name}) — arrow functions etc.`); + // A no-`in`/suppress group is a deeper construct — defer. + if (nudItems.some((it) => it.type === 'group' && it.suppress)) { + throw new Error(`portable: Pratt NUD with suppress (no-in context) not yet in scope (rule ${name})`); } nudSeqs.push(nudItems.map((it) => stepOfPratt(it))); // general NUD sequence (guarded ident, class expr) continue; @@ -305,7 +316,8 @@ function buildPratt( if (e.type === 'seq') return { t: 'seq', steps: e.items.map(stepOfPratt) }; if (e.type === 'sameLine') return { t: 'sameLine' }; if (e.type === 'not') return { t: 'not', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; - if (e.type === 'group' && !e.capBelow && !e.ctxMode && !e.suppress && e.body.type !== 'seq') return stepOfPratt(e.body); + // ctxMode (await/yield) is transparent to the portable parser (no fork); unwrap a non-seq group. + if (e.type === 'group' && !e.capBelow && !e.suppress && e.body.type !== 'seq') return stepOfPratt(e.body); if (e.type === 'sep') return { t: 'sep', elem: stepOfPratt(e.element), delim: e.delimiter }; if (e.type === 'quantifier' && e.kind === '?') return { t: 'opt', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; if (e.type === 'quantifier' && e.kind === '*') return { t: 'star', step: stepOfPratt(e.body) }; @@ -319,5 +331,5 @@ function buildPratt( const postfix = sawPostfix ? [...a.opTable.entries()].filter(([, info]) => info.position === 'postfix').map(([op, info]) => ({ op, lbp: info.lbp })) : []; - return { kind: 'pratt', name, nudToks, nudBrackets, nudSeqs, prefix, binary, leds, ledAccessTail, postfixToks, postfix }; + return { kind: 'pratt', name, nudToks, nudBrackets, nudSeqs, nudCapped, prefix, binary, leds, ledAccessTail, postfixToks, postfix }; } diff --git a/src/target-go.ts b/src/target-go.ts index 82b70c5..6729ec3 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -232,8 +232,9 @@ var ${r.name}POST = map[string]int{${post}} var ${r.name}ATOM = map[string]bool{${atoms}} func parse${r.name}() int32 { return ${r.name}bp(0) } func ${r.name}bp(minBp int) int32 { -\tleft := ${r.name}nud() +\tleft := ${r.name}nud(minBp) \tif left < 0 { return -1 } +\tif _capped { return left } \ttailClosed := false \tfor { \t\tt := peek() @@ -256,9 +257,11 @@ ${r.postfixToks.map(postfixArm).join('\n')} \t} \treturn left } -func ${r.name}nud() int32 { +func ${r.name}nud(minBp int) int32 { +\t_capped = false \tt := peek() \tif t == nil { return -1 } +${r.nudCapped.map((c) => `\tif minBp < ${c.capBp} { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'} { _capped = true; return finish(${J(r.name)}, sb, offAt(save)) }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('\n')} ${tplNud}\tif ${r.name}ATOM[t.Kind] { \t\tsb := len(scratch); scratch = append(scratch, mkLeaf(t.Kind, t.Off, t.End)); pos++ \t\treturn finish(${J(r.name)}, sb, t.Off) @@ -327,6 +330,7 @@ type bp struct{ lbp, rbp int } var toks []Tok var pos int +var _capped bool var nodes []Node var kids []int32 var scratch []int32 @@ -375,7 +379,11 @@ func opt(body func() bool) bool { } func sepBy(elem func() bool, delim string) bool { \tif !elem() { return false } -\tfor { sp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if matchLit(delim, "$punct") && elem() { continue }; pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break } +\tfor { +\t\tsp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) +\t\tif !matchLit(delim, "$punct") { pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break } +\t\tif !elem() { break } // a trailing delimiter is allowed — keep the pushed delim and stop +\t} \treturn true } func altLit(opts [][2]string) bool { diff --git a/src/target-rust.ts b/src/target-rust.ts index 4b032fb..a81c3f5 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -257,7 +257,8 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { fn ${r.name}_post(op: &str) -> Option { match op { ${postArms}${postArms ? ', ' : ''}_ => None } } fn ${r.name}_atom(kind: &str) -> bool { matches!(kind, ${atomArm || '""'}) } fn ${r.name}_bp(&mut self, min_bp: i64) -> Option { - let mut left = self.${r.name}_nud()?; + let mut left = self.${r.name}_nud(min_bp)?; + if self.capped { return Some(left); } let mut tail_closed = false; loop { let t = match self.peek() { Some(t) => t, None => break }; @@ -274,8 +275,10 @@ ${r.postfixToks.map(postfixArm).join('\n')} } Some(left) } - fn ${r.name}_nud(&mut self) -> Option { + fn ${r.name}_nud(&mut self, min_bp: i64) -> Option { + self.capped = false; let t = self.peek()?; +${r.nudCapped.map((c) => ` if min_bp < ${c.capBp} { let save = self.pos; let mut kids: Vec = Vec::new(); if ${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'} { self.capped = true; return Some(self.branch(${J(r.name)}, kids, save)); } self.pos = save; }`).join('\n')} ${tplNud} if Parser::${r.name}_atom(t.kind) { self.pos += 1; return Some(Cst::node(${J(r.name)}, vec![Cst::leaf(t.kind, t.off, t.end)], t.off, t.end)); @@ -337,7 +340,7 @@ fn node(rule: &'static str, kids: Vec) -> Cst { let o = kids[0].offset; let ${lexer(ir)} -struct Parser<'a> { toks: Vec>, pos: usize } +struct Parser<'a> { toks: Vec>, pos: usize, capped: bool } impl<'a> Parser<'a> { fn peek(&self) -> Option> { if self.pos < self.toks.len() { Some(self.toks[self.pos]) } else { None } } fn branch(&self, rule: &'static str, kids: Vec, save: usize) -> Cst { @@ -363,7 +366,11 @@ impl<'a> Parser<'a> { } fn sep_by(&mut self, elem: fn(&mut Parser<'a>, &mut Vec) -> bool, delim: &str, kids: &mut Vec) -> bool { if !elem(self, kids) { return false; } - loop { let sp = self.pos; let before = kids.len(); if self.match_lit(delim, "$punct", kids) && elem(self, kids) { continue; } self.pos = sp; kids.truncate(before); break; } + loop { + let sp = self.pos; let before = kids.len(); + if !self.match_lit(delim, "$punct", kids) { self.pos = sp; kids.truncate(before); break; } + if !elem(self, kids) { break; } // a trailing delimiter is allowed — keep the pushed delim and stop + } true } fn alt_lit(&mut self, opts: &[(&str, &'static str)], kids: &mut Vec) -> bool { @@ -390,15 +397,15 @@ fn main() { // Self-bench: a numeric arg N times the lex+parse loop and prints ms/iteration. if let Some(iters) = std::env::args().nth(1).and_then(|a| a.parse::().ok()) { // black_box on the input + result so the optimizer can't elide the lex/parse. - for _ in 0..3 { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0 }; std::hint::black_box(p.parse_${ir.entry}()); } + for _ in 0..3 { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false }; std::hint::black_box(p.parse_${ir.entry}()); } let t = std::time::Instant::now(); - for _ in 0..iters { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0 }; std::hint::black_box(p.parse_${ir.entry}()); } + for _ in 0..iters { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false }; std::hint::black_box(p.parse_${ir.entry}()); } println!("{:.4}", t.elapsed().as_secs_f64() * 1000.0 / iters as f64); return; } let toks = lex(&src); let n = toks.len(); - let mut p = Parser { toks, pos: 0 }; + let mut p = Parser { toks, pos: 0, capped: false }; match p.parse_${ir.entry}() { Some(root) if p.pos == n => { let mut out = String::new(); write_json(&root, &mut out); print!("{}", out); } _ => { eprintln!("parse error (pos {}/{})", p.pos, n); std::process::exit(1); } diff --git a/src/target-ts.ts b/src/target-ts.ts index fcfb818..0be1cb8 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -212,8 +212,9 @@ const ${r.name}_POST: Record = ${POST}; const ${r.name}_ATOM = ${atom}; function parse${r.name}(): Node | null { return ${r.name}_bp(0); } function ${r.name}_bp(minBp: number): Node | null { - let left = ${r.name}_nud(); + let left = ${r.name}_nud(minBp); if (left === null) return null; + if (_capped) return left; // an assignment-level arrow admits no led let tailClosed = false; for (;;) { const t = peek(); @@ -233,9 +234,11 @@ ${r.postfixToks.map(postfixArm).join('\n')} } return left; } -function ${r.name}_nud(): Node | null { +function ${r.name}_nud(minBp: number): Node | null { + _capped = false; const t = peek(); if (t === null) return null; +${r.nudCapped.map((c) => ` if (minBp < ${c.capBp}) { const save = pos; const kids: Cst[] = []; if (${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'}) { _capped = true; return branch(${J(r.name)}, kids, save); } pos = save; }`).join('\n')} ${tplNud} if (${r.name}_ATOM.has(t.kind)) { pos++; return { rule: ${J(r.name)}, children: [{ tokenType: t.kind, offset: t.off, end: t.end }], offset: t.off, end: t.end }; } ${r.nudBrackets.map(bracketNud).join('\n')} const pbp = ${r.name}_PRE[t.text]; @@ -287,6 +290,7 @@ ${lexer(ir)} let toks: Tok[] = []; let pos = 0; +let _capped = false; function peek(): Tok | null { return pos < toks.length ? toks[pos] : null; } function branch(rule: string, kids: Cst[], save: number): Node { const offset = kids.length > 0 ? kids[0].offset : (save < toks.length ? toks[save].off : 0); @@ -320,7 +324,11 @@ function opt(body: () => boolean, kids: Cst[]): boolean { } function sepBy(elem: () => boolean, delim: string, kids: Cst[]): boolean { if (!elem()) return false; - for (;;) { const sp = pos; const before = kids.length; if (matchLit(delim, '$punct', kids) && elem()) continue; pos = sp; kids.length = before; break; } + for (;;) { + const sp = pos; const before = kids.length; + if (!matchLit(delim, '$punct', kids)) { pos = sp; kids.length = before; break; } + if (!elem()) break; // a trailing delimiter is allowed — keep the pushed delim and stop + } return true; } function altLit(opts: [string, string][], kids: Cst[]): boolean { diff --git a/test/portable-targets.ts b/test/portable-targets.ts index e9cfc59..71a28fd 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -142,6 +142,18 @@ const CASES: Case[] = [ ], reject: ['return\n1;', 'return\nx;', 'return /*\n*/ 1;', 'return // c\n 1;'], }, + { + // capBelow (assignment-level) arrow functions: a NUD parsed only when minBp < the + // connector's bp, admitting NO led once parsed; the `(x) => y` vs `(x)` ambiguity is + // resolved by longest-match ordering (the arrow is tried first, falls back to grouping). + grammar: 'arrowjs', path: '../examples/arrowjs.ts', + accept: [ + 'x => x;', '(a, b) => a + b;', '() => {};', 'x = (() => 1);', 'f(() => 1, 2);', + '(x);', 'a + b;', 'x => y => x;', '(() => 2);', '(a) => a;', 'x = y => y;', 'foo();', + '(a,) => b;', '(a, b,) => a;', // trailing comma in params (sep allows a trailing delimiter) + ], + reject: ['=> x;', 'x => ;', '1 + () => 2;', '(,) => b;'], + }, ]; const sortKeys = (o: unknown): unknown => From 395ba5113d1f56668f96d9a376d9f511c90967a9 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 04:14:55 +0800 Subject: [PATCH 18/27] emit-portable: a transparent group degrades to a `seq` step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A `group` whose body is a multi-item sequence (e.g. a ctxMode group wrapping a sequence) previously threw "group must reduce to a single step". Since ctxMode is transparent to the portable parser and a `seq` step already exists, a transparent group now degrades to a single `seq` step (or its sole step when the body is one); only a no-`in` `suppress` group is still deferred. Both stepOf and stepOfPratt. No new behaviour to verify beyond the existing seq step (seqjs) — full suite 42/42, no regression. javascript.ts clears the multi-step group and advances to the next construct, the no-`in` `suppress` context. --- src/emit-portable.ts | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 12751b0..91afac1 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -142,7 +142,11 @@ function buildIR(grammar: CstGrammar): ParserIR { switch (e.type) { case 'literal': return { t: 'lit', value: e.value, ttype: litTtype(e.value) }; case 'ref': return tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }; - case 'group': { const ss = altSteps(e.body); if (ss.length !== 1) throw new Error('portable: group must reduce to a single step'); return ss[0]; } + case 'group': { // transparent (ctxMode is invisible to the portable parser); only no-in `suppress` is deferred + if (e.suppress) throw new Error('portable: group with suppress (no-in context) not yet in scope'); + const ss = altSteps(e.body); + return ss.length === 1 ? ss[0] : { t: 'seq', steps: ss }; + } case 'not': return { t: 'not', steps: altSteps(e.body) }; // zero-width negative lookahead case 'sameLine': return { t: 'sameLine' }; // zero-width no-newline assertion case 'seq': return { t: 'seq', steps: e.items.map(stepOf) }; // grouped sub-sequence (star/sep body) @@ -316,8 +320,10 @@ function buildPratt( if (e.type === 'seq') return { t: 'seq', steps: e.items.map(stepOfPratt) }; if (e.type === 'sameLine') return { t: 'sameLine' }; if (e.type === 'not') return { t: 'not', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; - // ctxMode (await/yield) is transparent to the portable parser (no fork); unwrap a non-seq group. - if (e.type === 'group' && !e.capBelow && !e.suppress && e.body.type !== 'seq') return stepOfPratt(e.body); + // ctxMode (await/yield) is transparent to the portable parser (no fork); unwrap the group. + if (e.type === 'group' && !e.capBelow && !e.suppress) { + return e.body.type === 'seq' ? { t: 'seq', steps: e.body.items.map(stepOfPratt) } : stepOfPratt(e.body); + } if (e.type === 'sep') return { t: 'sep', elem: stepOfPratt(e.element), delim: e.delimiter }; if (e.type === 'quantifier' && e.kind === '?') return { t: 'opt', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; if (e.type === 'quantifier' && e.kind === '*') return { t: 'star', step: stepOfPratt(e.body) }; From 544e277f716b8222a150f1b4f6ad252ff71ea40f Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 04:23:42 +0800 Subject: [PATCH 19/27] emit-portable: precedence-gated mixfix LEDs (ternary + chain-rhs in/instanceof) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The portable parser's mixfix leds bound maximally tight — fine for access tails (`.`/`(`/`[`) but wrong for a precedence-carrying led like the ternary `? :` (`a == b ? c : d` must group as `(a == b) ? c : d`). The led loop now gates such a led by its lbp (from the grammar's ledPrec): bind only when lbp > minBp. And a chain-rhs led (`in`/`instanceof`) parses its trailing self-operand at the level's bp via a new `ruleBp` step, so `a in b in c` left-chains as `(a in b) in c`. Both derive from analyzeGrammar's ledPrecByConnector — single-sourced with the interpreter. examples/ledjs.ts verifies it across ts/go/rust — 11/11 accept (ternary below the operators, right-associative `a ? b : c ? d : e`, chain-rhs `in`), 4/4 reject, byte-identical to createParser. Full suite 42/42. This is the precedence foundation the no-`in` (suppress) context builds on next. --- examples/ledjs.ts | 46 ++++++++++++++++++++++++++++++++++++++++ src/emit-portable.ts | 14 +++++++++--- src/target-go.ts | 5 +++-- src/target-rust.ts | 6 ++++-- src/target-ts.ts | 8 ++++--- test/portable-targets.ts | 11 ++++++++++ 6 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 examples/ledjs.ts diff --git a/examples/ledjs.ts b/examples/ledjs.ts new file mode 100644 index 0000000..f13831a --- /dev/null +++ b/examples/ledjs.ts @@ -0,0 +1,46 @@ +// Exercises precedence-gated mixfix LEDs: the ternary `? :` (a led that binds LOOSER than the +// operators, so `a == b ? c : d` groups as `(a == b) ? c : d`) and `in`/`instanceof` (chain-rhs +// leds at the relational level — `a in b in c` left-chains as `(a in b) in c`). Both need the +// led-precedence gate the portable parser previously lacked (its mixfix leds bound maximally tight). +import { + token, rule, defineGrammar, left, right, op, + seq, oneOf, range, star, many, +} from '../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [ + right('='), + left('||'), + left('==', '!='), + left('<', '>'), + left('+', '-'), + left('*', '/'), +]; + +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + [$, op, $], + [$, '?', $, ':', $], // ternary (binds below `||`) + [$, 'in', $], // relational chain-rhs + [$, 'instanceof', $], +]); +const Stmt = rule(($) => [[Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'ledjs', + scopeName: 'source.ledjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + ledPrec: [ + { connector: '?', below: '||' }, + { connector: 'in', sameAs: '<', chainRhs: true }, + { connector: 'instanceof', sameAs: '<', chainRhs: true }, + ], + rules: { Expr, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 91afac1..3e9fce4 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -46,6 +46,7 @@ export type Step = | { t: 'lit'; value: string; ttype: '$keyword' | '$punct' } // match a literal by text | { t: 'tok'; name: string } // match a token kind | { t: 'rule'; name: string } // call a rule, append its node + | { t: 'ruleBp'; name: string; bp: number } // call a Pratt rule at a given binding power (chain-rhs led trailing operand) | { t: 'star'; step: Step } // repeat inner 0+ | { t: 'opt'; steps: Step[] } // optional sub-sequence | { t: 'sep'; elem: Step; delim: string } // elem (delim elem)* @@ -69,6 +70,7 @@ export type PrattRule = { binary: Array<{ op: string; lbp: number; rbp: number }>; // LED: infix op, bind iff lbp > minBp, rhs at rbp leds: Bracket[]; // LED: mixfix continuation (call/member/index), tried before operators ledAccessTail: boolean[]; // parallel to leds: a "closed punct-connector" tail (member/call/index) — disabled once a postfix binds + ledLbp: Array; // parallel to leds: precedence gate (ternary/in/instanceof) — bind only when lbp > minBp; null = bind maximally tight postfixToks: string[]; // LED: a postfix token `$ X` (e.g. a tagged template), tried like a mixfix led (also an access tail) postfix: Array<{ op: string; lbp: number }>; // LED: a postfix operator `$ ++` — binds iff lbp > minBp + !tailClosed, no rhs, closes the tail }; @@ -268,6 +270,7 @@ function buildPratt( let sawPrefix = false, sawBinary = false, sawPostfix = false; const leds: Bracket[] = []; const ledAccessTail: boolean[] = []; + const ledLbp: Array = []; const postfixToks: string[] = []; for (const alt of alts) { const items = alt.type === 'seq' ? alt.items : [alt]; @@ -303,12 +306,17 @@ function buildPratt( if (rest[0].type === 'op') { sawBinary = true; continue; } if (rest[0].type === 'postfix') { sawPostfix = true; continue; } // postfix operator (`x++`) if (rest[0].type === 'literal') { + const conn = rest[0].value; + const prec = a.ledPrecByConnector.get(conn); // { lbp, rhsBp } for ternary/in/instanceof const steps = rest.map((it) => stepOfPratt(it)); const last = steps[steps.length - 1]; const lastIsOperand = last !== undefined && last.t === 'rule' && last.name === name; // open binary/ternary operand - const wordConnector = /^[A-Za-z]/.test(rest[0].value); // `in`/`instanceof`/`as` — not a tail - leds.push({ first: rest[0].value, steps }); + // chain-rhs (`in`/`instanceof`): the trailing self-operand parses at the level's bp (left-chain). + if (prec && prec.rhsBp !== null && lastIsOperand) steps[steps.length - 1] = { t: 'ruleBp', name, bp: prec.rhsBp }; + const wordConnector = /^[A-Za-z]/.test(conn); // `in`/`instanceof`/`as` — not a tail + leds.push({ first: conn, steps }); ledAccessTail.push(!lastIsOperand && !wordConnector); + ledLbp.push(prec ? prec.lbp : null); continue; } if (rest.length === 1 && rest[0].type === 'ref' && a.tokenNames.has(rest[0].name)) { postfixToks.push(rest[0].name); continue; } // postfix token (tagged template) @@ -337,5 +345,5 @@ function buildPratt( const postfix = sawPostfix ? [...a.opTable.entries()].filter(([, info]) => info.position === 'postfix').map(([op, info]) => ({ op, lbp: info.lbp })) : []; - return { kind: 'pratt', name, nudToks, nudBrackets, nudSeqs, nudCapped, prefix, binary, leds, ledAccessTail, postfixToks, postfix }; + return { kind: 'pratt', name, nudToks, nudBrackets, nudSeqs, nudCapped, prefix, binary, leds, ledAccessTail, ledLbp, postfixToks, postfix }; } diff --git a/src/target-go.ts b/src/target-go.ts index 6729ec3..428bea6 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -169,6 +169,7 @@ function stepCond(s: Step): string { case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)})`; case 'tok': return `matchTok(${J(s.name)})`; case 'rule': return `callRule(parse${s.name})`; + case 'ruleBp': return `callRule(func() int32 { return ${s.name}bp(${s.bp}) })`; case 'star': return `star(func() bool { return ${stepCond(s.step)} })`; case 'opt': return `opt(func() bool { return ${s.steps.map(stepCond).join(' && ')} })`; case 'sep': return `sepBy(func() bool { return ${stepCond(s.elem)} }, ${J(s.delim)})`; @@ -208,7 +209,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { \t\tif ${b.steps.map(stepCond).join(' && ')} { return finish(${J(r.name)}, sb, t.Off) } \t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 \t}`; - const ledArm = (b: Bracket, accessTail: boolean) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}t.Text == ${J(b.first)} { + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}t.Text == ${J(b.first)} { \t\t\tledSave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) \t\t\tscratch = append(scratch, left) \t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } @@ -239,7 +240,7 @@ func ${r.name}bp(minBp int) int32 { \tfor { \t\tt := peek() \t\tif t == nil { break } -${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i])).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} \t\tif post, ok := ${r.name}POST[t.Text]; ok && !tailClosed && post > minBp { \t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf("$operator", t.Off, t.End)); pos++; tailClosed = true diff --git a/src/target-rust.ts b/src/target-rust.ts index a81c3f5..adbc530 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -177,6 +177,7 @@ function stepCond(s: Step): string { case 'lit': return `self.match_lit(${J(s.value)}, ${J(s.ttype)}, &mut kids)`; case 'tok': return `self.match_tok(${J(s.name)}, &mut kids)`; case 'rule': return `self.call_rule(Parser::parse_${s.name}, &mut kids)`; + case 'ruleBp': return `self.call_rule(|p| p.${s.name}_bp(${s.bp}), &mut kids)`; case 'star': return `self.star(|p, k| ${stepCondP(s.step)}, &mut kids)`; case 'opt': return `self.opt(|p, k| ${s.steps.map(stepCondP).join(' && ')}, &mut kids)`; case 'sep': return `self.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, &mut kids)`; @@ -202,6 +203,7 @@ function stepCondP(s: Step): string { case 'lit': return `p.match_lit(${J(s.value)}, ${J(s.ttype)}, k)`; case 'tok': return `p.match_tok(${J(s.name)}, k)`; case 'rule': return `p.call_rule(Parser::parse_${s.name}, k)`; + case 'ruleBp': return `p.call_rule(|p| p.${s.name}_bp(${s.bp}), k)`; case 'star': return `p.star(|p, k| ${stepCondP(s.step)}, k)`; case 'opt': return `p.opt(|p, k| ${s.steps.map(stepCondP).join(' && ')}, k)`; case 'sep': return `p.sep_by(|p, k| ${stepCondP(s.elem)}, ${J(s.delim)}, k)`; @@ -237,7 +239,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { if ${b.steps.map(stepCond).join(' && ')} { return Some(node(${J(r.name)}, kids)); } self.pos = save; return None; }`; - const ledArm = (b: Bracket, accessTail: boolean) => ` if ${accessTail ? '!tail_closed && ' : ''}t.text == ${J(b.first)} { + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if ${accessTail ? '!tail_closed && ' : ''}${lbp !== null ? `${lbp} > min_bp && ` : ''}t.text == ${J(b.first)} { let led_save = self.pos; let mut kids: Vec = Vec::new(); if ${b.steps.map(stepCond).join(' && ')} { let mut full = vec![left]; full.append(&mut kids); @@ -262,7 +264,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { let mut tail_closed = false; loop { let t = match self.peek() { Some(t) => t, None => break }; -${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i])).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} if let Some(plbp) = Parser::${r.name}_post(t.text) { if !tail_closed && plbp > min_bp { self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); left = node(${J(r.name)}, vec![left, op_leaf]); tail_closed = true; continue; } } let (lbp, rbp) = match Parser::${r.name}_bin(t.text) { Some(x) => x, None => break }; diff --git a/src/target-ts.ts b/src/target-ts.ts index 0be1cb8..216a0c8 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -160,6 +160,7 @@ function stepCond(s: Step): string { case 'lit': return `matchLit(${J(s.value)}, ${J(s.ttype)}, kids)`; case 'tok': return `matchTok(${J(s.name)}, kids)`; case 'rule': return `callRule(parse${s.name}, kids)`; + case 'ruleBp': return `callRule(() => ${s.name}_bp(${s.bp}), kids)`; case 'star': return `star(() => ${stepCond(s.step)}, kids)`; case 'opt': return `opt(() => ${s.steps.map(stepCond).join(' && ')}, kids)`; case 'sep': return `sepBy(() => ${stepCond(s.elem)}, ${J(s.delim)}, kids)`; @@ -193,8 +194,9 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { if (${b.steps.map(stepCond).join(' && ')}) return node(${J(r.name)}, kids); pos = save; return null; }`; - // Access-tail leds (member/call/index) are disabled once a postfix has closed the operand. - const ledArm = (b: Bracket, accessTail: boolean) => ` if (${accessTail ? '!tailClosed && ' : ''}t.text === ${J(b.first)}) { + // Access-tail leds (member/call/index) are disabled once a postfix has closed the operand; + // a precedence-gated led (ternary/in/instanceof) binds only when its lbp > minBp. + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if (${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}t.text === ${J(b.first)}) { const ledSave = pos; const kids: Cst[] = [left]; if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.name)}, kids); continue; } pos = ledSave; break; @@ -219,7 +221,7 @@ function ${r.name}_bp(minBp: number): Node | null { for (;;) { const t = peek(); if (t === null) break; -${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i])).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} const post = ${r.name}_POST[t.text]; if (!tailClosed && post !== undefined && post > minBp) { pos++; const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; left = { rule: ${J(r.name)}, children: [left, opLeaf], offset: left.offset, end: t.end }; tailClosed = true; continue; } diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 71a28fd..c41234d 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -154,6 +154,17 @@ const CASES: Case[] = [ ], reject: ['=> x;', 'x => ;', '1 + () => 2;', '(,) => b;'], }, + { + // Precedence-gated mixfix LEDs: ternary `? :` (binds below the operators) and the + // chain-rhs relational leds `in`/`instanceof` (`a in b in c` left-chains). + grammar: 'ledjs', path: '../examples/ledjs.ts', + accept: [ + 'a == b ? c : d;', 'a ? b : c ? d : e;', 'a + b ? c : d - e;', 'a in b;', + 'a in b in c;', 'x instanceof Y;', 'a < b in c;', '1 + 2 * 3 ? 4 : 5;', + '(a ? b : c) + d;', 'a in b ? c : d;', 'a = b ? c : d;', + ], + reject: ['a ? b;', 'a ? : c;', 'in b;', 'a instanceof;'], + }, ]; const sortKeys = (o: unknown): unknown => From ba158c0e61bc4fe3180bef95c77fe305848e4655 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 04:50:34 +0800 Subject: [PATCH 20/27] =?UTF-8?q?emit-portable:=20no-in=20suppress,=20+-qu?= =?UTF-8?q?antifier,=20sep/bracket=20fixes=20=E2=80=94=20javascript.ts=20n?= =?UTF-8?q?ow=20EMITS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A run of constructs that together take the real javascript.ts grammar through the whole portable emitter end-to-end: - no-`in` (suppress) context: a `for (binding in iterable)` head parses its binding with the `in` led disabled (examples/noinjs.ts, 9/9+4/4 ×3). Threads a suppressed-connector set consumed per led loop. - one-or-more `+` quantifier (`x+` = `x x*`) — the last buildIR throw; with it, javascript.ts EMITS in all three targets. - Two latent `sep` bugs, both exposed only by the real grammar (earlier grammars wrapped sep in opt or never tested the shapes — the aggregate passed for the wrong reason): gen-parser's sep is `(element (delim element)*)?`, i.e. the WHOLE list is optional (empty `f()` valid) AND a trailing delimiter is allowed. sepBy now matches. - A NUD bracket that fails now FALLS THROUGH to the next same-first-token alternative instead of returning null — javascript has four `new`-led NUDs. Result: javascript.ts emits, compiles and runs in ts/go/rust, and is byte-identical to createParser on basic JS (var/function/arrow/ternary/member-call/for-in/while/if/class/ new/template/regex/instanceof/try/switch) — 23/24 in TS, the one miss a `new a.b()` NewTarget member-constructor CST shape. The await/yield fork (async/await) and that new-expression edge remain. Full suite 42/42; existing gate unaffected by the shared sep/bracket fixes. --- examples/noinjs.ts | 35 +++++++++++++++++++++++++++++++++++ src/emit-portable.ts | 19 +++++++++---------- src/target-go.ts | 9 ++++++--- src/target-rust.ts | 18 +++++++++++------- src/target-ts.ts | 9 ++++++--- test/portable-targets.ts | 11 +++++++++++ 6 files changed, 78 insertions(+), 23 deletions(-) create mode 100644 examples/noinjs.ts diff --git a/examples/noinjs.ts b/examples/noinjs.ts new file mode 100644 index 0000000..bc413ad --- /dev/null +++ b/examples/noinjs.ts @@ -0,0 +1,35 @@ +// Exercises the no-`in` (suppress) context. In a `for (binding in iterable)` head, the +// binding is parsed with the `in` LED DISABLED — `exclude('in', Expr)` — so the `in` belongs +// to the for-head, not to a relational expression inside the binding. Outside a for-head, `in` +// binds normally. The portable parser threads a suppressed-connector set into the led loop. +import { + token, rule, defineGrammar, left, op, exclude, + seq, oneOf, range, star, many, +} from '../src/api.ts'; + +const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); +const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); +const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' }); +const Number_ = token(seq(range('0', '9'), star(range('0', '9'))), { scope: 'constant.numeric' }); + +const jsPrec = [left('||'), left('<', '>'), left('+', '-')]; + +const Expr = rule(($) => [ + Number_, Ident, + ['(', $, ')'], + [$, op, $], + [$, 'in', $], + [$, '.', Ident], +]); +const ForHead = rule(($) => [['for', '(', exclude('in', Expr), 'in', Expr, ')', Stmt]]); +const Stmt = rule(($) => [ForHead, [Expr, ';']]); +const Program = rule(($) => [many(Stmt)]); + +export default defineGrammar({ + name: 'noinjs', + scopeName: 'source.noinjs', + tokens: { Ident, Number: Number_ }, + prec: jsPrec, + ledPrec: [{ connector: 'in', sameAs: '<', chainRhs: true }], + rules: { Expr, ForHead, Stmt, Program }, +}); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index 3e9fce4..dffa5f5 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -54,7 +54,8 @@ export type Step = | { t: 'alt'; branches: Step[][] } // inline alternation of sub-sequences (backtracking) | { t: 'not'; steps: Step[] } // zero-width negative lookahead (consumes nothing) | { t: 'seq'; steps: Step[] } // a grouped sub-sequence (e.g. a star body `(',' Expr)`) - | { t: 'sameLine' }; // zero-width: the next token is on the same line (no preceding newline) + | { t: 'sameLine' } // zero-width: the next token is on the same line (no preceding newline) + | { t: 'suppress'; connectors: string[]; steps: Step[] }; // parse the body with these LED connectors disabled (no-`in` context) export type Alt = Step[]; export type RdRule = { kind: 'rd'; name: string; alts: Alt[] }; @@ -144,9 +145,9 @@ function buildIR(grammar: CstGrammar): ParserIR { switch (e.type) { case 'literal': return { t: 'lit', value: e.value, ttype: litTtype(e.value) }; case 'ref': return tokenNames.has(e.name) ? { t: 'tok', name: e.name } : { t: 'rule', name: e.name }; - case 'group': { // transparent (ctxMode is invisible to the portable parser); only no-in `suppress` is deferred - if (e.suppress) throw new Error('portable: group with suppress (no-in context) not yet in scope'); + case 'group': { // transparent (ctxMode is invisible to the portable parser) const ss = altSteps(e.body); + if (e.suppress && e.suppress.length) return { t: 'suppress', connectors: e.suppress, steps: ss }; // no-`in` context return ss.length === 1 ? ss[0] : { t: 'seq', steps: ss }; } case 'not': return { t: 'not', steps: altSteps(e.body) }; // zero-width negative lookahead @@ -156,7 +157,7 @@ function buildIR(grammar: CstGrammar): ParserIR { case 'quantifier': if (e.kind === '*') return { t: 'star', step: stepOf(e.body) }; if (e.kind === '?') return { t: 'opt', steps: altSteps(e.body) }; - if (e.kind === '+') throw new Error("portable: '+' not yet modeled (use '*')"); + if (e.kind === '+') return { t: 'seq', steps: [stepOf(e.body), { t: 'star', step: stepOf(e.body) }] }; // x+ = x x* break; case 'alt': { if (e.items.every((it) => it.type === 'literal')) { // fast path: all-literal alternation @@ -289,15 +290,11 @@ function buildPratt( continue; } if (items[0].type === 'literal') { nudBrackets.push({ first: items[0].value, steps: items.map((it) => stepOfPratt(it)) }); continue; } - // A single transparent group unwraps to its body (an explicit grouping of the NUD sequence). + // A single transparent (non-suppress) group unwraps to its body (an explicit grouping). let nudItems = items; if (items.length === 1 && items[0].type === 'group' && !items[0].suppress) { nudItems = items[0].body.type === 'seq' ? items[0].body.items : [items[0].body]; } - // A no-`in`/suppress group is a deeper construct — defer. - if (nudItems.some((it) => it.type === 'group' && it.suppress)) { - throw new Error(`portable: Pratt NUD with suppress (no-in context) not yet in scope (rule ${name})`); - } nudSeqs.push(nudItems.map((it) => stepOfPratt(it))); // general NUD sequence (guarded ident, class expr) continue; } @@ -328,13 +325,15 @@ function buildPratt( if (e.type === 'seq') return { t: 'seq', steps: e.items.map(stepOfPratt) }; if (e.type === 'sameLine') return { t: 'sameLine' }; if (e.type === 'not') return { t: 'not', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; + if (e.type === 'group' && e.suppress && e.suppress.length) return { t: 'suppress', connectors: e.suppress, steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; // ctxMode (await/yield) is transparent to the portable parser (no fork); unwrap the group. - if (e.type === 'group' && !e.capBelow && !e.suppress) { + if (e.type === 'group' && !e.capBelow) { return e.body.type === 'seq' ? { t: 'seq', steps: e.body.items.map(stepOfPratt) } : stepOfPratt(e.body); } if (e.type === 'sep') return { t: 'sep', elem: stepOfPratt(e.element), delim: e.delimiter }; if (e.type === 'quantifier' && e.kind === '?') return { t: 'opt', steps: (e.body.type === 'seq' ? e.body.items : [e.body]).map(stepOfPratt) }; if (e.type === 'quantifier' && e.kind === '*') return { t: 'star', step: stepOfPratt(e.body) }; + if (e.type === 'quantifier' && e.kind === '+') return { t: 'seq', steps: [stepOfPratt(e.body), { t: 'star', step: stepOfPratt(e.body) }] }; if (e.type === 'literal') return { t: 'lit', value: e.value, ttype: litTtype(e.value) }; return stepOf(e); } diff --git a/src/target-go.ts b/src/target-go.ts index 428bea6..2fad500 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -178,6 +178,7 @@ function stepCond(s: Step): string { case 'not': return `func() bool { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); m := ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return !m }()`; case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; case 'sameLine': return `func() bool { t := peek(); return t != nil && !t.Nl }()`; + case 'suppress': return `func() bool { _suppressNext = map[string]bool{${s.connectors.map((c) => `${J(c)}: true`).join(', ')}}; _r := (${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}); _suppressNext = nil; return _r }()`; } } @@ -207,9 +208,9 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { const bracketNud = (b: Bracket) => `\tif t.Text == ${J(b.first)} { \t\tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) \t\tif ${b.steps.map(stepCond).join(' && ')} { return finish(${J(r.name)}, sb, t.Off) } -\t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 +\t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] \t}`; - const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}t.Text == ${J(b.first)} { + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}!_mySup[${J(b.first)}] && t.Text == ${J(b.first)} { \t\t\tledSave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) \t\t\tscratch = append(scratch, left) \t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } @@ -233,6 +234,7 @@ var ${r.name}POST = map[string]int{${post}} var ${r.name}ATOM = map[string]bool{${atoms}} func parse${r.name}() int32 { return ${r.name}bp(0) } func ${r.name}bp(minBp int) int32 { +\t_mySup := _suppressNext; _suppressNext = nil; _ = _mySup \tleft := ${r.name}nud(minBp) \tif left < 0 { return -1 } \tif _capped { return left } @@ -332,6 +334,7 @@ type bp struct{ lbp, rbp int } var toks []Tok var pos int var _capped bool +var _suppressNext map[string]bool var nodes []Node var kids []int32 var scratch []int32 @@ -379,7 +382,7 @@ func opt(body func() bool) bool { \tsp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if !body() { pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }; return true } func sepBy(elem func() bool, delim string) bool { -\tif !elem() { return false } +\tif !elem() { return true } // the whole separated list is optional — zero elements is valid \tfor { \t\tsp := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) \t\tif !matchLit(delim, "$punct") { pos = sp; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break } diff --git a/src/target-rust.ts b/src/target-rust.ts index adbc530..a05f71b 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -186,6 +186,7 @@ function stepCond(s: Step): string { case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(self, &mut kids)`; case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; case 'sameLine': return `matches!(self.peek(), Some(t) if !t.nl)`; + case 'suppress': return `{ self.suppress_next = vec![${s.connectors.map(J).join(', ')}]; let _r = (${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}); self.suppress_next = Vec::new(); _r }`; } } // A backtracking inline alternation rendered as an immediately-applied closure over (p, k), @@ -212,6 +213,7 @@ function stepCondP(s: Step): string { case 'not': return `(|p: &mut Parser<'a>, k: &mut Vec| -> bool { ${notBody(s.steps)} })(p, k)`; case 'seq': return `(${s.steps.length ? s.steps.map(stepCondP).join(' && ') : 'true'})`; case 'sameLine': return `matches!(p.peek(), Some(t) if !t.nl)`; + case 'suppress': return `{ p.suppress_next = vec![${s.connectors.map(J).join(', ')}]; let _r = (${s.steps.length ? s.steps.map(stepCondP).join(' && ') : 'true'}); p.suppress_next = Vec::new(); _r }`; } } @@ -237,9 +239,9 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { const bracketNud = (b: Bracket) => ` if t.text == ${J(b.first)} { let save = self.pos; let mut kids: Vec = Vec::new(); if ${b.steps.map(stepCond).join(' && ')} { return Some(node(${J(r.name)}, kids)); } - self.pos = save; return None; + self.pos = save; // fall through to the next NUD alternative }`; - const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if ${accessTail ? '!tail_closed && ' : ''}${lbp !== null ? `${lbp} > min_bp && ` : ''}t.text == ${J(b.first)} { + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if ${accessTail ? '!tail_closed && ' : ''}${lbp !== null ? `${lbp} > min_bp && ` : ''}!my_sup.iter().any(|c| *c == ${J(b.first)}) && t.text == ${J(b.first)} { let led_save = self.pos; let mut kids: Vec = Vec::new(); if ${b.steps.map(stepCond).join(' && ')} { let mut full = vec![left]; full.append(&mut kids); @@ -259,6 +261,8 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { fn ${r.name}_post(op: &str) -> Option { match op { ${postArms}${postArms ? ', ' : ''}_ => None } } fn ${r.name}_atom(kind: &str) -> bool { matches!(kind, ${atomArm || '""'}) } fn ${r.name}_bp(&mut self, min_bp: i64) -> Option { + let my_sup = std::mem::take(&mut self.suppress_next); + let _ = &my_sup; let mut left = self.${r.name}_nud(min_bp)?; if self.capped { return Some(left); } let mut tail_closed = false; @@ -342,7 +346,7 @@ fn node(rule: &'static str, kids: Vec) -> Cst { let o = kids[0].offset; let ${lexer(ir)} -struct Parser<'a> { toks: Vec>, pos: usize, capped: bool } +struct Parser<'a> { toks: Vec>, pos: usize, capped: bool, suppress_next: Vec<&'static str> } impl<'a> Parser<'a> { fn peek(&self) -> Option> { if self.pos < self.toks.len() { Some(self.toks[self.pos]) } else { None } } fn branch(&self, rule: &'static str, kids: Vec, save: usize) -> Cst { @@ -367,7 +371,7 @@ impl<'a> Parser<'a> { let sp = self.pos; let before = kids.len(); if !body(self, kids) { self.pos = sp; kids.truncate(before); } true } fn sep_by(&mut self, elem: fn(&mut Parser<'a>, &mut Vec) -> bool, delim: &str, kids: &mut Vec) -> bool { - if !elem(self, kids) { return false; } + if !elem(self, kids) { return true; } // the whole separated list is optional — zero elements is valid loop { let sp = self.pos; let before = kids.len(); if !self.match_lit(delim, "$punct", kids) { self.pos = sp; kids.truncate(before); break; } @@ -399,15 +403,15 @@ fn main() { // Self-bench: a numeric arg N times the lex+parse loop and prints ms/iteration. if let Some(iters) = std::env::args().nth(1).and_then(|a| a.parse::().ok()) { // black_box on the input + result so the optimizer can't elide the lex/parse. - for _ in 0..3 { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false }; std::hint::black_box(p.parse_${ir.entry}()); } + for _ in 0..3 { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new() }; std::hint::black_box(p.parse_${ir.entry}()); } let t = std::time::Instant::now(); - for _ in 0..iters { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false }; std::hint::black_box(p.parse_${ir.entry}()); } + for _ in 0..iters { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new() }; std::hint::black_box(p.parse_${ir.entry}()); } println!("{:.4}", t.elapsed().as_secs_f64() * 1000.0 / iters as f64); return; } let toks = lex(&src); let n = toks.len(); - let mut p = Parser { toks, pos: 0, capped: false }; + let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new() }; match p.parse_${ir.entry}() { Some(root) if p.pos == n => { let mut out = String::new(); write_json(&root, &mut out); print!("{}", out); } _ => { eprintln!("parse error (pos {}/{})", p.pos, n); std::process::exit(1); } diff --git a/src/target-ts.ts b/src/target-ts.ts index 216a0c8..c30e81f 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -169,6 +169,7 @@ function stepCond(s: Step): string { case 'not': return `(() => { const sp = pos; const bk = kids.length; const m = ${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}; pos = sp; kids.length = bk; return !m; })()`; case 'seq': return `(${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'})`; case 'sameLine': return `(() => { const t = peek(); return t !== null && !t.nl; })()`; + case 'suppress': return `(() => { _suppressNext = new Set([${s.connectors.map(J).join(', ')}]); const _r = (${s.steps.length ? s.steps.map(stepCond).join(' && ') : 'true'}); _suppressNext = null; return _r; })()`; } } @@ -192,11 +193,11 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { const bracketNud = (b: Bracket) => ` if (t.text === ${J(b.first)}) { const save = pos; const kids: Cst[] = []; if (${b.steps.map(stepCond).join(' && ')}) return node(${J(r.name)}, kids); - pos = save; return null; + pos = save; // fall through to the next NUD alternative (e.g. another '${b.first}'-led form) }`; // Access-tail leds (member/call/index) are disabled once a postfix has closed the operand; // a precedence-gated led (ternary/in/instanceof) binds only when its lbp > minBp. - const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if (${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}t.text === ${J(b.first)}) { + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if (${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}(_mySup === null || !_mySup.has(${J(b.first)})) && t.text === ${J(b.first)}) { const ledSave = pos; const kids: Cst[] = [left]; if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.name)}, kids); continue; } pos = ledSave; break; @@ -214,6 +215,7 @@ const ${r.name}_POST: Record = ${POST}; const ${r.name}_ATOM = ${atom}; function parse${r.name}(): Node | null { return ${r.name}_bp(0); } function ${r.name}_bp(minBp: number): Node | null { + const _mySup = _suppressNext; _suppressNext = null; // no-in: consume the suppressed-connector set for this led loop let left = ${r.name}_nud(minBp); if (left === null) return null; if (_capped) return left; // an assignment-level arrow admits no led @@ -293,6 +295,7 @@ ${lexer(ir)} let toks: Tok[] = []; let pos = 0; let _capped = false; +let _suppressNext: Set | null = null; function peek(): Tok | null { return pos < toks.length ? toks[pos] : null; } function branch(rule: string, kids: Cst[], save: number): Node { const offset = kids.length > 0 ? kids[0].offset : (save < toks.length ? toks[save].off : 0); @@ -325,7 +328,7 @@ function opt(body: () => boolean, kids: Cst[]): boolean { const sp = pos; const before = kids.length; if (!body()) { pos = sp; kids.length = before; } return true; } function sepBy(elem: () => boolean, delim: string, kids: Cst[]): boolean { - if (!elem()) return false; + if (!elem()) return true; // the whole separated list is optional — zero elements is valid for (;;) { const sp = pos; const before = kids.length; if (!matchLit(delim, '$punct', kids)) { pos = sp; kids.length = before; break; } diff --git a/test/portable-targets.ts b/test/portable-targets.ts index c41234d..fe52c60 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -165,6 +165,17 @@ const CASES: Case[] = [ ], reject: ['a ? b;', 'a ? : c;', 'in b;', 'a instanceof;'], }, + { + // The no-`in` (suppress) context: a `for (binding in iterable)` head parses its binding + // with the `in` led disabled, so `in` belongs to the for-head, not the binding. + grammar: 'noinjs', path: '../examples/noinjs.ts', + accept: [ + 'for (x in y) z;', 'x in y;', 'for (a.b in c) d;', 'a in b in c;', + 'for ((x) in y) z;', 'for (x in y) a in b;', 'for (x in a in b) z;', + '(a in b);', 'for (a in b) for (c in d) e;', + ], + reject: ['for (x y) z;', 'for x in y;', 'for (in y) z;', 'for (x in) z;'], + }, ]; const sortKeys = (o: unknown): unknown => From 0b6d7fddf97514e255dfbc8de3e61fae78ddac2a Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 05:26:48 +0800 Subject: [PATCH 21/27] emit-portable: the real javascript.ts grammar emits to ts/go/rust (issue #6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The target-agnostic emitter now handles a full language end-to-end. javascript.ts — 89 rules after the [Await]/[Yield] fork — emits, compiles and runs in all three targets, byte-identical to createParser, and is gate-maintained (28/28 accept, 6/6 reject ×3, ASCII corpus). What it took: - Left recursion: a left-recursive non-Pratt rule (NewTarget, TS Type) now routes through buildPratt (atom-then-continuation), fixing the infinite recursion a plain rd rule hit. - The [Await]/[Yield] context fork: emitPortableParser applies `withAwaitYield` exactly as createParser does, so `await`/`yield` are keywords in async/generator bodies and identifiers elsewhere — name-forked into $A/$Y/$AY families. - A forked rule labels its CST node with the CANON base name (cstName), not the $-suffixed family name; and the $ in family names (a valid TS but not Go/Rust identifier) is sanitized to `_` for the emitted parse-fn names. - Full JS whitespace (`\s`: NBSP/LS/PS/…), not just ASCII. - A leaked `_capped` flag: it is a global, but gen-parser's `capped` is local, so a grouping `(arrow)` leaked the cap to the outer expression and dropped a trailing call (`(() => {})()`). Non-capped NUD arms now force it false. - Two more `sep` shapes (empty list `f()`, both surfaced by the real grammar). ts/go/rust all 28/28 on the ASCII corpus (destructuring, generators, classes, optional chaining, async/await, labels). Byte-based go/rust use UTF-8 offsets — identical to the JS oracle for ASCII; non-ASCII offset units differ inherently. Full suite 42/42. --- src/emit-portable.ts | 48 +++++++++++++++++++++++++++++++++------- src/target-go.ts | 32 +++++++++++++++------------ src/target-rust.ts | 30 +++++++++++++++---------- src/target-ts.ts | 34 ++++++++++++++++------------ test/portable-targets.ts | 19 ++++++++++++++++ 5 files changed, 115 insertions(+), 48 deletions(-) diff --git a/src/emit-portable.ts b/src/emit-portable.ts index dffa5f5..a6a24eb 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -21,6 +21,7 @@ // operators. buildIR THROWS on a construct outside this set rather than emit a wrong // parser. This is enough to derive a real JavaScript-subset parser (examples/minijs.ts). import type { CstGrammar, RuleExpr, TokenDecl, TokenPattern } from './types.ts'; +import { withAwaitYield } from './await-yield-fork.ts'; import { analyzeGrammar, findEntryRule } from './grammar-analysis.ts'; import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; import { @@ -58,11 +59,12 @@ export type Step = | { t: 'suppress'; connectors: string[]; steps: Step[] }; // parse the body with these LED connectors disabled (no-`in` context) export type Alt = Step[]; -export type RdRule = { kind: 'rd'; name: string; alts: Alt[] }; +export type RdRule = { kind: 'rd'; name: string; cstName: string; alts: Alt[] }; export type Bracket = { first: string; steps: Step[] }; // a literal-led sequence (grouping/array; LED call/index) export type PrattRule = { kind: 'pratt'; - name: string; + name: string; // the (possibly $A/$Y-forked) rule name — used for the parse fn names + cstName: string; // the CANON name — the CST node label (a fork collapses to its base) nudToks: string[]; // NUD: a bare token wrapped in a node nudBrackets: Bracket[]; // NUD: '(' … ')' / '[' … ']' nudSeqs: Step[][]; // NUD: a general sequence (guarded ident, class expr), tried with backtracking @@ -123,7 +125,10 @@ export interface Target { } export function emitPortableParser(grammar: CstGrammar, target: Target): string { - return target.render(buildIR(grammar)); + // Apply the [Await]/[Yield] context fork exactly as createParser does, so `await`/`yield` + // are keywords inside async/generator bodies and identifiers outside — name-forked into + // $A/$Y/$AY rule families. Every other consumer (and the portable parser) sees plain rules. + return target.render(buildIR(withAwaitYield(grammar))); } // ── buildIR: grammar + analysis → the target-agnostic parse plan ── @@ -174,8 +179,12 @@ function buildIR(grammar: CstGrammar): ParserIR { } const rules: RuleIR[] = grammar.rules.map((r) => { - if (a.prattRules.has(r.name)) return buildPratt(r.name, r.body, a, stepOf, altSteps, litTtype); - return { kind: 'rd', name: r.name, alts: r.body.type === 'alt' ? r.body.items.map(altSteps) : [altSteps(r.body)] }; + const cstName = (r as { canon?: string }).canon ?? r.name; // a forked $A/$Y rule labels its CST node with the base name + // Pratt rules AND left-recursive non-Pratt rules (e.g. NewTarget, TS Type) both parse as + // atom-then-continuation: buildPratt detects `startsSelf` and splits accordingly, so routing + // left-recursive rules through it avoids the infinite left-recursion a plain rd rule would hit. + if (a.prattRules.has(r.name) || a.leftRecSet.has(r.name)) return buildPratt(r.name, cstName, r.body, a, stepOf, altSteps, litTtype); + return { kind: 'rd', name: r.name, cstName, alts: r.body.type === 'alt' ? r.body.items.map(altSteps) : [altSteps(r.body)] }; }); // Regex-vs-division context (only if the grammar declares a regex token + config). @@ -212,7 +221,30 @@ function buildIR(grammar: CstGrammar): ParserIR { }; } - return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules, regexCtx, tpl }; + // The [Await]/[Yield] fork names rules `Expr$A`/`Expr$Y` — `$` is a valid TS identifier but + // NOT a Go/Rust one. Sanitize every rule-IDENTIFIER use (`$`→`_`) for the emitted parse-fn + // names; the CST node label (cstName) keeps the canon base name, so the tree is unchanged. + const san = (n: string) => n.replace(/\$/g, '_'); + const sanStep = (s: Step): void => { + if (s.t === 'rule' || s.t === 'ruleBp') s.name = san(s.name); + else if (s.t === 'star') sanStep(s.step); + else if (s.t === 'opt' || s.t === 'not' || s.t === 'seq' || s.t === 'suppress') s.steps.forEach(sanStep); + else if (s.t === 'sep') sanStep(s.elem); + else if (s.t === 'alt') s.branches.forEach((b) => b.forEach(sanStep)); + }; + for (const r of rules) { + r.name = san(r.name); + if (r.kind === 'rd') r.alts.forEach((alt) => alt.forEach(sanStep)); + else { + r.nudBrackets.forEach((b) => b.steps.forEach(sanStep)); + r.nudSeqs.forEach((seq) => seq.forEach(sanStep)); + r.nudCapped.forEach((c) => c.steps.forEach(sanStep)); + r.leds.forEach((b) => b.steps.forEach(sanStep)); + } + } + if (tpl) tpl.interpRule = san(tpl.interpRule); + + return { grammarName: grammar.name ?? 'grammar', entry: san(findEntryRule(grammar)), tokens, puncts, rules, regexCtx, tpl }; } // Classify a token: a fast-path shape (run/string/line/block) when one cleanly matches, @@ -259,7 +291,7 @@ function codesToRanges(codes: number[]): CharRange[] { // A Pratt rule's alternatives → NUD atoms/brackets/prefix + binary + mixfix LEDs. // Binding powers come from the analysis (opTable/prefixOps), single-sourced with the interpreter. function buildPratt( - name: string, body: RuleExpr, a: ReturnType, + name: string, cstName: string, body: RuleExpr, a: ReturnType, stepOf: (e: RuleExpr) => Step, altSteps: (e: RuleExpr) => Step[], litTtype: (v: string) => '$keyword' | '$punct', ): PrattRule { @@ -344,5 +376,5 @@ function buildPratt( const postfix = sawPostfix ? [...a.opTable.entries()].filter(([, info]) => info.position === 'postfix').map(([op, info]) => ({ op, lbp: info.lbp })) : []; - return { kind: 'pratt', name, nudToks, nudBrackets, nudSeqs, nudCapped, prefix, binary, leds, ledAccessTail, ledLbp, postfixToks, postfix }; + return { kind: 'pratt', name, cstName, nudToks, nudBrackets, nudSeqs, nudCapped, prefix, binary, leds, ledAccessTail, ledLbp, postfixToks, postfix }; } diff --git a/src/target-go.ts b/src/target-go.ts index 2fad500..0e58d86 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -154,8 +154,8 @@ ${emitHooks} \t_ = pendingNl ${rxState}${tplState}${emitFn}${pushTokFn}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { \t\tc := int(src[pos]) -\t\tif c == 32 || c == 9 { pos++; continue } -\t\tif c == 10 || c == 13 { pendingNl = true; pos++; continue } +\t\tif c == 10 || c == 13 || c == 8232 || c == 8233 { pendingNl = true; pos++; continue } +\t\tif c == 32 || c == 9 || c == 11 || c == 12 || c == 160 || c == 5760 || (c >= 8192 && c <= 8202) || c == 8239 || c == 8287 || c == 12288 || c == 65279 { pos++; continue } ${tplDispatch}${toks} ${puncts} \t\tpanic(fmt.Sprintf("lex error at %d", pos)) @@ -184,7 +184,7 @@ function stepCond(s: Step): string { function rdRule(r: RdRule): string { const alt = (steps: Step[]) => - `\tif ${steps.map(stepCond).join(' && ')} { return finish(${J(r.name)}, sb, offAt(save)) } + `\tif ${steps.map(stepCond).join(' && ')} { return finish(${J(r.cstName)}, sb, offAt(save)) } \tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]`; return `func parse${r.name}() int32 { \tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) @@ -199,7 +199,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { \t\tnode := matchTemplate() \t\tif node < 0 { return -1 } \t\tsb := len(scratch); scratch = append(scratch, node) -\t\treturn finish(${J(r.name)}, sb, nodes[node].Offset) +\t\treturn finish(${J(r.cstName)}, sb, nodes[node].Offset) \t}\n` : ''; const bin = r.binary.map((b) => `${J(b.op)}: {${b.lbp}, ${b.rbp}}`).join(', '); @@ -207,24 +207,24 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { const atoms = r.nudToks.map((k) => `${J(k)}: true`).join(', '); const bracketNud = (b: Bracket) => `\tif t.Text == ${J(b.first)} { \t\tsave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) -\t\tif ${b.steps.map(stepCond).join(' && ')} { return finish(${J(r.name)}, sb, t.Off) } +\t\tif ${b.steps.map(stepCond).join(' && ')} { return finish(${J(r.cstName)}, sb, t.Off) } \t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] \t}`; const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}!_mySup[${J(b.first)}] && t.Text == ${J(b.first)} { \t\t\tledSave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) \t\t\tscratch = append(scratch, left) -\t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } +\t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue } \t\t\tpos = ledSave; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; break \t\t}`; const postfixArm = (tok: string) => { const tplPart = tpl && tok === tpl.token ? ` \t\tif !tailClosed && t.Kind == "$templateHead" { \t\t\tnode := matchTemplate() -\t\t\tif node >= 0 { sb := len(scratch); scratch = append(scratch, left, node); left = finish(${J(r.name)}, sb, nodes[left].Offset); continue } +\t\t\tif node >= 0 { sb := len(scratch); scratch = append(scratch, left, node); left = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue } \t\t}` : ''; return `\t\tif !tailClosed && t.Kind == ${J(tok)} { \t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf(t.Kind, t.Off, t.End)); pos++ -\t\t\tleft = finish(${J(r.name)}, sb, nodes[left].Offset); continue +\t\t\tleft = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue \t\t}${tplPart}`; }; const post = r.postfix.map((p) => `${J(p.op)}: ${p.lbp}`).join(', '); @@ -246,7 +246,7 @@ ${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} \t\tif post, ok := ${r.name}POST[t.Text]; ok && !tailClosed && post > minBp { \t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf("$operator", t.Off, t.End)); pos++; tailClosed = true -\t\t\tleft = finish(${J(r.name)}, sb, nodes[left].Offset); continue +\t\t\tleft = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue \t\t} \t\tinfo, ok := ${r.name}BIN[t.Text] \t\tif !ok || info.lbp <= minBp { break } @@ -256,7 +256,7 @@ ${r.postfixToks.map(postfixArm).join('\n')} \t\trhs := ${r.name}bp(info.rbp) \t\tif rhs < 0 { pos = ledSave; scratch = scratch[:sb]; break } \t\tscratch = append(scratch, rhs) -\t\tleft = finish(${J(r.name)}, sb, nodes[left].Offset) +\t\tleft = finish(${J(r.cstName)}, sb, nodes[left].Offset) \t} \treturn left } @@ -264,10 +264,11 @@ func ${r.name}nud(minBp int) int32 { \t_capped = false \tt := peek() \tif t == nil { return -1 } -${r.nudCapped.map((c) => `\tif minBp < ${c.capBp} { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'} { _capped = true; return finish(${J(r.name)}, sb, offAt(save)) }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('\n')} +${r.nudCapped.map((c) => `\tif minBp < ${c.capBp} { save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'} { _capped = true; return finish(${J(r.cstName)}, sb, offAt(save)) }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('\n')} +\t_r := func() int32 { // non-capped: a sub-parse may leave _capped set; force it false after ${tplNud}\tif ${r.name}ATOM[t.Kind] { \t\tsb := len(scratch); scratch = append(scratch, mkLeaf(t.Kind, t.Off, t.End)); pos++ -\t\treturn finish(${J(r.name)}, sb, t.Off) +\t\treturn finish(${J(r.cstName)}, sb, t.Off) \t} ${r.nudBrackets.map(bracketNud).join('\n')} \tif pbp, ok := ${r.name}PRE[t.Text]; ok { @@ -276,10 +277,13 @@ ${r.nudBrackets.map(bracketNud).join('\n')} \t\toperand := ${r.name}bp(pbp) \t\tif operand < 0 { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 } \t\tscratch = append(scratch, operand) -\t\treturn finish(${J(r.name)}, sb, t.Off) +\t\treturn finish(${J(r.cstName)}, sb, t.Off) \t} -${r.nudSeqs.map((seq) => `\t{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${seq.length ? seq.map(stepCond).join(' && ') : 'true'} { return finish(${J(r.name)}, sb, offAt(save)) }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('\n')} +${r.nudSeqs.map((seq) => `\t{ save := pos; sb := len(scratch); nb := len(nodes); kb := len(kids); if ${seq.length ? seq.map(stepCond).join(' && ') : 'true'} { return finish(${J(r.cstName)}, sb, offAt(save)) }; pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] }`).join('\n')} \treturn -1 +\t}() +\t_capped = false +\treturn _r }`; } diff --git a/src/target-rust.ts b/src/target-rust.ts index a05f71b..72d163b 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -219,7 +219,7 @@ function stepCondP(s: Step): string { function rdRule(r: RdRule): string { const alt = (steps: Step[]) => - ` { let mut kids: Vec = Vec::new(); if ${steps.map(stepCond).join(' && ')} { return Some(self.branch(${J(r.name)}, kids, save)); } self.pos = save; }`; + ` { let mut kids: Vec = Vec::new(); if ${steps.map(stepCond).join(' && ')} { return Some(self.branch(${J(r.cstName)}, kids, save)); } self.pos = save; }`; return ` fn parse_${r.name}(&mut self) -> Option { let save = self.pos; ${r.alts.map(alt).join('\n')} @@ -230,7 +230,7 @@ ${r.alts.map(alt).join('\n')} function prattRule(r: PrattRule, tpl: TplCfg | null): string { const tplNud = tpl && r.nudToks.includes(tpl.token) ? ` if t.kind == "$templateHead" { - return self.match_template().map(|n| { let (o, e) = (n.offset, n.end); Cst::node(${J(r.name)}, vec![n], o, e) }); + return self.match_template().map(|n| { let (o, e) = (n.offset, n.end); Cst::node(${J(r.cstName)}, vec![n], o, e) }); }\n` : ''; const binArms = r.binary.map((b) => `${J(b.op)} => Some((${b.lbp}, ${b.rbp}))`).join(', '); @@ -238,21 +238,21 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { const atomArm = r.nudToks.map(J).join(' | '); const bracketNud = (b: Bracket) => ` if t.text == ${J(b.first)} { let save = self.pos; let mut kids: Vec = Vec::new(); - if ${b.steps.map(stepCond).join(' && ')} { return Some(node(${J(r.name)}, kids)); } + if ${b.steps.map(stepCond).join(' && ')} { return Some(node(${J(r.cstName)}, kids)); } self.pos = save; // fall through to the next NUD alternative }`; const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if ${accessTail ? '!tail_closed && ' : ''}${lbp !== null ? `${lbp} > min_bp && ` : ''}!my_sup.iter().any(|c| *c == ${J(b.first)}) && t.text == ${J(b.first)} { let led_save = self.pos; let mut kids: Vec = Vec::new(); if ${b.steps.map(stepCond).join(' && ')} { let mut full = vec![left]; full.append(&mut kids); - left = node(${J(r.name)}, full); continue; + left = node(${J(r.cstName)}, full); continue; } self.pos = led_save; break; }`; const postfixArm = (tok: string) => { const tplPart = tpl && tok === tpl.token ? ` - if !tail_closed && t.kind == "$templateHead" { if let Some(n) = self.match_template() { left = node(${J(r.name)}, vec![left, n]); continue; } }` : ''; - return ` if !tail_closed && t.kind == ${J(tok)} { self.pos += 1; let leaf = Cst::leaf(t.kind, t.off, t.end); left = node(${J(r.name)}, vec![left, leaf]); continue; }${tplPart}`; + if !tail_closed && t.kind == "$templateHead" { if let Some(n) = self.match_template() { left = node(${J(r.cstName)}, vec![left, n]); continue; } }` : ''; + return ` if !tail_closed && t.kind == ${J(tok)} { self.pos += 1; let leaf = Cst::leaf(t.kind, t.off, t.end); left = node(${J(r.cstName)}, vec![left, leaf]); continue; }${tplPart}`; }; const postArms = r.postfix.map((p) => `${J(p.op)} => Some(${p.lbp})`).join(', '); return ` fn parse_${r.name}(&mut self) -> Option { self.${r.name}_bp(0) } @@ -270,35 +270,41 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { let t = match self.peek() { Some(t) => t, None => break }; ${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} - if let Some(plbp) = Parser::${r.name}_post(t.text) { if !tail_closed && plbp > min_bp { self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); left = node(${J(r.name)}, vec![left, op_leaf]); tail_closed = true; continue; } } + if let Some(plbp) = Parser::${r.name}_post(t.text) { if !tail_closed && plbp > min_bp { self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); left = node(${J(r.cstName)}, vec![left, op_leaf]); tail_closed = true; continue; } } let (lbp, rbp) = match Parser::${r.name}_bin(t.text) { Some(x) => x, None => break }; if lbp <= min_bp { break; } let led_save = self.pos; self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); let rhs = match self.${r.name}_bp(rbp) { Some(r) => r, None => { self.pos = led_save; break; } }; - left = node(${J(r.name)}, vec![left, op_leaf, rhs]); + left = node(${J(r.cstName)}, vec![left, op_leaf, rhs]); } Some(left) } fn ${r.name}_nud(&mut self, min_bp: i64) -> Option { self.capped = false; let t = self.peek()?; -${r.nudCapped.map((c) => ` if min_bp < ${c.capBp} { let save = self.pos; let mut kids: Vec = Vec::new(); if ${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'} { self.capped = true; return Some(self.branch(${J(r.name)}, kids, save)); } self.pos = save; }`).join('\n')} +${r.nudCapped.map((c) => ` if min_bp < ${c.capBp} { let save = self.pos; let mut kids: Vec = Vec::new(); if ${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'} { self.capped = true; return Some(self.branch(${J(r.cstName)}, kids, save)); } self.pos = save; }`).join('\n')} + // non-capped: a sub-parse may leave capped set (grouping a capped arrow); force it false after + let r = self.${r.name}_nud_rest(t); + self.capped = false; + r + } + fn ${r.name}_nud_rest(&mut self, t: Tok<'a>) -> Option { ${tplNud} if Parser::${r.name}_atom(t.kind) { self.pos += 1; - return Some(Cst::node(${J(r.name)}, vec![Cst::leaf(t.kind, t.off, t.end)], t.off, t.end)); + return Some(Cst::node(${J(r.cstName)}, vec![Cst::leaf(t.kind, t.off, t.end)], t.off, t.end)); } ${r.nudBrackets.map(bracketNud).join('\n')} if let Some(pbp) = Parser::${r.name}_pre(t.text) { let save = self.pos; self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); match self.${r.name}_bp(pbp) { - Some(operand) => { let (o, e) = (t.off, operand.end); return Some(Cst::node(${J(r.name)}, vec![op_leaf, operand], o, e)); } + Some(operand) => { let (o, e) = (t.off, operand.end); return Some(Cst::node(${J(r.cstName)}, vec![op_leaf, operand], o, e)); } None => { self.pos = save; return None; } } } -${r.nudSeqs.map((seq) => ` { let save = self.pos; let mut kids: Vec = Vec::new(); if ${seq.length ? seq.map(stepCond).join(' && ') : 'true'} { return Some(self.branch(${J(r.name)}, kids, save)); } self.pos = save; }`).join('\n')} +${r.nudSeqs.map((seq) => ` { let save = self.pos; let mut kids: Vec = Vec::new(); if ${seq.length ? seq.map(stepCond).join(' && ') : 'true'} { return Some(self.branch(${J(r.cstName)}, kids, save)); } self.pos = save; }`).join('\n')} None }`; } diff --git a/src/target-ts.ts b/src/target-ts.ts index c30e81f..589476e 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -144,8 +144,8 @@ ${emitHooks} let pendingNl = false; ${defs.length ? ' _s = src;\n' : ''}${rxState}${tplState}${stateful ? emitFn : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end, nl: pendingNl }); pendingNl = false; };\n'} while (pos < n) { const c = src.charCodeAt(pos); - if (c === 32 || c === 9) { pos++; continue; } - if (c === 10 || c === 13) { pendingNl = true; pos++; continue; } + if (c === 10 || c === 13 || c === 8232 || c === 8233) { pendingNl = true; pos++; continue; } + if (c === 32 || c === 9 || c === 11 || c === 12 || c === 160 || c === 5760 || (c >= 8192 && c <= 8202) || c === 8239 || c === 8287 || c === 12288 || c === 65279) { pos++; continue; } ${tplDispatch}${toks} ${puncts} throw new Error('lex error at ' + pos + ': ' + JSON.stringify(src[pos])); @@ -175,7 +175,7 @@ function stepCond(s: Step): string { function rdRule(r: RdRule): string { const alt = (steps: Step[]) => - ` { const kids: Cst[] = []; if (${steps.map(stepCond).join(' && ')}) return branch(${J(r.name)}, kids, save); pos = save; }`; + ` { const kids: Cst[] = []; if (${steps.map(stepCond).join(' && ')}) return branch(${J(r.cstName)}, kids, save); pos = save; }`; return `function parse${r.name}(): Node | null { const save = pos; ${r.alts.map(alt).join('\n')} @@ -185,28 +185,28 @@ ${r.alts.map(alt).join('\n')} function prattRule(r: PrattRule, tpl: TplCfg | null): string { const tplNud = tpl && r.nudToks.includes(tpl.token) - ? ` if (t.kind === '$templateHead') { const node = matchTemplate(); return node === null ? null : { rule: ${J(r.name)}, children: [node], offset: node.offset, end: node.end }; }\n` + ? ` if (t.kind === '$templateHead') { const node = matchTemplate(); return node === null ? null : { rule: ${J(r.cstName)}, children: [node], offset: node.offset, end: node.end }; }\n` : ''; const BIN = `{ ${r.binary.map((b) => `${J(b.op)}: { lbp: ${b.lbp}, rbp: ${b.rbp} }`).join(', ')} }`; const PRE = `{ ${r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', ')} }`; const atom = `new Set([${r.nudToks.map(J).join(', ')}])`; const bracketNud = (b: Bracket) => ` if (t.text === ${J(b.first)}) { const save = pos; const kids: Cst[] = []; - if (${b.steps.map(stepCond).join(' && ')}) return node(${J(r.name)}, kids); + if (${b.steps.map(stepCond).join(' && ')}) return node(${J(r.cstName)}, kids); pos = save; // fall through to the next NUD alternative (e.g. another '${b.first}'-led form) }`; // Access-tail leds (member/call/index) are disabled once a postfix has closed the operand; // a precedence-gated led (ternary/in/instanceof) binds only when its lbp > minBp. const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if (${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}(_mySup === null || !_mySup.has(${J(b.first)})) && t.text === ${J(b.first)}) { const ledSave = pos; const kids: Cst[] = [left]; - if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.name)}, kids); continue; } + if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.cstName)}, kids); continue; } pos = ledSave; break; }`; // A postfix token (e.g. a tagged template) binds like a mixfix led: `left X` → node(left, X). Also an access tail. const postfixArm = (tok: string) => { const tplPart = tpl && tok === tpl.token ? ` - if (!tailClosed && t.kind === '$templateHead') { const node = matchTemplate(); if (node !== null) { left = { rule: ${J(r.name)}, children: [left, node], offset: left.offset, end: node.end }; continue; } }` : ''; - return ` if (!tailClosed && t.kind === ${J(tok)}) { const leaf: Leaf = { tokenType: t.kind, offset: t.off, end: t.end }; pos++; left = { rule: ${J(r.name)}, children: [left, leaf], offset: left.offset, end: leaf.end }; continue; }${tplPart}`; + if (!tailClosed && t.kind === '$templateHead') { const node = matchTemplate(); if (node !== null) { left = { rule: ${J(r.cstName)}, children: [left, node], offset: left.offset, end: node.end }; continue; } }` : ''; + return ` if (!tailClosed && t.kind === ${J(tok)}) { const leaf: Leaf = { tokenType: t.kind, offset: t.off, end: t.end }; pos++; left = { rule: ${J(r.cstName)}, children: [left, leaf], offset: left.offset, end: leaf.end }; continue; }${tplPart}`; }; const POST = `{ ${r.postfix.map((p) => `${J(p.op)}: ${p.lbp}`).join(', ')} }`; return `const ${r.name}_BIN: Record = ${BIN}; @@ -226,7 +226,7 @@ function ${r.name}_bp(minBp: number): Node | null { ${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} const post = ${r.name}_POST[t.text]; - if (!tailClosed && post !== undefined && post > minBp) { pos++; const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; left = { rule: ${J(r.name)}, children: [left, opLeaf], offset: left.offset, end: t.end }; tailClosed = true; continue; } + if (!tailClosed && post !== undefined && post > minBp) { pos++; const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; left = { rule: ${J(r.cstName)}, children: [left, opLeaf], offset: left.offset, end: t.end }; tailClosed = true; continue; } const info = ${r.name}_BIN[t.text]; if (info === undefined || info.lbp <= minBp) break; const ledSave = pos; @@ -234,7 +234,7 @@ ${r.postfixToks.map(postfixArm).join('\n')} const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; const rhs = ${r.name}_bp(info.rbp); if (rhs === null) { pos = ledSave; break; } - left = { rule: ${J(r.name)}, children: [left, opLeaf, rhs], offset: left.offset, end: rhs.end }; + left = { rule: ${J(r.cstName)}, children: [left, opLeaf, rhs], offset: left.offset, end: rhs.end }; } return left; } @@ -242,8 +242,11 @@ function ${r.name}_nud(minBp: number): Node | null { _capped = false; const t = peek(); if (t === null) return null; -${r.nudCapped.map((c) => ` if (minBp < ${c.capBp}) { const save = pos; const kids: Cst[] = []; if (${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'}) { _capped = true; return branch(${J(r.name)}, kids, save); } pos = save; }`).join('\n')} -${tplNud} if (${r.name}_ATOM.has(t.kind)) { pos++; return { rule: ${J(r.name)}, children: [{ tokenType: t.kind, offset: t.off, end: t.end }], offset: t.off, end: t.end }; } +${r.nudCapped.map((c) => ` if (minBp < ${c.capBp}) { const save = pos; const kids: Cst[] = []; if (${c.steps.length ? c.steps.map(stepCond).join(' && ') : 'true'}) { _capped = true; return branch(${J(r.cstName)}, kids, save); } pos = save; }`).join('\n')} + // Below is non-capped: a sub-parse may leave _capped set (e.g. grouping a capped arrow), + // so force it false after — only the capped arms above produce a capped node. + const _r = ((): Node | null => { +${tplNud} if (${r.name}_ATOM.has(t.kind)) { pos++; return { rule: ${J(r.cstName)}, children: [{ tokenType: t.kind, offset: t.off, end: t.end }], offset: t.off, end: t.end }; } ${r.nudBrackets.map(bracketNud).join('\n')} const pbp = ${r.name}_PRE[t.text]; if (pbp !== undefined) { @@ -251,10 +254,13 @@ ${r.nudBrackets.map(bracketNud).join('\n')} const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; const operand = ${r.name}_bp(pbp); if (operand === null) { pos = save; return null; } - return { rule: ${J(r.name)}, children: [opLeaf, operand], offset: t.off, end: operand.end }; + return { rule: ${J(r.cstName)}, children: [opLeaf, operand], offset: t.off, end: operand.end }; } -${r.nudSeqs.map((seq) => ` { const save = pos; const kids: Cst[] = []; if (${seq.length ? seq.map(stepCond).join(' && ') : 'true'}) return branch(${J(r.name)}, kids, save); pos = save; }`).join('\n')} +${r.nudSeqs.map((seq) => ` { const save = pos; const kids: Cst[] = []; if (${seq.length ? seq.map(stepCond).join(' && ') : 'true'}) return branch(${J(r.cstName)}, kids, save); pos = save; }`).join('\n')} return null; + })(); + _capped = false; + return _r; }`; } diff --git a/test/portable-targets.ts b/test/portable-targets.ts index fe52c60..2863226 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -176,6 +176,25 @@ const CASES: Case[] = [ ], reject: ['for (x y) z;', 'for x in y;', 'for (in y) z;', 'for (x in) z;'], }, + { + // The REAL javascript.ts grammar (89 rules after the [Await]/[Yield] fork) — the proof + // that the target-agnostic emitter handles a full language end-to-end in ts/go/rust. + // ASCII corpus only (byte-based go/rust use UTF-8 offsets, identical to the JS oracle's + // UTF-16 offsets for ASCII; non-ASCII offset units differ inherently). + grammar: 'javascript', path: '../javascript.ts', + accept: [ + 'var x = 1, y = 2;', 'function f(a, b) { return a + b; }', 'const g = (x) => x * 2;', + 'x => x + 1;', 'a ? b : c;', 'a.b.c();', 'f(g(1, 2), 3);', '[1, 2, 3].map(f);', + 'for (let i = 0; i < n; i++) x();', 'for (const k in obj) { y(); }', 'while (x) { z(); }', + 'if (a) b(); else c();', 'class C extends B { m() {} get p() { return 1; } }', 'a++; b--;', + 'typeof x; void 0;', 'new Foo(1, 2); new.target;', 'a ?? b; a?.b?.c;', + 'try { f(); } catch (e) { g(); } finally { h(); }', 'switch (x) { case 1: f(); break; default: g(); }', + 'a instanceof B; a in obj;', '(function () {})(); (() => {})();', 'x = a && b || c;', + 'do { x(); } while (y);', 'function* gen() { yield* o(); }', 'const { a, b: c, ...r } = o;', + 'const [p, , q, ...z] = arr;', 'label: for (;;) { break label; }', 'async function h() { await x; }', + ], + reject: ['function (', 'a +;', 'if x {}', '{ a: }', 'for (;;', 'a ? b ;'], + }, ]; const sortKeys = (o: unknown): unknown => From cd4ebc84fe3fb45b41ebe32dac5c137f7e7bd6a4 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 05:49:02 +0800 Subject: [PATCH 22/27] =?UTF-8?q?emit-portable:=20typescript.ts=20emits=20?= =?UTF-8?q?too=20=E2=80=94=20both=20real=20grammars=20in=20the=20gate=20(i?= =?UTF-8?q?ssue=20#6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The second real full language now goes through the agnostic emitter end-to-end. Two type-grammar constructs were the wall: - A LED with a leading `sameLine` guard (`$ sameLine '<' …`) — TS's generic-args / array / non-null type tails that must not cross a newline. The guard is hoisted into the led-arm condition (skip, don't break, so the connector can rebind). - `notLeftLeaf`: a led skipped when the LEFT node's head-leaf text is in a word set (`void`/`null`/`this` can't be `.`-qualified as a type). Each target gains a `headLeafText` (the leftmost leaf's source text) and the led arm checks it. typescript.ts (the most complex grammar) emits, compiles and runs in ts/go/rust, and is gate-maintained alongside javascript.ts (13/13 accept, 4/4 reject ×3, ASCII corpus; 83.5% on the broad curated TS corpus in TS). Full suite 42/42. The agnostic emitter now covers both full real languages — the issue-#6 goal, proven in three target languages. --- src/emit-portable.ts | 24 +++++++++++++++++++----- src/target-go.ts | 12 ++++++++++-- src/target-rust.ts | 18 ++++++++++++------ src/target-ts.ts | 11 +++++++++-- test/portable-targets.ts | 14 ++++++++++++++ 5 files changed, 64 insertions(+), 15 deletions(-) diff --git a/src/emit-portable.ts b/src/emit-portable.ts index a6a24eb..d631b12 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -74,6 +74,8 @@ export type PrattRule = { leds: Bracket[]; // LED: mixfix continuation (call/member/index), tried before operators ledAccessTail: boolean[]; // parallel to leds: a "closed punct-connector" tail (member/call/index) — disabled once a postfix binds ledLbp: Array; // parallel to leds: precedence gate (ternary/in/instanceof) — bind only when lbp > minBp; null = bind maximally tight + ledSameLine: boolean[]; // parallel to leds: a leading `sameLine` guard (TS type tails) — the connector must be on the operand's line + ledNotLeftLeaf: Array; // parallel to leds: skip this led when the left node's head-leaf text is in this set (`void.x` etc.) postfixToks: string[]; // LED: a postfix token `$ X` (e.g. a tagged template), tried like a mixfix led (also an access tail) postfix: Array<{ op: string; lbp: number }>; // LED: a postfix operator `$ ++` — binds iff lbp > minBp + !tailClosed, no rhs, closes the tail }; @@ -304,9 +306,17 @@ function buildPratt( const leds: Bracket[] = []; const ledAccessTail: boolean[] = []; const ledLbp: Array = []; + const ledSameLine: boolean[] = []; + const ledNotLeftLeaf: Array = []; const postfixToks: string[] = []; for (const alt of alts) { - const items = alt.type === 'seq' ? alt.items : [alt]; + let items = alt.type === 'seq' ? alt.items : [alt]; + // A left-recursive continuation may carry a leading `notLeftLeaf(words)` head-leaf guard + // before the self `$` — strip it and attach the word set to the led it produces. + let nllWords: string[] | null = null; + if (items[0].type === 'notLeftLeaf' && items[1]?.type === 'ref' && items[1].name === name) { + nllWords = items[0].words; items = items.slice(1); + } const startsSelf = items[0].type === 'ref' && items[0].name === name; if (!startsSelf) { // NUD @@ -331,9 +341,11 @@ function buildPratt( continue; } // LED (starts with self): `$ op $` (binary, op slot + trailing self) or `$ …` (mixfix) - const rest = items.slice(1); - if (rest[0].type === 'op') { sawBinary = true; continue; } - if (rest[0].type === 'postfix') { sawPostfix = true; continue; } // postfix operator (`x++`) + const restAll = items.slice(1); + const hasSameLine = restAll[0]?.type === 'sameLine'; // a TS type tail: `$ sameLine '<' …` + const rest = hasSameLine ? restAll.slice(1) : restAll; + if (!hasSameLine && rest[0].type === 'op') { sawBinary = true; continue; } + if (!hasSameLine && rest[0].type === 'postfix') { sawPostfix = true; continue; } // postfix operator (`x++`) if (rest[0].type === 'literal') { const conn = rest[0].value; const prec = a.ledPrecByConnector.get(conn); // { lbp, rhsBp } for ternary/in/instanceof @@ -346,6 +358,8 @@ function buildPratt( leds.push({ first: conn, steps }); ledAccessTail.push(!lastIsOperand && !wordConnector); ledLbp.push(prec ? prec.lbp : null); + ledSameLine.push(hasSameLine); + ledNotLeftLeaf.push(nllWords); continue; } if (rest.length === 1 && rest[0].type === 'ref' && a.tokenNames.has(rest[0].name)) { postfixToks.push(rest[0].name); continue; } // postfix token (tagged template) @@ -376,5 +390,5 @@ function buildPratt( const postfix = sawPostfix ? [...a.opTable.entries()].filter(([, info]) => info.position === 'postfix').map(([op, info]) => ({ op, lbp: info.lbp })) : []; - return { kind: 'pratt', name, cstName, nudToks, nudBrackets, nudSeqs, nudCapped, prefix, binary, leds, ledAccessTail, ledLbp, postfixToks, postfix }; + return { kind: 'pratt', name, cstName, nudToks, nudBrackets, nudSeqs, nudCapped, prefix, binary, leds, ledAccessTail, ledLbp, ledSameLine, ledNotLeftLeaf, postfixToks, postfix }; } diff --git a/src/target-go.ts b/src/target-go.ts index 0e58d86..a654a7b 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -210,7 +210,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { \t\tif ${b.steps.map(stepCond).join(' && ')} { return finish(${J(r.cstName)}, sb, t.Off) } \t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb] \t}`; - const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}!_mySup[${J(b.first)}] && t.Text == ${J(b.first)} { + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null, sameLine: boolean, nll: string[] | null) => `\t\tif ${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}${sameLine ? '!t.Nl && ' : ''}${nll ? `!_inW([]string{${nll.map(J).join(', ')}}, headLeafText(left)) && ` : ''}!_mySup[${J(b.first)}] && t.Text == ${J(b.first)} { \t\t\tledSave := pos; sb := len(scratch); nb := len(nodes); kb := len(kids) \t\t\tscratch = append(scratch, left) \t\t\tif ${b.steps.map(stepCond).join(' && ')} { left = finish(${J(r.cstName)}, sb, nodes[left].Offset); continue } @@ -242,7 +242,7 @@ func ${r.name}bp(minBp int) int32 { \tfor { \t\tt := peek() \t\tif t == nil { break } -${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i], r.ledSameLine[i], r.ledNotLeftLeaf[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} \t\tif post, ok := ${r.name}POST[t.Text]; ok && !tailClosed && post > minBp { \t\t\tsb := len(scratch); scratch = append(scratch, left, mkLeaf("$operator", t.Off, t.End)); pos++; tailClosed = true @@ -338,6 +338,7 @@ type bp struct{ lbp, rbp int } var toks []Tok var pos int var _capped bool +var _src string var _suppressNext map[string]bool var nodes []Node var kids []int32 @@ -412,7 +413,14 @@ func writeJSON(id int32, b *strings.Builder) { \tfmt.Fprintf(b, "],\\"offset\\":%d,\\"end\\":%d}", nd.Offset, nd.End) } +func headLeafText(id int32) string { +\tfor !nodes[id].IsLeaf && nodes[id].KidCount > 0 { id = kids[nodes[id].KidStart] } +\treturn _src[nodes[id].Offset:nodes[id].End] +} +func _inW(ws []string, s string) bool { for _, w := range ws { if w == s { return true } }; return false } + func parseOnce(src string) int32 { +\t_src = src \ttoks = lex(src) \tpos = 0 \tnodes = nodes[:0]; kids = kids[:0]; scratch = scratch[:0] diff --git a/src/target-rust.ts b/src/target-rust.ts index 72d163b..5a43013 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -241,7 +241,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { if ${b.steps.map(stepCond).join(' && ')} { return Some(node(${J(r.cstName)}, kids)); } self.pos = save; // fall through to the next NUD alternative }`; - const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if ${accessTail ? '!tail_closed && ' : ''}${lbp !== null ? `${lbp} > min_bp && ` : ''}!my_sup.iter().any(|c| *c == ${J(b.first)}) && t.text == ${J(b.first)} { + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null, sameLine: boolean, nll: string[] | null) => ` if ${accessTail ? '!tail_closed && ' : ''}${lbp !== null ? `${lbp} > min_bp && ` : ''}${sameLine ? '!t.nl && ' : ''}${nll ? `!self.nll_blocked(&[${nll.map(J).join(', ')}], &left) && ` : ''}!my_sup.iter().any(|c| *c == ${J(b.first)}) && t.text == ${J(b.first)} { let led_save = self.pos; let mut kids: Vec = Vec::new(); if ${b.steps.map(stepCond).join(' && ')} { let mut full = vec![left]; full.append(&mut kids); @@ -268,7 +268,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { let mut tail_closed = false; loop { let t = match self.peek() { Some(t) => t, None => break }; -${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i], r.ledSameLine[i], r.ledNotLeftLeaf[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} if let Some(plbp) = Parser::${r.name}_post(t.text) { if !tail_closed && plbp > min_bp { self.pos += 1; let op_leaf = Cst::leaf("$operator", t.off, t.end); left = node(${J(r.cstName)}, vec![left, op_leaf]); tail_closed = true; continue; } } let (lbp, rbp) = match Parser::${r.name}_bin(t.text) { Some(x) => x, None => break }; @@ -352,9 +352,15 @@ fn node(rule: &'static str, kids: Vec) -> Cst { let o = kids[0].offset; let ${lexer(ir)} -struct Parser<'a> { toks: Vec>, pos: usize, capped: bool, suppress_next: Vec<&'static str> } +struct Parser<'a> { toks: Vec>, pos: usize, capped: bool, suppress_next: Vec<&'static str>, src: &'a str } impl<'a> Parser<'a> { fn peek(&self) -> Option> { if self.pos < self.toks.len() { Some(self.toks[self.pos]) } else { None } } + fn head_leaf_text(&self, node: &Cst) -> &'a str { + let mut n = node; + while !n.children.is_empty() { n = &n.children[0]; } + &self.src[n.offset..n.end] + } + fn nll_blocked(&self, words: &[&str], node: &Cst) -> bool { let h = self.head_leaf_text(node); words.iter().any(|w| *w == h) } fn branch(&self, rule: &'static str, kids: Vec, save: usize) -> Cst { let offset = if !kids.is_empty() { kids[0].offset } else if save < self.toks.len() { self.toks[save].off } else { 0 }; let end = if !kids.is_empty() { kids[kids.len() - 1].end } else { offset }; @@ -409,15 +415,15 @@ fn main() { // Self-bench: a numeric arg N times the lex+parse loop and prints ms/iteration. if let Some(iters) = std::env::args().nth(1).and_then(|a| a.parse::().ok()) { // black_box on the input + result so the optimizer can't elide the lex/parse. - for _ in 0..3 { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new() }; std::hint::black_box(p.parse_${ir.entry}()); } + for _ in 0..3 { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new(), src: &src }; std::hint::black_box(p.parse_${ir.entry}()); } let t = std::time::Instant::now(); - for _ in 0..iters { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new() }; std::hint::black_box(p.parse_${ir.entry}()); } + for _ in 0..iters { let toks = lex(std::hint::black_box(&src)); let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new(), src: &src }; std::hint::black_box(p.parse_${ir.entry}()); } println!("{:.4}", t.elapsed().as_secs_f64() * 1000.0 / iters as f64); return; } let toks = lex(&src); let n = toks.len(); - let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new() }; + let mut p = Parser { toks, pos: 0, capped: false, suppress_next: Vec::new(), src: &src }; match p.parse_${ir.entry}() { Some(root) if p.pos == n => { let mut out = String::new(); write_json(&root, &mut out); print!("{}", out); } _ => { eprintln!("parse error (pos {}/{})", p.pos, n); std::process::exit(1); } diff --git a/src/target-ts.ts b/src/target-ts.ts index 589476e..7984ba9 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -197,7 +197,7 @@ function prattRule(r: PrattRule, tpl: TplCfg | null): string { }`; // Access-tail leds (member/call/index) are disabled once a postfix has closed the operand; // a precedence-gated led (ternary/in/instanceof) binds only when its lbp > minBp. - const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null) => ` if (${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}(_mySup === null || !_mySup.has(${J(b.first)})) && t.text === ${J(b.first)}) { + const ledArm = (b: Bracket, accessTail: boolean, lbp: number | null, sameLine: boolean, nll: string[] | null) => ` if (${accessTail ? '!tailClosed && ' : ''}${lbp !== null ? `${lbp} > minBp && ` : ''}${sameLine ? '!t.nl && ' : ''}${nll ? `!${J(nll)}.includes(headLeafText(left)) && ` : ''}(_mySup === null || !_mySup.has(${J(b.first)})) && t.text === ${J(b.first)}) { const ledSave = pos; const kids: Cst[] = [left]; if (${b.steps.map(stepCond).join(' && ')}) { left = node(${J(r.cstName)}, kids); continue; } pos = ledSave; break; @@ -223,7 +223,7 @@ function ${r.name}_bp(minBp: number): Node | null { for (;;) { const t = peek(); if (t === null) break; -${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i])).join('\n')} +${r.leds.map((b, i) => ledArm(b, r.ledAccessTail[i], r.ledLbp[i], r.ledSameLine[i], r.ledNotLeftLeaf[i])).join('\n')} ${r.postfixToks.map(postfixArm).join('\n')} const post = ${r.name}_POST[t.text]; if (!tailClosed && post !== undefined && post > minBp) { pos++; const opLeaf: Leaf = { tokenType: '$operator', offset: t.off, end: t.end }; left = { rule: ${J(r.cstName)}, children: [left, opLeaf], offset: left.offset, end: t.end }; tailClosed = true; continue; } @@ -302,7 +302,13 @@ let toks: Tok[] = []; let pos = 0; let _capped = false; let _suppressNext: Set | null = null; +let _src = ''; function peek(): Tok | null { return pos < toks.length ? toks[pos] : null; } +function headLeafText(node: Cst): string { + let n: Cst = node; + while ('children' in n && n.children.length > 0) n = n.children[0]; + return _src.slice(n.offset, n.end); +} function branch(rule: string, kids: Cst[], save: number): Node { const offset = kids.length > 0 ? kids[0].offset : (save < toks.length ? toks[save].off : 0); const end = kids.length > 0 ? kids[kids.length - 1].end : offset; @@ -350,6 +356,7 @@ function altLit(opts: [string, string][], kids: Cst[]): boolean { ${matchTemplate}${ruleFns} const src = readFileSync(0, 'utf8'); +_src = src; toks = lex(src); pos = 0; const root = parse${ir.entry}(); diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 2863226..7dd6544 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -195,6 +195,20 @@ const CASES: Case[] = [ ], reject: ['function (', 'a +;', 'if x {}', '{ a: }', 'for (;;', 'a ? b ;'], }, + { + // The real typescript.ts grammar — the second, most complex full language proving the + // agnostic emitter (types, generics, interfaces, enums, assertions, variance). ASCII. + grammar: 'typescript', path: '../typescript.ts', + accept: [ + 'const a: number = 1;', 'let s: string;', 'type Alias = { a: number; b?: string };', + 'type U = "a" | "b" | "c";', 'function gen2(x: T, y: U): T { return x; }', + 'interface I extends A { m(x: T): T; }', 'const c = x as const;', + 'function isStr(x: unknown): x is string { return true; }', 'enum E { A, B, C }', + 'const n = maybe!;', 'let arr: number[];', 'type Fn = (x: number) => string;', + 'class C { value!: T; }', + ], + reject: ['interface {}', 'const x: = 1;', 'enum {}', 'a + ;'], + }, ]; const sortKeys = (o: unknown): unknown => From ca2a56bbd44fffffa58b138686e3d94cc04fffda Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 05:50:27 +0800 Subject: [PATCH 23/27] =?UTF-8?q?docs:=20README=20=E2=80=94=20the=20emitte?= =?UTF-8?q?d=20parser=20need=20not=20be=20JS=20(issue=20#6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documents the target-agnostic emitter under "A language-agnostic engine": one analysis → one IR → per-target render (Go/Rust/native, each with its own regex-free lexer), proven by the real javascript.ts and typescript.ts grammars emitting to ts/go/rust byte-identical to the interpreter and gate-maintained, with the Rust/Go throughput results and the ASCII-offset boundary noted. --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index a8f69cf..3b2e3a9 100644 --- a/README.md +++ b/README.md @@ -338,6 +338,21 @@ const Regex = token(seq( [`test/agnostic.ts`](test/agnostic.ts) proves it directly — the same engine parses a toy grammar whose identifier token is `Word`, with no templates or regex. The deeper proof is [`html.ts`](html.ts): markup shares *nothing* with TypeScript's token stream, yet the same engine handles it. +### The emitted parser need not be JS — Go, Rust, native + +The grammar also derives a **standalone parser in another language**. [`emitPortableParser(grammar, target)`](src/emit-portable.ts) runs one analysis into one language-agnostic IR, and each `Target` renders it — including its own regex-free lexer, so the output has no dependency on the JS runtime and compiles offline: + +```ts +import { emitPortableParser } from './src/emit-portable.ts'; +import { goTarget } from './src/target-go.ts'; +import { rustTarget } from './src/target-rust.ts'; + +writeFileSync('parser.go', emitPortableParser(grammar, goTarget)); // `go build`, no deps +writeFileSync('parser.rs', emitPortableParser(grammar, rustTarget)); // `rustc`, no crates +``` + +The proof is the full languages: the real [`javascript.ts`](javascript.ts) and [`typescript.ts`](typescript.ts) grammars — including the `[Await]/[Yield]` fork, left recursion, the regex/division and template state machines, arrow functions, and the TS type grammar — emit to **TypeScript, Go, and Rust**, and every CST is byte-identical to the reference interpreter. [`test/portable-targets.ts`](test/portable-targets.ts) compiles and runs all three for sixteen grammars (the two real languages plus focused fixtures) on every CI run. The Rust output reaches [oxc](https://github.com/oxc-project/oxc) throughput and the Go output beats [tsgo](https://github.com/microsoft/typescript-go) on the same corpus (an arena keeps both near zero-allocation). Byte-based Go/Rust use UTF-8 offsets — identical to the JS interpreter's for ASCII; non-ASCII offset units differ inherently. + ## Adding a language A new language is **one grammar file** on the unchanged engine: From aeb4736bc45487611f4122e757443f85cd79931a Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 06:47:10 +0800 Subject: [PATCH 24/27] emit: converge to 2 target-parameterized APIs (emitParser reuses emitLexer) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The emit layer had three inconsistent entry points — `emitParser(grammar)` (JS, no target), `emitLexer(grammar, st)` (JS, internal symtab), and `emitPortableParser(grammar, target)` (lexer buried in `target.render`). Collapse them to exactly two, both parameterized by a Target: emitLexer(grammar, target) -> the lexer for that target emitParser(grammar, target) -> the parser, REUSING emitLexer(grammar, target) A Target owns both halves, so a parser reuses the SAME target's lexer — jsTarget's parser embeds jsTarget's SoA-int lexer, goTarget's parser embeds goTarget's Tok-list lexer. No cross-target lexer format is shared, so the optimized JS path keeps its integer-bitmask dispatch and the portable targets keep their clean byte scanner. - src/emit.ts (new): the Target interface + the two public functions; re-exports jsTarget / tsTarget / goTarget / rustTarget. - emit-parser.ts: the optimized emitter split into `emitJsLexer` (derive) + `emitJsParser` (embed a handed-in lexer) behind `jsTarget`. The split is pure refactor — re-deriving the deterministic symtab yields the identical lexer string, so emit-parser-verify stays byte-for-byte. - emit-lexer.ts: `emitLexer` -> `emitSoaLexer` (frees the public name). - emit-portable.ts + target-{ts,go,rust}.ts: `render(ir)` split into the target's `emitLexer`/`emitParser`; `emitPortableParser` removed (`portableIR` exported). - ~19 callers updated to `emitParser(g, jsTarget)` / `emitParser(g, )`. emit-parser-verify byte-identical (0 mismatches), portable-targets 16 grammars ×3 ≡ interpreter, emit-tsc-gate clean, full suite 42/42. --- src/emit-lexer.ts | 2 +- src/emit-parser.ts | 31 +++++++++++++++++++++++++------ src/emit-portable.ts | 19 +++++++------------ src/emit.ts | 33 +++++++++++++++++++++++++++++++++ src/target-go.ts | 16 +++++++++++----- src/target-rust.ts | 16 +++++++++++----- src/target-ts.ts | 13 ++++++++++--- test/cst-match-totality.ts | 4 ++-- test/emit-lexer-verify.ts | 4 ++-- test/emit-parser-bench.ts | 4 ++-- test/emit-parser-verify.ts | 4 ++-- test/emit-reject-messages.ts | 4 ++-- test/emit-tsc-gate.ts | 4 ++-- test/exhaustive-edits.ts | 4 ++-- test/head-to-head.ts | 4 ++-- test/incremental-grammars.ts | 4 ++-- test/incremental-verify.ts | 4 ++-- test/multi-doc.ts | 4 ++-- test/portable-targets.ts | 13 +++++-------- test/profile-vs-peers.mjs | 6 +++--- test/profile-vs-tsc.mjs | 4 ++-- test/recovery-conformance.ts | 4 ++-- test/recovery.ts | 4 ++-- 23 files changed, 134 insertions(+), 71 deletions(-) create mode 100644 src/emit.ts diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index ba09347..28f9420 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -45,7 +45,7 @@ const NON_ASCII_WS_FN = const nonAsciiWsConsume = (v: string, cont: boolean, indent: string): string => `${indent}if (${v} > 127 && lxNonAsciiWs(${v})) { LX_WS.lastIndex = pos; const m = LX_WS.exec(source); if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length;${cont ? ' continue;' : ''} } }`; -export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null { +export function emitSoaLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // Out of scope: the markup / indentation / newline state machines. if (grammar.markup || grammar.indent || grammar.newline) return null; if (grammar.tokens.some(t => tokenBlockPatternSource(t) || t.blockOnly)) return null; diff --git a/src/emit-parser.ts b/src/emit-parser.ts index 6368898..ebbc2f9 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -27,7 +27,8 @@ import type { CstGrammar, RuleExpr, RuleDecl } from './types.ts'; import { isKeywordLiteral, collectLiterals } from './grammar-utils.ts'; import { analyzeGrammar, findEntryRule, type Sec } from './grammar-analysis.ts'; -import { emitLexer } from './emit-lexer.ts'; +import { emitSoaLexer } from './emit-lexer.ts'; +import type { Target } from './emit.ts'; import { withAwaitYield } from './await-yield-fork.ts'; // ── Static analysis ── @@ -1092,7 +1093,28 @@ class Emitter { // ── Top-level emit ── -export function emitParser(grammar: CstGrammar): string { +// The `js` Target: the optimized SoA-int parser/lexer, wrapped behind the same two-method +// Target contract as the portable ts/go/rust targets (see emit.ts). `emitJsLexer` derives the +// standalone lexer; `emitJsParser` embeds whatever lexer source it is handed. Splitting the +// lexer COMPUTATION from its EMBEDDING leaves the emitted bytes identical (both re-derive the +// same deterministic symtab), so `emit-parser-verify` stays byte-for-byte. +export const jsTarget: Target = { + name: 'javascript', + ext: 'js', + emitLexer: emitJsLexer, + emitParser: emitJsParser, +}; + +export function emitJsLexer(grammar: CstGrammar): string | null { + grammar = withAwaitYield(grammar); + const st = analyze(grammar).symtab; + return emitSoaLexer(grammar, { + typeKind: st.typeKind, kwLitKind: st.kwLitKind, puLitKind: st.puLitKind, + KIND_PUNCT: st.KIND_PUNCT, KIND_NAMED_FALLBACK: st.KIND_NAMED_FALLBACK, + }); +} + +export function emitJsParser(grammar: CstGrammar, lexSrc: string | null): string { // [Await]/[Yield] context: name-fork the body-reachable rule closure into $A/$Y/$AY // families (see await-yield-fork.ts). No-op for a grammar with no ctx markers. Done // HERE (not at grammar export) so the forks exist ONLY in the parser's rule identity @@ -1127,11 +1149,8 @@ export function emitParser(grammar: CstGrammar): string { // The lexer: EMITTED (specialized, standalone — see emit-lexer.ts) when the grammar // is a plain token stream; the data-driven createLexer runtime otherwise // (markup/indent/newline state machines stay interpreter-only). + // `lexSrc` is handed in by the Target façade (emitParser reuses emitLexer) — see emit.ts. const st = a.symtab; - const lexSrc = emitLexer(grammar, { - typeKind: st.typeKind, kwLitKind: st.kwLitKind, puLitKind: st.puLitKind, - KIND_PUNCT: st.KIND_PUNCT, KIND_NAMED_FALLBACK: st.KIND_NAMED_FALLBACK, - }); e.soa = lexSrc !== null; if (!lexSrc) { e.emit(`import { createLexer } from ${J(resolveLexerImport())};`); diff --git a/src/emit-portable.ts b/src/emit-portable.ts index d631b12..e2ec5d8 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -1,6 +1,6 @@ // ── emit-portable ── // -// The target-agnostic emitter (issue #6). `emitPortableParser(grammar, target)` derives +// The target-agnostic emitter (issue #6). `emitParser(grammar, target)` (see emit.ts) derives // a COMPLETE, self-contained parser in the target's language from the same CstGrammar the // TS engine uses. It is the agnosticism proof: ONE analysis → ONE intermediate form (IR) // → N language renderings, all producing the byte-identical CST the interpreter does. @@ -120,17 +120,12 @@ export type ParserIR = { tpl: TplCfg | null; // null unless the grammar has a template token }; -export interface Target { - name: string; - ext: string; // emitted file extension (no dot) - render(ir: ParserIR): string; // the complete, compilable source -} - -export function emitPortableParser(grammar: CstGrammar, target: Target): string { - // Apply the [Await]/[Yield] context fork exactly as createParser does, so `await`/`yield` - // are keywords inside async/generator bodies and identifiers outside — name-forked into - // $A/$Y/$AY rule families. Every other consumer (and the portable parser) sees plain rules. - return target.render(buildIR(withAwaitYield(grammar))); +// The target-agnostic parse plan for a grammar. Applies the [Await]/[Yield] context fork +// exactly as createParser does (so `await`/`yield` are keywords inside async/generator bodies +// and identifiers outside — name-forked into $A/$Y/$AY rule families), then builds the IR each +// portable Target (ts/go/rust) renders. The `Target` contract itself lives in emit.ts. +export function portableIR(grammar: CstGrammar): ParserIR { + return buildIR(withAwaitYield(grammar)); } // ── buildIR: grammar + analysis → the target-agnostic parse plan ── diff --git a/src/emit.ts b/src/emit.ts new file mode 100644 index 0000000..1513fb5 --- /dev/null +++ b/src/emit.ts @@ -0,0 +1,33 @@ +// The emit layer's public surface: exactly two APIs, both parameterized by a `Target`. +// +// emitLexer(grammar, target) → the lexer source for that target +// emitParser(grammar, target) → the parser source for that target, REUSING emitLexer +// +// A `Target` owns BOTH halves, so emitParser(grammar, target) reuses the SAME target's lexer — +// jsTarget's parser embeds jsTarget's SoA-int lexer, goTarget's parser embeds goTarget's +// Tok-list lexer. No cross-target lexer format is shared, so the optimized JS path keeps its +// integer-bitmask token dispatch while the portable targets keep their clean byte scanner. +// +// Targets: `jsTarget` (the optimized SoA parser, emit-parser.ts) and the portable +// `tsTarget`/`goTarget`/`rustTarget` (emit-portable.ts + target-*.ts). +import type { CstGrammar } from './types.ts'; + +export interface Target { + name: string; + ext: string; // emitted file extension (no dot) + emitLexer(grammar: CstGrammar): string | null; // null ⇒ runtime-lexer fallback (jsTarget markup/indent grammars) + emitParser(grammar: CstGrammar, lexerSrc: string | null): string; // the parser, embedding `lexerSrc` +} + +export function emitLexer(grammar: CstGrammar, target: Target): string | null { + return target.emitLexer(grammar); +} + +export function emitParser(grammar: CstGrammar, target: Target): string { + return target.emitParser(grammar, emitLexer(grammar, target)); // ← parser reuses lexer +} + +export { jsTarget } from './emit-parser.ts'; +export { tsTarget } from './target-ts.ts'; +export { goTarget } from './target-go.ts'; +export { rustTarget } from './target-rust.ts'; diff --git a/src/target-go.ts b/src/target-go.ts index a654a7b..8d61807 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -1,7 +1,7 @@ // The Go Target for emit-portable. Renders the same language-agnostic ParserIR as tsTarget // into a self-contained Go program (Go stdlib only — the lexer is regex-free, so it compiles // with no module dependencies). Its CST JSON is checked byte-for-byte against the interpreter, -// so `emitPortableParser(grammar, goTarget)` is a real, verified Go parser derived from the +// so `emitParser(grammar, goTarget)` is a real, verified Go parser derived from the // same grammar definition. // // ARENA allocation (to minimise GC pressure, as tsgo does): nodes live in a flat `nodes []Node`, @@ -9,8 +9,10 @@ // stack. A node is an int32 index, never a heap pointer. Backtracking truncates the three // slices to saved lengths; the slices keep their capacity across parses (reset to len 0), so a // warmed parser allocates ~nothing per parse. -import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts'; -import type { TokenPattern } from './types.ts'; +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts'; +import { portableIR } from './emit-portable.ts'; +import type { Target } from './emit.ts'; +import type { TokenPattern, CstGrammar } from './types.ts'; const J = (v: unknown) => JSON.stringify(v); const rangeCond = (v: string, rs: CharRange[]) => @@ -290,7 +292,11 @@ ${r.nudSeqs.map((seq) => `\t{ save := pos; sb := len(scratch); nb := len(nodes); export const goTarget: Target = { name: 'go', ext: 'go', - render(ir: ParserIR): string { + emitLexer(grammar: CstGrammar): string { + return lexer(portableIR(grammar)); + }, + emitParser(grammar: CstGrammar, lexerSrc: string | null): string { + const ir = portableIR(grammar); const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); const matchTemplate = ir.tpl ? `func matchTemplate() int32 { \tt := peek() @@ -344,7 +350,7 @@ var nodes []Node var kids []int32 var scratch []int32 -${lexer(ir)} +${lexerSrc ?? ''} func peek() *Tok { \tif pos < len(toks) { return &toks[pos] } diff --git a/src/target-rust.ts b/src/target-rust.ts index 5a43013..8d995f2 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -1,7 +1,7 @@ // The Rust Target for emit-portable. Renders the same language-agnostic ParserIR as // tsTarget/goTarget into a self-contained Rust program (no external crates — the lexer is // regex-free, so it compiles with rustc alone, no Cargo/network). Its CST JSON is checked -// byte-for-byte against the interpreter, so `emitPortableParser(grammar, rustTarget)` is a +// byte-for-byte against the interpreter, so `emitParser(grammar, rustTarget)` is a // real, verified Rust parser derived from the same grammar definition. // // Rust ownership note: a CST node is OWNED (moved), unlike the TS/Go pointer trees. In the @@ -11,8 +11,10 @@ // returns it. Sub-sequence combinators (star/opt/sep) take non-capturing fn pointers // `fn(&mut Parser, &mut Vec) -> bool`, threading the parser + kids as params (so nothing // is captured, sidestepping the borrow checker). -import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts'; -import type { TokenPattern } from './types.ts'; +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts'; +import { portableIR } from './emit-portable.ts'; +import type { Target } from './emit.ts'; +import type { TokenPattern, CstGrammar } from './types.ts'; const J = (v: unknown) => JSON.stringify(v); const rangeCond = (v: string, rs: CharRange[]) => @@ -312,7 +314,11 @@ ${r.nudSeqs.map((seq) => ` { let save = self.pos; let mut kids: Vec export const rustTarget: Target = { name: 'rust', ext: 'rs', - render(ir: ParserIR): string { + emitLexer(grammar: CstGrammar): string { + return lexer(portableIR(grammar)); + }, + emitParser(grammar: CstGrammar, lexerSrc: string | null): string { + const ir = portableIR(grammar); const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); const matchTemplate = ir.tpl ? ` fn match_template(&mut self) -> Option { let t = self.peek()?; @@ -350,7 +356,7 @@ impl Cst { // offset/end inferred from first/last child (children non-empty). fn node(rule: &'static str, kids: Vec) -> Cst { let o = kids[0].offset; let e = kids[kids.len() - 1].end; Cst::node(rule, kids, o, e) } -${lexer(ir)} +${lexerSrc ?? ''} struct Parser<'a> { toks: Vec>, pos: usize, capped: bool, suppress_next: Vec<&'static str>, src: &'a str } impl<'a> Parser<'a> { diff --git a/src/target-ts.ts b/src/target-ts.ts index 7984ba9..549b302 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -4,7 +4,10 @@ // index LEDs), and a CST→JSON printer over stdin. It is the reference rendering — its CST // is checked byte-for-byte against the interpreter (createParser), so a divergence in the // portable logic surfaces here before Go/Rust are compiled. -import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts'; +import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts'; +import { portableIR } from './emit-portable.ts'; +import type { Target } from './emit.ts'; +import type { CstGrammar } from './types.ts'; const J = (v: unknown) => JSON.stringify(v); const rangeCond = (v: string, rs: CharRange[]) => @@ -267,7 +270,11 @@ ${r.nudSeqs.map((seq) => ` { const save = pos; const kids: Cst[] = []; if (${se export const tsTarget: Target = { name: 'typescript', ext: 'ts', - render(ir: ParserIR): string { + emitLexer(grammar: CstGrammar): string { + return lexer(portableIR(grammar)); + }, + emitParser(grammar: CstGrammar, lexerSrc: string | null): string { + const ir = portableIR(grammar); const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n'); const matchTemplate = ir.tpl ? `function matchTemplate(): Cst | null { const t = peek(); @@ -296,7 +303,7 @@ type Leaf = { tokenType: string; offset: number; end: number }; type Node = { rule: string; children: Cst[]; offset: number; end: number }; type Cst = Node | Leaf; -${lexer(ir)} +${lexerSrc ?? ''} let toks: Tok[] = []; let pos = 0; diff --git a/test/cst-match-totality.ts b/test/cst-match-totality.ts index 25c0d8b..2aab35f 100644 --- a/test/cst-match-totality.ts +++ b/test/cst-match-totality.ts @@ -13,7 +13,7 @@ // node test/cst-match-totality.ts import { existsSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; import { join } from 'node:path'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { generateInputs } from './grammar-gen.ts'; const GRAMMARS = ['typescript', 'javascript', 'typescriptreact', 'javascriptreact', 'yaml', 'html']; @@ -52,7 +52,7 @@ for (const name of GRAMMARS) { const grammar = (await import(`../${name}.ts`)).default; const matchers = (await import(`../${name}.cst-match.ts`)).MATCHERS; const emPath = `/tmp/emitted-totality-${name}.mts`; - writeFileSync(emPath, emitParser(grammar)); + writeFileSync(emPath, emitParser(grammar, jsTarget)); const em = (await import(emPath + '?v=' + process.pid)) as Emitted; let parsed = 0; for (const input of generateInputs(grammar, { depth: 5, nestDepth: 5, cap: 7, fuzzRounds: 250, maxInputs: 1500, seed: 5 })) { diff --git a/test/emit-lexer-verify.ts b/test/emit-lexer-verify.ts index e0ab8a5..900de91 100644 --- a/test/emit-lexer-verify.ts +++ b/test/emit-lexer-verify.ts @@ -9,14 +9,14 @@ // node test/emit-lexer-verify.ts # in-repo corpus (+ /tmp/ts-repo if present) import { readFileSync, writeFileSync } from 'node:fs'; import { createLexer } from '../src/gen-lexer.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { inRepoCorpus, externalTsFiles } from './emit-corpus.ts'; const grammar = (await import('../typescript.ts')).default; // The reference: createLexer with the SAME intern config the emitted parser bakes. const EMITTED = '/tmp/emit-lexer-verify-parser.mts'; -writeFileSync(EMITTED, emitParser(grammar)); +writeFileSync(EMITTED, emitParser(grammar, jsTarget)); const emitted = await import(EMITTED + '?v=' + Date.now()); const src = readFileSync(EMITTED, 'utf-8'); if (src.includes('createLexer(')) { diff --git a/test/emit-parser-bench.ts b/test/emit-parser-bench.ts index 5f9a2a3..5af58a2 100644 --- a/test/emit-parser-bench.ts +++ b/test/emit-parser-bench.ts @@ -9,14 +9,14 @@ // node test/emit-parser-bench.ts # the 4 bench files, N=20 // node test/emit-parser-bench.ts # custom timed-run count import { createParser } from '../src/gen-parser.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { readFileSync, writeFileSync } from 'fs'; const grammar = (await import('../typescript.ts')).default; const oracle = createParser(grammar); const EMITTED = '/tmp/emitted-parser.mts'; -writeFileSync(EMITTED, emitParser(grammar)); +writeFileSync(EMITTED, emitParser(grammar, jsTarget)); const emitted = await import(EMITTED + '?v=' + Date.now()); const N = Number(process.argv[2]) || 20; diff --git a/test/emit-parser-verify.ts b/test/emit-parser-verify.ts index 63228c6..b3020bd 100644 --- a/test/emit-parser-verify.ts +++ b/test/emit-parser-verify.ts @@ -13,7 +13,7 @@ // node test/emit-parser-verify.ts # external sweep stride N (default ~400 files) import { objectify } from './emitted-obj.ts'; import { createParser } from '../src/gen-parser.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { inRepoCorpus, externalTsFiles } from './emit-corpus.ts'; import { readFileSync, writeFileSync } from 'fs'; @@ -22,7 +22,7 @@ const oracle = createParser(grammar); // Emit, write to /tmp, import the standalone module. const EMITTED = '/tmp/emitted-parser.mts'; -writeFileSync(EMITTED, emitParser(grammar)); +writeFileSync(EMITTED, emitParser(grammar, jsTarget)); const emitted = await import(EMITTED + '?v=' + Date.now()); type Outcome = { ok: true; cst: string } | { ok: false; err: string }; diff --git a/test/emit-reject-messages.ts b/test/emit-reject-messages.ts index 9d549b0..28891e7 100644 --- a/test/emit-reject-messages.ts +++ b/test/emit-reject-messages.ts @@ -16,7 +16,7 @@ // // node test/emit-reject-messages.ts import { createParser } from '../src/gen-parser.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { inRepoCorpus, externalTsFiles } from './emit-corpus.ts'; import { readFileSync, writeFileSync } from 'fs'; @@ -24,7 +24,7 @@ const grammar = (await import('../typescript.ts')).default; const oracle = createParser(grammar); const EMITTED = '/tmp/emitted-parser-msg.mts'; -writeFileSync(EMITTED, emitParser(grammar)); +writeFileSync(EMITTED, emitParser(grammar, jsTarget)); const emitted = await import(EMITTED + '?v=' + Date.now()); function errOf(parse: (s: string) => unknown, code: string): string | null { diff --git a/test/emit-tsc-gate.ts b/test/emit-tsc-gate.ts index e6df929..a923934 100644 --- a/test/emit-tsc-gate.ts +++ b/test/emit-tsc-gate.ts @@ -19,7 +19,7 @@ // and it already paid off: the fallback editCore branch referenced cs/ceOld/ // parenCachePos declared only in the soa branch (unreached at runtime, invisible // until this gate), now hoisted/gated correctly. -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { writeFileSync } from 'node:fs'; import { execFileSync } from 'node:child_process'; import type { CstGrammar } from '../src/types.ts'; @@ -51,7 +51,7 @@ for (const [name, path] of GRAMMARS) { continue; } const out = `/tmp/emit-tsc-gate-${name}.ts`; - writeFileSync(out, emitParser(grammar)); + writeFileSync(out, emitParser(grammar, jsTarget)); try { execFileSync('npx', ['tsc', ...TSC_FLAGS, out], { stdio: 'pipe' }); console.log(` ${name}: ✓ emitted parser type-checks (tsc --strict)`); diff --git a/test/exhaustive-edits.ts b/test/exhaustive-edits.ts index 72a8ca9..9aa404f 100644 --- a/test/exhaustive-edits.ts +++ b/test/exhaustive-edits.ts @@ -9,7 +9,7 @@ // node --max-old-space-size=4096 test/exhaustive-edits.ts import { writeFileSync } from 'node:fs'; import { token, rule, defineGrammar, many, opt, sep, plus, oneOf, range, seq, star, noneOf } from '../src/api.ts'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { objectify } from './emitted-obj.ts'; // A deliberately bracket-and-list-shaped grammar: parens force synthesis and @@ -32,7 +32,7 @@ const g = defineGrammar({ }); const emPath = '/tmp/emitted-exhaustive.mts'; -writeFileSync(emPath, emitParser(g)); +writeFileSync(emPath, emitParser(g, jsTarget)); type Cst = { root: number; errors: object[] }; type Parser = { parse(s: string): Cst; edit(c: Cst, e: object[]): void; visit(c: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; const em = (await import(emPath + '?v=' + process.pid)) as { createParser(): Parser; __arenaStats(): { inPlaceShrink: number } }; diff --git a/test/head-to-head.ts b/test/head-to-head.ts index 15f913b..1d84a5a 100644 --- a/test/head-to-head.ts +++ b/test/head-to-head.ts @@ -15,7 +15,7 @@ // so it reads through a 16KB chunk callback (its documented large-input path). import { readFileSync } from 'node:fs'; import { createRequire } from 'node:module'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { writeFileSync } from 'node:fs'; import ts from 'typescript'; @@ -27,7 +27,7 @@ const TSLang = require(TS_BENCH + '/node_modules/tree-sitter-typescript').typesc const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-h2h.mts'; -writeFileSync(emPath, emitParser(grammar)); +writeFileSync(emPath, emitParser(grammar, jsTarget)); const { createParser } = await import(emPath + '?v=' + process.pid); const unit = readFileSync(CORPUS, 'utf-8'); diff --git a/test/incremental-grammars.ts b/test/incremental-grammars.ts index 404272b..9c4a780 100644 --- a/test/incremental-grammars.ts +++ b/test/incremental-grammars.ts @@ -13,7 +13,7 @@ // // node test/incremental-grammars.ts import { writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { generateInputs } from './grammar-gen.ts'; import { objectify } from './emitted-obj.ts'; @@ -85,7 +85,7 @@ const failures: string[] = []; for (const name of GRAMMARS) { const grammar = (await import(`../${name}.ts`)).default; const emPath = `/tmp/emitted-incr-${name}.mts`; - writeFileSync(emPath, emitParser(grammar)); + writeFileSync(emPath, emitParser(grammar, jsTarget)); const em = (await import(emPath + '?v=' + process.pid)) as Em; const session = em.createParser(); const fresh = em.createParser(); diff --git a/test/incremental-verify.ts b/test/incremental-verify.ts index cd01c17..85814e1 100644 --- a/test/incremental-verify.ts +++ b/test/incremental-verify.ts @@ -8,11 +8,11 @@ // node test/incremental-verify.ts import { objectify } from './emitted-obj.ts'; import { existsSync, readFileSync, writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-incremental.mts'; -writeFileSync(emPath, emitParser(grammar)); +writeFileSync(emPath, emitParser(grammar, jsTarget)); type Edit = { start: number; end: number; text: string }; type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; type Parser = { diff --git a/test/multi-doc.ts b/test/multi-doc.ts index c3d844d..25af324 100644 --- a/test/multi-doc.ts +++ b/test/multi-doc.ts @@ -9,12 +9,12 @@ // // node test/multi-doc.ts import { writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { objectify } from './emitted-obj.ts'; const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-multidoc.mts'; -writeFileSync(emPath, emitParser(grammar)); +writeFileSync(emPath, emitParser(grammar, jsTarget)); type Edit = { start: number; end: number; text: string }; type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; type Parser = { parse(s: string): Cst; edit(cst: Cst, edits: Edit[]): void; visit(cst: Cst, fns: object): void; tree: import('./emitted-obj.ts').TreeView }; diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 7dd6544..5732b64 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -1,4 +1,4 @@ -// Gate: the TARGET-AGNOSTIC emitter (issue #6) — `emitPortableParser(grammar, target)` +// Gate: the TARGET-AGNOSTIC emitter (issue #6) — `emitParser(grammar, target)` // derives a parser in EACH target language that produces the byte-identical CST the // interpreter (createParser) does. The agnosticism proof by EXECUTION: every grammar is // rendered to TypeScript, Go, and Rust; the Go/Rust sources are COMPILED and RUN, and each @@ -15,10 +15,7 @@ import { execFileSync } from 'node:child_process'; import { mkdirSync, rmSync, writeFileSync } from 'node:fs'; import { createParser } from '../src/gen-parser.ts'; -import { emitPortableParser } from '../src/emit-portable.ts'; -import { tsTarget } from '../src/target-ts.ts'; -import { goTarget } from '../src/target-go.ts'; -import { rustTarget } from '../src/target-rust.ts'; +import { emitParser, tsTarget, goTarget, rustTarget } from '../src/emit.ts'; import type { CstGrammar } from '../src/types.ts'; type Case = { grammar: string; path: string; accept: string[]; reject: string[]; tsOnly?: boolean }; @@ -243,19 +240,19 @@ for (const c of CASES) { const runners: Array<{ label: string; run: (src: string) => Outcome }> = []; const tsFile = `${dir}/p.ts`; - writeFileSync(tsFile, emitPortableParser(grammar, tsTarget)); + writeFileSync(tsFile, emitParser(grammar, tsTarget)); runners.push({ label: 'typescript', run: (src) => runProc('node', [tsFile], src) }); if (HAS_GO && !c.tsOnly) { const gdir = `${dir}/go`; mkdirSync(gdir, { recursive: true }); - writeFileSync(`${gdir}/main.go`, emitPortableParser(grammar, goTarget)); + writeFileSync(`${gdir}/main.go`, emitParser(grammar, goTarget)); writeFileSync(`${gdir}/go.mod`, 'module p\n\ngo 1.21\n'); execFileSync('go', ['build', '-o', `${gdir}/p`, '.'], { cwd: gdir, stdio: 'pipe' }); runners.push({ label: 'go', run: (src) => runProc(`${gdir}/p`, [], src) }); } if (HAS_RUST && !c.tsOnly) { const rfile = `${dir}/main.rs`; - writeFileSync(rfile, emitPortableParser(grammar, rustTarget)); + writeFileSync(rfile, emitParser(grammar, rustTarget)); execFileSync('rustc', ['-O', '-A', 'warnings', rfile, '-o', `${dir}/pr`], { stdio: 'pipe' }); runners.push({ label: 'rust', run: (src) => runProc(`${dir}/pr`, [], src) }); } diff --git a/test/profile-vs-peers.mjs b/test/profile-vs-peers.mjs index 421bc6a..e67a1f7 100644 --- a/test/profile-vs-peers.mjs +++ b/test/profile-vs-peers.mjs @@ -17,10 +17,10 @@ import { fileURLToPath } from 'node:url'; const REPO = resolve(dirname(fileURLToPath(import.meta.url)), '..'); const acorn = await import(REPO + '/node_modules/acorn/dist/acorn.mjs'); const parse5 = await import(REPO + '/node_modules/parse5/dist/index.js'); -const { emitParser } = await import(REPO + '/src/emit-parser.ts'); +const { emitParser, jsTarget } = await import(REPO + '/src/emit.ts'); -writeFileSync('/tmp/emitted-peers-js.mjs', emitParser((await import(REPO + '/javascript.ts')).default)); -writeFileSync('/tmp/emitted-peers-html.mjs', emitParser((await import(REPO + '/html.ts')).default)); +writeFileSync('/tmp/emitted-peers-js.mjs', emitParser((await import(REPO + '/javascript.ts')).default, jsTarget)); +writeFileSync('/tmp/emitted-peers-html.mjs', emitParser((await import(REPO + '/html.ts')).default, jsTarget)); const monoJs = await import('/tmp/emitted-peers-js.mjs?v=' + Date.now()); const monoHtml = await import('/tmp/emitted-peers-html.mjs?v=' + Date.now()); diff --git a/test/profile-vs-tsc.mjs b/test/profile-vs-tsc.mjs index b668fdd..0dc13c3 100644 --- a/test/profile-vs-tsc.mjs +++ b/test/profile-vs-tsc.mjs @@ -11,10 +11,10 @@ import { fileURLToPath } from 'node:url'; const REPO = resolve(dirname(fileURLToPath(import.meta.url)), '..'); const ts = (await import(REPO + '/node_modules/typescript/lib/typescript.js')).default; -const { emitParser } = await import(REPO + '/src/emit-parser.ts'); +const { emitParser, jsTarget } = await import(REPO + '/src/emit.ts'); const grammar = (await import(REPO + '/typescript.ts')).default; -writeFileSync('/tmp/emitted-current.mjs', emitParser(grammar)); +writeFileSync('/tmp/emitted-current.mjs', emitParser(grammar, jsTarget)); const emitted = await import('/tmp/emitted-current.mjs?v=' + Date.now()); const paths = [ diff --git a/test/recovery-conformance.ts b/test/recovery-conformance.ts index a2eda45..7c7a7f1 100644 --- a/test/recovery-conformance.ts +++ b/test/recovery-conformance.ts @@ -13,12 +13,12 @@ import { writeFileSync, readFileSync } from 'node:fs'; import { readdir } from 'fs/promises'; import { join } from 'path'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import ts from 'typescript'; const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-recovery-conf.mts'; -writeFileSync(emPath, emitParser(grammar)); +writeFileSync(emPath, emitParser(grammar, jsTarget)); type Cst = { root: number; errors: { offset: number; end: number; message: string }[] }; const em = (await import(emPath + '?v=' + process.pid)) as { createParser(): { parse(s: string): Cst } }; const p = em.createParser(); diff --git a/test/recovery.ts b/test/recovery.ts index 9215c46..193cae8 100644 --- a/test/recovery.ts +++ b/test/recovery.ts @@ -13,12 +13,12 @@ // // node test/recovery.ts import { existsSync, readFileSync, writeFileSync } from 'node:fs'; -import { emitParser } from '../src/emit-parser.ts'; +import { emitParser, jsTarget } from '../src/emit.ts'; import { objectify } from './emitted-obj.ts'; const grammar = (await import('../typescript.ts')).default; const emPath = '/tmp/emitted-recovery.mts'; -writeFileSync(emPath, emitParser(grammar)); +writeFileSync(emPath, emitParser(grammar, jsTarget)); type Edit = { start: number; end: number; text: string }; type Diag = { offset: number; end: number; message: string; related?: { offset: number; end: number; message: string } }; type Cst = { root: number; errors: Diag[] }; From 6e0df6c9f5a3f3b684030bf7c2f2810b40da0a89 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 07:14:55 +0800 Subject: [PATCH 25/27] test: move the portable-targets fixtures examples/ -> test/fixtures/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The thirteen grammars under examples/ are not user-facing examples — they are the construct-isolation fixtures consumed solely by test/portable-targets.ts (each isolates one emitter construct so a divergence pinpoints which one broke). They belong next to their only consumer, beside test/vendor/, not in a directory whose name promises a learning sample. No real examples were displaced; examples/ held only fixtures and is now removed. Mechanical: git mv to test/fixtures/, fixtures' `../src` imports -> `../../src`, gate paths `../examples/X.ts` -> `./fixtures/X.ts`. Full suite 42/42. --- src/emit-portable.ts | 2 +- {examples => test/fixtures}/altjs.ts | 2 +- {examples => test/fixtures}/arrowjs.ts | 2 +- {examples => test/fixtures}/calc.ts | 2 +- {examples => test/fixtures}/ledjs.ts | 2 +- {examples => test/fixtures}/minijs.ts | 2 +- {examples => test/fixtures}/noinjs.ts | 2 +- {examples => test/fixtures}/nudjs.ts | 2 +- {examples => test/fixtures}/postjs.ts | 2 +- {examples => test/fixtures}/regexjs.ts | 2 +- {examples => test/fixtures}/richtokens.ts | 2 +- {examples => test/fixtures}/seqjs.ts | 2 +- {examples => test/fixtures}/sljs.ts | 2 +- {examples => test/fixtures}/templatejs.ts | 2 +- test/portable-targets.ts | 26 +++++++++++------------ 15 files changed, 27 insertions(+), 27 deletions(-) rename {examples => test/fixtures}/altjs.ts (98%) rename {examples => test/fixtures}/arrowjs.ts (98%) rename {examples => test/fixtures}/calc.ts (98%) rename {examples => test/fixtures}/ledjs.ts (98%) rename {examples => test/fixtures}/minijs.ts (99%) rename {examples => test/fixtures}/noinjs.ts (98%) rename {examples => test/fixtures}/nudjs.ts (98%) rename {examples => test/fixtures}/postjs.ts (97%) rename {examples => test/fixtures}/regexjs.ts (99%) rename {examples => test/fixtures}/richtokens.ts (98%) rename {examples => test/fixtures}/seqjs.ts (98%) rename {examples => test/fixtures}/sljs.ts (98%) rename {examples => test/fixtures}/templatejs.ts (98%) diff --git a/src/emit-portable.ts b/src/emit-portable.ts index e2ec5d8..4bbf760 100644 --- a/src/emit-portable.ts +++ b/src/emit-portable.ts @@ -19,7 +19,7 @@ // and a Pratt expression engine with operator precedence/associativity, prefix unary, // bracket NUDs (grouping, array), and mixfix LEDs (call / member / index) tried before // operators. buildIR THROWS on a construct outside this set rather than emit a wrong -// parser. This is enough to derive a real JavaScript-subset parser (examples/minijs.ts). +// parser. This is enough to derive a real JavaScript-subset parser (test/fixtures/minijs.ts). import type { CstGrammar, RuleExpr, TokenDecl, TokenPattern } from './types.ts'; import { withAwaitYield } from './await-yield-fork.ts'; import { analyzeGrammar, findEntryRule } from './grammar-analysis.ts'; diff --git a/examples/altjs.ts b/test/fixtures/altjs.ts similarity index 98% rename from examples/altjs.ts rename to test/fixtures/altjs.ts index d1f117d..fe409a7 100644 --- a/examples/altjs.ts +++ b/test/fixtures/altjs.ts @@ -5,7 +5,7 @@ import { token, rule, defineGrammar, left, op, seq, oneOf, range, star, sep, opt, many, alt, noneOf, -} from '../src/api.ts'; +} from '../../src/api.ts'; const digit = range('0', '9'); const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); diff --git a/examples/arrowjs.ts b/test/fixtures/arrowjs.ts similarity index 98% rename from examples/arrowjs.ts rename to test/fixtures/arrowjs.ts index 486cdaa..b4967c9 100644 --- a/examples/arrowjs.ts +++ b/test/fixtures/arrowjs.ts @@ -6,7 +6,7 @@ import { token, rule, defineGrammar, left, right, op, capExpr, alt, seq, oneOf, range, star, sep, opt, many, -} from '../src/api.ts'; +} from '../../src/api.ts'; const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); diff --git a/examples/calc.ts b/test/fixtures/calc.ts similarity index 98% rename from examples/calc.ts rename to test/fixtures/calc.ts index 2bfcfff..5fb7078 100644 --- a/examples/calc.ts +++ b/test/fixtures/calc.ts @@ -15,7 +15,7 @@ import { token, rule, defineGrammar, left, right, op, prefix, seq, oneOf, range, star, many, -} from '../src/api.ts'; +} from '../../src/api.ts'; const digit = range('0', '9'); const identStart = oneOf(range('a', 'z'), range('A', 'Z'), '_'); diff --git a/examples/ledjs.ts b/test/fixtures/ledjs.ts similarity index 98% rename from examples/ledjs.ts rename to test/fixtures/ledjs.ts index f13831a..7148851 100644 --- a/examples/ledjs.ts +++ b/test/fixtures/ledjs.ts @@ -5,7 +5,7 @@ import { token, rule, defineGrammar, left, right, op, seq, oneOf, range, star, many, -} from '../src/api.ts'; +} from '../../src/api.ts'; const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); diff --git a/examples/minijs.ts b/test/fixtures/minijs.ts similarity index 99% rename from examples/minijs.ts rename to test/fixtures/minijs.ts index 6de468a..25279e6 100644 --- a/examples/minijs.ts +++ b/test/fixtures/minijs.ts @@ -16,7 +16,7 @@ import { token, rule, defineGrammar, left, right, op, prefix, alt, seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, anyChar, -} from '../src/api.ts'; +} from '../../src/api.ts'; const digit = range('0', '9'); const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); diff --git a/examples/noinjs.ts b/test/fixtures/noinjs.ts similarity index 98% rename from examples/noinjs.ts rename to test/fixtures/noinjs.ts index bc413ad..54d5395 100644 --- a/examples/noinjs.ts +++ b/test/fixtures/noinjs.ts @@ -5,7 +5,7 @@ import { token, rule, defineGrammar, left, op, exclude, seq, oneOf, range, star, many, -} from '../src/api.ts'; +} from '../../src/api.ts'; const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); diff --git a/examples/nudjs.ts b/test/fixtures/nudjs.ts similarity index 98% rename from examples/nudjs.ts rename to test/fixtures/nudjs.ts index a443b7b..d9b54c6 100644 --- a/examples/nudjs.ts +++ b/test/fixtures/nudjs.ts @@ -6,7 +6,7 @@ import { token, rule, defineGrammar, left, op, seq, oneOf, range, star, sep, opt, many, alt, not, noneOf, -} from '../src/api.ts'; +} from '../../src/api.ts'; const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); diff --git a/examples/postjs.ts b/test/fixtures/postjs.ts similarity index 97% rename from examples/postjs.ts rename to test/fixtures/postjs.ts index 239fdec..1ac9340 100644 --- a/examples/postjs.ts +++ b/test/fixtures/postjs.ts @@ -4,7 +4,7 @@ import { token, rule, defineGrammar, left, right, op, prefix, postfix, seq, oneOf, range, star, many, -} from '../src/api.ts'; +} from '../../src/api.ts'; const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); diff --git a/examples/regexjs.ts b/test/fixtures/regexjs.ts similarity index 99% rename from examples/regexjs.ts rename to test/fixtures/regexjs.ts index b9ad82d..0f966f9 100644 --- a/examples/regexjs.ts +++ b/test/fixtures/regexjs.ts @@ -7,7 +7,7 @@ import { token, rule, defineGrammar, left, right, op, prefix, alt, seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, anyChar, -} from '../src/api.ts'; +} from '../../src/api.ts'; const digit = range('0', '9'); const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); diff --git a/examples/richtokens.ts b/test/fixtures/richtokens.ts similarity index 98% rename from examples/richtokens.ts rename to test/fixtures/richtokens.ts index ed10aec..3f7bc2d 100644 --- a/examples/richtokens.ts +++ b/test/fixtures/richtokens.ts @@ -9,7 +9,7 @@ import { token, rule, defineGrammar, seq, oneOf, range, star, plus, repeat, optPattern, altPattern, noneOf, anyChar, notFollowedBy, many, -} from '../src/api.ts'; +} from '../../src/api.ts'; const digit = range('0', '9'); const hexDigit = oneOf(digit, range('a', 'f'), range('A', 'F')); diff --git a/examples/seqjs.ts b/test/fixtures/seqjs.ts similarity index 98% rename from examples/seqjs.ts rename to test/fixtures/seqjs.ts index d0e40fe..b1facfd 100644 --- a/examples/seqjs.ts +++ b/test/fixtures/seqjs.ts @@ -4,7 +4,7 @@ import { token, rule, defineGrammar, left, op, seq, oneOf, range, star, opt, many, -} from '../src/api.ts'; +} from '../../src/api.ts'; // `many(',', $)` is the rule-level `(',' Expr)*` — a star whose body is the sequence // `, Expr`, exactly the shape javascript.ts uses for comma lists. diff --git a/examples/sljs.ts b/test/fixtures/sljs.ts similarity index 98% rename from examples/sljs.ts rename to test/fixtures/sljs.ts index 68421a5..5c57d9e 100644 --- a/examples/sljs.ts +++ b/test/fixtures/sljs.ts @@ -5,7 +5,7 @@ import { token, rule, defineGrammar, left, op, seq, oneOf, range, star, opt, many, altPattern, noneOf, sameLine, -} from '../src/api.ts'; +} from '../../src/api.ts'; const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$'); diff --git a/examples/templatejs.ts b/test/fixtures/templatejs.ts similarity index 98% rename from examples/templatejs.ts rename to test/fixtures/templatejs.ts index 8cda83d..cf6f523 100644 --- a/examples/templatejs.ts +++ b/test/fixtures/templatejs.ts @@ -6,7 +6,7 @@ import { token, rule, defineGrammar, left, right, op, prefix, alt, seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, notFollowedBy, -} from '../src/api.ts'; +} from '../../src/api.ts'; const digit = range('0', '9'); const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$'); diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 5732b64..732c011 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -21,7 +21,7 @@ import type { CstGrammar } from '../src/types.ts'; type Case = { grammar: string; path: string; accept: string[]; reject: string[]; tsOnly?: boolean }; const CASES: Case[] = [ { - grammar: 'calc', path: '../examples/calc.ts', + grammar: 'calc', path: './fixtures/calc.ts', accept: [ '1;', 'a;', '', '1 + 2 * 3;', '1 * 2 + 3;', '1 - 2 - 3;', 'a / b / c;', '1 + 2 + 3 + 4;', '-a;', '-(-a);', '- - a;', '-a * b;', '-a + b * c;', '-(a + b) * c;', @@ -31,7 +31,7 @@ const CASES: Case[] = [ reject: ['1 +;', '(1;', '1 2;', 'let = 1;', ') ;', '* a;', 'let x 1;'], }, { - grammar: 'minijs', path: '../examples/minijs.ts', + grammar: 'minijs', path: './fixtures/minijs.ts', accept: [ '1;', 'a;', '', 'x = 1 + 2 * 3;', '-a * b + 1;', '(1 + 2) * 3;', 'foo(a, b);', 'a.b.c;', 'a[0][1];', 'f()()();', 'a.b(c).d[e];', @@ -50,7 +50,7 @@ const CASES: Case[] = [ // The general token-pattern matcher (stateless real-JS token tier): \u-escaped // identifiers, the decimal/hex number family with a boundary, both-quote strings — // compiled to a backtracking-free matcher in all three targets. - grammar: 'richtokens', path: '../examples/richtokens.ts', + grammar: 'richtokens', path: './fixtures/richtokens.ts', accept: [ '123', '0xFF', '1_000_000', '3.14', 'foo', 'bar_$x9', '"hi"', "'single'", '"esc\\"q\\n"', '123 0xa foo "s" 3.14', '0xDEADbeef 42 _id $x cafe // line\n 7', @@ -62,7 +62,7 @@ const CASES: Case[] = [ // The STATEFUL regex-vs-division lexer: `/` is a regex in expression context, division // after a value. Exercises every branch of prevIsValue — after `=`/keyword/`(`-head // (regex) vs after value/`)`/`]`/member/call (division), plus regex escapes & classes. - grammar: 'regexjs', path: '../examples/regexjs.ts', + grammar: 'regexjs', path: './fixtures/regexjs.ts', accept: [ 'a / b;', 'var r = /abc/g;', 'return /re/;', 'if (x) /re/;', '(a + b) / c;', 'a.b / c;', 'foo(x) / y;', '[1, 2] / 3;', 'var x = a / b / c;', @@ -76,7 +76,7 @@ const CASES: Case[] = [ // STATEFUL template literals: the `${…}` interpolation split (head/middle/tail) with a // brace-depth stack — adjacent/multiple holes, exprs in holes, nested templates, and a // nested `{…}` object inside a hole (which must NOT close the hole). - grammar: 'templatejs', path: '../examples/templatejs.ts', + grammar: 'templatejs', path: './fixtures/templatejs.ts', accept: [ 'var a = `hello`;', 'var b = `hi ${name}!`;', 'var c = `${x}${y}`;', 'var d = `a${ x + 1 }b${ y * 2 }c`;', 'var e = `outer ${ `inner ${z}` } end`;', @@ -89,7 +89,7 @@ const CASES: Case[] = [ { // General (non-literal) inline alt: object keys are alt(Ident | Str | Number) — a // backtracking alternation of token refs inside a rule sequence. - grammar: 'altjs', path: '../examples/altjs.ts', + grammar: 'altjs', path: './fixtures/altjs.ts', accept: [ '{a: 1};', '{"k": 2};', '{1: x};', '{a: 1, "b": 2, 3: c};', '{x: 1 + 2 * 3};', '({nested: {inner: 1}});', '{};', 'a + b;', '{k: (1 + 2)};', @@ -99,7 +99,7 @@ const CASES: Case[] = [ { // General Pratt NUD sequences: a reserved-word-guarded identifier (`not(kw)… Ident`, // a zero-width negative lookahead) and a quantifier-first class expression. - grammar: 'nudjs', path: '../examples/nudjs.ts', + grammar: 'nudjs', path: './fixtures/nudjs.ts', accept: [ 'x;', 'foo + bar;', 'class C {};', 'class {};', 'class C extends B {};', '@dec class C { m(){} };', 'new Foo;', 'new C();', 'a.b.c;', @@ -111,7 +111,7 @@ const CASES: Case[] = [ // Postfix-operator LED (`x++`/`x--`) + the access-tail closure: once a postfix binds, the // operand is an update expression, so a further postfix or an access tail (`.`/`[`/`(`) // can't attach (`a++--`, `a++.b` are ill-formed; `(a++).b` is fine). - grammar: 'postjs', path: '../examples/postjs.ts', + grammar: 'postjs', path: './fixtures/postjs.ts', accept: [ 'x++;', 'x--;', 'a + b++;', '++x;', 'x++ + y;', 'a.b++;', '(x)++;', '--a.b;', 'x++ * 2;', '(a++).b;', 'x.y.z++;', @@ -121,7 +121,7 @@ const CASES: Case[] = [ { // A grouped sub-sequence `seq` step: comma lists as `star([',', $])` (e.g. `many(',', $)`), // the array/argument-list shape javascript.ts uses. - grammar: 'seqjs', path: '../examples/seqjs.ts', + grammar: 'seqjs', path: './fixtures/seqjs.ts', accept: [ '[1, 2, 3];', '[];', '[1];', 'f(1, 2);', 'f();', '[a + b, c];', 'f(g(1, 2), 3);', '(x);', 'f(a)(b, c);', '[[1,2],[3,4]];', @@ -132,7 +132,7 @@ const CASES: Case[] = [ // The `sameLine` zero-width assertion (no line terminator before the next token): // `return` takes a value only on the same line. Also verifies the lexer's newline-before // tracking across a block comment that spans a newline. - grammar: 'sljs', path: '../examples/sljs.ts', + grammar: 'sljs', path: './fixtures/sljs.ts', accept: [ 'return 1;', 'return;', 'return 1 + 2;', '1 + 2;', 'return /* c */ 1;', '(a);', 'return (1);', @@ -143,7 +143,7 @@ const CASES: Case[] = [ // capBelow (assignment-level) arrow functions: a NUD parsed only when minBp < the // connector's bp, admitting NO led once parsed; the `(x) => y` vs `(x)` ambiguity is // resolved by longest-match ordering (the arrow is tried first, falls back to grouping). - grammar: 'arrowjs', path: '../examples/arrowjs.ts', + grammar: 'arrowjs', path: './fixtures/arrowjs.ts', accept: [ 'x => x;', '(a, b) => a + b;', '() => {};', 'x = (() => 1);', 'f(() => 1, 2);', '(x);', 'a + b;', 'x => y => x;', '(() => 2);', '(a) => a;', 'x = y => y;', 'foo();', @@ -154,7 +154,7 @@ const CASES: Case[] = [ { // Precedence-gated mixfix LEDs: ternary `? :` (binds below the operators) and the // chain-rhs relational leds `in`/`instanceof` (`a in b in c` left-chains). - grammar: 'ledjs', path: '../examples/ledjs.ts', + grammar: 'ledjs', path: './fixtures/ledjs.ts', accept: [ 'a == b ? c : d;', 'a ? b : c ? d : e;', 'a + b ? c : d - e;', 'a in b;', 'a in b in c;', 'x instanceof Y;', 'a < b in c;', '1 + 2 * 3 ? 4 : 5;', @@ -165,7 +165,7 @@ const CASES: Case[] = [ { // The no-`in` (suppress) context: a `for (binding in iterable)` head parses its binding // with the `in` led disabled, so `in` belongs to the for-head, not the binding. - grammar: 'noinjs', path: '../examples/noinjs.ts', + grammar: 'noinjs', path: './fixtures/noinjs.ts', accept: [ 'for (x in y) z;', 'x in y;', 'for (a.b in c) d;', 'a in b in c;', 'for ((x) in y) z;', 'for (x in y) a in b;', 'for (x in a in b) z;', From 84895d3cc30e75546160243f80028a21347d8f43 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 07:33:46 +0800 Subject: [PATCH 26/27] Address Copilot review: portable lexer newline parity + stale-API docs + .mts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All ten review comments, verified before fixing: - Portable lexers (ts/go/rust) set newline-before for `\r`/LS/PS, but the interpreter (gen-lexer.ts) sets it only for `\n`. Confirmed by parse: `return\r1;` the oracle ACCEPTS (CR isn't newline-before) while the portable REJECTED. Fixed all three to set pendingNl only for `\n`; `\r`/LS/PS are plain whitespace. Added the `return\r1;` (accept) / `return\r\n1;` (reject) cases to the sljs gate as a guard. (go/rust are byte-based, so their `8232`/`8233` checks were already dead; the reachable bug was `\r`.) - README's portable-emitter snippet still imported the removed `emitPortableParser` from src/emit-portable.ts + target-*.ts → rewritten to `emitParser` from src/emit.ts. - calc/minijs fixture header comments referenced `emitPortableParser` → `emitParser`. - profile-vs-tsc/peers write the now-typed emitted parser to `.mjs` and import it; node only strips types from `.ts`/`.mts`, so that would SyntaxError → switched the emitted output to `.mts` (matching the other emit harnesses). emit-parser-verify byte-identical, portable-targets 16 grammars ×3 (incl. the new CR cases), full suite 42/42. Separately noted (not in scope here): the interpreter itself counts only `\n` as a line terminator, not `\r`/LS/PS — a pre-existing JS-ASI conformance gap in the core lexer, on near-extinct inputs. --- README.md | 10 ++++------ src/target-go.ts | 4 ++-- src/target-rust.ts | 4 ++-- src/target-ts.ts | 6 ++++-- test/fixtures/calc.ts | 2 +- test/fixtures/minijs.ts | 2 +- test/portable-targets.ts | 5 ++++- test/profile-vs-peers.mjs | 8 ++++---- test/profile-vs-tsc.mjs | 4 ++-- 9 files changed, 24 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 3b2e3a9..243879e 100644 --- a/README.md +++ b/README.md @@ -340,15 +340,13 @@ const Regex = token(seq( ### The emitted parser need not be JS — Go, Rust, native -The grammar also derives a **standalone parser in another language**. [`emitPortableParser(grammar, target)`](src/emit-portable.ts) runs one analysis into one language-agnostic IR, and each `Target` renders it — including its own regex-free lexer, so the output has no dependency on the JS runtime and compiles offline: +The grammar also derives a **standalone parser in another language**. [`emitParser(grammar, target)`](src/emit.ts) runs one analysis into one language-agnostic IR, and each `Target` renders it — including its own regex-free lexer (`emitParser` reuses `emitLexer(grammar, target)`), so the output has no dependency on the JS runtime and compiles offline: ```ts -import { emitPortableParser } from './src/emit-portable.ts'; -import { goTarget } from './src/target-go.ts'; -import { rustTarget } from './src/target-rust.ts'; +import { emitParser, goTarget, rustTarget } from './src/emit.ts'; -writeFileSync('parser.go', emitPortableParser(grammar, goTarget)); // `go build`, no deps -writeFileSync('parser.rs', emitPortableParser(grammar, rustTarget)); // `rustc`, no crates +writeFileSync('parser.go', emitParser(grammar, goTarget)); // `go build`, no deps +writeFileSync('parser.rs', emitParser(grammar, rustTarget)); // `rustc`, no crates ``` The proof is the full languages: the real [`javascript.ts`](javascript.ts) and [`typescript.ts`](typescript.ts) grammars — including the `[Await]/[Yield]` fork, left recursion, the regex/division and template state machines, arrow functions, and the TS type grammar — emit to **TypeScript, Go, and Rust**, and every CST is byte-identical to the reference interpreter. [`test/portable-targets.ts`](test/portable-targets.ts) compiles and runs all three for sixteen grammars (the two real languages plus focused fixtures) on every CI run. The Rust output reaches [oxc](https://github.com/oxc-project/oxc) throughput and the Go output beats [tsgo](https://github.com/microsoft/typescript-go) on the same corpus (an arena keeps both near zero-allocation). Byte-based Go/Rust use UTF-8 offsets — identical to the JS interpreter's for ASCII; non-ASCII offset units differ inherently. diff --git a/src/target-go.ts b/src/target-go.ts index 8d61807..d45ec2c 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -156,8 +156,8 @@ ${emitHooks} \t_ = pendingNl ${rxState}${tplState}${emitFn}${pushTokFn}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { \t\tc := int(src[pos]) -\t\tif c == 10 || c == 13 || c == 8232 || c == 8233 { pendingNl = true; pos++; continue } -\t\tif c == 32 || c == 9 || c == 11 || c == 12 || c == 160 || c == 5760 || (c >= 8192 && c <= 8202) || c == 8239 || c == 8287 || c == 12288 || c == 65279 { pos++; continue } +\t\tif c == 10 { pendingNl = true; pos++; continue } // only LF (10) is newline-before (matches the interpreter); CR is plain whitespace +\t\tif c == 13 || c == 32 || c == 9 || c == 11 || c == 12 || c == 160 || c == 5760 || (c >= 8192 && c <= 8202) || c == 8239 || c == 8287 || c == 12288 || c == 65279 { pos++; continue } ${tplDispatch}${toks} ${puncts} \t\tpanic(fmt.Sprintf("lex error at %d", pos)) diff --git a/src/target-rust.ts b/src/target-rust.ts index 8d995f2..0466ce2 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -163,8 +163,8 @@ ${open} let mut pos = 0usize; while pos < n { let c = b[pos] as u32; - if c == 32 || c == 9 { pos += 1; continue; } - if c == 10 || c == 13 { ${nlVar} = true; pos += 1; continue; } + if c == 32 || c == 9 || c == 13 { pos += 1; continue; } // CR is plain whitespace, NOT newline-before + if c == 10 { ${nlVar} = true; pos += 1; continue; } // only LF (10) is newline-before (matches the interpreter) ${tplDispatch}${toks} ${puncts} panic!("lex error at {}", pos); diff --git a/src/target-ts.ts b/src/target-ts.ts index 549b302..f18c046 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -147,8 +147,10 @@ ${emitHooks} let pendingNl = false; ${defs.length ? ' _s = src;\n' : ''}${rxState}${tplState}${stateful ? emitFn : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end, nl: pendingNl }); pendingNl = false; };\n'} while (pos < n) { const c = src.charCodeAt(pos); - if (c === 10 || c === 13 || c === 8232 || c === 8233) { pendingNl = true; pos++; continue; } - if (c === 32 || c === 9 || c === 11 || c === 12 || c === 160 || c === 5760 || (c >= 8192 && c <= 8202) || c === 8239 || c === 8287 || c === 12288 || c === 65279) { pos++; continue; } + // Only LF (char 10) sets newline-before, matching the interpreter (gen-lexer.ts: only wc === 10). + // CR/LS/PS are whitespace but NOT newline-before there, so a lone CR must not flip sameLine. + if (c === 10) { pendingNl = true; pos++; continue; } + if (c === 13 || c === 8232 || c === 8233 || c === 32 || c === 9 || c === 11 || c === 12 || c === 160 || c === 5760 || (c >= 8192 && c <= 8202) || c === 8239 || c === 8287 || c === 12288 || c === 65279) { pos++; continue; } ${tplDispatch}${toks} ${puncts} throw new Error('lex error at ' + pos + ': ' + JSON.stringify(src[pos])); diff --git a/test/fixtures/calc.ts b/test/fixtures/calc.ts index 5fb7078..1d04fd1 100644 --- a/test/fixtures/calc.ts +++ b/test/fixtures/calc.ts @@ -5,7 +5,7 @@ // backtracking alternation, quantifiers (opt/many/sep), recursion (grouping), and — // the crux — a Pratt expression engine with operator PRECEDENCE and associativity // (`1 + 2 * 3` must group as `1 + (2 * 3)`), prefix unary, and a left-associative -// call/postfix continuation. emitPortableParser derives a TS, Go, and Rust parser +// call/postfix continuation. emitParser(grammar, target) derives a TS, Go, and Rust parser // from THIS one definition; the cross-language gate proves all three produce the // byte-identical CST the interpreter (createParser) does. // diff --git a/test/fixtures/minijs.ts b/test/fixtures/minijs.ts index 25279e6..21f9bfb 100644 --- a/test/fixtures/minijs.ts +++ b/test/fixtures/minijs.ts @@ -4,7 +4,7 @@ // chains, arrays, and the common statement forms), so the emitted Rust parser can // be benchmarked against oxc on the same bytes. // -// Derived from ONE definition by emitPortableParser into TypeScript, Go, and Rust; +// Derived from ONE definition by emitParser(grammar, target) into TypeScript, Go, and Rust; // the cross-language gate proves all three produce the byte-identical CST that the // interpreter (createParser) does. The portable lexer is regex-free (char scanner // driven by token-pattern.ts's structural recognizers), so the Go/Rust output diff --git a/test/portable-targets.ts b/test/portable-targets.ts index 732c011..ca3072a 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -136,8 +136,11 @@ const CASES: Case[] = [ accept: [ 'return 1;', 'return;', 'return 1 + 2;', '1 + 2;', 'return /* c */ 1;', '(a);', 'return (1);', + // Only `\n` is newline-before — a lone `\r` is plain whitespace, so `return` still takes its + // same-line operand (matches the interpreter; was a portable-lexer bug). CRLF still has the `\n`. + 'return\r1;', ], - reject: ['return\n1;', 'return\nx;', 'return /*\n*/ 1;', 'return // c\n 1;'], + reject: ['return\n1;', 'return\nx;', 'return /*\n*/ 1;', 'return // c\n 1;', 'return\r\n1;'], }, { // capBelow (assignment-level) arrow functions: a NUD parsed only when minBp < the diff --git a/test/profile-vs-peers.mjs b/test/profile-vs-peers.mjs index e67a1f7..801ecde 100644 --- a/test/profile-vs-peers.mjs +++ b/test/profile-vs-peers.mjs @@ -19,10 +19,10 @@ const acorn = await import(REPO + '/node_modules/acorn/dist/acorn.mjs'); const parse5 = await import(REPO + '/node_modules/parse5/dist/index.js'); const { emitParser, jsTarget } = await import(REPO + '/src/emit.ts'); -writeFileSync('/tmp/emitted-peers-js.mjs', emitParser((await import(REPO + '/javascript.ts')).default, jsTarget)); -writeFileSync('/tmp/emitted-peers-html.mjs', emitParser((await import(REPO + '/html.ts')).default, jsTarget)); -const monoJs = await import('/tmp/emitted-peers-js.mjs?v=' + Date.now()); -const monoHtml = await import('/tmp/emitted-peers-html.mjs?v=' + Date.now()); +writeFileSync('/tmp/emitted-peers-js.mts', emitParser((await import(REPO + '/javascript.ts')).default, jsTarget)); +writeFileSync('/tmp/emitted-peers-html.mts', emitParser((await import(REPO + '/html.ts')).default, jsTarget)); +const monoJs = await import('/tmp/emitted-peers-js.mts?v=' + Date.now()); +const monoHtml = await import('/tmp/emitted-peers-html.mts?v=' + Date.now()); function time(fn, code, name, n) { const s = process.hrtime.bigint(); diff --git a/test/profile-vs-tsc.mjs b/test/profile-vs-tsc.mjs index 0dc13c3..61d7382 100644 --- a/test/profile-vs-tsc.mjs +++ b/test/profile-vs-tsc.mjs @@ -14,8 +14,8 @@ const ts = (await import(REPO + '/node_modules/typescript/lib/typescript.js')).d const { emitParser, jsTarget } = await import(REPO + '/src/emit.ts'); const grammar = (await import(REPO + '/typescript.ts')).default; -writeFileSync('/tmp/emitted-current.mjs', emitParser(grammar, jsTarget)); -const emitted = await import('/tmp/emitted-current.mjs?v=' + Date.now()); +writeFileSync('/tmp/emitted-current.mts', emitParser(grammar, jsTarget)); +const emitted = await import('/tmp/emitted-current.mts?v=' + Date.now()); const paths = [ '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserharness.ts', From 8cca2bc0f35cf5bac12e91a2d64be2886295cd6d Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 22 Jun 2026 07:54:20 +0800 Subject: [PATCH 27/27] Fix JS line-terminator conformance across all four lexers (CR / LS / PS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lexers counted only LF as a line terminator, but ECMAScript also defines CR (U+000D), LS (U+2028), and PS (U+2029) — the set that drives ASI and the "no LineTerminator here" restrictions. So `return\r1` was parsed `return 1` where a conforming JS parser applies ASI (bare `return`, then `1`). Fixed consistently in all four lexer implementations so they stay in lockstep: - gen-lexer.ts (interpreter, the oracle): LF/CR in the ASCII path, LS/PS via the \s-run regex, and the comment-span check. - emit-lexer.ts (emitted SoA/JS lexer): the same, in its codegen. - target-ts.ts (portable, UTF-16): LF/CR/LS/PS. - target-go.ts / target-rust.ts (portable, byte-based): LF/CR (LS/PS are multi-byte and fall under the documented non-ASCII offset boundary). CRLF is unchanged (the LF already set newline-before), so the existing corpus is unaffected — the change only reaches lone-CR and LS/PS inputs. This supersedes the earlier direction (which had made the portable lexers match the LF-only interpreter); now the interpreter is conforming and all four agree on the full set. sljs gate extended: `return\r1;` / `return\r\n1;` / `return /*\r*/ 1;` reject, `return\t1;` accepts (tab is whitespace, not a terminator) — checked across ts/go/rust. emit-parser-verify byte-identical, portable-targets 16 grammars ×3, full suite 42/42. --- src/emit-lexer.ts | 6 +++--- src/gen-lexer.ts | 10 ++++++---- src/target-go.ts | 6 +++--- src/target-rust.ts | 6 +++--- src/target-ts.ts | 9 ++++----- test/portable-targets.ts | 10 +++++----- 6 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts index 28f9420..c9f2921 100644 --- a/src/emit-lexer.ts +++ b/src/emit-lexer.ts @@ -43,7 +43,7 @@ const NON_ASCII_WS_FN = // The non-ASCII whitespace fallback, emitted at the two sites that need it (after an ASCII run, // and as the lead char). `cont` appends the `continue` the lead-char site needs. const nonAsciiWsConsume = (v: string, cont: boolean, indent: string): string => - `${indent}if (${v} > 127 && lxNonAsciiWs(${v})) { LX_WS.lastIndex = pos; const m = LX_WS.exec(source); if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length;${cont ? ' continue;' : ''} } }`; + `${indent}if (${v} > 127 && lxNonAsciiWs(${v})) { LX_WS.lastIndex = pos; const m = LX_WS.exec(source); if (m !== null) { if (/[\\n\\r\\u2028\\u2029]/.test(m[0])) pendingNl = true; pos += m[0].length;${cont ? ' continue;' : ''} } }`; export function emitSoaLexer(grammar: CstGrammar, st: LexerSymtab): string | null { // Out of scope: the markup / indentation / newline state machines. @@ -390,7 +390,7 @@ export function emitSoaLexer(grammar: CstGrammar, st: LexerSymtab): string | nul emit(` if (cc === 32 || (cc >= 9 && cc <= 13)) {`); emit(` let wc = cc;`); emit(` do {`); - emit(` if (wc === 10) pendingNl = true;`); + emit(` if (wc === 10 || wc === 13) pendingNl = true;`); // JS line terminators LF/CR (LS/PS via the \\s regex below) emit(` pos++;`); emit(` wc = source.charCodeAt(pos);`); emit(` } while (wc === 32 || (wc >= 9 && wc <= 13));`); @@ -476,7 +476,7 @@ export function emitSoaLexer(grammar: CstGrammar, st: LexerSymtab): string | nul emit(`${ind} }`); } if (m.skip) { - emit(`${ind} if (m[0].includes('\\n')) pendingNl = true;`); + emit(`${ind} if (/[\\n\\r\\u2028\\u2029]/.test(m[0])) pendingNl = true;`); emit(`${ind} pos += m[0].length;`); } else { emit(`${ind} const _e = pos + m[0].length;`); diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts index 7fc06ea..a4ed10c 100644 --- a/src/gen-lexer.ts +++ b/src/gen-lexer.ts @@ -842,21 +842,23 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { let wc = source.charCodeAt(pos); if (wc === 32 || (wc >= 9 && wc <= 13)) { do { - if (wc === 10) pendingNl = true; + // JS line terminators: LF, CR, LS, PS (the ECMAScript set, driving ASI / "no + // LineTerminator here"). LF/CR are ASCII (here); LS/PS arrive via the \s regex below. + if (wc === 10 || wc === 13) pendingNl = true; pos++; wc = source.charCodeAt(pos); } while (wc === 32 || (wc >= 9 && wc <= 13)); if (wc > 127) { // a Unicode space may continue the run — absorb it like the old regex did wsReY.lastIndex = pos; const wsMatch = wsReY.exec(source); - if (wsMatch) { if (wsMatch[0].includes('\n')) pendingNl = true; pos += wsMatch[0].length; } + if (wsMatch) { if (/[\n\r\u2028\u2029]/.test(wsMatch[0])) pendingNl = true; pos += wsMatch[0].length; } } continue; } if (wc > 127) { wsReY.lastIndex = pos; const wsMatch = wsReY.exec(source); - if (wsMatch) { if (wsMatch[0].includes('\n')) pendingNl = true; pos += wsMatch[0].length; continue; } + if (wsMatch) { if (/[\n\r\u2028\u2029]/.test(wsMatch[0])) pendingNl = true; pos += wsMatch[0].length; continue; } } } @@ -1178,7 +1180,7 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) { if (!tm.skip) { push(mkNamed(tm.name, m[0], pos, tm.k)); } else { - if (m[0].includes('\n')) pendingNl = true; // a skipped comment spanning a newline still terminates the previous line + if (/[\n\r\u2028\u2029]/.test(m[0])) pendingNl = true; // a skipped comment spanning a line terminator still terminates the previous line // An inline comment (indentation grammars) ENDS a plain scalar — flag the next token so a // multi-line fold won't reabsorb a post-comment line (yaml-test-suite 8XDJ / BF9H). if (indent?.comment && m[0].startsWith(indent.comment)) pendingComment = true; diff --git a/src/target-go.ts b/src/target-go.ts index d45ec2c..5d1e9b9 100644 --- a/src/target-go.ts +++ b/src/target-go.ts @@ -50,7 +50,7 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): st const name = (t as { name: string }).name; const stateful = rxTok !== undefined || tplTok !== undefined; if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine - const push = (endE: string) => (t.skip ? `if strings.Contains(src[pos:${endE}], "\\n") { pendingNl = true }; ` : stateful ? `emit(${J(name)}, src[pos:${endE}], pos, ${endE}); ` : `pushTok(${J(name)}, src[pos:${endE}], pos, ${endE}); `); + const push = (endE: string) => (t.skip ? `if strings.ContainsAny(src[pos:${endE}], "\\n\\r") { pendingNl = true }; ` : stateful ? `emit(${J(name)}, src[pos:${endE}], pos, ${endE}); ` : `pushTok(${J(name)}, src[pos:${endE}], pos, ${endE}); `); const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : ''; if (t.kind === 'run') return `\t\tif ${gate}${rangeCond('c', t.first)} { \t\t\te := pos + 1 @@ -156,8 +156,8 @@ ${emitHooks} \t_ = pendingNl ${rxState}${tplState}${emitFn}${pushTokFn}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n { \t\tc := int(src[pos]) -\t\tif c == 10 { pendingNl = true; pos++; continue } // only LF (10) is newline-before (matches the interpreter); CR is plain whitespace -\t\tif c == 13 || c == 32 || c == 9 || c == 11 || c == 12 || c == 160 || c == 5760 || (c >= 8192 && c <= 8202) || c == 8239 || c == 8287 || c == 12288 || c == 65279 { pos++; continue } +\t\tif c == 10 || c == 13 { pendingNl = true; pos++; continue } // JS line terminators LF/CR (matches the interpreter; LS/PS are multi-byte: non-ASCII boundary) +\t\tif c == 32 || c == 9 || c == 11 || c == 12 || c == 160 || c == 5760 || (c >= 8192 && c <= 8202) || c == 8239 || c == 8287 || c == 12288 || c == 65279 { pos++; continue } ${tplDispatch}${toks} ${puncts} \t\tpanic(fmt.Sprintf("lex error at %d", pos)) diff --git a/src/target-rust.ts b/src/target-rust.ts index 0466ce2..52051a0 100644 --- a/src/target-rust.ts +++ b/src/target-rust.ts @@ -54,7 +54,7 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): st const stateful = rxTok !== undefined || tplTok !== undefined; if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine const nlVar = stateful ? 'st.pending_nl' : 'pending_nl'; - const push = (endE: string) => (t.skip ? `if src[pos..${endE}].contains('\\n') { ${nlVar} = true; } ` : stateful ? `st.emit(${J(name)}, &src[pos..${endE}], pos, ${endE}); ` : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE}, nl: pending_nl }); pending_nl = false; `); + const push = (endE: string) => (t.skip ? `if src[pos..${endE}].bytes().any(|c| c == 10 || c == 13) { ${nlVar} = true; } ` : stateful ? `st.emit(${J(name)}, &src[pos..${endE}], pos, ${endE}); ` : `toks.push(Tok { kind: ${J(name)}, text: &src[pos..${endE}], off: pos, end: ${endE}, nl: pending_nl }); pending_nl = false; `); const gate = rxTok !== undefined && name === rxTok ? '!st.prev_is_value() && ' : ''; if (t.kind === 'run') return ` if ${gate}${rangeCond('c', t.first)} { let mut e = pos + 1; @@ -163,8 +163,8 @@ ${open} let mut pos = 0usize; while pos < n { let c = b[pos] as u32; - if c == 32 || c == 9 || c == 13 { pos += 1; continue; } // CR is plain whitespace, NOT newline-before - if c == 10 { ${nlVar} = true; pos += 1; continue; } // only LF (10) is newline-before (matches the interpreter) + if c == 32 || c == 9 { pos += 1; continue; } + if c == 10 || c == 13 { ${nlVar} = true; pos += 1; continue; } // JS line terminators LF/CR (matches the interpreter; LS/PS multi-byte: non-ASCII boundary) ${tplDispatch}${toks} ${puncts} panic!("lex error at {}", pos); diff --git a/src/target-ts.ts b/src/target-ts.ts index f18c046..dc45015 100644 --- a/src/target-ts.ts +++ b/src/target-ts.ts @@ -50,7 +50,7 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): st if (tplTok !== undefined && name === tplTok) return ''; // template token is scanned by the state machine // `emit(...)` threads the lexer state in stateful mode; a plain push otherwise. A skipped // token (comment) still records a newline it spans, so `sameLine` sees it. - const push = (endExpr: string) => (t.skip ? `if (src.slice(pos, ${endExpr}).indexOf('\\n') >= 0) pendingNl = true; ` : `${stateful ? 'emit' : 'push'}(${J(name)}, src.slice(pos, ${endExpr}), pos, ${endExpr}); `); + const push = (endExpr: string) => (t.skip ? `if (/[\\n\\r\\u2028\\u2029]/.test(src.slice(pos, ${endExpr}))) pendingNl = true; ` : `${stateful ? 'emit' : 'push'}(${J(name)}, src.slice(pos, ${endExpr}), pos, ${endExpr}); `); const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : ''; if (t.kind === 'run') return ` if (${gate}${rangeCond('c', t.first)}) { let e = pos + 1; @@ -147,10 +147,9 @@ ${emitHooks} let pendingNl = false; ${defs.length ? ' _s = src;\n' : ''}${rxState}${tplState}${stateful ? emitFn : ' const push = (kind: string, text: string, off: number, end: number) => { toks.push({ kind, text, off, end, nl: pendingNl }); pendingNl = false; };\n'} while (pos < n) { const c = src.charCodeAt(pos); - // Only LF (char 10) sets newline-before, matching the interpreter (gen-lexer.ts: only wc === 10). - // CR/LS/PS are whitespace but NOT newline-before there, so a lone CR must not flip sameLine. - if (c === 10) { pendingNl = true; pos++; continue; } - if (c === 13 || c === 8232 || c === 8233 || c === 32 || c === 9 || c === 11 || c === 12 || c === 160 || c === 5760 || (c >= 8192 && c <= 8202) || c === 8239 || c === 8287 || c === 12288 || c === 65279) { pos++; continue; } + // JS line terminators LF/CR/LS/PS set newline-before, matching the interpreter (gen-lexer.ts). + if (c === 10 || c === 13 || c === 8232 || c === 8233) { pendingNl = true; pos++; continue; } + if (c === 32 || c === 9 || c === 11 || c === 12 || c === 160 || c === 5760 || (c >= 8192 && c <= 8202) || c === 8239 || c === 8287 || c === 12288 || c === 65279) { pos++; continue; } ${tplDispatch}${toks} ${puncts} throw new Error('lex error at ' + pos + ': ' + JSON.stringify(src[pos])); diff --git a/test/portable-targets.ts b/test/portable-targets.ts index ca3072a..58de024 100644 --- a/test/portable-targets.ts +++ b/test/portable-targets.ts @@ -135,12 +135,12 @@ const CASES: Case[] = [ grammar: 'sljs', path: './fixtures/sljs.ts', accept: [ 'return 1;', 'return;', 'return 1 + 2;', '1 + 2;', 'return /* c */ 1;', - '(a);', 'return (1);', - // Only `\n` is newline-before — a lone `\r` is plain whitespace, so `return` still takes its - // same-line operand (matches the interpreter; was a portable-lexer bug). CRLF still has the `\n`. - 'return\r1;', + '(a);', 'return (1);', 'return\t1;', ], - reject: ['return\n1;', 'return\nx;', 'return /*\n*/ 1;', 'return // c\n 1;', 'return\r\n1;'], + // `\r`, LS, PS are JS line terminators just like `\n` (ASI / "no LineTerminator here"), so a + // `return` followed by any of them takes no operand — across all four lexers (interpreter, + // emitted JS, portable ts/go/rust). A `\t` (tab) is whitespace but NOT a terminator → accepted above. + reject: ['return\n1;', 'return\nx;', 'return /*\n*/ 1;', 'return // c\n 1;', 'return\r1;', 'return\r\n1;', 'return /*\r*/ 1;'], }, { // capBelow (assignment-level) arrow functions: a NUD parsed only when minBp < the