From d82461c30a532abd388ffd59085a228837fa2050 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:03:15 -0600 Subject: [PATCH 1/4] fix(embed): resolve source files from DB root, not cwd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `codegraph embed --db ` is invoked without a positional directory, the positional `rootDir` defaults to the current working directory. The embed loop then joined every relative node file path against cwd, causing every file read to fail and the command to exit 0 with a misleading "Stored 0 embeddings". Fix: - Persist the repo root (`root_dir`) in the `build_meta` table during build (both WASM/JS finalize and the Rust native orchestrator). - In `buildEmbeddings`, resolve relative file paths against `build_meta.root_dir` first, falling back to the DB's parent directory (`/.codegraph/graph.db` → ``) for DBs built before this change, then finally the caller-provided rootDir. - When every file read fails, throw `DbError` with a clear message so the CLI exits non-zero instead of printing "Stored 0 embeddings" and returning success. Closes #983 Impact: 3 functions changed, 5 affected --- crates/codegraph-core/src/build_pipeline.rs | 7 ++ src/domain/graph/builder/stages/finalize.ts | 5 + src/domain/search/generator.ts | 30 ++++- tests/search/embedding-strategy.test.ts | 116 ++++++++++++++++++++ 4 files changed, 156 insertions(+), 2 deletions(-) diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index 42589dda..656500ca 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -703,6 +703,13 @@ fn finalize_build(conn: &Connection, root_dir: &str) -> (i64, i64) { let _ = stmt.execute(["node_count", &node_count.to_string()]); let _ = stmt.execute(["edge_count", &edge_count.to_string()]); let _ = stmt.execute(["last_build", &now_ms().to_string()]); + // Persist repo root so downstream commands (e.g. `codegraph embed`) + // can resolve relative file paths regardless of invoking cwd. + let root_canon = std::fs::canonicalize(root_dir) + .ok() + .and_then(|p| p.to_str().map(|s| s.to_string())) + .unwrap_or_else(|| root_dir.to_string()); + let _ = stmt.execute(["root_dir", &root_canon]); } // Write journal header diff --git a/src/domain/graph/builder/stages/finalize.ts b/src/domain/graph/builder/stages/finalize.ts index 7794d742..62849980 100644 --- a/src/domain/graph/builder/stages/finalize.ts +++ b/src/domain/graph/builder/stages/finalize.ts @@ -88,6 +88,9 @@ function persistBuildMetadata( // subsequent build to be a full rebuild. const codeVersionToWrite = ctx.engineName === 'native' && ctx.engineVersion ? ctx.engineVersion : CODEGRAPH_VERSION; + // Persist the repo root so downstream commands (e.g. `codegraph embed`) + // can resolve relative file paths regardless of the invoking cwd. + const rootDirToWrite = path.resolve(ctx.rootDir); try { if (useNativeDb) { ctx.nativeDb!.setBuildMeta( @@ -99,6 +102,7 @@ function persistBuildMetadata( built_at: buildNow.toISOString(), node_count: String(nodeCount), edge_count: String(actualEdgeCount), + root_dir: rootDirToWrite, }).map(([key, value]) => ({ key, value: String(value) })), ); } else { @@ -110,6 +114,7 @@ function persistBuildMetadata( built_at: buildNow.toISOString(), node_count: nodeCount, edge_count: actualEdgeCount, + root_dir: rootDirToWrite, }); } } catch (err) { diff --git a/src/domain/search/generator.ts b/src/domain/search/generator.ts index 69363367..fd3c5317 100644 --- a/src/domain/search/generator.ts +++ b/src/domain/search/generator.ts @@ -1,6 +1,6 @@ import fs from 'node:fs'; import path from 'node:path'; -import { closeDb, findDbPath, openDb } from '../../db/index.js'; +import { closeDb, findDbPath, getBuildMeta, openDb } from '../../db/index.js'; import { warn } from '../../infrastructure/logger.js'; import { DbError } from '../../shared/errors.js'; import type { BetterSqlite3Database, NodeRow } from '../../types.js'; @@ -73,6 +73,15 @@ export async function buildEmbeddings( const db = openDb(dbPath) as BetterSqlite3Database; initEmbeddingsSchema(db); + // Prefer the repo root recorded at build time — embed may be invoked from a + // different cwd (e.g. `codegraph embed --db /abs/path/graph.db`) and the + // positional rootDir will be wrong in that case. Fall back to the DB's + // parent (`/.codegraph/graph.db` → ``) if metadata is missing, + // then finally the caller-provided rootDir for pre-metadata compat. + const metaRoot = getBuildMeta(db, 'root_dir'); + const dbParent = path.dirname(path.dirname(path.resolve(dbPath))); + const resolvedRoot = metaRoot || dbParent || rootDir; + db.exec('DELETE FROM embeddings'); db.exec('DELETE FROM embedding_meta'); db.exec('DELETE FROM fts_index'); @@ -98,13 +107,17 @@ export async function buildEmbeddings( const config = getModelConfig(modelKey); const contextWindow = config.contextWindow; let overflowCount = 0; + let filesRead = 0; + let filesSkipped = 0; for (const [file, fileNodes] of byFile) { - const fullPath = path.isAbsolute(file) ? file : path.join(rootDir, file); + const fullPath = path.isAbsolute(file) ? file : path.join(resolvedRoot, file); let lines: string[]; try { lines = fs.readFileSync(fullPath, 'utf-8').split('\n'); + filesRead++; } catch (err: unknown) { + filesSkipped++; warn(`Cannot read ${file} for embeddings: ${(err as Error).message}`); continue; } @@ -136,6 +149,19 @@ export async function buildEmbeddings( ); } + // If there were symbols to embed but every file failed to read, the DB was + // almost certainly built from a different location than the current cwd. + // Surface this clearly instead of emitting a silent "Stored 0 embeddings". + if (byFile.size > 0 && filesRead === 0) { + closeDb(db); + throw new DbError( + `embed: could not read any of the ${filesSkipped} source files recorded in the graph — the DB may have been built from a different location than the current working directory.\n` + + `Tried resolving against: ${resolvedRoot}\n` + + 'Pass a positional argument pointing at the original repo root, or re-run "codegraph build" from that directory.', + { file: dbPath }, + ); + } + console.log(`Embedding ${texts.length} symbols...`); const { vectors, dim } = await embed(texts, modelKey); diff --git a/tests/search/embedding-strategy.test.ts b/tests/search/embedding-strategy.test.ts index ff0265c6..16c47780 100644 --- a/tests/search/embedding-strategy.test.ts +++ b/tests/search/embedding-strategy.test.ts @@ -335,6 +335,122 @@ describe('absolute file paths in DB (#752)', () => { }); }); +describe('embed resolves source files from DB root, not cwd (#983)', () => { + let repoDir: string, otherDir: string, repoDbPath: string; + let originalCwd: string; + + beforeAll(() => { + // Repo that was built (files live here) + repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-repo-')); + // Unrelated directory we'll cd into when running embed + otherDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-other-')); + + fs.writeFileSync( + path.join(repoDir, 'a.js'), + 'export function alpha() { return 1; }\nexport function beta() { return alpha(); }\n', + ); + + const dbDir = path.join(repoDir, '.codegraph'); + fs.mkdirSync(dbDir, { recursive: true }); + repoDbPath = path.join(dbDir, 'graph.db'); + + const db = new Database(repoDbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + // DB stores *relative* file paths (typical of WASM-engine builds) + insertNode(db, 'alpha', 'function', 'a.js', 1, 1); + insertNode(db, 'beta', 'function', 'a.js', 2, 2); + // Persist the repo root as the build pipeline would + db.prepare('INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)').run( + 'root_dir', + path.resolve(repoDir), + ); + db.close(); + + originalCwd = process.cwd(); + }); + + afterAll(() => { + try { + process.chdir(originalCwd); + } catch { + /* ignore */ + } + if (repoDir) fs.rmSync(repoDir, { recursive: true, force: true }); + if (otherDir) fs.rmSync(otherDir, { recursive: true, force: true }); + }); + + test('uses root_dir metadata when embed is invoked from unrelated cwd', async () => { + EMBEDDED_TEXTS.length = 0; + process.chdir(otherDir); + + // Simulate the CLI: positional dir defaults to cwd (here: otherDir), DB path is absolute + await buildEmbeddings(process.cwd(), 'minilm', repoDbPath); + + expect(EMBEDDED_TEXTS.length).toBe(2); + + const db = new Database(repoDbPath, { readonly: true }); + const count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c; + db.close(); + expect(count).toBe(2); + }); + + test('falls back to /../.. when root_dir metadata is missing', async () => { + // Build a fresh DB without root_dir metadata + const legacyRepo = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-legacy-')); + fs.writeFileSync(path.join(legacyRepo, 'b.js'), 'export function gamma() { return 42; }\n'); + const legacyDbDir = path.join(legacyRepo, '.codegraph'); + fs.mkdirSync(legacyDbDir, { recursive: true }); + const legacyDb = path.join(legacyDbDir, 'graph.db'); + + const db = new Database(legacyDb); + db.pragma('journal_mode = WAL'); + initSchema(db); + insertNode(db, 'gamma', 'function', 'b.js', 1, 1); + // Deliberately NOT writing root_dir — simulates DB built before #983 fix + db.close(); + + EMBEDDED_TEXTS.length = 0; + process.chdir(otherDir); + await buildEmbeddings(process.cwd(), 'minilm', legacyDb); + + expect(EMBEDDED_TEXTS.length).toBe(1); + + fs.rmSync(legacyRepo, { recursive: true, force: true }); + }); + + test('exits non-zero (throws) when no source files can be read', async () => { + // Build a DB pointing at files that no longer exist + const ghostRepo = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-ghost-')); + const ghostDbDir = path.join(ghostRepo, '.codegraph'); + fs.mkdirSync(ghostDbDir, { recursive: true }); + const ghostDb = path.join(ghostDbDir, 'graph.db'); + + const db = new Database(ghostDb); + db.pragma('journal_mode = WAL'); + initSchema(db); + insertNode(db, 'missing', 'function', 'does-not-exist.js', 1, 1); + db.prepare('INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)').run( + 'root_dir', + path.resolve(ghostRepo), + ); + db.close(); + + EMBEDDED_TEXTS.length = 0; + await expect(buildEmbeddings(ghostRepo, 'minilm', ghostDb)).rejects.toThrow( + /could not read any of the .* source files/, + ); + + // No embeddings were persisted (they would have been overwritten via DELETE) + const readDb = new Database(ghostDb, { readonly: true }); + const count = readDb.prepare('SELECT COUNT(*) as c FROM embeddings').get().c; + readDb.close(); + expect(count).toBe(0); + + fs.rmSync(ghostRepo, { recursive: true, force: true }); + }); +}); + describe('context window overflow detection', () => { let bigDir: string, bigDbPath: string; From 9f20bdaef3faefe6fe4c5d69284fbf7f30009a26 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Tue, 21 Apr 2026 01:14:55 -0600 Subject: [PATCH 2/4] fix(embed): restore rootDir fallback when DB is outside .codegraph layout (#992) The prior 'metaRoot || dbParent || rootDir' chain was unreachable past dbParent because path.dirname always returns a non-empty string. This silently ignored an explicit positional argument for legacy DBs at non-standard locations. Only use dbParent when the DB actually lives at the conventional /.codegraph/graph.db layout; otherwise fall through to the caller-provided rootDir. Impact: 1 functions changed, 1 affected --- src/domain/search/generator.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/domain/search/generator.ts b/src/domain/search/generator.ts index fd3c5317..ef0ddf35 100644 --- a/src/domain/search/generator.ts +++ b/src/domain/search/generator.ts @@ -75,11 +75,17 @@ export async function buildEmbeddings( // Prefer the repo root recorded at build time — embed may be invoked from a // different cwd (e.g. `codegraph embed --db /abs/path/graph.db`) and the - // positional rootDir will be wrong in that case. Fall back to the DB's - // parent (`/.codegraph/graph.db` → ``) if metadata is missing, - // then finally the caller-provided rootDir for pre-metadata compat. + // positional rootDir will be wrong in that case. For legacy DBs without + // root_dir metadata, fall back to `` only when the DB lives at + // the conventional `/.codegraph/graph.db` layout — otherwise trust + // the caller-provided rootDir (which may be an explicit positional arg). + // `path.dirname(...)` is always non-empty (`'.'` at minimum), so the + // conventional-layout check is required to keep the rootDir path reachable. const metaRoot = getBuildMeta(db, 'root_dir'); - const dbParent = path.dirname(path.dirname(path.resolve(dbPath))); + const resolvedDbPath = path.resolve(dbPath); + const dbDirName = path.basename(path.dirname(resolvedDbPath)); + const dbParent = + dbDirName === '.codegraph' ? path.dirname(path.dirname(resolvedDbPath)) : undefined; const resolvedRoot = metaRoot || dbParent || rootDir; db.exec('DELETE FROM embeddings'); From e48690c0b9ceb62fc7c736bc0639ec99f539fb89 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Tue, 21 Apr 2026 01:15:12 -0600 Subject: [PATCH 3/4] fix(embed): canonicalize rootDir in JS to match Rust finalize_build (#992) For native full builds, Rust's finalize_build writes root_dir via std::fs::canonicalize (symlink-resolving) but the JS persistBuildMetadata then overwrites it with path.resolve(), which does not resolve symlinks. On systems where the project root is behind a symlink the two values diverge and the JS write wins, reintroducing a non-canonical path. Use fs.realpathSync to match the Rust behaviour, with a safe fallback to path.resolve() if realpath throws. Impact: 1 functions changed, 3 affected --- src/domain/graph/builder/stages/finalize.ts | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/domain/graph/builder/stages/finalize.ts b/src/domain/graph/builder/stages/finalize.ts index 62849980..74f1dfbb 100644 --- a/src/domain/graph/builder/stages/finalize.ts +++ b/src/domain/graph/builder/stages/finalize.ts @@ -3,6 +3,7 @@ * * WASM cleanup, stats logging, drift detection, build metadata, registry, journal. */ +import fs from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; import { performance } from 'node:perf_hooks'; @@ -90,7 +91,17 @@ function persistBuildMetadata( ctx.engineName === 'native' && ctx.engineVersion ? ctx.engineVersion : CODEGRAPH_VERSION; // Persist the repo root so downstream commands (e.g. `codegraph embed`) // can resolve relative file paths regardless of the invoking cwd. - const rootDirToWrite = path.resolve(ctx.rootDir); + // Use realpathSync (symlink-resolving) to match the Rust engine's + // std::fs::canonicalize — otherwise the JS write here would overwrite the + // canonical path Rust wrote for native full builds and could re-introduce + // a non-canonical path when the project root is behind a symlink. + const resolvedRootDir = path.resolve(ctx.rootDir); + let rootDirToWrite = resolvedRootDir; + try { + rootDirToWrite = fs.realpathSync(resolvedRootDir); + } catch { + /* realpath can fail (e.g. path no longer exists); fall back to resolve() */ + } try { if (useNativeDb) { ctx.nativeDb!.setBuildMeta( From f99f10e55691242cf8026eecd2d5fdc52805039f Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Tue, 21 Apr 2026 01:15:25 -0600 Subject: [PATCH 4/4] test(embed): wrap temp-dir tests in try/finally to avoid leaks (#992) The ghostRepo and legacyRepo temp dirs were cleaned up inline in the test body, so an unexpected throw before fs.rmSync would leak them. Wrap each test in try/finally so cleanup always runs. --- tests/search/embedding-strategy.test.ts | 94 +++++++++++++------------ 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/tests/search/embedding-strategy.test.ts b/tests/search/embedding-strategy.test.ts index 16c47780..0f829bc0 100644 --- a/tests/search/embedding-strategy.test.ts +++ b/tests/search/embedding-strategy.test.ts @@ -398,56 +398,60 @@ describe('embed resolves source files from DB root, not cwd (#983)', () => { test('falls back to /../.. when root_dir metadata is missing', async () => { // Build a fresh DB without root_dir metadata const legacyRepo = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-legacy-')); - fs.writeFileSync(path.join(legacyRepo, 'b.js'), 'export function gamma() { return 42; }\n'); - const legacyDbDir = path.join(legacyRepo, '.codegraph'); - fs.mkdirSync(legacyDbDir, { recursive: true }); - const legacyDb = path.join(legacyDbDir, 'graph.db'); - - const db = new Database(legacyDb); - db.pragma('journal_mode = WAL'); - initSchema(db); - insertNode(db, 'gamma', 'function', 'b.js', 1, 1); - // Deliberately NOT writing root_dir — simulates DB built before #983 fix - db.close(); - - EMBEDDED_TEXTS.length = 0; - process.chdir(otherDir); - await buildEmbeddings(process.cwd(), 'minilm', legacyDb); - - expect(EMBEDDED_TEXTS.length).toBe(1); - - fs.rmSync(legacyRepo, { recursive: true, force: true }); + try { + fs.writeFileSync(path.join(legacyRepo, 'b.js'), 'export function gamma() { return 42; }\n'); + const legacyDbDir = path.join(legacyRepo, '.codegraph'); + fs.mkdirSync(legacyDbDir, { recursive: true }); + const legacyDb = path.join(legacyDbDir, 'graph.db'); + + const db = new Database(legacyDb); + db.pragma('journal_mode = WAL'); + initSchema(db); + insertNode(db, 'gamma', 'function', 'b.js', 1, 1); + // Deliberately NOT writing root_dir — simulates DB built before #983 fix + db.close(); + + EMBEDDED_TEXTS.length = 0; + process.chdir(otherDir); + await buildEmbeddings(process.cwd(), 'minilm', legacyDb); + + expect(EMBEDDED_TEXTS.length).toBe(1); + } finally { + fs.rmSync(legacyRepo, { recursive: true, force: true }); + } }); test('exits non-zero (throws) when no source files can be read', async () => { // Build a DB pointing at files that no longer exist const ghostRepo = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-ghost-')); - const ghostDbDir = path.join(ghostRepo, '.codegraph'); - fs.mkdirSync(ghostDbDir, { recursive: true }); - const ghostDb = path.join(ghostDbDir, 'graph.db'); - - const db = new Database(ghostDb); - db.pragma('journal_mode = WAL'); - initSchema(db); - insertNode(db, 'missing', 'function', 'does-not-exist.js', 1, 1); - db.prepare('INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)').run( - 'root_dir', - path.resolve(ghostRepo), - ); - db.close(); - - EMBEDDED_TEXTS.length = 0; - await expect(buildEmbeddings(ghostRepo, 'minilm', ghostDb)).rejects.toThrow( - /could not read any of the .* source files/, - ); - - // No embeddings were persisted (they would have been overwritten via DELETE) - const readDb = new Database(ghostDb, { readonly: true }); - const count = readDb.prepare('SELECT COUNT(*) as c FROM embeddings').get().c; - readDb.close(); - expect(count).toBe(0); - - fs.rmSync(ghostRepo, { recursive: true, force: true }); + try { + const ghostDbDir = path.join(ghostRepo, '.codegraph'); + fs.mkdirSync(ghostDbDir, { recursive: true }); + const ghostDb = path.join(ghostDbDir, 'graph.db'); + + const db = new Database(ghostDb); + db.pragma('journal_mode = WAL'); + initSchema(db); + insertNode(db, 'missing', 'function', 'does-not-exist.js', 1, 1); + db.prepare('INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)').run( + 'root_dir', + path.resolve(ghostRepo), + ); + db.close(); + + EMBEDDED_TEXTS.length = 0; + await expect(buildEmbeddings(ghostRepo, 'minilm', ghostDb)).rejects.toThrow( + /could not read any of the .* source files/, + ); + + // No embeddings were persisted (they would have been overwritten via DELETE) + const readDb = new Database(ghostDb, { readonly: true }); + const count = readDb.prepare('SELECT COUNT(*) as c FROM embeddings').get().c; + readDb.close(); + expect(count).toBe(0); + } finally { + fs.rmSync(ghostRepo, { recursive: true, force: true }); + } }); });