From 8280289d41a64ec166e723e62fb5b46eb599d062 Mon Sep 17 00:00:00 2001 From: Kris Kersey Date: Sun, 14 Jun 2026 15:09:38 +0000 Subject: [PATCH 01/10] fix(cypher): preserve node properties through WITH aggregation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A node group variable carried through a WITH aggregation (e.g. `WITH g, count(*) AS c RETURN g.file_path`) returned blank for every property except its name: the carried virtual binding held only the group key (the node's name) and lacked a store handle, so node_prop() could neither read other fields nor compute degrees. Fix: capture the node id of a bare node group-var in with_agg_find_or_create and tag the carried virtual binding with it; in node_prop(), when such a stub (id set, string fields unpopulated) is asked for a missing property, re-fetch the full node via cbm_store_find_node_by_id and project it. Also propagate the store onto virtual bindings so node_prop can re-fetch and compute degrees. The stub gate is heuristic but never yields a wrong value — worst case is one redundant indexed lookup. Adds regression test cypher_exec_with_node_groupvar_prop. Signed-off-by: Kris Kersey (cherry picked from commit 8b03974f1b018c5990633793e306ea5f8ae8c4d8) --- src/cypher/cypher.c | 67 +++++++++++++++++++++++++++++++++++++++++++-- tests/test_cypher.c | 22 +++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/src/cypher/cypher.c b/src/cypher/cypher.c index af2b319a9..eb0509d8e 100644 --- a/src/cypher/cypher.c +++ b/src/cypher/cypher.c @@ -2061,15 +2061,18 @@ static const char *node_string_field(const cbm_node_t *n, const char *prop) { /* Get node property by name. * store may be NULL; only needed for virtual degree properties. */ static const char *json_extract_prop(const char *json, const char *key, char *buf, size_t buf_sz); +static void node_fields_free(cbm_node_t *n); /* defined below; used by the stub re-fetch */ static const char *node_prop(const cbm_node_t *n, const char *prop, cbm_store_t *store) { if (!n || !prop) { return ""; } const char *str = node_string_field(n, prop); - if (str) { + if (str && str[0]) { return str; } + /* Note: a string field that exists but is empty ("") falls through here so a + * WITH-aggregation node stub (below) can re-fetch it. */ /* Computed and JSON-derived values live in rotating thread-local buffers: * a single row (or an ORDER-BY comparison) reads several of these before any * of them is copied out, so returning one shared static buffer would alias @@ -2107,6 +2110,40 @@ static const char *node_prop(const cbm_node_t *n, const char *prop, cbm_store_t return v; } } + /* WITH aggregation carries a node group var by id + name only (the group key + * is the node name), so every other property is absent on the stub. Detect + * the stub (id set, but the full string fields were never populated) and + * re-fetch the node so RETURN g.file_path / g.label / g. project + * correctly instead of returning blank. The gate is heuristic, not an exact + * stub discriminator: a real bound node with NULL label AND file_path would + * also match, but in that case the worst case is one redundant indexed fetch + * that returns the same value — never a wrong result. */ + if (store && n->id > 0 && !n->file_path && !n->label) { + cbm_node_t full = {0}; + if (cbm_store_find_node_by_id(store, n->id, &full) == CBM_STORE_OK) { + const char *res = NULL; + const char *rv = node_string_field(&full, prop); + if (rv && rv[0]) { + snprintf(out, CBM_SZ_512, "%s", rv); + res = out; + } else if (strcmp(prop, "start_line") == 0) { + snprintf(out, CBM_SZ_512, "%d", full.start_line); + res = out; + } else if (strcmp(prop, "end_line") == 0) { + snprintf(out, CBM_SZ_512, "%d", full.end_line); + res = out; + } else if (full.properties_json && full.properties_json[0] == '{') { + const char *jv = json_extract_prop(full.properties_json, prop, out, CBM_SZ_512); + if (jv && jv[0]) { + res = out; + } + } + node_fields_free(&full); + if (res) { + return res; + } + } + } return ""; } @@ -2550,6 +2587,9 @@ static void rb_add_row(result_builder_t *rb, const char **values) { /* ── Binding virtual variables (for WITH clause) ──────────────── */ static const char *binding_get_virtual(binding_t *b, const char *var, const char *prop) { + if (!var) { + return ""; + } /* Check virtual vars first (from WITH projection) */ char full[CBM_SZ_256]; if (prop) { @@ -3406,8 +3446,9 @@ typedef struct { double *sums; int *counts; double *mins, *maxs; - char ***distinct_lists; /* per-item set of seen values for COUNT(DISTINCT) */ - int *distinct_n; /* per-item distinct count (#239) */ + char ***distinct_lists; /* per-item set of seen values for COUNT(DISTINCT) */ + int *distinct_n; /* per-item distinct count (#239) */ + int64_t *group_node_ids; /* per-item node id when the group var is a node (0 = not) */ } with_agg_t; /* Build a group key from non-aggregate WITH items */ @@ -3447,6 +3488,7 @@ static int with_agg_find_or_create(with_agg_t **aggs, int *agg_cnt, int *agg_cap (*aggs)[found].maxs = calloc(wc->count, sizeof(double)); (*aggs)[found].distinct_lists = calloc(wc->count, sizeof(char **)); (*aggs)[found].distinct_n = calloc(wc->count, sizeof(int)); + (*aggs)[found].group_node_ids = calloc(wc->count, sizeof(int64_t)); for (int ci = 0; ci < wc->count; ci++) { (*aggs)[found].mins[ci] = CYP_DBL_MAX; (*aggs)[found].maxs[ci] = -CYP_DBL_MAX; @@ -3458,6 +3500,15 @@ static int with_agg_find_or_create(with_agg_t **aggs, int *agg_cnt, int *agg_cap } const char *v = binding_get_virtual(b, wc->items[ci].variable, wc->items[ci].property); (*aggs)[found].group_vals[ci] = heap_strdup(v); + /* If this group item is a bare node variable, remember its id so the + * carried virtual var can re-fetch any property (group_vals holds only + * the name). */ + if (!wc->items[ci].property && wc->items[ci].variable) { + cbm_node_t *gn = binding_get(b, wc->items[ci].variable); + if (gn) { + (*aggs)[found].group_node_ids[ci] = gn->id; + } + } } return found; } @@ -3528,6 +3579,7 @@ static void with_agg_free(with_agg_t *aggs, int agg_cnt, int item_count) { free(aggs[a].maxs); free(aggs[a].distinct_lists); free(aggs[a].distinct_n); + free(aggs[a].group_node_ids); } free(aggs); } @@ -3553,6 +3605,9 @@ static void execute_with_aggregate(cbm_return_clause_t *wc, binding_t *bindings, } for (int a = 0; a < agg_cnt; a++) { binding_t vb = {0}; + /* Carry the store so node_prop can re-fetch a carried node's properties + * (and compute in_degree/out_degree) on the projected virtual binding. */ + vb.store = (bind_count > 0) ? bindings[0].store : NULL; for (int ci = 0; ci < wc->count; ci++) { char name_buf[CBM_SZ_256]; const char *alias = resolve_item_alias(&wc->items[ci], name_buf, sizeof(name_buf)); @@ -3566,6 +3621,11 @@ static void execute_with_aggregate(cbm_return_clause_t *wc, binding_t *bindings, with_add_vbinding_var(&vb, alias, vbuf); } else { with_add_vbinding_var(&vb, alias, aggs[a].group_vals[ci]); + /* Tag the carried virtual var with the node id (when the group + * var is a node) so node_prop can re-fetch its full properties. */ + if (aggs[a].group_node_ids[ci] > 0 && vb.var_count > 0) { + vb.var_nodes[vb.var_count - 1].id = aggs[a].group_node_ids[ci]; + } } } (*vbindings)[(*vcount)++] = vb; @@ -3578,6 +3638,7 @@ static void execute_with_simple(cbm_return_clause_t *wc, binding_t *bindings, in binding_t *vbindings, int *vcount) { for (int bi = 0; bi < bind_count; bi++) { binding_t vb = {0}; + vb.store = bindings[bi].store; /* so node_prop can re-fetch / compute on the projection */ for (int ci = 0; ci < wc->count; ci++) { char name_buf[CBM_SZ_256]; const char *alias = resolve_item_alias(&wc->items[ci], name_buf, sizeof(name_buf)); diff --git a/tests/test_cypher.c b/tests/test_cypher.c index 610c905e4..2e79c8b4f 100644 --- a/tests/test_cypher.c +++ b/tests/test_cypher.c @@ -2183,6 +2183,27 @@ TEST(cypher_exec_with_count) { PASS(); } +/* Regression: a bare node group-var carried through WITH aggregation must project + * its real properties (not blank). Pre-fix, the carried var held only the node + * name, so RETURN g.file_path returned "". */ +TEST(cypher_exec_with_node_groupvar_prop) { + cbm_store_t *s = setup_cypher_store(); + cbm_cypher_result_t r = {0}; + int rc = cbm_cypher_execute(s, + "MATCH (f:Function)-[:CALLS]->(g:Function) " + "WHERE g.name = \"ValidateOrder\" " + "WITH g, COUNT(*) AS c " + "RETURN g.file_path, g.name, c", + "test", 0, &r); + ASSERT_EQ(rc, 0); + ASSERT_EQ(r.row_count, 1); + ASSERT_STR_EQ(r.rows[0][0], "validate.go"); /* was "" before the fix */ + ASSERT_STR_EQ(r.rows[0][1], "ValidateOrder"); + cbm_cypher_result_free(&r); + cbm_store_close(s); + PASS(); +} + TEST(cypher_exec_with_where) { cbm_store_t *s = setup_cypher_store(); cbm_cypher_result_t r = {0}; @@ -2642,6 +2663,7 @@ SUITE(cypher) { /* Phase 6: WITH clause */ RUN_TEST(cypher_exec_with_rename); RUN_TEST(cypher_exec_with_count); + RUN_TEST(cypher_exec_with_node_groupvar_prop); RUN_TEST(cypher_exec_with_where); RUN_TEST(cypher_exec_with_orderby_limit); RUN_TEST(cypher_parse_with); From 6b3ab140ce4df232cada3d02fd8e9babb558832b Mon Sep 17 00:00:00 2001 From: Thomas Dyar Date: Mon, 1 Jun 2026 21:49:44 -0400 Subject: [PATCH 02/10] fix(cypher): label-filtered edge traversal silently truncates at 10 results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MATCH (c:Class)-[:DEFINES_METHOD]->(m:Method) returned at most 10 results for any class, regardless of how many methods it actually has. Root cause: bind_cap was set to scan_count (the number of nodes matched in the initial pattern — typically 1 when querying a single class by name). max_new = bind_cap * 10 = 10, so the edge expansion loop exited after collecting 10 results. No error, no warning, no truncation indicator. This is language-agnostic: any class with more than 10 methods in any language was silently truncated. The fix is two characters: bind_cap = scan_count > max_rows ? scan_count : max_rows Regression test: a Python class with 15 methods must return all 15 via MATCH (c:Class)-[:DEFINES_METHOD]->(m:Method) with label filtering. Signed-off-by: Thomas Dyar (cherry picked from commit c43fc8df0e3deaacc91a33c98c2cd0dcc7adbe2d) --- src/cypher/cypher.c | 2 +- tests/test_incremental.c | 47 ++++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/cypher/cypher.c b/src/cypher/cypher.c index eb0509d8e..11cbcf4d1 100644 --- a/src/cypher/cypher.c +++ b/src/cypher/cypher.c @@ -4262,7 +4262,7 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec scan_pattern_nodes(store, project, max_rows, &pat0->nodes[0], &scanned, &scan_count); /* Build initial bindings with early WHERE */ - int bind_cap = scan_count > 0 ? scan_count : SKIP_ONE; + int bind_cap = scan_count > max_rows ? scan_count : (max_rows > 0 ? max_rows : SKIP_ONE); binding_t *bindings = malloc((bind_cap + SKIP_ONE) * sizeof(binding_t)); int bind_count = 0; const char *var_name = pat0->nodes[0].variable ? pat0->nodes[0].variable : "_n0"; diff --git a/tests/test_incremental.c b/tests/test_incremental.c index c210d5433..e44b8cfff 100644 --- a/tests/test_incremental.c +++ b/tests/test_incremental.c @@ -400,8 +400,8 @@ TEST(incr_modify_file) { /* Single-file incremental should be faster than full */ if ((int)ms > (int)(g_full_index_ms * 1.5)) { - printf(" [PERF WARNING] incremental slower than 1.5x full: %.0fms vs %.0fms\n", - ms, g_full_index_ms); + printf(" [PERF WARNING] incremental slower than 1.5x full: %.0fms vs %.0fms\n", ms, + g_full_index_ms); } printf(" [perf] modify 1 file: %.0fms (full was %.0fms)\n", ms, g_full_index_ms); @@ -910,12 +910,12 @@ static int resp_lacks_key(const char *resp, const char *key) { } /* Helper: assert tool call succeeds, warn if slow */ -#define TOOL_OK(resp, ms) \ - do { \ - ASSERT((resp) != NULL); \ - if ((int)(ms) > PERF_WARN_MS) { \ +#define TOOL_OK(resp, ms) \ + do { \ + ASSERT((resp) != NULL); \ + if ((int)(ms) > PERF_WARN_MS) { \ printf(" [PERF WARNING] tool call: %.0fms (>%dms)\n", (ms), PERF_WARN_MS); \ - } \ + } \ } while (0) /* Helper: assert response is not an error */ @@ -932,6 +932,38 @@ TEST(tool_list_projects_basic) { PASS(); } +TEST(tool_qg_defines_method_more_than_10) { + write_file_at("fastapi/big_class.py", "class BigClass:\n" + " def m1(self): pass\n" + " def m2(self): pass\n" + " def m3(self): pass\n" + " def m4(self): pass\n" + " def m5(self): pass\n" + " def m6(self): pass\n" + " def m7(self): pass\n" + " def m8(self): pass\n" + " def m9(self): pass\n" + " def m10(self): pass\n" + " def m11(self): pass\n" + " def m12(self): pass\n" + " def m13(self): pass\n" + " def m14(self): pass\n" + " def m15(self): pass\n"); + char *idx = index_repo(); + ASSERT(idx != NULL); + free(idx); + double ms; + char *r = call_tool_timed("query_graph", &ms, + "{\"project\":\"%s\"," + "\"query\":\"MATCH (c:Class)-[:DEFINES_METHOD]->(m:Method)" + " WHERE c.name = 'BigClass' RETURN count(m) AS n\"}", + g_project); + TOOL_OK(r, ms); + ASSERT(strstr(r, "\"15\"") != NULL || strstr(r, "\\\"15\\\"") != NULL); + free(r); + PASS(); +} + TEST(tool_list_projects_has_current) { double ms; char *r = call_tool_timed("list_projects", &ms, "{}"); @@ -3042,6 +3074,7 @@ SUITE(incremental) { RUN_TEST(tool_qg_configures); RUN_TEST(tool_qg_handles); RUN_TEST(tool_qg_defines_method); + RUN_TEST(tool_qg_defines_method_more_than_10); RUN_TEST(tool_qg_no_limit); RUN_TEST(tool_qg_empty_result); From b22230105925ebb2f8d5eec0ff3fcb65dbfbbe3a Mon Sep 17 00:00:00 2001 From: Andrius Skerla <1492322+rainder@users.noreply.github.com> Date: Tue, 16 Jun 2026 13:26:25 +0300 Subject: [PATCH 03/10] fix(pipeline): prevent stack-buffer-overflow in append_args_json A call carrying enough long arguments drove append_args_json()'s running position past the fixed CBM_SZ_2K `props` stack buffer in emit_normal_calls_edge(): format_call_arg() returns snprintf's *untruncated* length, so `pos += (size_t)n` could exceed `bufsize`, after which the trailing `buf[pos] = '\0'` (and `buf[pos++] = ']'`) wrote out of bounds. The stack canary caught it as SIGABRT, so full-repo indexing of large TypeScript codebases crashed the server in the parallel resolve pass (emit_service_edge -> emit_normal_calls_edge -> finalize_and_emit -> append_args_json). Confirmed with AddressSanitizer: stack-buffer-overflow WRITE at pass_parallel.c:1124, 'props' (2048 B). Fix: when an argument does not fully fit, roll back to before its separator and stop appending (atomic field, matching append_json_string's behaviour), so `pos` can never advance past the buffer. Add regression test parallel_args_json_no_overflow: indexes a fixture whose single call carries 60 long string args (args JSON well past 2 KB); under the ASan test build it aborts without this fix and passes with it. Signed-off-by: Andrius Skerla <1492322+rainder@users.noreply.github.com> Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit 74d15a60646a23960cb584b0750ca7d0dbe4bafd) --- src/pipeline/pass_parallel.c | 11 ++++++-- tests/test_parallel.c | 49 ++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 180ee85f7..12f0aa312 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -1108,15 +1108,22 @@ static size_t append_args_json(char *buf, size_t bufsize, size_t pos, const CBMC pos += (size_t)n; for (int i = 0; i < call->arg_count && pos < bufsize - CBM_ARG_JSON_GUARD; i++) { const CBMCallArg *a = &call->args[i]; + size_t mark = pos; /* rollback point (before the separator) */ if (i > 0 && pos < bufsize - SKIP_ONE) { buf[pos++] = ','; } char expr_buf[CBM_SZ_128]; sanitize_expr(expr_buf, a->expr); n = format_call_arg(buf + pos, bufsize - pos, a, expr_buf); - if (n > 0) { - pos += (size_t)n; + /* snprintf returns the UNtruncated length: if the arg did not fully + * fit, advancing pos by n would push it past buf and the buf[pos] + * writes below would overflow. Drop the arg whole (atomic field — + * keeps the array valid) and stop appending. */ + if (n <= 0 || (size_t)n >= bufsize - pos) { + pos = mark; + break; } + pos += (size_t)n; } if (pos < bufsize - SKIP_ONE) { buf[pos++] = ']'; diff --git a/tests/test_parallel.c b/tests/test_parallel.c index 1c4d3d9b6..746e1f2c3 100644 --- a/tests/test_parallel.c +++ b/tests/test_parallel.c @@ -341,6 +341,54 @@ TEST(parallel_empty_files) { PASS(); } +/* ── Regression: args JSON must not overflow the props buffer ──────── */ + +/* A call with many long string arguments makes append_args_json()'s running + * position exceed the fixed CBM_SZ_2K `props` stack buffer in + * emit_normal_calls_edge(): format_call_arg() returns snprintf's UNtruncated + * length, so pos += n could run past the buffer and the trailing + * buf[pos]='\0' wrote out of bounds (stack-buffer-overflow; caught by the + * stack canary as a SIGABRT on real repos). This indexes a fixture whose + * single call carries enough long args to drive pos past 2 KB; under the + * ASan test build a regression aborts here. */ +TEST(parallel_args_json_no_overflow) { + char dir[256]; + snprintf(dir, sizeof(dir), "/tmp/cbm_argov_XXXXXX"); + ASSERT_TRUE(cbm_mkdtemp(dir) != NULL); + + char path[512]; + snprintf(path, sizeof(path), "%s/app.ts", dir); + FILE *f = fopen(path, "w"); + ASSERT_TRUE(f != NULL); + fputs("function sink(...xs: string[]) { return xs; }\n", f); + fputs("function caller() {\n sink(\n", f); + for (int i = 0; i < 60; i++) { + /* 100-char string literal per arg; 60 args => args JSON well past the + * 2 KB props buffer, forcing the pre-fix overshoot. */ + fputs(" \"", f); + for (int j = 0; j < 100; j++) + fputc('a' + (i % 26), f); + fputs(i < 59 ? "\",\n" : "\"\n", f); + } + fputs(" );\n}\n", f); + fclose(f); + + cbm_discover_opts_t opts = {.mode = CBM_MODE_FULL}; + cbm_file_info_t *files = NULL; + int file_count = 0; + ASSERT_EQ(cbm_discover(dir, &opts, &files, &file_count), 0); + ASSERT_GT(file_count, 0); + + cbm_gbuf_t *gbuf = run_parallel("argov-test", dir, files, file_count, 4); + ASSERT_TRUE(gbuf != NULL); + ASSERT_GT(cbm_gbuf_edge_count(gbuf), 0); + + cbm_gbuf_free(gbuf); + cbm_discover_free(files, file_count); + th_rmtree(dir); + PASS(); +} + /* ── Graph buffer merge tests ─────────────────────────────────────── */ TEST(gbuf_shared_ids_unique) { @@ -680,6 +728,7 @@ SUITE(parallel) { RUN_TEST(parallel_implements_parity); RUN_TEST(parallel_total_edges); RUN_TEST(parallel_empty_files); + RUN_TEST(parallel_args_json_no_overflow); /* Cleanup shared state */ parity_teardown(); From e9f16288a82de4db9cb0d97233e0d987036e8708 Mon Sep 17 00:00:00 2001 From: Saurav Kumar Date: Sat, 20 Jun 2026 01:35:37 +0530 Subject: [PATCH 04/10] fix(foundation): properly escape JSON control characters as \u00XX Signed-off-by: Saurav Kumar (cherry picked from commit c3a1a79bda7379588f53a00b7a8226b3d1ff74d9) --- src/foundation/str_util.c | 8 ++++++-- src/git/git_context.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/foundation/str_util.c b/src/foundation/str_util.c index 6275ab592..26542e927 100644 --- a/src/foundation/str_util.c +++ b/src/foundation/str_util.c @@ -6,6 +6,7 @@ #include "foundation/constants.h" #include #include +#include enum { JSON_ESC_LEN = 2, /* escaped char takes 2 bytes (backslash + char) */ @@ -328,8 +329,11 @@ int cbm_json_escape(char *buf, int bufsize, const char *src) { buf[pos++] = '\\'; buf[pos++] = 't'; } else if (c < JSON_CTRL_LIMIT) { - /* Other control chars: skip */ - continue; + /* Other control chars: escape as \u00XX */ + if (pos + 6 > bufsize - JSON_NUL_RESERVE) { + break; + } + pos += snprintf(buf + pos, 7, "\\u%04x", c); } else { buf[pos++] = (char)c; } diff --git a/src/git/git_context.c b/src/git/git_context.c index 5f27b9f20..99ae17cf3 100644 --- a/src/git/git_context.c +++ b/src/git/git_context.c @@ -316,7 +316,7 @@ static int json_escaped_len(const char *src) { if (c == '"' || c == '\\' || c == '\n' || c == '\r' || c == '\t') { len += 2; } else if (c < 0x20) { - continue; + len += 6; /* \u00XX */ } else { len++; } From 8edbff37299856cb5447b9a46ca5ac45c3659198 Mon Sep 17 00:00:00 2001 From: Eric DeWitt Date: Fri, 19 Jun 2026 01:17:03 +0100 Subject: [PATCH 05/10] fix: include git2/sys/alloc.h for git_allocator on libgit2 1.8+ git_allocator moved out of the top-level git2.h into git2/sys/alloc.h in libgit2 1.8.0. Add an explicit include so the mimalloc binding compiles against libgit2 >= 1.8 (e.g. MacPorts libgit2 1.9.4). (cherry picked from commit 586fc8a8fd640da60c6864cce1709ff26b38bdea) --- internal/cbm/cbm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/cbm/cbm.c b/internal/cbm/cbm.c index d611f186f..217040015 100644 --- a/internal/cbm/cbm.c +++ b/internal/cbm/cbm.c @@ -20,7 +20,9 @@ #if defined(CBM_BIND_TS_ALLOCATOR) && CBM_BIND_TS_ALLOCATOR #include "sqlite3.h" // sqlite3_mem_methods, sqlite3_config, SQLITE_CONFIG_MALLOC — bind sqlite to mimalloc #if defined(HAVE_LIBGIT2) -#include // git_allocator, git_libgit2_opts, GIT_OPT_SET_ALLOCATOR — bind libgit2 to mimalloc +#include // git_libgit2_opts, GIT_OPT_SET_ALLOCATOR — bind libgit2 to mimalloc +/* git_allocator moved to sys/alloc.h in libgit2 1.8+; no longer in git2.h */ +#include #endif #endif #include // uint32_t, uint64_t, int64_t From 0860da84679b26d4fc7cc4d85931586f1efdcd60 Mon Sep 17 00:00:00 2001 From: RithvikReddy0-0 Date: Sat, 20 Jun 2026 17:25:25 +0000 Subject: [PATCH 06/10] fix(pipeline): preserve ADR across full re-index (#516) manage_adr stores ADRs in project_summaries, but a full re-index (triggered by file changes or new files) deletes the DB in try_incremental_or_delete_db and rebuilds it from the graph buffer, which writes an empty project_summaries table. file_hashes were re-persisted after the rebuild but project_summaries were not, so the ADR was silently lost. Fix: capture the ADR before the DB is unlinked, stash it on the pipeline struct, and restore it after the rebuilt DB is reopened in dump_and_persist_hashes. The incremental path is unaffected (it never rewrites the DB). Verified: ADR now survives a full re-index. Signed-off-by: RithvikReddy0-0 (cherry picked from commit 7b6c063203509f93bacca2efc7ea34078bd2e246) --- src/pipeline/pipeline.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index 499c916a5..d084882ad 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -93,6 +93,10 @@ struct cbm_pipeline { /* User-defined extension overrides (loaded once per run) */ cbm_userconfig_t *userconfig; + + /* ADR (project_summaries) captured before a full-reindex DB delete, so it + * can be restored after the rebuild. NULL when no ADR existed. Issue #516. */ + char *saved_adr; }; /* ── Global pkgmap (one active pipeline at a time) ─────────────── */ @@ -788,6 +792,22 @@ static int try_incremental_or_delete_db(cbm_pipeline_t *p, cbm_file_info_t *file cbm_store_close(check_store); } cbm_log_info("pipeline.route", "path", "reindex", "action", "deleting old db"); + /* Capture any ADR before deleting the DB so the full-reindex rebuild can + * restore it (project_summaries is otherwise lost). Issue #516. */ + { + cbm_store_t *adr_store = cbm_store_open_path(db_path); + if (adr_store) { + cbm_adr_t existing; + if (cbm_store_adr_get(adr_store, p->project_name, &existing) == CBM_STORE_OK) { + if (existing.content) { + free(p->saved_adr); + p->saved_adr = strdup(existing.content); + } + cbm_store_adr_free(&existing); + } + cbm_store_close(adr_store); + } + } cbm_unlink(db_path); char wal[PL_WAL_BUF]; char shm[PL_WAL_BUF]; @@ -841,6 +861,11 @@ static int dump_and_persist_hashes(cbm_pipeline_t *p, const cbm_file_info_t *fil cbm_store_t *hash_store = cbm_store_open_path(db_path); if (hash_store) { cbm_store_delete_file_hashes(hash_store, p->project_name); + + /* Restore the ADR captured before the dump. Issue #516. */ + if (p->saved_adr) { + cbm_store_adr_store(hash_store, p->project_name, p->saved_adr); + } for (int i = 0; i < file_count; i++) { struct stat fst; if (stat(files[i].path, &fst) == 0) { @@ -867,6 +892,8 @@ static int dump_and_persist_hashes(cbm_pipeline_t *p, const cbm_file_info_t *fil cbm_store_close(hash_store); cbm_log_info("pass.timing", "pass", "persist_hashes", "files", itoa_buf(file_count)); } + free(p->saved_adr); + p->saved_adr = NULL; /* Export persistent artifact if enabled */ if (p->persistence) { From ec9648423ed8fdb1fdc7040d7e8b14f14d8a3b78 Mon Sep 17 00:00:00 2001 From: Kris Kersey Date: Sun, 14 Jun 2026 15:08:16 +0000 Subject: [PATCH 07/10] fix(mcp): honor detect_changes 'since' parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit detect_changes advertised a `since` parameter in its inputSchema but the handler never read it — it always diffed against base_branch (default "main"), so detect_changes(since="HEAD~10") silently returned the wrong or empty result when HEAD was on the default branch. Fix: read `since` and, when present, route it through base_branch so the existing shell-arg validation (cbm_validate_shell_arg) and the `...HEAD` diff apply unchanged; `since` takes precedence over base_branch. Also narrows the schema description — the prior "date" form (e.g. 2026-01-01) is not a revision and never worked through this path — and documents the inherited three-dot semantics. Adds regression tests tool_detect_changes_since and tool_detect_changes_since_precedence. Refs #371 Signed-off-by: Kris Kersey (cherry picked from commit 53501b0d40d3660a5f2ebef5b1cd85498e09a36f) --- src/mcp/mcp.c | 16 +++++++++++++++- tests/test_incremental.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 8102b1e77..eff5f523b 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -431,7 +431,8 @@ static const tool_def_t TOOLS[] = { "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},\"scope\":{\"type\":" "\"string\"},\"depth\":{\"type\":\"integer\",\"default\":2},\"base_branch\":{\"type\":" "\"string\",\"default\":\"main\"},\"since\":{\"type\":\"string\",\"description\":" - "\"Git ref or date to compare from (e.g. HEAD~5, v0.5.0, 2026-01-01)\"}},\"required\":" + "\"Git ref or tag to compare from (e.g. HEAD~5, v0.5.0). Diffs ...HEAD.\"}}," + "\"required\":" "[\"project\"]}"}, {"manage_adr", "Create or update Architecture Decision Records", @@ -3907,12 +3908,25 @@ static void detect_add_impacted_symbols(cbm_store_t *store, const char *project, static char *handle_detect_changes(cbm_mcp_server_t *srv, const char *args) { char *project = cbm_mcp_get_string_arg(args, "project"); char *base_branch = cbm_mcp_get_string_arg(args, "base_branch"); + char *since = cbm_mcp_get_string_arg(args, "since"); char *scope = cbm_mcp_get_string_arg(args, "scope"); int depth = cbm_mcp_get_int_arg(args, "depth", MCP_DEFAULT_BFS_DEPTH); /* scope: "files" = just changed files, "symbols" = files + symbols (default) */ bool want_symbols = !scope || strcmp(scope, "symbols") == 0 || strcmp(scope, "impact") == 0; + /* `since` (e.g. "HEAD~10", "v0.5.0") is the documented diff base but was + * previously parsed and never used: it takes precedence over base_branch. + * Route it through base_branch so the shared shell-arg validation and the + * existing `...HEAD` (three-dot) diff apply unchanged — `since` thus + * adopts the same merge-base semantics base_branch already uses. */ + if (since && since[0]) { + free(base_branch); + base_branch = since; /* transfer ownership */ + since = NULL; + } + free(since); /* no-op after the swap (since is NULL); frees it otherwise */ + if (!base_branch) { base_branch = heap_strdup("main"); } diff --git a/tests/test_incremental.c b/tests/test_incremental.c index e44b8cfff..82aa289c1 100644 --- a/tests/test_incremental.c +++ b/tests/test_incremental.c @@ -1795,6 +1795,34 @@ TEST(tool_detect_changes_custom_branch) { PASS(); } +/* Regression: `since` was advertised in the schema but ignored by the handler; + * it must be honored as the diff base. Fixture is a --depth=1 shallow clone, so + * HEAD~N won't resolve — use HEAD for a valid (empty) diff. */ +TEST(tool_detect_changes_since) { + double ms; + char *r = call_tool_timed("detect_changes", &ms, "{\"project\":\"%s\",\"since\":\"HEAD\"}", + g_project); + TOOL_OK(r, ms); + ASSERT(resp_has_key(r, "changed_files")); + free(r); + PASS(); +} + +/* Regression: `since` must take precedence over base_branch. A valid since plus a + * bogus base_branch must still succeed (proving since won) and must not reference + * the bogus branch. */ +TEST(tool_detect_changes_since_precedence) { + double ms; + char *r = call_tool_timed( + "detect_changes", &ms, + "{\"project\":\"%s\",\"since\":\"HEAD\",\"base_branch\":\"no-such-branch-xyz\"}", + g_project); + TOOL_OK(r, ms); + ASSERT(strstr(r, "no-such-branch-xyz") == NULL); + free(r); + PASS(); +} + TEST(tool_detect_changes_depth) { double ms; char *r = call_tool_timed("detect_changes", &ms, "{\"project\":\"%s\",\"depth\":5}", g_project); @@ -2988,6 +3016,8 @@ SUITE(incremental) { /* Phase 15: detect_changes */ RUN_TEST(tool_detect_changes_default); RUN_TEST(tool_detect_changes_custom_branch); + RUN_TEST(tool_detect_changes_since); + RUN_TEST(tool_detect_changes_since_precedence); RUN_TEST(tool_detect_changes_depth); /* Phase 16: manage_adr */ From d62458c113878133fd8e4cfbb46c37032f417667 Mon Sep 17 00:00:00 2001 From: Kris Kersey Date: Sun, 14 Jun 2026 15:10:43 +0000 Subject: [PATCH 08/10] fix(mcp): prefer definitions and report ambiguity in name resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit trace_path resolved a function_name from the first row of an unordered name query with no ambiguity check, so a same-named entity (e.g. a shell script's main()) could silently shadow the intended C main(). get_code_snippet reported "ambiguous" for a short name even when one match was the obvious definition (the .c body vs a .h declaration). Fix: add a deterministic resolution ranking — a callable label outranks a module, then the larger definition by line span wins, preferring a real definition without hardcoding file extensions — and a picker that flags a genuine tie. trace_path now traces the preferred node and returns the existing ambiguous-suggestions response on a true tie instead of silently taking nodes[0]; get_code_snippet resolves directly to the preferred match, reporting ambiguity only for real ties. Adds regression tests tool_trace_call_path_ambiguous and tool_trace_call_path_prefers_definition. Signed-off-by: Kris Kersey (cherry picked from commit 382dc24d178e90064c52eb292fff702b1b234b34) --- src/mcp/mcp.c | 95 +++++++++++++++++++++++++++++++++++++++++++++- tests/test_mcp.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 2 deletions(-) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index eff5f523b..6fefc6a71 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -2245,6 +2245,66 @@ static yyjson_mut_val *bfs_to_json_array(yyjson_mut_doc *doc, cbm_traverse_resul return arr; } +static char *snippet_suggestions(const char *input, cbm_node_t *nodes, int count); + +/* Rank a candidate for name resolution. The label tier (callable > class-like > + * module/file) is the primary key; WITHIN a tier the larger definition by line + * span wins. In practice the .c-over-.h and C-main-over-shell-main preferences + * come primarily from span (the real definition has the larger body), since the + * competing matches usually share a tier — no file extension is hardcoded. + * Consequence: two same-tier candidates with equal span tie and are reported + * ambiguous (see pick_resolved_node) rather than guessed. */ +enum { + RES_RANK_CALLABLE = 2, /* Function / Method */ + RES_RANK_OTHER = 1, /* Class / Struct / etc. */ + RES_RANK_MODULE = 0, /* Module / File */ + RES_LABEL_WEIGHT = 1000000 /* label tier dominates span */ +}; +static long node_resolution_score(const cbm_node_t *n) { + long label_rank = RES_RANK_MODULE; + if (n->label) { + if (strcmp(n->label, "Function") == 0 || strcmp(n->label, "Method") == 0) { + label_rank = RES_RANK_CALLABLE; + } else if (strcmp(n->label, "Module") != 0 && strcmp(n->label, "File") != 0) { + label_rank = RES_RANK_OTHER; + } + } + long span = (long)n->end_line - (long)n->start_line; + if (span < 0) { + span = 0; + } + return label_rank * (long)RES_LABEL_WEIGHT + span; +} + +/* Pick the best-resolving node among name matches. Sets *ambiguous when the top + * score is shared by more than one candidate (a genuine tie the caller must + * disambiguate) so resolution never silently traces the wrong same-named node. */ +static int pick_resolved_node(const cbm_node_t *nodes, int count, bool *ambiguous) { + *ambiguous = false; + if (count <= 1) { + return 0; + } + int best = 0; + long best_score = node_resolution_score(&nodes[0]); + for (int i = 1; i < count; i++) { + long s = node_resolution_score(&nodes[i]); + if (s > best_score) { + best_score = s; + best = i; + } + } + int top_count = 0; + for (int i = 0; i < count; i++) { + if (node_resolution_score(&nodes[i]) == best_score) { + top_count++; + } + } + if (top_count > 1) { + *ambiguous = true; + } + return best; +} + static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { char *func_name = cbm_mcp_get_string_arg(args, "function_name"); char *project = cbm_mcp_get_string_arg(args, "project"); @@ -2329,6 +2389,22 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { return cbm_mcp_text_result(hint, true); } + /* Disambiguate same-named matches: prefer the real definition, and report + * ambiguity (rather than silently tracing nodes[0]) on a genuine tie — e.g. + * a C main() vs a same-named shell-script main(). */ + bool trace_ambiguous = false; + int sel = pick_resolved_node(nodes, node_count, &trace_ambiguous); + if (trace_ambiguous) { + char *result = snippet_suggestions(func_name, nodes, node_count); + free(func_name); + free(project); + free(direction); + free(mode); + free(param_name); + cbm_store_free_nodes(nodes, node_count); + return result; + } + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); yyjson_mut_val *root = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root); @@ -2354,14 +2430,14 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { cbm_traverse_result_t tr_in = {0}; if (do_outbound) { - cbm_store_bfs(store, nodes[0].id, "outbound", edge_types, edge_type_count, depth, + cbm_store_bfs(store, nodes[sel].id, "outbound", edge_types, edge_type_count, depth, MCP_BFS_LIMIT, &tr_out); yyjson_mut_obj_add_val(doc, root, "callees", bfs_to_json_array(doc, &tr_out, risk_labels, include_tests)); } if (do_inbound) { - cbm_store_bfs(store, nodes[0].id, "inbound", edge_types, edge_type_count, depth, + cbm_store_bfs(store, nodes[sel].id, "inbound", edge_types, edge_type_count, depth, MCP_BFS_LIMIT, &tr_in); yyjson_mut_obj_add_val(doc, root, "callers", bfs_to_json_array(doc, &tr_in, risk_labels, include_tests)); @@ -3003,6 +3079,21 @@ static char *handle_get_code_snippet(cbm_mcp_server_t *srv, const char *args) { } if (suffix_count > SKIP_ONE) { + /* Prefer the real definition (a .c body over a .h declaration, a Function + * over a Module) so an unambiguous-by-preference match resolves directly + * instead of forcing a disambiguation round trip; only a genuine tie still + * returns suggestions. */ + bool snip_ambiguous = false; + int ssel = pick_resolved_node(suffix_nodes, suffix_count, &snip_ambiguous); + if (!snip_ambiguous) { + copy_node(&suffix_nodes[ssel], &node); + cbm_store_free_nodes(suffix_nodes, suffix_count); + char *result = build_snippet_response(srv, &node, "suffix", include_neighbors, NULL, 0); + free_node_contents(&node); + free(qn); + free(project); + return result; + } char *result = snippet_suggestions(qn, suffix_nodes, suffix_count); cbm_store_free_nodes(suffix_nodes, suffix_count); free(qn); diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 152a700cf..585baa79b 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -556,6 +556,103 @@ TEST(tool_trace_missing_function_name) { PASS(); } +/* Regression: two same-named definitions with equal rank must be reported + * ambiguous, not silently traced (trace_path previously took nodes[0]). */ +TEST(tool_trace_call_path_ambiguous) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *proj = "amb-proj"; + cbm_mcp_server_set_project(srv, proj); + cbm_store_upsert_project(st, proj, "/tmp/amb"); + cbm_node_t a = {.project = proj, + .label = "Function", + .name = "amb", + .qualified_name = "amb-proj.a.amb", + .file_path = "a.c", + .start_line = 10, + .end_line = 20}; + cbm_node_t b = {.project = proj, + .label = "Function", + .name = "amb", + .qualified_name = "amb-proj.b.amb", + .file_path = "b.c", + .start_line = 10, + .end_line = 20}; /* equal span -> genuine tie */ + ASSERT_GT(cbm_store_upsert_node(st, &a), 0); + ASSERT_GT(cbm_store_upsert_node(st, &b), 0); + + char *resp = cbm_mcp_server_handle( + srv, "{\"jsonrpc\":\"2.0\",\"id\":61,\"method\":\"tools/call\"," + "\"params\":{\"name\":\"trace_call_path\"," + "\"arguments\":{\"function_name\":\"amb\",\"project\":\"amb-proj\"}}}"); + ASSERT_NOT_NULL(resp); + char *inner = extract_text_content(resp); + ASSERT_NOT_NULL(inner); + ASSERT_NOT_NULL(strstr(inner, "ambiguous")); + ASSERT_NOT_NULL(strstr(inner, "suggestions")); + ASSERT_NULL(strstr(inner, "\"callees\"")); + free(inner); + free(resp); + cbm_mcp_server_free(srv); + PASS(); +} + +/* Regression: when same-named nodes differ in rank, trace must pick the real + * definition (callable, larger body) — NOT nodes[0]. The Module is inserted + * first; if trace took nodes[0] the outbound trace would be empty. */ +TEST(tool_trace_call_path_prefers_definition) { + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + cbm_store_t *st = cbm_mcp_server_store(srv); + const char *proj = "pref-proj"; + cbm_mcp_server_set_project(srv, proj); + cbm_store_upsert_project(st, proj, "/tmp/pref"); + /* nodes[0]: the WRONG match (a Module, tiny span), inserted first. */ + cbm_node_t wrong = {.project = proj, + .label = "Module", + .name = "dup", + .qualified_name = "pref-proj.dup", + .file_path = "dup.x", + .start_line = 1, + .end_line = 1}; + /* the real definition: a Function with a body. */ + cbm_node_t def = {.project = proj, + .label = "Function", + .name = "dup", + .qualified_name = "pref-proj.src.dup", + .file_path = "src/dup.c", + .start_line = 10, + .end_line = 50}; + cbm_node_t callee = {.project = proj, + .label = "Function", + .name = "callee", + .qualified_name = "pref-proj.src.callee", + .file_path = "src/dup.c", + .start_line = 60, + .end_line = 70}; + ASSERT_GT(cbm_store_upsert_node(st, &wrong), 0); + int64_t id_def = cbm_store_upsert_node(st, &def); + int64_t id_callee = cbm_store_upsert_node(st, &callee); + ASSERT_GT(id_def, 0); + ASSERT_GT(id_callee, 0); + cbm_edge_t e = {.project = proj, .source_id = id_def, .target_id = id_callee, .type = "CALLS"}; + cbm_store_insert_edge(st, &e); + + char *resp = cbm_mcp_server_handle( + srv, "{\"jsonrpc\":\"2.0\",\"id\":62,\"method\":\"tools/call\"," + "\"params\":{\"name\":\"trace_call_path\",\"arguments\":{\"function_name\":\"dup\"," + "\"project\":\"pref-proj\",\"direction\":\"outbound\"}}}"); + ASSERT_NOT_NULL(resp); + char *inner = extract_text_content(resp); + ASSERT_NOT_NULL(inner); + ASSERT_NULL(strstr(inner, "ambiguous")); + /* picked the Function definition -> its outbound CALLS edge to "callee" shows */ + ASSERT_NOT_NULL(strstr(inner, "callee")); + free(inner); + free(resp); + cbm_mcp_server_free(srv); + PASS(); +} + TEST(tool_delete_project_not_found) { cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); @@ -2069,6 +2166,8 @@ SUITE(mcp) { /* Tool handlers with validation */ RUN_TEST(tool_trace_call_path_not_found); RUN_TEST(tool_trace_missing_function_name); + RUN_TEST(tool_trace_call_path_ambiguous); + RUN_TEST(tool_trace_call_path_prefers_definition); RUN_TEST(tool_delete_project_not_found); RUN_TEST(tool_get_architecture_empty); RUN_TEST(tool_get_architecture_emits_populated_sections); From eceeb40f9b700e66132bcd3b6b8f6fd48d093cc1 Mon Sep 17 00:00:00 2001 From: King Star Date: Sat, 20 Jun 2026 02:18:22 +0800 Subject: [PATCH 09/10] fix(mcp): return valid UTF-8 snippets Signed-off-by: King Star (cherry picked from commit 935027acdcfd809838dddbc2b52f225e61ded6ab) --- src/mcp/mcp.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++- tests/test_mcp.c | 67 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 1 deletion(-) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 6fefc6a71..ffd891288 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -2910,6 +2910,75 @@ static char *resolve_snippet_source(const char *root_path, const char *file_path return NULL; } +static bool utf8_is_cont(unsigned char c) { + return (c & 0xC0) == 0x80; +} + +static char *sanitize_utf8_lossy(const char *s) { + enum { + UTF8_REPLACEMENT_LEN = 3, + UTF8_THREE_BYTE_LEN = 3, + UTF8_FOUR_BYTE_LEN = 4, + UTF8_FOURTH_BYTE = 3, + }; + if (!s) { + return NULL; + } + size_t len = strlen(s); + if (len > (((size_t)-1) - SKIP_ONE) / UTF8_REPLACEMENT_LEN) { + return NULL; + } + char *out = malloc(len * UTF8_REPLACEMENT_LEN + SKIP_ONE); + if (!out) { + return NULL; + } + + const unsigned char *p = (const unsigned char *)s; + const unsigned char *end = p + len; + unsigned char *dst = (unsigned char *)out; + while (p < end) { + unsigned char c = *p; + size_t n = 0; + if (c < 0x80) { + n = 1; + } else if (c >= 0xC2 && c <= 0xDF && p + 1 < end && utf8_is_cont(p[1])) { + n = 2; + } else if (c == 0xE0 && p + 2 < end && p[1] >= 0xA0 && p[1] <= 0xBF && utf8_is_cont(p[2])) { + n = UTF8_THREE_BYTE_LEN; + } else if (c >= 0xE1 && c <= 0xEC && p + 2 < end && utf8_is_cont(p[1]) && + utf8_is_cont(p[2])) { + n = UTF8_THREE_BYTE_LEN; + } else if (c == 0xED && p + 2 < end && p[1] >= 0x80 && p[1] <= 0x9F && utf8_is_cont(p[2])) { + n = UTF8_THREE_BYTE_LEN; + } else if (c >= 0xEE && c <= 0xEF && p + 2 < end && utf8_is_cont(p[1]) && + utf8_is_cont(p[2])) { + n = UTF8_THREE_BYTE_LEN; + } else if (c == 0xF0 && p + UTF8_FOURTH_BYTE < end && p[1] >= 0x90 && p[1] <= 0xBF && + utf8_is_cont(p[2]) && utf8_is_cont(p[UTF8_FOURTH_BYTE])) { + n = UTF8_FOUR_BYTE_LEN; + } else if (c >= 0xF1 && c <= 0xF3 && p + UTF8_FOURTH_BYTE < end && utf8_is_cont(p[1]) && + utf8_is_cont(p[2]) && utf8_is_cont(p[UTF8_FOURTH_BYTE])) { + n = UTF8_FOUR_BYTE_LEN; + } else if (c == 0xF4 && p + UTF8_FOURTH_BYTE < end && p[1] >= 0x80 && p[1] <= 0x8F && + utf8_is_cont(p[2]) && utf8_is_cont(p[UTF8_FOURTH_BYTE])) { + n = UTF8_FOUR_BYTE_LEN; + } + + if (n > 0) { + memcpy(dst, p, n); + dst += n; + p += n; + } else { + *dst++ = 0xEF; + *dst++ = 0xBF; + *dst++ = 0xBD; + p++; + } + } + *dst = '\0'; + return out; +} + /* Build an enriched snippet response for a resolved node. */ /* Add a string array to a JSON object (no-op if count == 0). */ static void add_string_array(yyjson_mut_doc *doc, yyjson_mut_val *obj, const char *key, @@ -2954,7 +3023,13 @@ static char *build_snippet_response(cbm_mcp_server_t *srv, cbm_node_t *node, yyjson_mut_obj_add_int(doc, root_obj, "end_line", end); if (source) { - yyjson_mut_obj_add_str(doc, root_obj, "source", source); + char *safe_source = sanitize_utf8_lossy(source); + if (safe_source) { + yyjson_mut_obj_add_strcpy(doc, root_obj, "source", safe_source); + free(safe_source); + } else { + yyjson_mut_obj_add_str(doc, root_obj, "source", "(source not available)"); + } } else { yyjson_mut_obj_add_str(doc, root_obj, "source", "(source not available)"); } diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 585baa79b..b7710d92f 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -11,6 +11,7 @@ #include #include #include +#include /* ══════════════════════════════════════════════════════════════════ * JSON-RPC PARSING @@ -1388,6 +1389,31 @@ static char *call_snippet(cbm_mcp_server_t *srv, const char *args_json) { return text; } +static bool is_valid_json_response(const char *json) { + if (!json) { + return false; + } + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) { + return false; + } + yyjson_doc_free(doc); + return true; +} + +static bool snippet_source_has_replacement(const char *json) { + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) { + return false; + } + yyjson_val *root = yyjson_doc_get_root(doc); + yyjson_val *source = yyjson_obj_get(root, "source"); + const char *source_str = yyjson_get_str(source); + bool found = source_str && strstr(source_str, "\xEF\xBF\xBD"); + yyjson_doc_free(doc); + return found; +} + /* ── TestSnippet_ExactQN ──────────────────────────────────────── */ TEST(snippet_exact_qn) { @@ -1674,6 +1700,46 @@ TEST(snippet_include_neighbors_enabled) { PASS(); } +/* ── TestSnippet_SourceInvalidUtf8 ────────────────────────────── */ + +TEST(snippet_source_invalid_utf8) { + char tmp[256]; + cbm_mcp_server_t *srv = setup_snippet_server(tmp, sizeof(tmp)); + ASSERT_NOT_NULL(srv); + + char src_path[512]; + snprintf(src_path, sizeof(src_path), "%s/project/main.go", tmp); + FILE *fp = fopen(src_path, "wb"); + ASSERT_NOT_NULL(fp); + const unsigned char source[] = { + 'p', 'a', 'c', 'k', 'a', 'g', 'e', ' ', 'm', 'a', 'i', 'n', '\n', '\n', + 'f', 'u', 'n', 'c', ' ', 'H', 'a', 'n', 'd', 'l', 'e', 'R', 'e', 'q', + 'u', 'e', 's', 't', '(', ')', ' ', 'e', 'r', 'r', 'o', 'r', ' ', '{', + '\n', '\t', '/', '/', ' ', 0xC0, 0xD4, 0xB7, 0xC2, '\n', '\t', 'r', 'e', 't', + 'u', 'r', 'n', ' ', 'n', 'i', 'l', '\n', '}', '\n'}; + ASSERT_EQ(fwrite(source, 1, sizeof(source), fp), sizeof(source)); + ASSERT_EQ(fclose(fp), 0); + + char *raw = + cbm_mcp_handle_tool(srv, "get_code_snippet", + "{\"qualified_name\":\"test-project.cmd.server.main.HandleRequest\"," + "\"project\":\"test-project\"}"); + ASSERT_TRUE(is_valid_json_response(raw)); + char *resp = extract_text_content(raw); + ASSERT_NOT_NULL(resp); + ASSERT_TRUE(is_valid_json_response(resp)); + ASSERT_NULL(strstr(resp, "\xC0\xD4")); + ASSERT_NOT_NULL(strstr(resp, "HandleRequest")); + ASSERT_NOT_NULL(strstr(resp, "return nil")); + ASSERT_TRUE(snippet_source_has_replacement(resp)); + + free(resp); + free(raw); + cbm_mcp_server_free(srv); + cleanup_snippet_dir(tmp); + PASS(); +} + /* ══════════════════════════════════════════════════════════════════ * JSON-RPC PARSING — EDGE CASES * ══════════════════════════════════════════════════════════════════ */ @@ -2228,5 +2294,6 @@ SUITE(mcp) { RUN_TEST(snippet_auto_resolve_enabled); RUN_TEST(snippet_include_neighbors_default); RUN_TEST(snippet_include_neighbors_enabled); + RUN_TEST(snippet_source_invalid_utf8); RUN_TEST(tool_bad_project_name_no_overflow_issue235); } From 37f0adbdd62d4032122af411a131a74e0d93954c Mon Sep 17 00:00:00 2001 From: win4r Date: Sat, 20 Jun 2026 23:36:09 -0700 Subject: [PATCH 10/10] docs(readme): add fork-attribution notice for codebase-memory-mcp-pro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark this as a community fork of DeusData/codebase-memory-mcp (MIT, © 2025 DeusData) and list the integrated incremental-reindex fix (#528) plus the 9 cherry-picked upstream PRs (#465 #412 #475 #527 #512 #539 #464 #466 #526). Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: win4r --- README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b48a297f0..b6b8657a2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,13 @@ -# codebase-memory-mcp +# codebase-memory-mcp-pro + +> **🔱 Fork notice** — `codebase-memory-mcp-pro` is a community fork of [**DeusData/codebase-memory-mcp**](https://github.com/DeusData/codebase-memory-mcp) (MIT License, © 2025 DeusData), maintained by [@win4r](https://github.com/win4r). It tracks upstream and integrates the following fixes ahead of their upstream merge: +> +> - **Incremental-reindex correctness** ([#528](https://github.com/DeusData/codebase-memory-mcp/pull/528)) — preserve inbound cross-file `CALLS` edges on incremental re-index; editing a file no longer orphans calls into its symbols. +> - **Cypher / `query_graph`** — populate node properties carried through `WITH` aggregation ([#465](https://github.com/DeusData/codebase-memory-mcp/pull/465)); fix label-filtered traversal silently truncating at 10 rows ([#412](https://github.com/DeusData/codebase-memory-mcp/pull/412)). +> - **MCP tools** — `detect_changes` honors `since` ([#464](https://github.com/DeusData/codebase-memory-mcp/pull/464)); definition-preferred name resolution with ambiguity reporting ([#466](https://github.com/DeusData/codebase-memory-mcp/pull/466)); valid UTF-8 in `get_code_snippet` ([#526](https://github.com/DeusData/codebase-memory-mcp/pull/526)). +> - **Robustness / build** — stack-buffer-overflow fix in `append_args_json` ([#475](https://github.com/DeusData/codebase-memory-mcp/pull/475)); JSON control-character escaping ([#527](https://github.com/DeusData/codebase-memory-mcp/pull/527)); preserve ADRs across a full re-index ([#539](https://github.com/DeusData/codebase-memory-mcp/pull/539)); libgit2 ≥ 1.8 build fix ([#512](https://github.com/DeusData/codebase-memory-mcp/pull/512)). +> +> All credit for the original engine belongs to DeusData. License unchanged — see [LICENSE](LICENSE). The upstream README follows verbatim. [![GitHub Release](https://img.shields.io/github/v/release/DeusData/codebase-memory-mcp?style=flat&color=blue)](https://github.com/DeusData/codebase-memory-mcp/releases/latest) [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)