diff --git a/scripts/benchmark-search-graph.sh b/scripts/benchmark-search-graph.sh new file mode 100755 index 00000000..3da58492 --- /dev/null +++ b/scripts/benchmark-search-graph.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# benchmark-search-graph.sh — Time search_graph name_pattern= queries against a +# codebase-memory-mcp binary to measure the regex / LIKE pre-filter performance. +# +# Usage: +# scripts/benchmark-search-graph.sh +# +# Example: +# scripts/benchmark-search-graph.sh ./build/c/codebase-memory-mcp my-project + +set -euo pipefail + +BINARY="${1:?Usage: $0 }" +PROJECT="${2:?Usage: $0 }" + +echo "Binary: $BINARY" +echo "Project: $PROJECT" +echo "" + +run_case() { + local label="$1" + local request="$2" + local start end elapsed_ms result + + start=$(date +%s%3N) + result=$(echo "$request" | "$BINARY" 2>/dev/null || true) + end=$(date +%s%3N) + elapsed_ms=$(( end - start )) + + local count + count=$(echo "$result" | python3 -c " +import sys, json +try: + d = json.load(sys.stdin) + content = d.get('result', {}).get('content', [{}])[0].get('text', '{}') + obj = json.loads(content) + print(obj.get('total', obj.get('count', '?'))) +except Exception: + print('?') +" 2>/dev/null || echo "?") + + printf " %-55s %5dms (total=%s)\n" "$label" "$elapsed_ms" "$count" +} + +sg() { + local project="$1" + local args="$2" + printf '{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"search_graph","arguments":{"project":"%s",%s}}}' \ + "$project" "$args" +} + +echo "=== search_graph name_pattern= benchmarks ===" +run_case "name_pattern=.*Controller.*" "$(sg "$PROJECT" '"name_pattern":".*Controller.*","limit":20')" +run_case "name_pattern=.*Service.*" "$(sg "$PROJECT" '"name_pattern":".*Service.*","limit":20')" +run_case "name_pattern=.*Repository.*" "$(sg "$PROJECT" '"name_pattern":".*Repository.*","limit":20')" +run_case "name_pattern=specificFunctionName" "$(sg "$PROJECT" '"name_pattern":"specificFunctionName","limit":20')" +run_case "label=Method + name_pattern=.*get.*" "$(sg "$PROJECT" '"label":"Method","name_pattern":".*get.*","limit":20')" + +echo "" +echo "=== search_graph query= benchmarks (BM25 path) ===" +run_case "query=controller service handler" "$(sg "$PROJECT" '"query":"controller service handler","limit":20')" +run_case "query=user authentication permission role" "$(sg "$PROJECT" '"query":"user authentication permission role","limit":20')" +run_case "query=create update delete manage list view admin" "$(sg "$PROJECT" '"query":"create update delete manage list view admin","limit":20')" diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index ae5b5c31..2b77dfda 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -1111,7 +1111,15 @@ enum { BM25_BIND_PROJECT = 2, BM25_BIND_LIMIT = 3, BM25_BIND_OFFSET = 4, + BM25_BIND_INNER = 5, BM25_SQL_AUTO_LEN = -1, + /* Inner FTS5 candidate cap. SQLite can early-terminate a plain FTS5 query + * (no JOIN/WHERE on outer table) of the form: + * SELECT rowid, bm25() FROM nodes_fts WHERE MATCH ? ORDER BY bm25() LIMIT N + * By fetching only the top BM25_INNER_LIMIT candidates from the FTS5 index + * and then joining/filtering/re-ranking those, we bound all work to O(N) where + * N = BM25_INNER_LIMIT rather than the full match set size. */ + BM25_INNER_LIMIT = 2000, }; /* Module-local SQLITE_TRANSIENT wrapper to dodge performance-no-int-to-ptr. @@ -1178,21 +1186,34 @@ static char *bm25_search(cbm_store_t *store, const char *project, const char *qu return NULL; } - /* BM25 ranked query with structural label boosting. bm25() returns a - * NEGATIVE score (lower = more relevant), so we subtract the boost to - * make high-value labels sort first. File/Folder/Module/Variable are - * excluded entirely — agents rarely want those as discovery results. */ + /* BM25 ranked query using a two-step approach to enable FTS5 early termination. + * + * Flat queries of the form: + * SELECT ... FROM nodes_fts JOIN nodes WHERE MATCH ? AND n.project=? ORDER BY rank LIMIT N + * block FTS5's WAND/MaxScore early-exit because the outer JOIN+WHERE conditions + * are invisible to the FTS5 planner — it must score every matching document before + * the project/label filter can discard any of them. On a large codebase with 100K+ + * matches, this causes multi-minute queries. + * + * The fix: let FTS5 drive the inner subquery alone. SQLite CAN early-terminate + * SELECT rowid, bm25(nodes_fts) FROM nodes_fts WHERE MATCH ? ORDER BY bm25() LIMIT N + * because no outer predicate blocks it. We fetch BM25_INNER_LIMIT top candidates + * from the FTS5 index, then join/filter/boost only those rows. bm25() returns a + * NEGATIVE score (lower = more relevant). */ const char *sql = "SELECT n.id, n.label, n.name, n.qualified_name, n.file_path, n.start_line, n.end_line, " - " (bm25(nodes_fts) " + " (fts.base_rank " " - CASE WHEN n.label IN ('Function','Method') THEN 10.0 " " WHEN n.label = 'Route' THEN 8.0 " " WHEN n.label IN ('Class','Interface','Type','Enum') THEN 5.0 " " ELSE 0.0 END) AS rank " - "FROM nodes_fts " - "JOIN nodes n ON n.id = nodes_fts.rowid " - "WHERE nodes_fts MATCH ?1 " - " AND n.project = ?2 " + "FROM (" + " SELECT rowid, bm25(nodes_fts) AS base_rank" + " FROM nodes_fts WHERE nodes_fts MATCH ?1" + " ORDER BY base_rank LIMIT ?5" + ") fts " + "JOIN nodes n ON n.id = fts.rowid " + "WHERE n.project = ?2 " " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project') " "ORDER BY rank " "LIMIT ?3 OFFSET ?4"; @@ -1201,24 +1222,33 @@ static char *bm25_search(cbm_store_t *store, const char *project, const char *qu if (sqlite3_prepare_v2(db, sql, BM25_SQL_AUTO_LEN, &stmt, NULL) != SQLITE_OK) { return NULL; } - sqlite3_bind_text(stmt, BM25_BIND_QUERY, fts_query, BM25_SQL_AUTO_LEN, MCP_SQLITE_TRANSIENT); - sqlite3_bind_text(stmt, BM25_BIND_PROJECT, project, BM25_SQL_AUTO_LEN, MCP_SQLITE_TRANSIENT); - sqlite3_bind_int(stmt, BM25_BIND_LIMIT, limit > 0 ? limit : BM25_DEFAULT_LIMIT); + sqlite3_bind_text(stmt, BM25_BIND_QUERY, fts_query, BM25_SQL_AUTO_LEN, MCP_SQLITE_TRANSIENT); + sqlite3_bind_text(stmt, BM25_BIND_PROJECT, project, BM25_SQL_AUTO_LEN, MCP_SQLITE_TRANSIENT); + sqlite3_bind_int(stmt, BM25_BIND_LIMIT, limit > 0 ? limit : BM25_DEFAULT_LIMIT); sqlite3_bind_int(stmt, BM25_BIND_OFFSET, offset > 0 ? offset : 0); + sqlite3_bind_int(stmt, BM25_BIND_INNER, BM25_INNER_LIMIT); - /* Count total hits (for pagination) in a separate cheap query. */ + /* Count hits within the same inner-limit window — capped at BM25_INNER_LIMIT. + * Uses the identical subquery structure so the FTS5 early-exit applies here too. */ int total = 0; { const char *count_sql = - "SELECT COUNT(*) FROM nodes_fts JOIN nodes n ON n.id = nodes_fts.rowid " - "WHERE nodes_fts MATCH ?1 AND n.project = ?2 " - " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')"; + "SELECT COUNT(*) FROM (" + " SELECT fts.rowid FROM (" + " SELECT rowid FROM nodes_fts WHERE nodes_fts MATCH ?1" + " ORDER BY bm25(nodes_fts) LIMIT ?3" + " ) fts " + " JOIN nodes n ON n.id = fts.rowid " + " WHERE n.project = ?2 " + " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')" + ")"; sqlite3_stmt *cs = NULL; if (sqlite3_prepare_v2(db, count_sql, BM25_SQL_AUTO_LEN, &cs, NULL) == SQLITE_OK) { - sqlite3_bind_text(cs, BM25_BIND_QUERY, fts_query, BM25_SQL_AUTO_LEN, + sqlite3_bind_text(cs, BM25_BIND_QUERY, fts_query, BM25_SQL_AUTO_LEN, MCP_SQLITE_TRANSIENT); - sqlite3_bind_text(cs, BM25_BIND_PROJECT, project, BM25_SQL_AUTO_LEN, + sqlite3_bind_text(cs, BM25_BIND_PROJECT, project, BM25_SQL_AUTO_LEN, MCP_SQLITE_TRANSIENT); + sqlite3_bind_int(cs, BM25_BIND_LIMIT, BM25_INNER_LIMIT); if (sqlite3_step(cs) == SQLITE_ROW) { total = sqlite3_column_int(cs, 0); } diff --git a/src/store/store.c b/src/store/store.c index 5d73dcce..2a26b0dd 100644 --- a/src/store/store.c +++ b/src/store/store.c @@ -40,7 +40,9 @@ enum { ST_GLOB_MIN_LEN = 3, ST_GLOB_SKIP = 2, ST_MAX_LANG = 10, - ST_SEARCH_MAX_BINDS = 16, + ST_SEARCH_MAX_BINDS = 32, /* increased: LIKE pre-filter adds binds per pattern */ + ST_LIKE_POOL_MAX = 12, /* max malloc'd LIKE strings alive during one search */ + ST_LIKE_HINT_MAX = 2, /* max LIKE hints extracted per regex pattern */ ST_MAX_PKGS = 64, ST_INIT_CAP_4 = 4, ST_HEADER_PREFIX = 3, @@ -409,47 +411,63 @@ static void sqlite_camel_split(sqlite3_context *ctx, int argc, sqlite3_value **a /* ── REGEXP function for SQLite ──────────────────────────────────── */ +/* Destructor passed to sqlite3_set_auxdata — frees the cached compiled regex. */ +static void regex_free_cb(void *p) { + cbm_regex_t *re = (cbm_regex_t *)p; + cbm_regfree(re); + free(re); +} + +/* Cache the compiled regex on argument slot 0 for the lifetime of the statement. + * sqlite3_get_auxdata returns the cached pointer on subsequent rows (same parameter + * value), so cbm_regcomp is called exactly once per statement instead of once per row. */ static void sqlite_regexp(sqlite3_context *ctx, int argc, sqlite3_value **argv) { (void)argc; const char *pattern = (const char *)sqlite3_value_text(argv[0]); - const char *text = (const char *)sqlite3_value_text(argv[SKIP_ONE]); + const char *text = (const char *)sqlite3_value_text(argv[SKIP_ONE]); if (!pattern || !text) { sqlite3_result_int(ctx, 0); return; } - cbm_regex_t re; - int rc = cbm_regcomp(&re, pattern, CBM_REG_EXTENDED | CBM_REG_NOSUB); - if (rc != 0) { - sqlite3_result_error(ctx, "invalid regex", CBM_NOT_FOUND); - return; + cbm_regex_t *re = (cbm_regex_t *)sqlite3_get_auxdata(ctx, 0); + if (!re) { + re = malloc(sizeof(cbm_regex_t)); + if (!re) { sqlite3_result_error_nomem(ctx); return; } + if (cbm_regcomp(re, pattern, CBM_REG_EXTENDED | CBM_REG_NOSUB) != 0) { + free(re); + sqlite3_result_error(ctx, "invalid regex", CBM_NOT_FOUND); + return; + } + sqlite3_set_auxdata(ctx, 0, re, regex_free_cb); } - rc = cbm_regexec(&re, text, 0, NULL, 0); - cbm_regfree(&re); - sqlite3_result_int(ctx, rc == 0 ? SKIP_ONE : 0); + sqlite3_result_int(ctx, cbm_regexec(re, text, 0, NULL, 0) == 0 ? SKIP_ONE : 0); } -/* Case-insensitive REGEXP variant */ +/* Case-insensitive REGEXP variant — same auxdata caching strategy. */ static void sqlite_iregexp(sqlite3_context *ctx, int argc, sqlite3_value **argv) { (void)argc; const char *pattern = (const char *)sqlite3_value_text(argv[0]); - const char *text = (const char *)sqlite3_value_text(argv[SKIP_ONE]); + const char *text = (const char *)sqlite3_value_text(argv[SKIP_ONE]); if (!pattern || !text) { sqlite3_result_int(ctx, 0); return; } - cbm_regex_t re; - int rc = cbm_regcomp(&re, pattern, CBM_REG_EXTENDED | CBM_REG_NOSUB | CBM_REG_ICASE); - if (rc != 0) { - sqlite3_result_error(ctx, "invalid regex", CBM_NOT_FOUND); - return; + cbm_regex_t *re = (cbm_regex_t *)sqlite3_get_auxdata(ctx, 0); + if (!re) { + re = malloc(sizeof(cbm_regex_t)); + if (!re) { sqlite3_result_error_nomem(ctx); return; } + if (cbm_regcomp(re, pattern, CBM_REG_EXTENDED | CBM_REG_NOSUB | CBM_REG_ICASE) != 0) { + free(re); + sqlite3_result_error(ctx, "invalid regex", CBM_NOT_FOUND); + return; + } + sqlite3_set_auxdata(ctx, 0, re, regex_free_cb); } - rc = cbm_regexec(&re, text, 0, NULL, 0); - cbm_regfree(&re); - sqlite3_result_int(ctx, rc == 0 ? SKIP_ONE : 0); + sqlite3_result_int(ctx, cbm_regexec(re, text, 0, NULL, 0) == 0 ? SKIP_ONE : 0); } /* Cosine similarity between two int8 BLOB vectors. @@ -2121,6 +2139,41 @@ typedef struct { const char *text; } search_bind_t; +/* Pool of malloc'd strings that must outlive statement execution. + * Freed in one shot after both statements are finalized. */ +typedef struct { + char *ptrs[ST_LIKE_POOL_MAX]; + int count; +} search_like_pool_t; + +static void like_pool_add(search_like_pool_t *pool, char *ptr) { + if (ptr && pool->count < ST_LIKE_POOL_MAX) { + pool->ptrs[pool->count++] = ptr; + } else { + free(ptr); /* pool full — don't leak */ + } +} + +static void like_pool_free(search_like_pool_t *pool) { + for (int i = 0; i < pool->count; i++) { + free(pool->ptrs[i]); + } + pool->count = 0; +} + +/* Wrap a literal string in % for use as a LIKE pattern (%literal%). */ +static char *make_like_hint(const char *literal) { + size_t len = strlen(literal); + char *buf = malloc(len + 3); /* % + literal + % + NUL */ + if (buf) { + buf[0] = '%'; + memcpy(buf + SKIP_ONE, literal, len); + buf[len + SKIP_ONE] = '%'; + buf[len + 2] = '\0'; + } + return buf; +} + static void search_apply_degree_filter(char *sql, size_t sql_sz, const cbm_search_params_t *p) { bool has_degree_filter = (p->min_degree >= 0 || p->max_degree >= 0); if (!has_degree_filter) { @@ -2191,10 +2244,31 @@ static void where_add_regex(char *where, int where_sz, int *wlen, int *nparams, where_bind_text(binds, bind_idx, pattern); } +/* Prepend LIKE pre-filter conditions for literal segments of a regex pattern. + * The idx_nodes_name index satisfies LIKE '%literal%', cutting the rows that + * reach the (more expensive) iregexp call to only those containing the literal. + * cbm_extract_like_hints bails on alternation, so no false negatives. */ +static void where_add_like_hints(const char *column, const char *pattern, char *where, + int where_sz, int *wlen, int *nparams, search_bind_t *binds, + int *bind_idx, search_like_pool_t *pool) { + char *hints[ST_LIKE_HINT_MAX]; + int nhints = cbm_extract_like_hints(pattern, hints, ST_LIKE_HINT_MAX); + char bind_buf[CBM_SZ_64]; + for (int i = 0; i < nhints; i++) { + char *lp = make_like_hint(hints[i]); + free(hints[i]); + if (!lp) continue; + like_pool_add(pool, lp); + snprintf(bind_buf, sizeof(bind_buf), "%s LIKE ?%d", column, *bind_idx + SKIP_ONE); + *wlen = where_append(where, where_sz, *wlen, nparams, bind_buf); + where_bind_text(binds, bind_idx, lp); + } +} + /* Build basic WHERE clauses: project, label, name, file, qn patterns. */ static int search_where_basic(const cbm_search_params_t *params, char *where, int where_sz, int *wlen, int *nparams, search_bind_t *binds, int *bind_idx, - char **like_pattern_out) { + search_like_pool_t *pool) { char bind_buf[CBM_SZ_64]; if (params->project) { @@ -2208,18 +2282,23 @@ static int search_where_basic(const cbm_search_params_t *params, char *where, in where_bind_text(binds, bind_idx, params->label); } if (params->name_pattern) { + where_add_like_hints("n.name", params->name_pattern, where, where_sz, wlen, nparams, + binds, bind_idx, pool); where_add_regex(where, where_sz, wlen, nparams, binds, bind_idx, "n.name", params->name_pattern, params->case_sensitive); } if (params->qn_pattern) { + where_add_like_hints("n.qualified_name", params->qn_pattern, where, where_sz, wlen, + nparams, binds, bind_idx, pool); where_add_regex(where, where_sz, wlen, nparams, binds, bind_idx, "n.qualified_name", params->qn_pattern, params->case_sensitive); } if (params->file_pattern) { - *like_pattern_out = cbm_glob_to_like(params->file_pattern); + char *lp = cbm_glob_to_like(params->file_pattern); + like_pool_add(pool, lp); snprintf(bind_buf, sizeof(bind_buf), "n.file_path LIKE ?%d", *bind_idx + SKIP_ONE); *wlen = where_append(where, where_sz, *wlen, nparams, bind_buf); - where_bind_text(binds, bind_idx, *like_pattern_out); + where_bind_text(binds, bind_idx, lp); } return *nparams; } @@ -2254,12 +2333,11 @@ static void search_where_advanced(const cbm_search_params_t *params, char *where } static int search_build_where(const cbm_search_params_t *params, char *where, int where_sz, - search_bind_t *binds, int *bind_idx, char **like_pattern_out) { + search_bind_t *binds, int *bind_idx, search_like_pool_t *pool) { int wlen = 0; int nparams = 0; - *like_pattern_out = NULL; - search_where_basic(params, where, where_sz, &wlen, &nparams, binds, bind_idx, like_pattern_out); + search_where_basic(params, where, where_sz, &wlen, &nparams, binds, bind_idx, pool); search_where_advanced(params, where, where_sz, &wlen, &nparams, binds, bind_idx); return nparams; @@ -2284,10 +2362,10 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear char where[CBM_SZ_2K] = ""; search_bind_t binds[ST_SEARCH_MAX_BINDS]; - char *like_pattern = NULL; + search_like_pool_t like_pool = {0}; int nparams = - search_build_where(params, where, (int)sizeof(where), binds, &bind_idx, &like_pattern); + search_build_where(params, where, (int)sizeof(where), binds, &bind_idx, &like_pool); /* Build full SQL */ if (nparams > 0) { @@ -2300,8 +2378,16 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear bool has_degree_filter = (params->min_degree >= 0 || params->max_degree >= 0); search_apply_degree_filter(sql, sizeof(sql), params); - /* Count query (wrap the full query) */ - snprintf(count_sql, sizeof(count_sql), "SELECT COUNT(*) FROM (%s)", sql); + /* Count query — stripped of per-row edge subqueries for the common (no-degree-filter) + * case, since we only need the row count, not in_deg/out_deg. The degree-filter + * case must wrap the full query because the filter references those columns. */ + if (has_degree_filter) { + snprintf(count_sql, sizeof(count_sql), "SELECT COUNT(*) FROM (%s)", sql); + } else if (nparams > 0) { + snprintf(count_sql, sizeof(count_sql), "SELECT COUNT(*) FROM nodes n WHERE %s", where); + } else { + snprintf(count_sql, sizeof(count_sql), "SELECT COUNT(*) FROM nodes n"); + } /* Add ORDER BY + LIMIT */ int limit = params->limit > 0 ? params->limit : ST_HALF_SEC; @@ -2330,7 +2416,7 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear rc = sqlite3_prepare_v2(s->db, sql, CBM_NOT_FOUND, &main_stmt, NULL); if (rc != SQLITE_OK) { store_set_error_sqlite(s, "search prepare"); - free(like_pattern); + like_pool_free(&like_pool); return CBM_STORE_ERR; } @@ -2355,7 +2441,7 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear } sqlite3_finalize(main_stmt); - free(like_pattern); + like_pool_free(&like_pool); out->results = results; out->count = n;