From 935027acdcfd809838dddbc2b52f225e61ded6ab Mon Sep 17 00:00:00 2001 From: King Star Date: Sat, 20 Jun 2026 02:18:22 +0800 Subject: [PATCH] fix(mcp): return valid UTF-8 snippets Signed-off-by: King Star --- src/mcp/mcp.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++- tests/test_mcp.c | 67 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 1 deletion(-) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 8102b1e77..48823b7d9 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -2833,6 +2833,75 @@ static char *resolve_snippet_source(const char *root_path, const char *file_path return NULL; } +static bool utf8_is_cont(unsigned char c) { + return (c & 0xC0) == 0x80; +} + +static char *sanitize_utf8_lossy(const char *s) { + enum { + UTF8_REPLACEMENT_LEN = 3, + UTF8_THREE_BYTE_LEN = 3, + UTF8_FOUR_BYTE_LEN = 4, + UTF8_FOURTH_BYTE = 3, + }; + if (!s) { + return NULL; + } + size_t len = strlen(s); + if (len > (((size_t)-1) - SKIP_ONE) / UTF8_REPLACEMENT_LEN) { + return NULL; + } + char *out = malloc(len * UTF8_REPLACEMENT_LEN + SKIP_ONE); + if (!out) { + return NULL; + } + + const unsigned char *p = (const unsigned char *)s; + const unsigned char *end = p + len; + unsigned char *dst = (unsigned char *)out; + while (p < end) { + unsigned char c = *p; + size_t n = 0; + if (c < 0x80) { + n = 1; + } else if (c >= 0xC2 && c <= 0xDF && p + 1 < end && utf8_is_cont(p[1])) { + n = 2; + } else if (c == 0xE0 && p + 2 < end && p[1] >= 0xA0 && p[1] <= 0xBF && utf8_is_cont(p[2])) { + n = UTF8_THREE_BYTE_LEN; + } else if (c >= 0xE1 && c <= 0xEC && p + 2 < end && utf8_is_cont(p[1]) && + utf8_is_cont(p[2])) { + n = UTF8_THREE_BYTE_LEN; + } else if (c == 0xED && p + 2 < end && p[1] >= 0x80 && p[1] <= 0x9F && utf8_is_cont(p[2])) { + n = UTF8_THREE_BYTE_LEN; + } else if (c >= 0xEE && c <= 0xEF && p + 2 < end && utf8_is_cont(p[1]) && + utf8_is_cont(p[2])) { + n = UTF8_THREE_BYTE_LEN; + } else if (c == 0xF0 && p + UTF8_FOURTH_BYTE < end && p[1] >= 0x90 && p[1] <= 0xBF && + utf8_is_cont(p[2]) && utf8_is_cont(p[UTF8_FOURTH_BYTE])) { + n = UTF8_FOUR_BYTE_LEN; + } else if (c >= 0xF1 && c <= 0xF3 && p + UTF8_FOURTH_BYTE < end && utf8_is_cont(p[1]) && + utf8_is_cont(p[2]) && utf8_is_cont(p[UTF8_FOURTH_BYTE])) { + n = UTF8_FOUR_BYTE_LEN; + } else if (c == 0xF4 && p + UTF8_FOURTH_BYTE < end && p[1] >= 0x80 && p[1] <= 0x8F && + utf8_is_cont(p[2]) && utf8_is_cont(p[UTF8_FOURTH_BYTE])) { + n = UTF8_FOUR_BYTE_LEN; + } + + if (n > 0) { + memcpy(dst, p, n); + dst += n; + p += n; + } else { + *dst++ = 0xEF; + *dst++ = 0xBF; + *dst++ = 0xBD; + p++; + } + } + *dst = '\0'; + return out; +} + /* Build an enriched snippet response for a resolved node. */ /* Add a string array to a JSON object (no-op if count == 0). */ static void add_string_array(yyjson_mut_doc *doc, yyjson_mut_val *obj, const char *key, @@ -2877,7 +2946,13 @@ static char *build_snippet_response(cbm_mcp_server_t *srv, cbm_node_t *node, yyjson_mut_obj_add_int(doc, root_obj, "end_line", end); if (source) { - yyjson_mut_obj_add_str(doc, root_obj, "source", source); + char *safe_source = sanitize_utf8_lossy(source); + if (safe_source) { + yyjson_mut_obj_add_strcpy(doc, root_obj, "source", safe_source); + free(safe_source); + } else { + yyjson_mut_obj_add_str(doc, root_obj, "source", "(source not available)"); + } } else { yyjson_mut_obj_add_str(doc, root_obj, "source", "(source not available)"); } diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 152a700cf..42aeb32dc 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -11,6 +11,7 @@ #include #include #include +#include /* ══════════════════════════════════════════════════════════════════ * JSON-RPC PARSING @@ -1291,6 +1292,31 @@ static char *call_snippet(cbm_mcp_server_t *srv, const char *args_json) { return text; } +static bool is_valid_json_response(const char *json) { + if (!json) { + return false; + } + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) { + return false; + } + yyjson_doc_free(doc); + return true; +} + +static bool snippet_source_has_replacement(const char *json) { + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) { + return false; + } + yyjson_val *root = yyjson_doc_get_root(doc); + yyjson_val *source = yyjson_obj_get(root, "source"); + const char *source_str = yyjson_get_str(source); + bool found = source_str && strstr(source_str, "\xEF\xBF\xBD"); + yyjson_doc_free(doc); + return found; +} + /* ── TestSnippet_ExactQN ──────────────────────────────────────── */ TEST(snippet_exact_qn) { @@ -1577,6 +1603,46 @@ TEST(snippet_include_neighbors_enabled) { PASS(); } +/* ── TestSnippet_SourceInvalidUtf8 ────────────────────────────── */ + +TEST(snippet_source_invalid_utf8) { + char tmp[256]; + cbm_mcp_server_t *srv = setup_snippet_server(tmp, sizeof(tmp)); + ASSERT_NOT_NULL(srv); + + char src_path[512]; + snprintf(src_path, sizeof(src_path), "%s/project/main.go", tmp); + FILE *fp = fopen(src_path, "wb"); + ASSERT_NOT_NULL(fp); + const unsigned char source[] = { + 'p', 'a', 'c', 'k', 'a', 'g', 'e', ' ', 'm', 'a', 'i', 'n', '\n', '\n', + 'f', 'u', 'n', 'c', ' ', 'H', 'a', 'n', 'd', 'l', 'e', 'R', 'e', 'q', + 'u', 'e', 's', 't', '(', ')', ' ', 'e', 'r', 'r', 'o', 'r', ' ', '{', + '\n', '\t', '/', '/', ' ', 0xC0, 0xD4, 0xB7, 0xC2, '\n', '\t', 'r', 'e', 't', + 'u', 'r', 'n', ' ', 'n', 'i', 'l', '\n', '}', '\n'}; + ASSERT_EQ(fwrite(source, 1, sizeof(source), fp), sizeof(source)); + ASSERT_EQ(fclose(fp), 0); + + char *raw = + cbm_mcp_handle_tool(srv, "get_code_snippet", + "{\"qualified_name\":\"test-project.cmd.server.main.HandleRequest\"," + "\"project\":\"test-project\"}"); + ASSERT_TRUE(is_valid_json_response(raw)); + char *resp = extract_text_content(raw); + ASSERT_NOT_NULL(resp); + ASSERT_TRUE(is_valid_json_response(resp)); + ASSERT_NULL(strstr(resp, "\xC0\xD4")); + ASSERT_NOT_NULL(strstr(resp, "HandleRequest")); + ASSERT_NOT_NULL(strstr(resp, "return nil")); + ASSERT_TRUE(snippet_source_has_replacement(resp)); + + free(resp); + free(raw); + cbm_mcp_server_free(srv); + cleanup_snippet_dir(tmp); + PASS(); +} + /* ══════════════════════════════════════════════════════════════════ * JSON-RPC PARSING — EDGE CASES * ══════════════════════════════════════════════════════════════════ */ @@ -2129,5 +2195,6 @@ SUITE(mcp) { RUN_TEST(snippet_auto_resolve_enabled); RUN_TEST(snippet_include_neighbors_default); RUN_TEST(snippet_include_neighbors_enabled); + RUN_TEST(snippet_source_invalid_utf8); RUN_TEST(tool_bad_project_name_no_overflow_issue235); }