diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 8102b1e7..5c91c62b 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -2833,6 +2833,85 @@ static char *resolve_snippet_source(const char *root_path, const char *file_path return NULL; } +/* ── UTF-8 sanitization ──────────────────────────────────────── */ + +/* Replace invalid UTF-8 byte sequences with U+FFFD (EF BF BD). + * Valid multi-byte UTF-8 (CJK, emoji, etc.) is preserved. + * Returns a newly allocated string; caller frees. + * Returns NULL only if src is NULL or malloc fails. + * + * Reference: RFC 3629 §3, Unicode §3.9 (U+FFFD Substitution of + * Maximal Subparts). Each invalid byte is individually replaced + * so that surrounding valid bytes are not consumed. */ +static char *sanitize_utf8(const char *src) { + if (!src) { + return NULL; + } + + const unsigned char *s = (const unsigned char *)src; + size_t src_len = strlen(src); + + /* Worst case: every byte is invalid → each becomes 3 bytes (U+FFFD). + * Guard against size_t overflow on src_len * 3 + 1 (UB if + * src_len > (SIZE_MAX - 1) / 3). Practically unreachable for snippet + * sources, but defensive — the same bound is enforced in #526. */ + if (src_len > (SIZE_MAX - 1) / 3) { + return NULL; + } + char *out = malloc(src_len * 3 + 1); + if (!out) { + return NULL; + } + size_t olen = 0; + + for (size_t i = 0; i < src_len;) { + unsigned char c = s[i]; + int seq_len = 0; /* 0 = invalid */ + + if (c <= 0x7F) { + /* ASCII */ + seq_len = 1; + } else if (c >= 0xC2 && c <= 0xDF) { + /* 2-byte: 110xxxxx 10xxxxxx (0xC0/0xC1 are overlong) */ + if (i + 1 < src_len && (s[i + 1] & 0xC0) == 0x80) { + seq_len = 2; + } + } else if ((c & 0xF0) == 0xE0) { + /* 3-byte: 1110xxxx 10xxxxxx 10xxxxxx */ + if (i + 2 < src_len && (s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80) { + /* Reject overlong (E0 + < A0) and surrogates (ED + >= A0). */ + if (!(c == 0xE0 && s[i + 1] < 0xA0) && !(c == 0xED && s[i + 1] >= 0xA0)) { + seq_len = 3; + } + } + } else if (c >= 0xF0 && c <= 0xF4) { + /* 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (i + 3 < src_len && (s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80 && + (s[i + 3] & 0xC0) == 0x80) { + /* Reject overlong (F0 + < 90) and > U+10FFFF (F4 + > 8F). */ + if (!(c == 0xF0 && s[i + 1] < 0x90) && !(c == 0xF4 && s[i + 1] > 0x8F)) { + seq_len = 4; + } + } + } + + if (seq_len > 0) { + memcpy(out + olen, s + i, (size_t)seq_len); + olen += (size_t)seq_len; + i += (size_t)seq_len; + } else { + /* Invalid byte → U+FFFD replacement character */ + out[olen++] = '\xEF'; + out[olen++] = '\xBF'; + out[olen++] = '\xBD'; + i++; + } + } + + out[olen] = '\0'; + return out; +} + /* Build an enriched snippet response for a resolved node. */ /* Add a string array to a JSON object (no-op if count == 0). */ static void add_string_array(yyjson_mut_doc *doc, yyjson_mut_val *obj, const char *key, @@ -2857,6 +2936,15 @@ static char *build_snippet_response(cbm_mcp_server_t *srv, cbm_node_t *node, char *abs_path = NULL; char *source = resolve_snippet_source(root_path, node->file_path, start, end, &abs_path); + /* Guarantee valid UTF-8 for JSON-RPC / MCP stdio transport (#511). + * Non-UTF-8 source files (CP949, EUC-KR, Shift_JIS, GBK, …) would + * otherwise produce invalid JSON that hangs MCP clients. */ + if (source) { + char *safe = sanitize_utf8(source); + free(source); + source = safe; + } + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); yyjson_mut_val *root_obj = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root_obj); diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 152a700c..b39a283c 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -11,6 +11,7 @@ #include #include #include +#include /* ══════════════════════════════════════════════════════════════════ * JSON-RPC PARSING @@ -1577,6 +1578,93 @@ TEST(snippet_include_neighbors_enabled) { PASS(); } +/* ── TestSnippet_SourceInvalidUtf8 ───────────────────────────── + * Regression for #511: non-UTF-8 source bytes (e.g. CP949 in a + * legacy CJK Windows codebase) used to emit invalid JSON on the + * MCP stdio channel, hanging clients indefinitely. After the + * sanitize_utf8() fix, the response must: + * 1. parse as strict JSON (envelope + embedded snippet), + * 2. not contain the original invalid byte sequence, + * 3. preserve surrounding valid bytes (function name, body), + * 4. include U+FFFD (EF BF BD) where invalid bytes were. + * + * Test borrowed from #526 (jstar0) — pairs that contributor's + * reproduction with this PR's sanitizer. */ + +static bool snippet_response_is_valid_json(const char *json) { + if (!json) + return false; + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) + return false; + yyjson_doc_free(doc); + return true; +} + +static bool snippet_source_field_has_replacement(const char *json) { + if (!json) + return false; + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) + return false; + yyjson_val *root = yyjson_doc_get_root(doc); + yyjson_val *source = yyjson_obj_get(root, "source"); + const char *source_str = yyjson_get_str(source); + bool found = source_str && strstr(source_str, "\xEF\xBF\xBD") != NULL; + yyjson_doc_free(doc); + return found; +} + +TEST(snippet_source_invalid_utf8) { + char tmp[256]; + cbm_mcp_server_t *srv = setup_snippet_server(tmp, sizeof(tmp)); + ASSERT_NOT_NULL(srv); + + /* Overwrite main.go with bytes that are valid CP949 but invalid + * UTF-8 (0xC0 0xD4 0xB7 0xC2 = "입력" in CP949). Keep the function + * signature so HandleRequest still resolves to lines 3–5. */ + char src_path[512]; + snprintf(src_path, sizeof(src_path), "%s/project/main.go", tmp); + FILE *fp = fopen(src_path, "wb"); + ASSERT_NOT_NULL(fp); + const unsigned char source[] = { + 'p', 'a', 'c', 'k', 'a', 'g', 'e', ' ', 'm', 'a', 'i', 'n', '\n', '\n', + 'f', 'u', 'n', 'c', ' ', 'H', 'a', 'n', 'd', 'l', 'e', 'R', 'e', 'q', + 'u', 'e', 's', 't', '(', ')', ' ', 'e', 'r', 'r', 'o', 'r', ' ', '{', + '\n', '\t', '/', '/', ' ', 0xC0, 0xD4, 0xB7, 0xC2, '\n', '\t', 'r', 'e', 't', + 'u', 'r', 'n', ' ', 'n', 'i', 'l', '\n', '}', '\n'}; + ASSERT_EQ(fwrite(source, 1, sizeof(source), fp), sizeof(source)); + ASSERT_EQ(fclose(fp), 0); + + char *raw = + cbm_mcp_handle_tool(srv, "get_code_snippet", + "{\"qualified_name\":\"test-project.cmd.server.main.HandleRequest\"," + "\"project\":\"test-project\"}"); + ASSERT_NOT_NULL(raw); + /* (1) Outer MCP envelope must be valid JSON — this is what the + * stdio JSON-RPC client decodes. Pre-fix, the raw bytes inside + * "source" broke the stream. */ + ASSERT_TRUE(snippet_response_is_valid_json(raw)); + + char *inner = extract_text_content(raw); + ASSERT_NOT_NULL(inner); + /* (2) Embedded snippet JSON must also be strictly parseable. */ + ASSERT_TRUE(snippet_response_is_valid_json(inner)); + /* (3) The raw invalid CP949 sequence must not appear verbatim. */ + ASSERT_NULL(strstr(inner, "\xC0\xD4")); + /* (4) Surrounding valid bytes must be preserved. */ + ASSERT_NOT_NULL(strstr(inner, "HandleRequest")); + ASSERT_NOT_NULL(strstr(inner, "return nil")); + /* (5) Replacement character must mark where invalid bytes were. */ + ASSERT_TRUE(snippet_source_field_has_replacement(inner)); + + free(inner); + free(raw); + cbm_mcp_server_free(srv); + cleanup_snippet_dir(tmp); + PASS(); +} + /* ══════════════════════════════════════════════════════════════════ * JSON-RPC PARSING — EDGE CASES * ══════════════════════════════════════════════════════════════════ */ @@ -2129,5 +2217,6 @@ SUITE(mcp) { RUN_TEST(snippet_auto_resolve_enabled); RUN_TEST(snippet_include_neighbors_default); RUN_TEST(snippet_include_neighbors_enabled); + RUN_TEST(snippet_source_invalid_utf8); RUN_TEST(tool_bad_project_name_no_overflow_issue235); }