From 93e33d71c9a4ac0cfc783a192f1dc79d03613604 Mon Sep 17 00:00:00 2001 From: piiiico Date: Sun, 21 Jun 2026 01:13:00 +0000 Subject: [PATCH 1/3] fix: sanitize non-UTF-8 source in get_code_snippet (#511) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_code_snippet reads file bytes via read_file_lines and passes them directly to yyjson without validating the encoding. When a source file uses a non-UTF-8 encoding (CP949, EUC-KR, Shift_JIS, GBK — common in legacy CJK codebases), the emitted JSON contains raw bytes that are invalid UTF-8. Over the MCP stdio JSON-RPC transport this hangs clients indefinitely because the stream decoder stalls on a malformed multi-byte lead byte. Add sanitize_utf8() — a single-pass UTF-8 validator that replaces each invalid byte with U+FFFD (EF BF BD) per Unicode §3.9 (Maximal Subpart substitution). Unlike the existing sanitize_ascii() used in search_code, this preserves valid multi-byte UTF-8 (CJK, emoji, accented Latin, etc.) while only replacing genuinely invalid sequences. Applied in build_snippet_response before JSON serialization so that both MCP and CLI paths emit valid UTF-8. Handles: overlong encodings, truncated sequences, unexpected continuation bytes, surrogates (U+D800..U+DFFF), and codepoints above U+10FFFF. Fixes #511. Signed-off-by: piiiico --- src/mcp/mcp.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 8102b1e7..af424f9e 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -2833,6 +2833,82 @@ static char *resolve_snippet_source(const char *root_path, const char *file_path return NULL; } +/* ── UTF-8 sanitization ──────────────────────────────────────── */ + +/* Replace invalid UTF-8 byte sequences with U+FFFD (EF BF BD). + * Valid multi-byte UTF-8 (CJK, emoji, etc.) is preserved. + * Returns a newly allocated string; caller frees. + * Returns NULL only if src is NULL or malloc fails. + * + * Reference: RFC 3629 §3, Unicode §3.9 (U+FFFD Substitution of + * Maximal Subparts). Each invalid byte is individually replaced + * so that surrounding valid bytes are not consumed. */ +static char *sanitize_utf8(const char *src) { + if (!src) { + return NULL; + } + + const unsigned char *s = (const unsigned char *)src; + size_t src_len = strlen(src); + + /* Worst case: every byte is invalid → each becomes 3 bytes (U+FFFD). */ + char *out = malloc(src_len * 3 + 1); + if (!out) { + return NULL; + } + size_t olen = 0; + + for (size_t i = 0; i < src_len;) { + unsigned char c = s[i]; + int seq_len = 0; /* 0 = invalid */ + + if (c <= 0x7F) { + /* ASCII */ + seq_len = 1; + } else if (c >= 0xC2 && c <= 0xDF) { + /* 2-byte: 110xxxxx 10xxxxxx (0xC0/0xC1 are overlong) */ + if (i + 1 < src_len && (s[i + 1] & 0xC0) == 0x80) { + seq_len = 2; + } + } else if ((c & 0xF0) == 0xE0) { + /* 3-byte: 1110xxxx 10xxxxxx 10xxxxxx */ + if (i + 2 < src_len && (s[i + 1] & 0xC0) == 0x80 && + (s[i + 2] & 0xC0) == 0x80) { + /* Reject overlong (E0 + < A0) and surrogates (ED + >= A0). */ + if (!(c == 0xE0 && s[i + 1] < 0xA0) && + !(c == 0xED && s[i + 1] >= 0xA0)) { + seq_len = 3; + } + } + } else if (c >= 0xF0 && c <= 0xF4) { + /* 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (i + 3 < src_len && (s[i + 1] & 0xC0) == 0x80 && + (s[i + 2] & 0xC0) == 0x80 && (s[i + 3] & 0xC0) == 0x80) { + /* Reject overlong (F0 + < 90) and > U+10FFFF (F4 + > 8F). */ + if (!(c == 0xF0 && s[i + 1] < 0x90) && + !(c == 0xF4 && s[i + 1] > 0x8F)) { + seq_len = 4; + } + } + } + + if (seq_len > 0) { + memcpy(out + olen, s + i, (size_t)seq_len); + olen += (size_t)seq_len; + i += (size_t)seq_len; + } else { + /* Invalid byte → U+FFFD replacement character */ + out[olen++] = '\xEF'; + out[olen++] = '\xBF'; + out[olen++] = '\xBD'; + i++; + } + } + + out[olen] = '\0'; + return out; +} + /* Build an enriched snippet response for a resolved node. */ /* Add a string array to a JSON object (no-op if count == 0). */ static void add_string_array(yyjson_mut_doc *doc, yyjson_mut_val *obj, const char *key, @@ -2857,6 +2933,15 @@ static char *build_snippet_response(cbm_mcp_server_t *srv, cbm_node_t *node, char *abs_path = NULL; char *source = resolve_snippet_source(root_path, node->file_path, start, end, &abs_path); + /* Guarantee valid UTF-8 for JSON-RPC / MCP stdio transport (#511). + * Non-UTF-8 source files (CP949, EUC-KR, Shift_JIS, GBK, …) would + * otherwise produce invalid JSON that hangs MCP clients. */ + if (source) { + char *safe = sanitize_utf8(source); + free(source); + source = safe; + } + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); yyjson_mut_val *root_obj = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root_obj); From ba5e8762f0dbd29000cea2768aa60767f0ef3741 Mon Sep 17 00:00:00 2001 From: piiiico Date: Sun, 21 Jun 2026 02:50:45 +0000 Subject: [PATCH 2/3] style: apply clang-format to sanitize_utf8 (CI lint fix) Compact multi-condition if-checks that fit within the 100-char column limit onto single lines, and repack the 4-byte sequence check so three conditions share the first line (95 chars) with the fourth wrapping to the continuation, as clang-format-20 requires. Signed-off-by: piiiico --- src/mcp/mcp.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index af424f9e..835a5c05 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -2872,21 +2872,18 @@ static char *sanitize_utf8(const char *src) { } } else if ((c & 0xF0) == 0xE0) { /* 3-byte: 1110xxxx 10xxxxxx 10xxxxxx */ - if (i + 2 < src_len && (s[i + 1] & 0xC0) == 0x80 && - (s[i + 2] & 0xC0) == 0x80) { + if (i + 2 < src_len && (s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80) { /* Reject overlong (E0 + < A0) and surrogates (ED + >= A0). */ - if (!(c == 0xE0 && s[i + 1] < 0xA0) && - !(c == 0xED && s[i + 1] >= 0xA0)) { + if (!(c == 0xE0 && s[i + 1] < 0xA0) && !(c == 0xED && s[i + 1] >= 0xA0)) { seq_len = 3; } } } else if (c >= 0xF0 && c <= 0xF4) { /* 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ - if (i + 3 < src_len && (s[i + 1] & 0xC0) == 0x80 && - (s[i + 2] & 0xC0) == 0x80 && (s[i + 3] & 0xC0) == 0x80) { + if (i + 3 < src_len && (s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80 && + (s[i + 3] & 0xC0) == 0x80) { /* Reject overlong (F0 + < 90) and > U+10FFFF (F4 + > 8F). */ - if (!(c == 0xF0 && s[i + 1] < 0x90) && - !(c == 0xF4 && s[i + 1] > 0x8F)) { + if (!(c == 0xF0 && s[i + 1] < 0x90) && !(c == 0xF4 && s[i + 1] > 0x8F)) { seq_len = 4; } } From c0ae93553f10e690ae13b7cd2b383b955b3b3313 Mon Sep 17 00:00:00 2001 From: piiiico Date: Mon, 22 Jun 2026 21:07:04 +0000 Subject: [PATCH 3/3] fix(mcp): add reproduce-first test and overflow guard for sanitize_utf8 (#511) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per maintainer review on #541, two additions to the snippet UTF-8 sanitizer: 1. **Reproduce-first test** (`snippet_source_invalid_utf8`) Writes a Go source file containing the CP949 sequence 0xC0 0xD4 0xB7 0xC2 inside a comment, then calls `get_code_snippet` and asserts: - the outer MCP envelope parses as strict JSON, - the embedded snippet JSON parses as strict JSON, - the original invalid bytes (0xC0 0xD4) do not appear verbatim, - surrounding valid bytes (HandleRequest, return nil) survive, - U+FFFD (EF BF BD) marks where invalid bytes were. Without `sanitize_utf8`, the response contains raw non-UTF-8 bytes and step 1 fails (yyjson rejects it) — i.e. the test fails on pre-fix `main`. Byte pattern and structure borrowed from #526 (jstar0) so both contributions converge on the same regression. 2. **Allocation overflow guard** `malloc(src_len * 3 + 1)` is technically UB if `src_len > (SIZE_MAX - 1) / 3`. Snippet sources can't reach that in practice, but the guard makes the safety property local to the function rather than relying on call-site bounds. The same bound is enforced in #526. Refs #511. Co-authored-by: King Star <54024410+jstar0@users.noreply.github.com> Signed-off-by: piiiico --- src/mcp/mcp.c | 8 ++++- tests/test_mcp.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 835a5c05..5c91c62b 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -2851,7 +2851,13 @@ static char *sanitize_utf8(const char *src) { const unsigned char *s = (const unsigned char *)src; size_t src_len = strlen(src); - /* Worst case: every byte is invalid → each becomes 3 bytes (U+FFFD). */ + /* Worst case: every byte is invalid → each becomes 3 bytes (U+FFFD). + * Guard against size_t overflow on src_len * 3 + 1 (UB if + * src_len > (SIZE_MAX - 1) / 3). Practically unreachable for snippet + * sources, but defensive — the same bound is enforced in #526. */ + if (src_len > (SIZE_MAX - 1) / 3) { + return NULL; + } char *out = malloc(src_len * 3 + 1); if (!out) { return NULL; diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 152a700c..b39a283c 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -11,6 +11,7 @@ #include #include #include +#include /* ══════════════════════════════════════════════════════════════════ * JSON-RPC PARSING @@ -1577,6 +1578,93 @@ TEST(snippet_include_neighbors_enabled) { PASS(); } +/* ── TestSnippet_SourceInvalidUtf8 ───────────────────────────── + * Regression for #511: non-UTF-8 source bytes (e.g. CP949 in a + * legacy CJK Windows codebase) used to emit invalid JSON on the + * MCP stdio channel, hanging clients indefinitely. After the + * sanitize_utf8() fix, the response must: + * 1. parse as strict JSON (envelope + embedded snippet), + * 2. not contain the original invalid byte sequence, + * 3. preserve surrounding valid bytes (function name, body), + * 4. include U+FFFD (EF BF BD) where invalid bytes were. + * + * Test borrowed from #526 (jstar0) — pairs that contributor's + * reproduction with this PR's sanitizer. */ + +static bool snippet_response_is_valid_json(const char *json) { + if (!json) + return false; + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) + return false; + yyjson_doc_free(doc); + return true; +} + +static bool snippet_source_field_has_replacement(const char *json) { + if (!json) + return false; + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) + return false; + yyjson_val *root = yyjson_doc_get_root(doc); + yyjson_val *source = yyjson_obj_get(root, "source"); + const char *source_str = yyjson_get_str(source); + bool found = source_str && strstr(source_str, "\xEF\xBF\xBD") != NULL; + yyjson_doc_free(doc); + return found; +} + +TEST(snippet_source_invalid_utf8) { + char tmp[256]; + cbm_mcp_server_t *srv = setup_snippet_server(tmp, sizeof(tmp)); + ASSERT_NOT_NULL(srv); + + /* Overwrite main.go with bytes that are valid CP949 but invalid + * UTF-8 (0xC0 0xD4 0xB7 0xC2 = "입력" in CP949). Keep the function + * signature so HandleRequest still resolves to lines 3–5. */ + char src_path[512]; + snprintf(src_path, sizeof(src_path), "%s/project/main.go", tmp); + FILE *fp = fopen(src_path, "wb"); + ASSERT_NOT_NULL(fp); + const unsigned char source[] = { + 'p', 'a', 'c', 'k', 'a', 'g', 'e', ' ', 'm', 'a', 'i', 'n', '\n', '\n', + 'f', 'u', 'n', 'c', ' ', 'H', 'a', 'n', 'd', 'l', 'e', 'R', 'e', 'q', + 'u', 'e', 's', 't', '(', ')', ' ', 'e', 'r', 'r', 'o', 'r', ' ', '{', + '\n', '\t', '/', '/', ' ', 0xC0, 0xD4, 0xB7, 0xC2, '\n', '\t', 'r', 'e', 't', + 'u', 'r', 'n', ' ', 'n', 'i', 'l', '\n', '}', '\n'}; + ASSERT_EQ(fwrite(source, 1, sizeof(source), fp), sizeof(source)); + ASSERT_EQ(fclose(fp), 0); + + char *raw = + cbm_mcp_handle_tool(srv, "get_code_snippet", + "{\"qualified_name\":\"test-project.cmd.server.main.HandleRequest\"," + "\"project\":\"test-project\"}"); + ASSERT_NOT_NULL(raw); + /* (1) Outer MCP envelope must be valid JSON — this is what the + * stdio JSON-RPC client decodes. Pre-fix, the raw bytes inside + * "source" broke the stream. */ + ASSERT_TRUE(snippet_response_is_valid_json(raw)); + + char *inner = extract_text_content(raw); + ASSERT_NOT_NULL(inner); + /* (2) Embedded snippet JSON must also be strictly parseable. */ + ASSERT_TRUE(snippet_response_is_valid_json(inner)); + /* (3) The raw invalid CP949 sequence must not appear verbatim. */ + ASSERT_NULL(strstr(inner, "\xC0\xD4")); + /* (4) Surrounding valid bytes must be preserved. */ + ASSERT_NOT_NULL(strstr(inner, "HandleRequest")); + ASSERT_NOT_NULL(strstr(inner, "return nil")); + /* (5) Replacement character must mark where invalid bytes were. */ + ASSERT_TRUE(snippet_source_field_has_replacement(inner)); + + free(inner); + free(raw); + cbm_mcp_server_free(srv); + cleanup_snippet_dir(tmp); + PASS(); +} + /* ══════════════════════════════════════════════════════════════════ * JSON-RPC PARSING — EDGE CASES * ══════════════════════════════════════════════════════════════════ */ @@ -2129,5 +2217,6 @@ SUITE(mcp) { RUN_TEST(snippet_auto_resolve_enabled); RUN_TEST(snippet_include_neighbors_default); RUN_TEST(snippet_include_neighbors_enabled); + RUN_TEST(snippet_source_invalid_utf8); RUN_TEST(tool_bad_project_name_no_overflow_issue235); }