From 93e33d71c9a4ac0cfc783a192f1dc79d03613604 Mon Sep 17 00:00:00 2001
From: piiiico <pico@aamdal.dev>
Date: Sun, 21 Jun 2026 01:13:00 +0000
Subject: [PATCH 1/3] fix: sanitize non-UTF-8 source in get_code_snippet (#511)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

get_code_snippet reads file bytes via read_file_lines and passes them
directly to yyjson without validating the encoding. When a source file
uses a non-UTF-8 encoding (CP949, EUC-KR, Shift_JIS, GBK — common in
legacy CJK codebases), the emitted JSON contains raw bytes that are
invalid UTF-8. Over the MCP stdio JSON-RPC transport this hangs clients
indefinitely because the stream decoder stalls on a malformed multi-byte
lead byte.

Add sanitize_utf8() — a single-pass UTF-8 validator that replaces each
invalid byte with U+FFFD (EF BF BD) per Unicode §3.9 (Maximal Subpart
substitution). Unlike the existing sanitize_ascii() used in search_code,
this preserves valid multi-byte UTF-8 (CJK, emoji, accented Latin, etc.)
while only replacing genuinely invalid sequences.

Applied in build_snippet_response before JSON serialization so that both
MCP and CLI paths emit valid UTF-8.

Handles: overlong encodings, truncated sequences, unexpected continuation
bytes, surrogates (U+D800..U+DFFF), and codepoints above U+10FFFF.

Fixes #511.

Signed-off-by: piiiico <pico@aamdal.dev>
---
 src/mcp/mcp.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c
index 8102b1e7..af424f9e 100644
--- a/src/mcp/mcp.c
+++ b/src/mcp/mcp.c
@@ -2833,6 +2833,82 @@ static char *resolve_snippet_source(const char *root_path, const char *file_path
     return NULL;
 }
 
+/* ── UTF-8 sanitization ──────────────────────────────────────── */
+
+/* Replace invalid UTF-8 byte sequences with U+FFFD (EF BF BD).
+ * Valid multi-byte UTF-8 (CJK, emoji, etc.) is preserved.
+ * Returns a newly allocated string; caller frees.
+ * Returns NULL only if src is NULL or malloc fails.
+ *
+ * Reference: RFC 3629 §3, Unicode §3.9 (U+FFFD Substitution of
+ * Maximal Subparts). Each invalid byte is individually replaced
+ * so that surrounding valid bytes are not consumed. */
+static char *sanitize_utf8(const char *src) {
+    if (!src) {
+        return NULL;
+    }
+
+    const unsigned char *s = (const unsigned char *)src;
+    size_t src_len = strlen(src);
+
+    /* Worst case: every byte is invalid → each becomes 3 bytes (U+FFFD). */
+    char *out = malloc(src_len * 3 + 1);
+    if (!out) {
+        return NULL;
+    }
+    size_t olen = 0;
+
+    for (size_t i = 0; i < src_len;) {
+        unsigned char c = s[i];
+        int seq_len = 0; /* 0 = invalid */
+
+        if (c <= 0x7F) {
+            /* ASCII */
+            seq_len = 1;
+        } else if (c >= 0xC2 && c <= 0xDF) {
+            /* 2-byte: 110xxxxx 10xxxxxx  (0xC0/0xC1 are overlong) */
+            if (i + 1 < src_len && (s[i + 1] & 0xC0) == 0x80) {
+                seq_len = 2;
+            }
+        } else if ((c & 0xF0) == 0xE0) {
+            /* 3-byte: 1110xxxx 10xxxxxx 10xxxxxx */
+            if (i + 2 < src_len && (s[i + 1] & 0xC0) == 0x80 &&
+                (s[i + 2] & 0xC0) == 0x80) {
+                /* Reject overlong (E0 + < A0) and surrogates (ED + >= A0). */
+                if (!(c == 0xE0 && s[i + 1] < 0xA0) &&
+                    !(c == 0xED && s[i + 1] >= 0xA0)) {
+                    seq_len = 3;
+                }
+            }
+        } else if (c >= 0xF0 && c <= 0xF4) {
+            /* 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+            if (i + 3 < src_len && (s[i + 1] & 0xC0) == 0x80 &&
+                (s[i + 2] & 0xC0) == 0x80 && (s[i + 3] & 0xC0) == 0x80) {
+                /* Reject overlong (F0 + < 90) and > U+10FFFF (F4 + > 8F). */
+                if (!(c == 0xF0 && s[i + 1] < 0x90) &&
+                    !(c == 0xF4 && s[i + 1] > 0x8F)) {
+                    seq_len = 4;
+                }
+            }
+        }
+
+        if (seq_len > 0) {
+            memcpy(out + olen, s + i, (size_t)seq_len);
+            olen += (size_t)seq_len;
+            i += (size_t)seq_len;
+        } else {
+            /* Invalid byte → U+FFFD replacement character */
+            out[olen++] = '\xEF';
+            out[olen++] = '\xBF';
+            out[olen++] = '\xBD';
+            i++;
+        }
+    }
+
+    out[olen] = '\0';
+    return out;
+}
+
 /* Build an enriched snippet response for a resolved node. */
 /* Add a string array to a JSON object (no-op if count == 0). */
 static void add_string_array(yyjson_mut_doc *doc, yyjson_mut_val *obj, const char *key,
@@ -2857,6 +2933,15 @@ static char *build_snippet_response(cbm_mcp_server_t *srv, cbm_node_t *node,
     char *abs_path = NULL;
     char *source = resolve_snippet_source(root_path, node->file_path, start, end, &abs_path);
 
+    /* Guarantee valid UTF-8 for JSON-RPC / MCP stdio transport (#511).
+     * Non-UTF-8 source files (CP949, EUC-KR, Shift_JIS, GBK, …) would
+     * otherwise produce invalid JSON that hangs MCP clients. */
+    if (source) {
+        char *safe = sanitize_utf8(source);
+        free(source);
+        source = safe;
+    }
+
     yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL);
     yyjson_mut_val *root_obj = yyjson_mut_obj(doc);
     yyjson_mut_doc_set_root(doc, root_obj);

From ba5e8762f0dbd29000cea2768aa60767f0ef3741 Mon Sep 17 00:00:00 2001
From: piiiico <pico@amdal.dev>
Date: Sun, 21 Jun 2026 02:50:45 +0000
Subject: [PATCH 2/3] style: apply clang-format to sanitize_utf8 (CI lint fix)

Compact multi-condition if-checks that fit within the 100-char column
limit onto single lines, and repack the 4-byte sequence check so three
conditions share the first line (95 chars) with the fourth wrapping to
the continuation, as clang-format-20 requires.

Signed-off-by: piiiico <pico@amdal.dev>
---
 src/mcp/mcp.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c
index af424f9e..835a5c05 100644
--- a/src/mcp/mcp.c
+++ b/src/mcp/mcp.c
@@ -2872,21 +2872,18 @@ static char *sanitize_utf8(const char *src) {
             }
         } else if ((c & 0xF0) == 0xE0) {
             /* 3-byte: 1110xxxx 10xxxxxx 10xxxxxx */
-            if (i + 2 < src_len && (s[i + 1] & 0xC0) == 0x80 &&
-                (s[i + 2] & 0xC0) == 0x80) {
+            if (i + 2 < src_len && (s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80) {
                 /* Reject overlong (E0 + < A0) and surrogates (ED + >= A0). */
-                if (!(c == 0xE0 && s[i + 1] < 0xA0) &&
-                    !(c == 0xED && s[i + 1] >= 0xA0)) {
+                if (!(c == 0xE0 && s[i + 1] < 0xA0) && !(c == 0xED && s[i + 1] >= 0xA0)) {
                     seq_len = 3;
                 }
             }
         } else if (c >= 0xF0 && c <= 0xF4) {
             /* 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
-            if (i + 3 < src_len && (s[i + 1] & 0xC0) == 0x80 &&
-                (s[i + 2] & 0xC0) == 0x80 && (s[i + 3] & 0xC0) == 0x80) {
+            if (i + 3 < src_len && (s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80 &&
+                (s[i + 3] & 0xC0) == 0x80) {
                 /* Reject overlong (F0 + < 90) and > U+10FFFF (F4 + > 8F). */
-                if (!(c == 0xF0 && s[i + 1] < 0x90) &&
-                    !(c == 0xF4 && s[i + 1] > 0x8F)) {
+                if (!(c == 0xF0 && s[i + 1] < 0x90) && !(c == 0xF4 && s[i + 1] > 0x8F)) {
                     seq_len = 4;
                 }
             }

From c0ae93553f10e690ae13b7cd2b383b955b3b3313 Mon Sep 17 00:00:00 2001
From: piiiico <pico@amdal.dev>
Date: Mon, 22 Jun 2026 21:07:04 +0000
Subject: [PATCH 3/3] fix(mcp): add reproduce-first test and overflow guard for
 sanitize_utf8 (#511)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per maintainer review on #541, two additions to the snippet UTF-8
sanitizer:

1. **Reproduce-first test** (`snippet_source_invalid_utf8`)
   Writes a Go source file containing the CP949 sequence 0xC0 0xD4
   0xB7 0xC2 inside a comment, then calls `get_code_snippet` and
   asserts:
   - the outer MCP envelope parses as strict JSON,
   - the embedded snippet JSON parses as strict JSON,
   - the original invalid bytes (0xC0 0xD4) do not appear verbatim,
   - surrounding valid bytes (HandleRequest, return nil) survive,
   - U+FFFD (EF BF BD) marks where invalid bytes were.

   Without `sanitize_utf8`, the response contains raw non-UTF-8 bytes
   and step 1 fails (yyjson rejects it) — i.e. the test fails on
   pre-fix `main`. Byte pattern and structure borrowed from #526
   (jstar0) so both contributions converge on the same regression.

2. **Allocation overflow guard**
   `malloc(src_len * 3 + 1)` is technically UB if
   `src_len > (SIZE_MAX - 1) / 3`. Snippet sources can't reach that
   in practice, but the guard makes the safety property local to
   the function rather than relying on call-site bounds. The same
   bound is enforced in #526.

Refs #511.

Co-authored-by: King Star <54024410+jstar0@users.noreply.github.com>
Signed-off-by: piiiico <pico@amdal.dev>
---
 src/mcp/mcp.c    |  8 ++++-
 tests/test_mcp.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c
index 835a5c05..5c91c62b 100644
--- a/src/mcp/mcp.c
+++ b/src/mcp/mcp.c
@@ -2851,7 +2851,13 @@ static char *sanitize_utf8(const char *src) {
     const unsigned char *s = (const unsigned char *)src;
     size_t src_len = strlen(src);
 
-    /* Worst case: every byte is invalid → each becomes 3 bytes (U+FFFD). */
+    /* Worst case: every byte is invalid → each becomes 3 bytes (U+FFFD).
+     * Guard against size_t overflow on src_len * 3 + 1 (UB if
+     * src_len > (SIZE_MAX - 1) / 3). Practically unreachable for snippet
+     * sources, but defensive — the same bound is enforced in #526. */
+    if (src_len > (SIZE_MAX - 1) / 3) {
+        return NULL;
+    }
     char *out = malloc(src_len * 3 + 1);
     if (!out) {
         return NULL;
diff --git a/tests/test_mcp.c b/tests/test_mcp.c
index 152a700c..b39a283c 100644
--- a/tests/test_mcp.c
+++ b/tests/test_mcp.c
@@ -11,6 +11,7 @@
 #include <yyjson/yyjson.h>
 #include <string.h>
 #include <stdlib.h>
+#include <stdbool.h>
 
 /* ══════════════════════════════════════════════════════════════════
  *  JSON-RPC PARSING
@@ -1577,6 +1578,93 @@ TEST(snippet_include_neighbors_enabled) {
     PASS();
 }
 
+/* ── TestSnippet_SourceInvalidUtf8 ─────────────────────────────
+ * Regression for #511: non-UTF-8 source bytes (e.g. CP949 in a
+ * legacy CJK Windows codebase) used to emit invalid JSON on the
+ * MCP stdio channel, hanging clients indefinitely. After the
+ * sanitize_utf8() fix, the response must:
+ *   1. parse as strict JSON (envelope + embedded snippet),
+ *   2. not contain the original invalid byte sequence,
+ *   3. preserve surrounding valid bytes (function name, body),
+ *   4. include U+FFFD (EF BF BD) where invalid bytes were.
+ *
+ * Test borrowed from #526 (jstar0) — pairs that contributor's
+ * reproduction with this PR's sanitizer. */
+
+static bool snippet_response_is_valid_json(const char *json) {
+    if (!json)
+        return false;
+    yyjson_doc *doc = yyjson_read(json, strlen(json), 0);
+    if (!doc)
+        return false;
+    yyjson_doc_free(doc);
+    return true;
+}
+
+static bool snippet_source_field_has_replacement(const char *json) {
+    if (!json)
+        return false;
+    yyjson_doc *doc = yyjson_read(json, strlen(json), 0);
+    if (!doc)
+        return false;
+    yyjson_val *root = yyjson_doc_get_root(doc);
+    yyjson_val *source = yyjson_obj_get(root, "source");
+    const char *source_str = yyjson_get_str(source);
+    bool found = source_str && strstr(source_str, "\xEF\xBF\xBD") != NULL;
+    yyjson_doc_free(doc);
+    return found;
+}
+
+TEST(snippet_source_invalid_utf8) {
+    char tmp[256];
+    cbm_mcp_server_t *srv = setup_snippet_server(tmp, sizeof(tmp));
+    ASSERT_NOT_NULL(srv);
+
+    /* Overwrite main.go with bytes that are valid CP949 but invalid
+     * UTF-8 (0xC0 0xD4 0xB7 0xC2 = "입력" in CP949). Keep the function
+     * signature so HandleRequest still resolves to lines 3–5. */
+    char src_path[512];
+    snprintf(src_path, sizeof(src_path), "%s/project/main.go", tmp);
+    FILE *fp = fopen(src_path, "wb");
+    ASSERT_NOT_NULL(fp);
+    const unsigned char source[] = {
+        'p',  'a',  'c', 'k', 'a', 'g',  'e',  ' ',  'm',  'a',  'i',  'n', '\n', '\n',
+        'f',  'u',  'n', 'c', ' ', 'H',  'a',  'n',  'd',  'l',  'e',  'R', 'e',  'q',
+        'u',  'e',  's', 't', '(', ')',  ' ',  'e',  'r',  'r',  'o',  'r', ' ',  '{',
+        '\n', '\t', '/', '/', ' ', 0xC0, 0xD4, 0xB7, 0xC2, '\n', '\t', 'r', 'e',  't',
+        'u',  'r',  'n', ' ', 'n', 'i',  'l',  '\n', '}',  '\n'};
+    ASSERT_EQ(fwrite(source, 1, sizeof(source), fp), sizeof(source));
+    ASSERT_EQ(fclose(fp), 0);
+
+    char *raw =
+        cbm_mcp_handle_tool(srv, "get_code_snippet",
+                            "{\"qualified_name\":\"test-project.cmd.server.main.HandleRequest\","
+                            "\"project\":\"test-project\"}");
+    ASSERT_NOT_NULL(raw);
+    /* (1) Outer MCP envelope must be valid JSON — this is what the
+     * stdio JSON-RPC client decodes. Pre-fix, the raw bytes inside
+     * "source" broke the stream. */
+    ASSERT_TRUE(snippet_response_is_valid_json(raw));
+
+    char *inner = extract_text_content(raw);
+    ASSERT_NOT_NULL(inner);
+    /* (2) Embedded snippet JSON must also be strictly parseable. */
+    ASSERT_TRUE(snippet_response_is_valid_json(inner));
+    /* (3) The raw invalid CP949 sequence must not appear verbatim. */
+    ASSERT_NULL(strstr(inner, "\xC0\xD4"));
+    /* (4) Surrounding valid bytes must be preserved. */
+    ASSERT_NOT_NULL(strstr(inner, "HandleRequest"));
+    ASSERT_NOT_NULL(strstr(inner, "return nil"));
+    /* (5) Replacement character must mark where invalid bytes were. */
+    ASSERT_TRUE(snippet_source_field_has_replacement(inner));
+
+    free(inner);
+    free(raw);
+    cbm_mcp_server_free(srv);
+    cleanup_snippet_dir(tmp);
+    PASS();
+}
+
 /* ══════════════════════════════════════════════════════════════════
  *  JSON-RPC PARSING — EDGE CASES
  * ══════════════════════════════════════════════════════════════════ */
@@ -2129,5 +2217,6 @@ SUITE(mcp) {
     RUN_TEST(snippet_auto_resolve_enabled);
     RUN_TEST(snippet_include_neighbors_default);
     RUN_TEST(snippet_include_neighbors_enabled);
+    RUN_TEST(snippet_source_invalid_utf8);
     RUN_TEST(tool_bad_project_name_no_overflow_issue235);
 }