Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions src/mcp/mcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2833,6 +2833,85 @@ static char *resolve_snippet_source(const char *root_path, const char *file_path
return NULL;
}

/* ── UTF-8 sanitization ──────────────────────────────────────── */

/* Replace invalid UTF-8 byte sequences with U+FFFD (EF BF BD).
* Valid multi-byte UTF-8 (CJK, emoji, etc.) is preserved.
* Returns a newly allocated string; caller frees.
* Returns NULL only if src is NULL or malloc fails.
*
* Reference: RFC 3629 §3, Unicode §3.9 (U+FFFD Substitution of
* Maximal Subparts). Each invalid byte is individually replaced
* so that surrounding valid bytes are not consumed. */
static char *sanitize_utf8(const char *src) {
if (!src) {
return NULL;
}

const unsigned char *s = (const unsigned char *)src;
size_t src_len = strlen(src);

/* Worst case: every byte is invalid → each becomes 3 bytes (U+FFFD).
* Guard against size_t overflow on src_len * 3 + 1 (UB if
* src_len > (SIZE_MAX - 1) / 3). Practically unreachable for snippet
* sources, but defensive — the same bound is enforced in #526. */
if (src_len > (SIZE_MAX - 1) / 3) {
return NULL;
}
char *out = malloc(src_len * 3 + 1);
if (!out) {
return NULL;
}
size_t olen = 0;

for (size_t i = 0; i < src_len;) {
unsigned char c = s[i];
int seq_len = 0; /* 0 = invalid */

if (c <= 0x7F) {
/* ASCII */
seq_len = 1;
} else if (c >= 0xC2 && c <= 0xDF) {
/* 2-byte: 110xxxxx 10xxxxxx (0xC0/0xC1 are overlong) */
if (i + 1 < src_len && (s[i + 1] & 0xC0) == 0x80) {
seq_len = 2;
}
} else if ((c & 0xF0) == 0xE0) {
/* 3-byte: 1110xxxx 10xxxxxx 10xxxxxx */
if (i + 2 < src_len && (s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80) {
/* Reject overlong (E0 + < A0) and surrogates (ED + >= A0). */
if (!(c == 0xE0 && s[i + 1] < 0xA0) && !(c == 0xED && s[i + 1] >= 0xA0)) {
seq_len = 3;
}
}
} else if (c >= 0xF0 && c <= 0xF4) {
/* 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (i + 3 < src_len && (s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80 &&
(s[i + 3] & 0xC0) == 0x80) {
/* Reject overlong (F0 + < 90) and > U+10FFFF (F4 + > 8F). */
if (!(c == 0xF0 && s[i + 1] < 0x90) && !(c == 0xF4 && s[i + 1] > 0x8F)) {
seq_len = 4;
}
}
}

if (seq_len > 0) {
memcpy(out + olen, s + i, (size_t)seq_len);
olen += (size_t)seq_len;
i += (size_t)seq_len;
} else {
/* Invalid byte → U+FFFD replacement character */
out[olen++] = '\xEF';
out[olen++] = '\xBF';
out[olen++] = '\xBD';
i++;
}
}

out[olen] = '\0';
return out;
}

/* Build an enriched snippet response for a resolved node. */
/* Add a string array to a JSON object (no-op if count == 0). */
static void add_string_array(yyjson_mut_doc *doc, yyjson_mut_val *obj, const char *key,
Expand All @@ -2857,6 +2936,15 @@ static char *build_snippet_response(cbm_mcp_server_t *srv, cbm_node_t *node,
char *abs_path = NULL;
char *source = resolve_snippet_source(root_path, node->file_path, start, end, &abs_path);

/* Guarantee valid UTF-8 for JSON-RPC / MCP stdio transport (#511).
* Non-UTF-8 source files (CP949, EUC-KR, Shift_JIS, GBK, …) would
* otherwise produce invalid JSON that hangs MCP clients. */
if (source) {
char *safe = sanitize_utf8(source);
free(source);
source = safe;
}

yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL);
yyjson_mut_val *root_obj = yyjson_mut_obj(doc);
yyjson_mut_doc_set_root(doc, root_obj);
Expand Down
89 changes: 89 additions & 0 deletions tests/test_mcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <yyjson/yyjson.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>

/* ══════════════════════════════════════════════════════════════════
* JSON-RPC PARSING
Expand Down Expand Up @@ -1577,6 +1578,93 @@ TEST(snippet_include_neighbors_enabled) {
PASS();
}

/* ── TestSnippet_SourceInvalidUtf8 ─────────────────────────────
* Regression for #511: non-UTF-8 source bytes (e.g. CP949 in a
* legacy CJK Windows codebase) used to emit invalid JSON on the
* MCP stdio channel, hanging clients indefinitely. After the
* sanitize_utf8() fix, the response must:
* 1. parse as strict JSON (envelope + embedded snippet),
* 2. not contain the original invalid byte sequence,
* 3. preserve surrounding valid bytes (function name, body),
* 4. include U+FFFD (EF BF BD) where invalid bytes were.
*
* Test borrowed from #526 (jstar0) — pairs that contributor's
* reproduction with this PR's sanitizer. */

static bool snippet_response_is_valid_json(const char *json) {
if (!json)
return false;
yyjson_doc *doc = yyjson_read(json, strlen(json), 0);
if (!doc)
return false;
yyjson_doc_free(doc);
return true;
}

static bool snippet_source_field_has_replacement(const char *json) {
if (!json)
return false;
yyjson_doc *doc = yyjson_read(json, strlen(json), 0);
if (!doc)
return false;
yyjson_val *root = yyjson_doc_get_root(doc);
yyjson_val *source = yyjson_obj_get(root, "source");
const char *source_str = yyjson_get_str(source);
bool found = source_str && strstr(source_str, "\xEF\xBF\xBD") != NULL;
yyjson_doc_free(doc);
return found;
}

TEST(snippet_source_invalid_utf8) {
char tmp[256];
cbm_mcp_server_t *srv = setup_snippet_server(tmp, sizeof(tmp));
ASSERT_NOT_NULL(srv);

/* Overwrite main.go with bytes that are valid CP949 but invalid
* UTF-8 (0xC0 0xD4 0xB7 0xC2 = "입력" in CP949). Keep the function
* signature so HandleRequest still resolves to lines 3–5. */
char src_path[512];
snprintf(src_path, sizeof(src_path), "%s/project/main.go", tmp);
FILE *fp = fopen(src_path, "wb");
ASSERT_NOT_NULL(fp);
const unsigned char source[] = {
'p', 'a', 'c', 'k', 'a', 'g', 'e', ' ', 'm', 'a', 'i', 'n', '\n', '\n',
'f', 'u', 'n', 'c', ' ', 'H', 'a', 'n', 'd', 'l', 'e', 'R', 'e', 'q',
'u', 'e', 's', 't', '(', ')', ' ', 'e', 'r', 'r', 'o', 'r', ' ', '{',
'\n', '\t', '/', '/', ' ', 0xC0, 0xD4, 0xB7, 0xC2, '\n', '\t', 'r', 'e', 't',
'u', 'r', 'n', ' ', 'n', 'i', 'l', '\n', '}', '\n'};
ASSERT_EQ(fwrite(source, 1, sizeof(source), fp), sizeof(source));
ASSERT_EQ(fclose(fp), 0);

char *raw =
cbm_mcp_handle_tool(srv, "get_code_snippet",
"{\"qualified_name\":\"test-project.cmd.server.main.HandleRequest\","
"\"project\":\"test-project\"}");
ASSERT_NOT_NULL(raw);
/* (1) Outer MCP envelope must be valid JSON — this is what the
* stdio JSON-RPC client decodes. Pre-fix, the raw bytes inside
* "source" broke the stream. */
ASSERT_TRUE(snippet_response_is_valid_json(raw));

char *inner = extract_text_content(raw);
ASSERT_NOT_NULL(inner);
/* (2) Embedded snippet JSON must also be strictly parseable. */
ASSERT_TRUE(snippet_response_is_valid_json(inner));
/* (3) The raw invalid CP949 sequence must not appear verbatim. */
ASSERT_NULL(strstr(inner, "\xC0\xD4"));
/* (4) Surrounding valid bytes must be preserved. */
ASSERT_NOT_NULL(strstr(inner, "HandleRequest"));
ASSERT_NOT_NULL(strstr(inner, "return nil"));
/* (5) Replacement character must mark where invalid bytes were. */
ASSERT_TRUE(snippet_source_field_has_replacement(inner));

free(inner);
free(raw);
cbm_mcp_server_free(srv);
cleanup_snippet_dir(tmp);
PASS();
}

/* ══════════════════════════════════════════════════════════════════
* JSON-RPC PARSING — EDGE CASES
* ══════════════════════════════════════════════════════════════════ */
Expand Down Expand Up @@ -2129,5 +2217,6 @@ SUITE(mcp) {
RUN_TEST(snippet_auto_resolve_enabled);
RUN_TEST(snippet_include_neighbors_default);
RUN_TEST(snippet_include_neighbors_enabled);
RUN_TEST(snippet_source_invalid_utf8);
RUN_TEST(tool_bad_project_name_no_overflow_issue235);
}
Loading