diff --git a/datamodel/spec_info.go b/datamodel/spec_info.go index f5200f04..8a2e7e0b 100644 --- a/datamodel/spec_info.go +++ b/datamodel/spec_info.go @@ -10,6 +10,8 @@ import ( "fmt" "strings" "time" + "unicode/utf16" + "unicode/utf8" "github.com/pb33f/libopenapi/utils" "go.yaml.in/yaml/v4" @@ -97,12 +99,11 @@ func extractSpecInfoInternal(spec []byte, bypass bool, skipJSON bool) (*SpecInfo specInfo.NumLines = bytes.Count(spec, []byte{'\n'}) + 1 - // Pre-process JSON to handle \/ escape sequences that YAML parser doesn't recognize. - // JSON (RFC 8259) allows \/ as an optional escape for forward slash, but YAML does not. - // See: https://github.com/pb33f/libopenapi/issues/479 + // Pre-process JSON escapes that YAML parsers do not accept even though + // they are valid JSON, while preserving the existing YAML-node parse path. parseBytes := spec if specInfo.SpecFileType == JSONFileType { - parseBytes = unescapeJSONSlashes(spec) + parseBytes = normalizeJSONForYAMLParser(spec) } err := yaml.Unmarshal(parseBytes, &parsedSpec) @@ -324,39 +325,126 @@ func parseVersionTypeData(d interface{}) (string, int, error) { return string(r), int(r[0]) - '0', nil } -// unescapeJSONSlashes replaces the optional \/ escape sequence in JSON with / -// JSON (RFC 8259) allows \/ as an optional escape for forward slash, but YAML -// parsers (including go.yaml.in/yaml/v4) do not recognize it. -// This handles escaped backslashes correctly: \\/ becomes \/ not // -// Returns the original slice if no transformation is needed (zero allocation). -func unescapeJSONSlashes(jsonBytes []byte) []byte { - // fast path: check if transformation is needed - if !bytes.Contains(jsonBytes, []byte(`\/`)) { +// normalizeJSONForYAMLParser rewrites the small set of JSON escapes accepted by +// RFC 8259 but rejected by go.yaml.in/yaml/v4. It returns the original slice +// without allocation unless a rewrite is required. +func normalizeJSONForYAMLParser(jsonBytes []byte) []byte { + if bytes.IndexByte(jsonBytes, '\\') < 0 { return jsonBytes } - result := make([]byte, 0, len(jsonBytes)) - i := 0 - for i < len(jsonBytes) { - if jsonBytes[i] == '\\' && i+1 < len(jsonBytes) { - switch jsonBytes[i+1] { - case '/': - // \/ -> / (json optional escape for solidus) - result = append(result, '/') - i += 2 - case '\\': - // preserve escaped backslash to prevent \\/ becoming // - result = append(result, '\\', '\\') - i += 2 - default: - // preserve other escape sequences (\n, \t, \", etc.) - result = append(result, jsonBytes[i]) - i++ - } - } else { - result = append(result, jsonBytes[i]) - i++ + var result []byte + var runeBytes [utf8.UTFMax]byte + last := 0 + scan := 0 + + for scan < len(jsonBytes) { + rel := bytes.IndexByte(jsonBytes[scan:], '\\') + if rel < 0 { + break + } + + escape := scan + rel + replacement, consumed, ok := jsonEscapeReplacement(jsonBytes, escape, &runeBytes) + if !ok { + scan = nextJSONEscapeScanOffset(jsonBytes, escape) + continue } + + if result == nil { + result = make([]byte, 0, len(jsonBytes)) + } + result = append(result, jsonBytes[last:escape]...) + result = append(result, replacement...) + scan = escape + consumed + last = scan + } + + if result == nil { + return jsonBytes } + + result = append(result, jsonBytes[last:]...) return result } + +func jsonEscapeReplacement(jsonBytes []byte, escape int, runeBytes *[utf8.UTFMax]byte) ([]byte, int, bool) { + if escape+1 >= len(jsonBytes) { + return nil, 0, false + } + + switch jsonBytes[escape+1] { + case '/': + runeBytes[0] = '/' + return runeBytes[:1], 2, true + case 'u': + if escape+12 > len(jsonBytes) { + return nil, 0, false + } + + high, ok := decodeJSONUnicodeEscape(jsonBytes[escape+2 : escape+6]) + if !ok || !isHighSurrogate(high) { + return nil, 0, false + } + + lowEscape := escape + 6 + if jsonBytes[lowEscape] != '\\' || jsonBytes[lowEscape+1] != 'u' { + return nil, 0, false + } + + low, ok := decodeJSONUnicodeEscape(jsonBytes[lowEscape+2 : lowEscape+6]) + if !ok || !isLowSurrogate(low) { + return nil, 0, false + } + + r := utf16.DecodeRune(rune(high), rune(low)) + n := utf8.EncodeRune(runeBytes[:], r) + return runeBytes[:n], 12, true + default: + return nil, 0, false + } +} + +func nextJSONEscapeScanOffset(jsonBytes []byte, escape int) int { + if escape+1 >= len(jsonBytes) { + return escape + 1 + } + return escape + 2 +} + +func decodeJSONUnicodeEscape(hexBytes []byte) (uint16, bool) { + if len(hexBytes) != 4 { + return 0, false + } + + var value uint16 + for _, b := range hexBytes { + hex, ok := jsonHexValue(b) + if !ok { + return 0, false + } + value = value<<4 | uint16(hex) + } + return value, true +} + +func jsonHexValue(b byte) (byte, bool) { + switch { + case b >= '0' && b <= '9': + return b - '0', true + case b >= 'a' && b <= 'f': + return b - 'a' + 10, true + case b >= 'A' && b <= 'F': + return b - 'A' + 10, true + default: + return 0, false + } +} + +func isHighSurrogate(value uint16) bool { + return value >= 0xD800 && value <= 0xDBFF +} + +func isLowSurrogate(value uint16) bool { + return value >= 0xDC00 && value <= 0xDFFF +} diff --git a/datamodel/spec_info_test.go b/datamodel/spec_info_test.go index 18816804..a2ba2d36 100644 --- a/datamodel/spec_info_test.go +++ b/datamodel/spec_info_test.go @@ -454,9 +454,12 @@ $self: something` assert.Equal(t, "something", r.Self) } -// TestUnescapeJSONSlashes tests the unescapeJSONSlashes helper function -// This addresses issue #479 where JSON files with \/ escape sequences fail to parse -func TestUnescapeJSONSlashes(t *testing.T) { +// TestNormalizeJSONForYAMLParser tests JSON escapes that are valid JSON but +// rejected by the YAML parser used for YAML-node construction. +func TestNormalizeJSONForYAMLParser(t *testing.T) { + thumbsUp := string(rune(0x1F44D)) + rocket := string(rune(0x1F680)) + tests := []struct { name string input string @@ -475,22 +478,61 @@ func TestUnescapeJSONSlashes(t *testing.T) { {"other escapes preserved", `\n\t\/`, `\n\t/`}, {"multiple escaped slashes", `\/one\/two\/three`, `/one/two/three`}, {"mixed content", `{"path":"\/test","url":"https:\/\/example.com"}`, `{"path":"/test","url":"https://example.com"}`}, + {"valid surrogate pair", `\ud83d\udc4d`, thumbsUp}, + {"valid uppercase surrogate pair", `\uD83D\uDC4D`, thumbsUp}, + {"multiple surrogate pairs", `\ud83d\udc4d \ud83d\ude80`, thumbsUp + " " + rocket}, + {"surrogate pair with escaped slash", `https:\/\/example.com\/\ud83d\udc4d`, `https://example.com/` + thumbsUp}, + {"double escaped surrogate pair", `\\ud83d\\udc4d`, `\\ud83d\\udc4d`}, + {"trailing backslash", `\`, `\`}, + {"lone high surrogate", `\ud83d`, `\ud83d`}, + {"high surrogate without low escape", `\ud83dxxxxxx`, `\ud83dxxxxxx`}, + {"high surrogate followed by non-low surrogate", `\ud83d\u0041`, `\ud83d\u0041`}, + {"lone low surrogate", `\udc4d`, `\udc4d`}, + {"invalid high surrogate hex", `\ud83x\udc4d`, `\ud83x\udc4d`}, + {"invalid low surrogate hex", `\ud83d\udc4x`, `\ud83d\udc4x`}, + {"truncated unicode escape", `\u12`, `\u12`}, + {"non surrogate unicode escape", `\u003c`, `\u003c`}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - result := unescapeJSONSlashes([]byte(tt.input)) + result := normalizeJSONForYAMLParser([]byte(tt.input)) assert.Equal(t, tt.expected, string(result)) }) } } -// TestUnescapeJSONSlashes_NoAllocation tests that the fast path returns original slice -func TestUnescapeJSONSlashes_NoAllocation(t *testing.T) { - input := []byte(`{"path":"/test"}`) - result := unescapeJSONSlashes(input) - // Should return same slice when no \/ present - assert.Equal(t, &input[0], &result[0], "Should return original slice when no \\/ present") +func TestDecodeJSONUnicodeEscape(t *testing.T) { + value, ok := decodeJSONUnicodeEscape([]byte("D83D")) + assert.True(t, ok) + assert.Equal(t, uint16(0xD83D), value) + + _, ok = decodeJSONUnicodeEscape([]byte("123")) + assert.False(t, ok) + + _, ok = decodeJSONUnicodeEscape([]byte("12xz")) + assert.False(t, ok) +} + +// TestNormalizeJSONForYAMLParser_NoAllocation tests that unchanged inputs +// return the original slice. +func TestNormalizeJSONForYAMLParser_NoAllocation(t *testing.T) { + tests := []string{ + `{"path":"/test"}`, + `{"text":"line\nquoted\"tab\tunicode\u003c"}`, + `{"text":"\\ud83d\\udc4d"}`, + `{"text":"\ud83d"}`, + `{"text":"\udc4d"}`, + } + + for _, tt := range tests { + t.Run(tt, func(t *testing.T) { + input := []byte(tt) + result := normalizeJSONForYAMLParser(input) + assert.Equal(t, tt, string(result)) + assert.Equal(t, &input[0], &result[0], "Should return original slice when no rewrite is needed") + }) + } } // TestExtractSpecInfo_JSON_EscapedSlashes tests issue #479 @@ -506,6 +548,49 @@ func TestExtractSpecInfo_JSON_EscapedSlashes(t *testing.T) { assert.Equal(t, utils.OpenApi3, r.SpecType) } +func TestExtractSpecInfo_JSON_SurrogatePairInExample(t *testing.T) { + jsonWithSurrogatePair := `{ + "openapi": "3.0.1", + "info": {"title": "r", "version": "1"}, + "paths": { + "/t": { + "post": { + "operationId": "t", + "responses": { + "201": { + "description": "ok", + "content": { + "application/json": { + "schema": {"type": "object", "properties": {"x": {"type": "string"}}}, + "examples": { + "e": {"value": {"x": "Hello \ud83d\udc4d"}} + } + } + } + } + } + } + } + } +}` + + r, e := ExtractSpecInfo([]byte(jsonWithSurrogatePair)) + assert.NoError(t, e) + assert.Equal(t, "3.0.1", r.Version) + assert.Equal(t, JSONFileType, r.SpecFileType) + assert.Equal(t, utils.OpenApi3, r.SpecType) +} + +func TestExtractSpecInfo_JSON_SurrogatePairInDescription(t *testing.T) { + jsonWithSurrogatePair := `{"openapi":"3.0.1","info":{"title":"r","version":"1","description":"Hello \ud83d\udc4d"},"paths":{}}` + + r, e := ExtractSpecInfo([]byte(jsonWithSurrogatePair)) + assert.NoError(t, e) + assert.Equal(t, "3.0.1", r.Version) + assert.Equal(t, JSONFileType, r.SpecFileType) + assert.Equal(t, utils.OpenApi3, r.SpecType) +} + // TestExtractSpecInfo_JSON_EscapedSlashes_URL tests URL paths with escaped slashes func TestExtractSpecInfo_JSON_EscapedSlashes_URL(t *testing.T) { jsonWithURL := `{"openapi":"3.0.0","info":{"title":"Test","version":"1.0.0"},"servers":[{"url":"https:\/\/api.example.com\/v1"}],"paths":{}}` @@ -619,6 +704,48 @@ func TestSpecInfo_Release(t *testing.T) { assert.Equal(t, "3.1.0", s.Version) } +var normalizeJSONForYAMLParserSink []byte + +func BenchmarkNormalizeJSONForYAMLParser_NoEscapes(b *testing.B) { + input := []byte(`{"openapi":"3.0.1","info":{"title":"r","version":"1"},"paths":{}}`) + b.ReportAllocs() + b.SetBytes(int64(len(input))) + + for i := 0; i < b.N; i++ { + normalizeJSONForYAMLParserSink = normalizeJSONForYAMLParser(input) + } +} + +func BenchmarkNormalizeJSONForYAMLParser_CommonEscapesNoRewrite(b *testing.B) { + input := []byte(`{"openapi":"3.0.1","info":{"title":"line\nquoted\"tab\tunicode\u003c","version":"1"},"paths":{}}`) + b.ReportAllocs() + b.SetBytes(int64(len(input))) + + for i := 0; i < b.N; i++ { + normalizeJSONForYAMLParserSink = normalizeJSONForYAMLParser(input) + } +} + +func BenchmarkNormalizeJSONForYAMLParser_EscapedSlashes(b *testing.B) { + input := []byte(`{"openapi":"3.0.1","info":{"title":"r","version":"1"},"servers":[{"url":"https:\/\/api.example.com\/v1"}],"paths":{"\/test":{}}}`) + b.ReportAllocs() + b.SetBytes(int64(len(input))) + + for i := 0; i < b.N; i++ { + normalizeJSONForYAMLParserSink = normalizeJSONForYAMLParser(input) + } +} + +func BenchmarkNormalizeJSONForYAMLParser_SurrogatePair(b *testing.B) { + input := []byte(`{"openapi":"3.0.1","info":{"title":"r","version":"1","description":"Hello \ud83d\udc4d"},"paths":{}}`) + b.ReportAllocs() + b.SetBytes(int64(len(input))) + + for i := 0; i < b.N; i++ { + normalizeJSONForYAMLParserSink = normalizeJSONForYAMLParser(input) + } +} + func TestSpecInfo_Release_Nil(t *testing.T) { var s *SpecInfo s.Release() // must not panic diff --git a/document_test.go b/document_test.go index 7a2cc751..933cfbad 100644 --- a/document_test.go +++ b/document_test.go @@ -643,6 +643,40 @@ func TestDocument_Serialize_JSON_Modified(t *testing.T) { assert.Equal(t, jsonModified, string(serial)) } +func TestNewDocument_JSONSurrogatePairExample(t *testing.T) { + spec := []byte(`{ + "openapi": "3.0.1", + "info": {"title": "r", "version": "1"}, + "paths": { + "/t": { + "post": { + "operationId": "t", + "responses": { + "201": { + "description": "ok", + "content": { + "application/json": { + "schema": {"type": "object", "properties": {"x": {"type": "string"}}}, + "examples": { + "e": {"value": {"x": "Hello \ud83d\udc4d"}} + } + } + } + } + } + } + } + } +}`) + + doc, err := NewDocument(spec) + require.NoError(t, err) + + model, buildErr := doc.BuildV3Model() + require.NoError(t, buildErr) + require.NotNil(t, model) +} + func TestExtractReference(t *testing.T) { data := ` openapi: "3.1"