From ccdb11186cbb10f1d253c58770cb9b2fbf4b0c97 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 23:40:58 +0200 Subject: [PATCH 1/3] HTML API: Preserve XMP raw text serialization --- .../html-api/class-wp-html-processor.php | 1 + .../html-api/wpHtmlProcessor-serialize.php | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 35d91fad3129c..b05da8a80e99d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1498,6 +1498,7 @@ public function serialize_token(): string { case 'SCRIPT': case 'STYLE': + case 'XMP': break; default: diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index e516addb6c314..1aa367fe7283a 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -134,6 +134,30 @@ public function test_style_contents_are_not_escaped() { ); } + /** + * Ensures that XMP contents are not escaped, as they are not parsed like text nodes are. + * + * XMP contents are parsed as raw text: character references are never decoded. + * Escaping the contents would change the document, e.g. a "<" would be replaced + * by the literal text "<" after serializing and re-parsing. + * + * @ticket 65372 + */ + public function test_xmp_contents_are_not_escaped() { + $normalized = WP_HTML_Processor::normalize( "1 < 2 &amp; apples > or\x00anges" ); + + $this->assertSame( + "1 < 2 &amp; apples > or\u{FFFD}anges", + $normalized, + 'Should have preserved text inside an XMP element, except for replacing NULL bytes.' + ); + $this->assertSame( + $normalized, + WP_HTML_Processor::normalize( $normalized ), + 'Normalizing already-normalized XMP should not escape the raw text again.' + ); + } + public function test_unexpected_closing_tags_are_removed() { $this->assertSame( WP_HTML_Processor::normalize( 'onetwothree' ), @@ -281,6 +305,7 @@ public static function data_tokens_with_null_bytes() { 'Foreign content text' => array( "one\x00two", "one\u{FFFD}two" ), 'SCRIPT content' => array( "", "" ), 'STYLE content' => array( "", "" ), + 'XMP content' => array( "a\x00b", "a\u{FFFD}b" ), 'Comment text' => array( "", "" ), ); } From cdc6f6e4867e231579e39278c98a0341e4c307ed Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 16 Jun 2026 19:52:17 +0200 Subject: [PATCH 2/3] Improve tests --- .../tests/html-api/wpHtmlProcessor-serialize.php | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index c22c7612aead3..e6c5cd7da1497 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -267,18 +267,13 @@ public function test_style_contents_are_not_escaped() { * @ticket 65372 */ public function test_xmp_contents_are_not_escaped() { - $normalized = WP_HTML_Processor::normalize( "1 < 2 &amp; apples > or\x00anges" ); + $normalized = WP_HTML_Processor::normalize( " < > & \" ' \x00 " ); $this->assertSame( - "1 < 2 &amp; apples > or\u{FFFD}anges", + " < > & \" ' \u{FFFD} ", $normalized, 'Should have preserved text inside an XMP element, except for replacing NULL bytes.' ); - $this->assertSame( - $normalized, - WP_HTML_Processor::normalize( $normalized ), - 'Normalizing already-normalized XMP should not escape the raw text again.' - ); } public function test_unexpected_closing_tags_are_removed() { @@ -654,6 +649,7 @@ public static function data_provider_normalized_fuzzer_cases_that_should_be_idem 'Duplicate ALT boundary' => array( '' ), 'NULL byte in SVG child tag' => array( "" ), 'NULL byte before slash in SVG child tag' => array( "" ), + 'XMP generic raw text' => array( " < > & \" ' \x00 " ), ); } From fa7a94a117610bedd1d477a30544e5fb4a2099fe Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 16 Jun 2026 19:58:29 +0200 Subject: [PATCH 3/3] Adjust comment --- tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index efc60d10d28e0..d9d7d7c13394a 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -258,11 +258,8 @@ public function test_style_contents_are_not_escaped() { } /** - * Ensures that XMP contents are not escaped, as they are not parsed like text nodes are. - * - * XMP contents are parsed as raw text: character references are never decoded. - * Escaping the contents would change the document, e.g. a "<" would be replaced - * by the literal text "<" after serializing and re-parsing. + * XMP contents are parsed using the generic raw text element parsing algorithm. + * Their contents should not be escaped with HTML character references on normalization. * * @ticket 65372 */