From 153cef615e53185233d01425c7aa401ae221248e Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 15 May 2026 11:49:54 +0200 Subject: [PATCH 1/3] Move a couple of `src/core/` string helper functions into their own file Given that the various utility-files naturally increase in size over time, it shouldn't hurt to shorten `src/core/core_utils.js` a little bit by moving a few of its string helper functions to their own file. --- src/core/annotation.js | 3 +- src/core/core_utils.js | 43 -------------------- src/core/default_appearance.js | 2 +- src/core/editor/pdf_editor.js | 2 +- src/core/string_utils.js | 62 +++++++++++++++++++++++++++++ src/core/struct_tree.js | 3 +- test/unit/clitests.json | 1 + test/unit/core_utils_spec.js | 53 ------------------------- test/unit/jasmine-boot.js | 1 + test/unit/string_utils_spec.js | 72 ++++++++++++++++++++++++++++++++++ 10 files changed, 141 insertions(+), 101 deletions(-) create mode 100644 src/core/string_utils.js create mode 100644 test/unit/string_utils_spec.js diff --git a/src/core/annotation.js b/src/core/annotation.js index b1b922af65625..208d26a6b4ba4 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -53,8 +53,6 @@ import { numberToString, RESOURCES_KEYS_OPERATOR_LIST, RESOURCES_KEYS_TEXT_CONTENT, - stringToAsciiOrUTF16BE, - stringToUTF16String, } from "./core_utils.js"; import { createDefaultAppearance, @@ -66,6 +64,7 @@ import { import { DateFormats, TimeFormats } from "../shared/scripting_utils.js"; import { Dict, isName, isRefsEqual, Name, Ref, RefSet } from "./primitives.js"; import { Stream, StringStream } from "./stream.js"; +import { stringToAsciiOrUTF16BE, stringToUTF16String } from "./string_utils.js"; import { BaseStream } from "./base_stream.js"; import { bidi } from "./bidi.js"; import { Catalog } from "./catalog.js"; diff --git a/src/core/core_utils.js b/src/core/core_utils.js index 009cc8c7986f9..e1f6bacfe0bc7 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -684,45 +684,6 @@ function getNewAnnotationsMap(annotationStorage) { return newAnnotationsByPage.size > 0 ? newAnnotationsByPage : null; } -// If the string is null or undefined then it is returned as is. -function stringToAsciiOrUTF16BE(str) { - if (str === null || str === undefined) { - return str; - } - return isAscii(str) ? str : stringToUTF16String(str, /* bigEndian = */ true); -} - -function isAscii(str) { - if (typeof str !== "string") { - return false; - } - return !str || /^[\x00-\x7F]*$/.test(str); -} - -function stringToUTF16HexString(str) { - const buf = []; - for (let i = 0, ii = str.length; i < ii; i++) { - const char = str.charCodeAt(i); - buf.push(Util.hexNums[(char >> 8) & 0xff], Util.hexNums[char & 0xff]); - } - return buf.join(""); -} - -function stringToUTF16String(str, bigEndian = false) { - const buf = []; - if (bigEndian) { - buf.push("\xFE\xFF"); - } - for (let i = 0, ii = str.length; i < ii; i++) { - const char = str.charCodeAt(i); - buf.push( - String.fromCharCode((char >> 8) & 0xff), - String.fromCharCode(char & 0xff) - ); - } - return buf.join(""); -} - function getModificationDate(date = new Date()) { if (!(date instanceof Date)) { date = new Date(date); @@ -782,7 +743,6 @@ export { getRotationMatrix, getSizeInBytes, IDENTITY_MATRIX, - isAscii, isBooleanArray, isNumberArray, isWhiteSpace, @@ -798,9 +758,6 @@ export { recoverJsURL, RESOURCES_KEYS_OPERATOR_LIST, RESOURCES_KEYS_TEXT_CONTENT, - stringToAsciiOrUTF16BE, - stringToUTF16HexString, - stringToUTF16String, toRomanNumerals, validateCSSFont, validateFontName, diff --git a/src/core/default_appearance.js b/src/core/default_appearance.js index 5756a388c8daa..01114baeb7a2e 100644 --- a/src/core/default_appearance.js +++ b/src/core/default_appearance.js @@ -18,7 +18,6 @@ import { escapePDFName, getRotationMatrix, numberToString, - stringToUTF16HexString, } from "./core_utils.js"; import { Dict, Name } from "./primitives.js"; import { @@ -33,6 +32,7 @@ import { EvaluatorPreprocessor } from "./evaluator.js"; import { LocalColorSpaceCache } from "./image_utils.js"; import { PDFFunctionFactory } from "./function.js"; import { StringStream } from "./stream.js"; +import { stringToUTF16HexString } from "./string_utils.js"; class DefaultAppearanceEvaluator extends EvaluatorPreprocessor { constructor(str) { diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index 4e3b3b0349a8b..0cb9b7de779ac 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -25,7 +25,6 @@ import { getInheritableProperty, getModificationDate, getNewAnnotationsMap, - stringToAsciiOrUTF16BE, } from "../core_utils.js"; import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js"; import { incrementalUpdate, writeValue } from "../writer.js"; @@ -34,6 +33,7 @@ import { stringToBytes, stringToPDFString } from "../../shared/util.js"; import { AnnotationFactory } from "../annotation.js"; import { BaseStream } from "../base_stream.js"; import { StringStream } from "../stream.js"; +import { stringToAsciiOrUTF16BE } from "../string_utils.js"; const MAX_LEAVES_PER_PAGES_NODE = 16; const MAX_IN_NAME_TREE_NODE = 64; diff --git a/src/core/string_utils.js b/src/core/string_utils.js new file mode 100644 index 0000000000000..1ef84ec622d85 --- /dev/null +++ b/src/core/string_utils.js @@ -0,0 +1,62 @@ +/* Copyright 2019 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { Util } from "../shared/util.js"; + +function isAscii(str) { + if (typeof str !== "string") { + return false; + } + return !str || /^[\x00-\x7F]*$/.test(str); +} + +// If the string is null or undefined then it is returned as is. +function stringToAsciiOrUTF16BE(str) { + if (str === null || str === undefined) { + return str; + } + return isAscii(str) ? str : stringToUTF16String(str, /* bigEndian = */ true); +} + +function stringToUTF16HexString(str) { + const buf = []; + for (let i = 0, ii = str.length; i < ii; i++) { + const char = str.charCodeAt(i); + buf.push(Util.hexNums[(char >> 8) & 0xff], Util.hexNums[char & 0xff]); + } + return buf.join(""); +} + +function stringToUTF16String(str, bigEndian = false) { + const buf = []; + if (bigEndian) { + buf.push("\xFE\xFF"); + } + for (let i = 0, ii = str.length; i < ii; i++) { + const char = str.charCodeAt(i); + buf.push( + String.fromCharCode((char >> 8) & 0xff), + String.fromCharCode(char & 0xff) + ); + } + return buf.join(""); +} + +export { + isAscii, + stringToAsciiOrUTF16BE, + stringToUTF16HexString, + stringToUTF16String, +}; diff --git a/src/core/struct_tree.js b/src/core/struct_tree.js index f6e6a842230e5..3fab022474bdc 100644 --- a/src/core/struct_tree.js +++ b/src/core/struct_tree.js @@ -21,9 +21,10 @@ import { warn, } from "../shared/util.js"; import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js"; -import { lookupNormalRect, stringToAsciiOrUTF16BE } from "./core_utils.js"; import { BaseStream } from "./base_stream.js"; +import { lookupNormalRect } from "./core_utils.js"; import { NumberTree } from "./name_number_tree.js"; +import { stringToAsciiOrUTF16BE } from "./string_utils.js"; const MAX_DEPTH = 40; diff --git a/test/unit/clitests.json b/test/unit/clitests.json index 3ab4dc148fbe5..ce261c24957b2 100644 --- a/test/unit/clitests.json +++ b/test/unit/clitests.json @@ -49,6 +49,7 @@ "postscript_spec.js", "primitives_spec.js", "stream_spec.js", + "string_utils_spec.js", "struct_tree_spec.js", "svg_factory_spec.js", "text_layer_spec.js", diff --git a/test/unit/core_utils_spec.js b/test/unit/core_utils_spec.js index 12f0a4d913743..a9c6bd8f040f8 100644 --- a/test/unit/core_utils_spec.js +++ b/test/unit/core_utils_spec.js @@ -22,13 +22,10 @@ import { getInheritableProperty, getModificationDate, getSizeInBytes, - isAscii, isWhiteSpace, numberToString, parseXFAPath, recoverJsURL, - stringToUTF16HexString, - stringToUTF16String, toRomanNumerals, validateCSSFont, } from "../../src/core/core_utils.js"; @@ -416,56 +413,6 @@ describe("core_utils", function () { }); }); - describe("isAscii", function () { - it("handles ascii/non-ascii strings", function () { - expect(isAscii("hello world")).toEqual(true); - expect(isAscii("こんにちは世界の")).toEqual(false); - expect(isAscii("hello world in Japanese is こんにちは世界の")).toEqual( - false - ); - expect(isAscii("")).toEqual(true); - expect(isAscii(123)).toEqual(false); - expect(isAscii(null)).toEqual(false); - expect(isAscii(undefined)).toEqual(false); - }); - }); - - describe("stringToUTF16HexString", function () { - it("should encode a string in UTF16 hexadecimal format", function () { - expect(stringToUTF16HexString("hello world")).toEqual( - "00680065006c006c006f00200077006f0072006c0064" - ); - - expect(stringToUTF16HexString("こんにちは世界の")).toEqual( - "30533093306b3061306f4e16754c306e" - ); - }); - }); - - describe("stringToUTF16String", function () { - it("should encode a string in UTF16", function () { - expect(stringToUTF16String("hello world")).toEqual( - "\0h\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d" - ); - - expect(stringToUTF16String("こんにちは世界の")).toEqual( - "\x30\x53\x30\x93\x30\x6b\x30\x61\x30\x6f\x4e\x16\x75\x4c\x30\x6e" - ); - }); - - it("should encode a string in UTF16BE with a BOM", function () { - expect( - stringToUTF16String("hello world", /* bigEndian = */ true) - ).toEqual("\xfe\xff\0h\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d"); - - expect( - stringToUTF16String("こんにちは世界の", /* bigEndian = */ true) - ).toEqual( - "\xfe\xff\x30\x53\x30\x93\x30\x6b\x30\x61\x30\x6f\x4e\x16\x75\x4c\x30\x6e" - ); - }); - }); - describe("deepCompare", function () { it("should return true for the same reference", function () { const dict = new Dict(); diff --git a/test/unit/jasmine-boot.js b/test/unit/jasmine-boot.js index 6ffb29784335f..fbbc7730ecff3 100644 --- a/test/unit/jasmine-boot.js +++ b/test/unit/jasmine-boot.js @@ -96,6 +96,7 @@ async function initializePDFJS(callback) { "pdfjs-test/unit/primitives_spec.js", "pdfjs-test/unit/scripting_spec.js", "pdfjs-test/unit/stream_spec.js", + "pdfjs-test/unit/string_utils_spec.js", "pdfjs-test/unit/struct_tree_spec.js", "pdfjs-test/unit/svg_factory_spec.js", "pdfjs-test/unit/text_layer_spec.js", diff --git a/test/unit/string_utils_spec.js b/test/unit/string_utils_spec.js new file mode 100644 index 0000000000000..3b919acb592b4 --- /dev/null +++ b/test/unit/string_utils_spec.js @@ -0,0 +1,72 @@ +/* Copyright 2019 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { + isAscii, + stringToUTF16HexString, + stringToUTF16String, +} from "../../src/core/string_utils.js"; + +describe("string_utils", function () { + describe("isAscii", function () { + it("handles ascii/non-ascii strings", function () { + expect(isAscii("hello world")).toEqual(true); + expect(isAscii("こんにちは世界の")).toEqual(false); + expect(isAscii("hello world in Japanese is こんにちは世界の")).toEqual( + false + ); + expect(isAscii("")).toEqual(true); + expect(isAscii(123)).toEqual(false); + expect(isAscii(null)).toEqual(false); + expect(isAscii(undefined)).toEqual(false); + }); + }); + + describe("stringToUTF16HexString", function () { + it("should encode a string in UTF16 hexadecimal format", function () { + expect(stringToUTF16HexString("hello world")).toEqual( + "00680065006c006c006f00200077006f0072006c0064" + ); + + expect(stringToUTF16HexString("こんにちは世界の")).toEqual( + "30533093306b3061306f4e16754c306e" + ); + }); + }); + + describe("stringToUTF16String", function () { + it("should encode a string in UTF16", function () { + expect(stringToUTF16String("hello world")).toEqual( + "\0h\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d" + ); + + expect(stringToUTF16String("こんにちは世界の")).toEqual( + "\x30\x53\x30\x93\x30\x6b\x30\x61\x30\x6f\x4e\x16\x75\x4c\x30\x6e" + ); + }); + + it("should encode a string in UTF16BE with a BOM", function () { + expect( + stringToUTF16String("hello world", /* bigEndian = */ true) + ).toEqual("\xfe\xff\0h\0e\0l\0l\0o\0 \0w\0o\0r\0l\0d"); + + expect( + stringToUTF16String("こんにちは世界の", /* bigEndian = */ true) + ).toEqual( + "\xfe\xff\x30\x53\x30\x93\x30\x6b\x30\x61\x30\x6f\x4e\x16\x75\x4c\x30\x6e" + ); + }); + }); +}); From 7a7e7049c179c87903a0d3a15d323da0ffcc6ed4 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 15 May 2026 11:56:33 +0200 Subject: [PATCH 2/3] Shorten the `isAscii` helper function a tiny bit --- src/core/string_utils.js | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/core/string_utils.js b/src/core/string_utils.js index 1ef84ec622d85..4d10e8d3df7e6 100644 --- a/src/core/string_utils.js +++ b/src/core/string_utils.js @@ -16,10 +16,7 @@ import { Util } from "../shared/util.js"; function isAscii(str) { - if (typeof str !== "string") { - return false; - } - return !str || /^[\x00-\x7F]*$/.test(str); + return typeof str === "string" && (!str || /^[\x00-\x7F]*$/.test(str)); } // If the string is null or undefined then it is returned as is. From e5330f06facdf355316d71b484c2ff75fc5a6be3 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 15 May 2026 11:59:50 +0200 Subject: [PATCH 3/3] Move the `stringToPDFString` helper function into the `src/core/string_utils.js` file Given that this function is only ever used during *parsing* of the PDF document, which happens in the worker-thread, this has always added (a little bit of) dead code in the built `pdf.mjs` file. --- src/core/annotation.js | 7 +++- src/core/catalog.js | 2 +- src/core/core_utils.js | 2 +- src/core/document.js | 2 +- src/core/editor/pdf_editor.js | 4 +- src/core/evaluator.js | 2 +- src/core/file_spec.js | 3 +- src/core/string_utils.js | 64 ++++++++++++++++++++++++++++- src/core/struct_tree.js | 3 +- src/core/worker.js | 2 +- src/shared/util.js | 62 ---------------------------- test/unit/string_utils_spec.js | 75 ++++++++++++++++++++++++++++++++++ test/unit/util_spec.js | 75 ---------------------------------- 13 files changed, 153 insertions(+), 150 deletions(-) diff --git a/src/core/annotation.js b/src/core/annotation.js index 208d26a6b4ba4..2befa96e9940f 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -33,7 +33,6 @@ import { OPS, RenderingIntentFlag, shadow, - stringToPDFString, unreachable, Util, warn, @@ -64,7 +63,11 @@ import { import { DateFormats, TimeFormats } from "../shared/scripting_utils.js"; import { Dict, isName, isRefsEqual, Name, Ref, RefSet } from "./primitives.js"; import { Stream, StringStream } from "./stream.js"; -import { stringToAsciiOrUTF16BE, stringToUTF16String } from "./string_utils.js"; +import { + stringToAsciiOrUTF16BE, + stringToPDFString, + stringToUTF16String, +} from "./string_utils.js"; import { BaseStream } from "./base_stream.js"; import { bidi } from "./bidi.js"; import { Catalog } from "./catalog.js"; diff --git a/src/core/catalog.js b/src/core/catalog.js index 3709958266605..ee367e8235ae3 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -22,7 +22,6 @@ import { objectSize, PermissionFlag, shadow, - stringToPDFString, stringToUTF8String, warn, } from "../shared/util.js"; @@ -53,6 +52,7 @@ import { clearGlobalCaches } from "./cleanup_helper.js"; import { ColorSpaceUtils } from "./colorspace_utils.js"; import { FileSpec } from "./file_spec.js"; import { MetadataParser } from "./metadata_parser.js"; +import { stringToPDFString } from "./string_utils.js"; import { StructTreeRoot } from "./struct_tree.js"; const isRef = v => v instanceof Ref; diff --git a/src/core/core_utils.js b/src/core/core_utils.js index e1f6bacfe0bc7..7397781813a5c 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -19,12 +19,12 @@ import { BaseException, makeArr, objectSize, - stringToPDFString, Util, warn, } from "../shared/util.js"; import { Dict, isName, isRefsEqual, Name, Ref, RefSet } from "./primitives.js"; import { BaseStream } from "./base_stream.js"; +import { stringToPDFString } from "./string_utils.js"; const PDF_VERSION_REGEXP = /^[1-9]\.\d$/; const MAX_INT_32 = 2 ** 31 - 1; diff --git a/src/core/document.js b/src/core/document.js index e6d61acfb544f..20fdb012fc148 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -26,7 +26,6 @@ import { RenderingIntentFlag, shadow, stringToBytes, - stringToPDFString, stringToUTF8String, unreachable, Util, @@ -76,6 +75,7 @@ import { OperatorList } from "./operator_list.js"; import { PartialEvaluator } from "./evaluator.js"; import { PDFImage } from "./image.js"; import { StreamsSequenceStream } from "./decode_stream.js"; +import { stringToPDFString } from "./string_utils.js"; import { StructTreePage } from "./struct_tree.js"; import { XFAFactory } from "./xfa/factory.js"; import { XRef } from "./xref.js"; diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index 0cb9b7de779ac..9f360b7f20d3b 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -29,11 +29,11 @@ import { import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js"; import { incrementalUpdate, writeValue } from "../writer.js"; import { NameTree, NumberTree } from "../name_number_tree.js"; -import { stringToBytes, stringToPDFString } from "../../shared/util.js"; +import { stringToAsciiOrUTF16BE, stringToPDFString } from "../string_utils.js"; import { AnnotationFactory } from "../annotation.js"; import { BaseStream } from "../base_stream.js"; import { StringStream } from "../stream.js"; -import { stringToAsciiOrUTF16BE } from "../string_utils.js"; +import { stringToBytes } from "../../shared/util.js"; const MAX_LEAVES_PER_PAGES_NODE = 16; const MAX_IN_NAME_TREE_NODE = 64; diff --git a/src/core/evaluator.js b/src/core/evaluator.js index efbbcf96915b8..e1badcec3670b 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -26,7 +26,6 @@ import { normalizeUnicode, OPS, shadow, - stringToPDFString, TextRenderingMode, Util, warn, @@ -90,6 +89,7 @@ import { getUnicodeForGlyph } from "./unicode.js"; import { MurmurHash3_64 } from "../shared/murmurhash3.js"; import { PDFImage } from "./image.js"; import { Stream } from "./stream.js"; +import { stringToPDFString } from "./string_utils.js"; const DefaultPartialEvaluatorOptions = Object.freeze({ maxImageSize: -1, diff --git a/src/core/file_spec.js b/src/core/file_spec.js index 890f347490040..06e9826f48d7b 100644 --- a/src/core/file_spec.js +++ b/src/core/file_spec.js @@ -13,9 +13,10 @@ * limitations under the License. */ -import { stringToPDFString, stripPath, warn } from "../shared/util.js"; +import { stripPath, warn } from "../shared/util.js"; import { BaseStream } from "./base_stream.js"; import { Dict } from "./primitives.js"; +import { stringToPDFString } from "./string_utils.js"; function pickPlatformItem(dict) { if (dict instanceof Dict) { diff --git a/src/core/string_utils.js b/src/core/string_utils.js index 4d10e8d3df7e6..a34fdf911531d 100644 --- a/src/core/string_utils.js +++ b/src/core/string_utils.js @@ -13,7 +13,7 @@ * limitations under the License. */ -import { Util } from "../shared/util.js"; +import { stringToBytes, Util, warn } from "../shared/util.js"; function isAscii(str) { return typeof str === "string" && (!str || /^[\x00-\x7F]*$/.test(str)); @@ -51,9 +51,71 @@ function stringToUTF16String(str, bigEndian = false) { return buf.join(""); } +const PDFStringTranslateTable = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2d8, + 0x2c7, 0x2c6, 0x2d9, 0x2dd, 0x2db, 0x2da, 0x2dc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x192, + 0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018, + 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x141, 0x152, 0x160, 0x178, 0x17d, + 0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac, +]; + +function stringToPDFString(str, keepEscapeSequence = false) { + // See section 7.9.2.2 Text String Type. + // The string can contain some language codes bracketed with 0x1b, + // so we must remove them. + if (str[0] >= "\xEF") { + let encoding; + if (str[0] === "\xFE" && str[1] === "\xFF") { + encoding = "utf-16be"; + if (str.length % 2 === 1) { + str = str.slice(0, -1); + } + } else if (str[0] === "\xFF" && str[1] === "\xFE") { + encoding = "utf-16le"; + if (str.length % 2 === 1) { + str = str.slice(0, -1); + } + } else if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") { + encoding = "utf-8"; + } + + if (encoding) { + try { + const decoder = new TextDecoder(encoding, { fatal: true }); + const buffer = stringToBytes(str); + const decoded = decoder.decode(buffer); + if (keepEscapeSequence || !decoded.includes("\x1b")) { + return decoded; + } + return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, ""); + } catch (ex) { + warn(`stringToPDFString: "${ex}".`); + } + } + } + // ISO Latin 1 + const strBuf = []; + for (let i = 0, ii = str.length; i < ii; i++) { + const charCode = str.charCodeAt(i); + if (!keepEscapeSequence && charCode === 0x1b) { + // eslint-disable-next-line no-empty + while (++i < ii && str.charCodeAt(i) !== 0x1b) {} + continue; + } + const code = PDFStringTranslateTable[charCode]; + strBuf.push(code ? String.fromCharCode(code) : str.charAt(i)); + } + return strBuf.join(""); +} + export { isAscii, stringToAsciiOrUTF16BE, + stringToPDFString, stringToUTF16HexString, stringToUTF16String, }; diff --git a/src/core/struct_tree.js b/src/core/struct_tree.js index 3fab022474bdc..af57f0a5ddcca 100644 --- a/src/core/struct_tree.js +++ b/src/core/struct_tree.js @@ -16,15 +16,14 @@ import { AnnotationPrefix, makeArr, - stringToPDFString, stringToUTF8String, warn, } from "../shared/util.js"; import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js"; +import { stringToAsciiOrUTF16BE, stringToPDFString } from "./string_utils.js"; import { BaseStream } from "./base_stream.js"; import { lookupNormalRect } from "./core_utils.js"; import { NumberTree } from "./name_number_tree.js"; -import { stringToAsciiOrUTF16BE } from "./string_utils.js"; const MAX_DEPTH = 40; diff --git a/src/core/worker.js b/src/core/worker.js index e089f47d193a3..1bee15c5fcae1 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -21,7 +21,6 @@ import { isNodeJS, PasswordException, setVerbosityLevel, - stringToPDFString, VerbosityLevel, warn, } from "../shared/util.js"; @@ -38,6 +37,7 @@ import { clearGlobalCaches } from "./cleanup_helper.js"; import { incrementalUpdate } from "./writer.js"; import { PDFEditor } from "./editor/pdf_editor.js"; import { PDFWorkerStream } from "./worker_stream.js"; +import { stringToPDFString } from "./string_utils.js"; import { StructTreeRoot } from "./struct_tree.js"; class WorkerTask { diff --git a/src/shared/util.js b/src/shared/util.js index 064c2bc79e210..b75301b24c7b0 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -1061,67 +1061,6 @@ class Util { } } -const PDFStringTranslateTable = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2d8, - 0x2c7, 0x2c6, 0x2d9, 0x2dd, 0x2db, 0x2da, 0x2dc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x192, - 0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018, - 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x141, 0x152, 0x160, 0x178, 0x17d, - 0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac, -]; - -function stringToPDFString(str, keepEscapeSequence = false) { - // See section 7.9.2.2 Text String Type. - // The string can contain some language codes bracketed with 0x1b, - // so we must remove them. - if (str[0] >= "\xEF") { - let encoding; - if (str[0] === "\xFE" && str[1] === "\xFF") { - encoding = "utf-16be"; - if (str.length % 2 === 1) { - str = str.slice(0, -1); - } - } else if (str[0] === "\xFF" && str[1] === "\xFE") { - encoding = "utf-16le"; - if (str.length % 2 === 1) { - str = str.slice(0, -1); - } - } else if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") { - encoding = "utf-8"; - } - - if (encoding) { - try { - const decoder = new TextDecoder(encoding, { fatal: true }); - const buffer = stringToBytes(str); - const decoded = decoder.decode(buffer); - if (keepEscapeSequence || !decoded.includes("\x1b")) { - return decoded; - } - return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, ""); - } catch (ex) { - warn(`stringToPDFString: "${ex}".`); - } - } - } - // ISO Latin 1 - const strBuf = []; - for (let i = 0, ii = str.length; i < ii; i++) { - const charCode = str.charCodeAt(i); - if (!keepEscapeSequence && charCode === 0x1b) { - // eslint-disable-next-line no-empty - while (++i < ii && str.charCodeAt(i) !== 0x1b) {} - continue; - } - const code = PDFStringTranslateTable[charCode]; - strBuf.push(code ? String.fromCharCode(code) : str.charAt(i)); - } - return strBuf.join(""); -} - function stringToUTF8String(str) { return decodeURIComponent(escape(str)); } @@ -1300,7 +1239,6 @@ export { setVerbosityLevel, shadow, stringToBytes, - stringToPDFString, stringToUTF8String, stripPath, TextRenderingMode, diff --git a/test/unit/string_utils_spec.js b/test/unit/string_utils_spec.js index 3b919acb592b4..7ef64b2764fc6 100644 --- a/test/unit/string_utils_spec.js +++ b/test/unit/string_utils_spec.js @@ -15,6 +15,7 @@ import { isAscii, + stringToPDFString, stringToUTF16HexString, stringToUTF16String, } from "../../src/core/string_utils.js"; @@ -69,4 +70,78 @@ describe("string_utils", function () { ); }); }); + + describe("stringToPDFString", function () { + it("handles ISO Latin 1 strings", function () { + const str = "\x8Dstring\x8E"; + expect(stringToPDFString(str)).toEqual("\u201Cstring\u201D"); + }); + + it("handles UTF-16 big-endian strings", function () { + const str = "\xFE\xFF\x00\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67"; + expect(stringToPDFString(str)).toEqual("string"); + }); + + it("handles incomplete UTF-16 big-endian strings", function () { + const str = "\xFE\xFF\x00\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00"; + expect(stringToPDFString(str)).toEqual("strin"); + }); + + it("handles UTF-16 little-endian strings", function () { + const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67\x00"; + expect(stringToPDFString(str)).toEqual("string"); + }); + + it("handles incomplete UTF-16 little-endian strings", function () { + const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67"; + expect(stringToPDFString(str)).toEqual("strin"); + }); + + it("handles UTF-8 strings", function () { + const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67"; + expect(stringToPDFString(simpleStr)).toEqual("string"); + + const complexStr = + "\xEF\xBB\xBF\xE8\xA1\xA8\xE3\x83\x9D\xE3\x81\x82\x41\xE9\xB7\x97" + + "\xC5\x92\xC3\xA9\xEF\xBC\xA2\xE9\x80\x8D\xC3\x9C\xC3\x9F\xC2\xAA" + + "\xC4\x85\xC3\xB1\xE4\xB8\x82\xE3\x90\x80\xF0\xA0\x80\x80"; + expect(stringToPDFString(complexStr)).toEqual( + "表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀" + ); + }); + + it("handles empty strings", function () { + // ISO Latin 1 + const str1 = ""; + expect(stringToPDFString(str1)).toEqual(""); + + // UTF-16BE + const str2 = "\xFE\xFF"; + expect(stringToPDFString(str2)).toEqual(""); + + // UTF-16LE + const str3 = "\xFF\xFE"; + expect(stringToPDFString(str3)).toEqual(""); + + // UTF-8 + const str4 = "\xEF\xBB\xBF"; + expect(stringToPDFString(str4)).toEqual(""); + }); + + it("handles strings with language code", function () { + // ISO Latin 1 + const str1 = "hello \x1benUS\x1bworld"; + expect(stringToPDFString(str1)).toEqual("hello world"); + + // UTF-16BE + const str2 = + "\xFE\xFF\x00h\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d"; + expect(stringToPDFString(str2)).toEqual("hello world"); + + // UTF-16LE + const str3 = + "\xFF\xFEh\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d\x00"; + expect(stringToPDFString(str3)).toEqual("hello world"); + }); + }); }); diff --git a/test/unit/util_spec.js b/test/unit/util_spec.js index a06ffb5c68797..994bcb588246e 100644 --- a/test/unit/util_spec.js +++ b/test/unit/util_spec.js @@ -19,7 +19,6 @@ import { createValidAbsoluteUrl, getUuid, stringToBytes, - stringToPDFString, } from "../../src/shared/util.js"; describe("util", function () { @@ -83,80 +82,6 @@ describe("util", function () { }); }); - describe("stringToPDFString", function () { - it("handles ISO Latin 1 strings", function () { - const str = "\x8Dstring\x8E"; - expect(stringToPDFString(str)).toEqual("\u201Cstring\u201D"); - }); - - it("handles UTF-16 big-endian strings", function () { - const str = "\xFE\xFF\x00\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67"; - expect(stringToPDFString(str)).toEqual("string"); - }); - - it("handles incomplete UTF-16 big-endian strings", function () { - const str = "\xFE\xFF\x00\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00"; - expect(stringToPDFString(str)).toEqual("strin"); - }); - - it("handles UTF-16 little-endian strings", function () { - const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67\x00"; - expect(stringToPDFString(str)).toEqual("string"); - }); - - it("handles incomplete UTF-16 little-endian strings", function () { - const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67"; - expect(stringToPDFString(str)).toEqual("strin"); - }); - - it("handles UTF-8 strings", function () { - const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67"; - expect(stringToPDFString(simpleStr)).toEqual("string"); - - const complexStr = - "\xEF\xBB\xBF\xE8\xA1\xA8\xE3\x83\x9D\xE3\x81\x82\x41\xE9\xB7\x97" + - "\xC5\x92\xC3\xA9\xEF\xBC\xA2\xE9\x80\x8D\xC3\x9C\xC3\x9F\xC2\xAA" + - "\xC4\x85\xC3\xB1\xE4\xB8\x82\xE3\x90\x80\xF0\xA0\x80\x80"; - expect(stringToPDFString(complexStr)).toEqual( - "表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀" - ); - }); - - it("handles empty strings", function () { - // ISO Latin 1 - const str1 = ""; - expect(stringToPDFString(str1)).toEqual(""); - - // UTF-16BE - const str2 = "\xFE\xFF"; - expect(stringToPDFString(str2)).toEqual(""); - - // UTF-16LE - const str3 = "\xFF\xFE"; - expect(stringToPDFString(str3)).toEqual(""); - - // UTF-8 - const str4 = "\xEF\xBB\xBF"; - expect(stringToPDFString(str4)).toEqual(""); - }); - - it("handles strings with language code", function () { - // ISO Latin 1 - const str1 = "hello \x1benUS\x1bworld"; - expect(stringToPDFString(str1)).toEqual("hello world"); - - // UTF-16BE - const str2 = - "\xFE\xFF\x00h\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d"; - expect(stringToPDFString(str2)).toEqual("hello world"); - - // UTF-16LE - const str3 = - "\xFF\xFEh\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d\x00"; - expect(stringToPDFString(str3)).toEqual("hello world"); - }); - }); - describe("createValidAbsoluteUrl", function () { it("handles invalid URLs", function () { expect(createValidAbsoluteUrl(undefined, undefined)).toEqual(null);