diff --git a/pyproject.toml b/pyproject.toml index 9d182607d..c0e4bb63e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ readme = "README.md" requires-python = ">=3.12,<3.13" dependencies = [ "chardet~=5.2", + "defusedxml~=0.7", "ffmpeg-python~=0.2", "humanfriendly==10.*", "mutagen~=1.47", diff --git a/src/tagstudio/core/constants.py b/src/tagstudio/core/constants.py index f332c9ad2..b17484a87 100644 --- a/src/tagstudio/core/constants.py +++ b/src/tagstudio/core/constants.py @@ -28,3 +28,10 @@ RESERVED_TAG_END = 999 RESERVED_NAMESPACE_PREFIX = "tagstudio" + +# Default ceilings for attacker-controlled XML reads in the thumbnail renderer +# and the DupeGuru results XML. Values are in MiB. +DEFAULT_DUPE_RESULTS_MAX_MB: int = 128 +DEFAULT_COMIC_INFO_MAX_MB: int = 1 +DEFAULT_MDP_HEADER_MAX_MB: int = 1 +DEFAULT_PDN_HEADER_MAX_MB: int = 16 diff --git a/src/tagstudio/core/library/alchemy/registries/dupe_files_registry.py b/src/tagstudio/core/library/alchemy/registries/dupe_files_registry.py index 95143fe27..c7435fae8 100644 --- a/src/tagstudio/core/library/alchemy/registries/dupe_files_registry.py +++ b/src/tagstudio/core/library/alchemy/registries/dupe_files_registry.py @@ -1,9 +1,10 @@ -import xml.etree.ElementTree as ET from dataclasses import dataclass, field from pathlib import Path +import defusedxml.ElementTree as ET # noqa: N817 import structlog +from tagstudio.core.constants import DEFAULT_DUPE_RESULTS_MAX_MB from tagstudio.core.library.alchemy.library import Library from tagstudio.core.library.alchemy.models import Entry from tagstudio.core.utils.types import unwrap @@ -22,7 +23,11 @@ class DupeFilesRegistry: def groups_count(self) -> int: return len(self.groups) - def refresh_dupe_files(self, results_filepath: str | Path): + def refresh_dupe_files( + self, + results_filepath: str | Path, + max_bytes: int = DEFAULT_DUPE_RESULTS_MAX_MB * 1024 * 1024, + ): """Refresh the list of duplicate files. A duplicate file is defined as an identical or near-identical file as determined @@ -35,6 +40,13 @@ def refresh_dupe_files(self, results_filepath: str | Path): if not results_filepath.is_file(): raise ValueError("invalid file path") + # Guard ET.parse against a malicious or accidental over-large results file. + if results_filepath.stat().st_size > max_bytes: + raise ValueError( + f"dupe results file exceeds {max_bytes} byte limit " + f"(see Advanced settings to raise the cap)" + ) + self.groups.clear() tree = ET.parse(results_filepath) root = tree.getroot() diff --git a/src/tagstudio/qt/global_settings.py b/src/tagstudio/qt/global_settings.py index 052d815e7..7f9652d2a 100644 --- a/src/tagstudio/qt/global_settings.py +++ b/src/tagstudio/qt/global_settings.py @@ -11,6 +11,12 @@ import toml from pydantic import BaseModel, Field +from tagstudio.core.constants import ( + DEFAULT_COMIC_INFO_MAX_MB, + DEFAULT_DUPE_RESULTS_MAX_MB, + DEFAULT_MDP_HEADER_MAX_MB, + DEFAULT_PDN_HEADER_MAX_MB, +) from tagstudio.core.enums import ShowFilepathOption, TagClickActionOption logger = structlog.get_logger(__name__) @@ -77,6 +83,11 @@ class GlobalSettings(BaseModel): hour_format: bool = Field(default=True) zero_padding: bool = Field(default=True) + dupe_results_max_mb: int = Field(default=DEFAULT_DUPE_RESULTS_MAX_MB) + comic_info_max_mb: int = Field(default=DEFAULT_COMIC_INFO_MAX_MB) + mdp_header_max_mb: int = Field(default=DEFAULT_MDP_HEADER_MAX_MB) + pdn_header_max_mb: int = Field(default=DEFAULT_PDN_HEADER_MAX_MB) + loaded_from: Path = Field(default=DEFAULT_GLOBAL_SETTINGS_PATH, exclude=True) @staticmethod diff --git a/src/tagstudio/qt/mixed/fix_dupe_files.py b/src/tagstudio/qt/mixed/fix_dupe_files.py index c215e862b..2819826b9 100644 --- a/src/tagstudio/qt/mixed/fix_dupe_files.py +++ b/src/tagstudio/qt/mixed/fix_dupe_files.py @@ -114,7 +114,10 @@ def set_filename(self, filename: str): self.mirror_modal.refresh_list() def refresh_dupes(self): - self.tracker.refresh_dupe_files(self.filename) + self.tracker.refresh_dupe_files( + self.filename, + max_bytes=self.driver.settings.dupe_results_max_mb * 1024 * 1024, + ) self.set_dupe_count(self.tracker.groups_count) def set_dupe_count(self, count: int): diff --git a/src/tagstudio/qt/mixed/settings_panel.py b/src/tagstudio/qt/mixed/settings_panel.py index 609ed5b05..6586a7562 100644 --- a/src/tagstudio/qt/mixed/settings_panel.py +++ b/src/tagstudio/qt/mixed/settings_panel.py @@ -20,6 +20,12 @@ QWidget, ) +from tagstudio.core.constants import ( + DEFAULT_COMIC_INFO_MAX_MB, + DEFAULT_DUPE_RESULTS_MAX_MB, + DEFAULT_MDP_HEADER_MAX_MB, + DEFAULT_PDN_HEADER_MAX_MB, +) from tagstudio.core.enums import ShowFilepathOption, TagClickActionOption from tagstudio.qt.global_settings import ( DEFAULT_THUMB_CACHE_SIZE, @@ -102,6 +108,9 @@ def __init__(self, driver: "QtDriver"): self.__build_global_settings() self.tab_widget.addTab(self.global_settings_container, Translations["settings.global"]) + self.__build_advanced_settings() + self.tab_widget.addTab(self.advanced_settings_container, Translations["settings.advanced"]) + # self.__build_library_settings() # self.tab_widget.addTab(self.library_settings_container, Translations["settings.library"]) @@ -282,6 +291,39 @@ def __build_library_settings(self): # pyright: ignore[reportUnusedFunction] todo_label = QLabel("TODO") form_layout.addRow(todo_label) + def __build_advanced_settings(self): + self.advanced_settings_container = QWidget() + form_layout = QFormLayout(self.advanced_settings_container) + form_layout.setContentsMargins(6, 6, 6, 6) + + def add_mib_row(label_key: str, current: int) -> QLineEdit: + container = QWidget() + layout = QHBoxLayout(container) + layout.setContentsMargins(0, 0, 0, 0) + layout.setSpacing(6) + edit = QLineEdit() + edit.setAlignment(Qt.AlignmentFlag.AlignRight) + edit.setValidator(QDoubleValidator(1, 1_000_000_000, 0)) + edit.setText(str(current)) + layout.addWidget(edit) + layout.setStretch(1, 2) + layout.addWidget(QLabel("MiB")) + form_layout.addRow(Translations[label_key], container) + return edit + + self.dupe_results_max_mb_edit = add_mib_row( + "settings.dupe_results_max_mb.label", self.driver.settings.dupe_results_max_mb + ) + self.comic_info_max_mb_edit = add_mib_row( + "settings.comic_info_max_mb.label", self.driver.settings.comic_info_max_mb + ) + self.mdp_header_max_mb_edit = add_mib_row( + "settings.mdp_header_max_mb.label", self.driver.settings.mdp_header_max_mb + ) + self.pdn_header_max_mb_edit = add_mib_row( + "settings.pdn_header_max_mb.label", self.driver.settings.pdn_header_max_mb + ) + def __get_language(self) -> str: return list(LANGUAGES.values())[self.language_combobox.currentIndex()] @@ -305,6 +347,18 @@ def get_settings(self) -> dict[str, Any]: # pyright: ignore[reportExplicitAny] "hour_format": self.hourformat_checkbox.isChecked(), "zero_padding": self.zeropadding_checkbox.isChecked(), "splash": self.splash_combobox.currentData(), + "dupe_results_max_mb": max( + int(self.dupe_results_max_mb_edit.text() or DEFAULT_DUPE_RESULTS_MAX_MB), 1 + ), + "comic_info_max_mb": max( + int(self.comic_info_max_mb_edit.text() or DEFAULT_COMIC_INFO_MAX_MB), 1 + ), + "mdp_header_max_mb": max( + int(self.mdp_header_max_mb_edit.text() or DEFAULT_MDP_HEADER_MAX_MB), 1 + ), + "pdn_header_max_mb": max( + int(self.pdn_header_max_mb_edit.text() or DEFAULT_PDN_HEADER_MAX_MB), 1 + ), } def update_settings(self, driver: "QtDriver"): @@ -325,6 +379,10 @@ def update_settings(self, driver: "QtDriver"): driver.settings.hour_format = settings["hour_format"] driver.settings.zero_padding = settings["zero_padding"] driver.settings.splash = settings["splash"] + driver.settings.dupe_results_max_mb = settings["dupe_results_max_mb"] + driver.settings.comic_info_max_mb = settings["comic_info_max_mb"] + driver.settings.mdp_header_max_mb = settings["mdp_header_max_mb"] + driver.settings.pdn_header_max_mb = settings["pdn_header_max_mb"] driver.settings.save() diff --git a/src/tagstudio/qt/previews/renderer.py b/src/tagstudio/qt/previews/renderer.py index 0e870bf3b..372d050e2 100644 --- a/src/tagstudio/qt/previews/renderer.py +++ b/src/tagstudio/qt/previews/renderer.py @@ -11,7 +11,6 @@ import sqlite3 import struct import tarfile -import xml.etree.ElementTree as ET import zipfile import zlib from copy import deepcopy @@ -22,6 +21,7 @@ from xml.etree.ElementTree import Element import cv2 +import defusedxml.ElementTree as ET # noqa: N817 import numpy as np import py7zr import py7zr.io @@ -894,8 +894,7 @@ def _powerpoint_thumb(filepath: Path) -> Image.Image | None: return im - @staticmethod - def _epub_cover(filepath: Path, ext: str) -> Image.Image | None: + def _epub_cover(self, filepath: Path, ext: str) -> Image.Image | None: """Extracts the cover specified by ComicInfo.xml or first image found in the ePub file. Args: @@ -907,9 +906,15 @@ def _epub_cover(filepath: Path, ext: str) -> Image.Image | None: the first image found in the ePub file, or None by default. """ im: Image.Image | None = None + max_bytes = self.driver.settings.comic_info_max_mb * 1024 * 1024 try: with ThumbRenderer.__open_archive(filepath, ext) as archive: if "ComicInfo.xml" in archive.namelist(): + size = ThumbRenderer.__archive_member_size(archive, "ComicInfo.xml") + if size is not None and size > max_bytes: + raise ValueError( + f"ComicInfo.xml exceeds {max_bytes} byte limit" + ) comic_info = ET.fromstring(archive.read("ComicInfo.xml")) im = ThumbRenderer.__cover_from_comic_info(archive, comic_info, "FrontCover") if not im: @@ -990,6 +995,19 @@ def __open_archive(filepath: Path, ext: str) -> _Archive: archiver = _TarFile return archiver(filepath, "r") + @staticmethod + def __archive_member_size(archive: _Archive, name: str) -> int | None: + """Uncompressed size of an archive member. + + Returns None for archive types that already enforce their own per-member + cap (currently _SevenZipFile via BytesIOFactory(limit=...)). + """ + if isinstance(archive, zipfile.ZipFile | rarfile.RarFile): + return archive.getinfo(name).file_size + if isinstance(archive, _TarFile): + return archive.tar.getmember(name).size + return None + @staticmethod def __first_image(archive: _Archive) -> Image.Image | None: """Find and extract the first renderable image in the archive. @@ -1422,8 +1440,7 @@ def _video_thumb(filepath: Path) -> Image.Image | None: logger.error("Couldn't render thumbnail", filepath=filepath, error=type(e).__name__) return im - @staticmethod - def _mdp_thumb(filepath: Path) -> Image.Image | None: + def _mdp_thumb(self, filepath: Path) -> Image.Image | None: """Extract the thumbnail from a .mdp file. Args: @@ -1433,6 +1450,7 @@ def _mdp_thumb(filepath: Path) -> Image.Image | None: Image: The embedded thumbnail. """ im: Image.Image | None = None + max_bytes = self.driver.settings.mdp_header_max_mb * 1024 * 1024 try: with open(filepath, "rb") as f: magic = struct.unpack("<7sx", f.read(8))[0] @@ -1440,6 +1458,11 @@ def _mdp_thumb(filepath: Path) -> Image.Image | None: return im bin_header = struct.unpack(" max_bytes: + raise ValueError( + f".mdp header exceeds {max_bytes} byte limit " + f"(raise mdp_header_max_mb to render this file)" + ) xml_header = ET.fromstring(f.read(bin_header[1])) mdibin_count = len(xml_header.findall("./*Layer")) + 1 for _ in range(mdibin_count): @@ -1464,8 +1487,7 @@ def _mdp_thumb(filepath: Path) -> Image.Image | None: return im - @staticmethod - def _pdn_thumb(filepath: Path) -> Image.Image | None: + def _pdn_thumb(self, filepath: Path) -> Image.Image | None: """Extract the base64-encoded thumbnail from a .pdn file header. Args: @@ -1475,6 +1497,7 @@ def _pdn_thumb(filepath: Path) -> Image.Image | None: Image: the decoded PNG thumbnail or None by default. """ im: Image.Image | None = None + max_bytes = self.driver.settings.pdn_header_max_mb * 1024 * 1024 with open(filepath, "rb") as f: try: # First 4 bytes are the magic number @@ -1483,6 +1506,11 @@ def _pdn_thumb(filepath: Path) -> Image.Image | None: # Header length is a little-endian 24-bit int header_size = struct.unpack(" max_bytes: + raise ValueError( + f".pdn header exceeds {max_bytes} byte limit " + f"(raise pdn_header_max_mb to render this file)" + ) thumb_element = ET.fromstring(f.read(header_size)).find("./*thumb") if thumb_element is None: return im diff --git a/src/tagstudio/resources/translations/en.json b/src/tagstudio/resources/translations/en.json index cdd46dc11..d54771771 100644 --- a/src/tagstudio/resources/translations/en.json +++ b/src/tagstudio/resources/translations/en.json @@ -255,8 +255,13 @@ "select.all": "Select All", "select.clear": "Clear Selection", "select.inverse": "Invert Selection", + "settings.advanced": "Advanced", "settings.clear_thumb_cache.title": "Clear Thumbnail Cache", + "settings.comic_info_max_mb.label": "ComicInfo.xml Max Size", "settings.dateformat.english": "English", + "settings.dupe_results_max_mb.label": "Dupe Results Max Size", + "settings.mdp_header_max_mb.label": "MDIPACK .mdp Header Max Size", + "settings.pdn_header_max_mb.label": "Paint.NET .pdn Header Max Size", "settings.dateformat.international": "International", "settings.dateformat.label": "Date Format", "settings.dateformat.system": "System", diff --git a/tests/macros/test_dupe_files.py b/tests/macros/test_dupe_files.py index e449bd552..319738444 100644 --- a/tests/macros/test_dupe_files.py +++ b/tests/macros/test_dupe_files.py @@ -4,6 +4,8 @@ from pathlib import Path +import pytest + from tagstudio.core.library.alchemy.library import Library from tagstudio.core.library.alchemy.models import Entry from tagstudio.core.library.alchemy.registries.dupe_files_registry import DupeFilesRegistry @@ -42,3 +44,14 @@ def test_refresh_dupe_files(library: Library): Path("foo.txt"), Path("foo/foo.txt"), ] + + +def test_refresh_dupe_files_rejects_oversized(library: Library, tmp_path: Path): + library.library_dir = Path("/tmp/") + registry = DupeFilesRegistry(library=library) + + too_big = tmp_path / "big.dupeguru" + too_big.write_bytes(b"x" * 200) + + with pytest.raises(ValueError, match="exceeds 100 byte limit"): + registry.refresh_dupe_files(too_big, max_bytes=100) diff --git a/tests/qt/test_global_settings.py b/tests/qt/test_global_settings.py index 95d6b5f6a..b45083dd7 100644 --- a/tests/qt/test_global_settings.py +++ b/tests/qt/test_global_settings.py @@ -5,6 +5,12 @@ from pathlib import Path +from tagstudio.core.constants import ( + DEFAULT_COMIC_INFO_MAX_MB, + DEFAULT_DUPE_RESULTS_MAX_MB, + DEFAULT_MDP_HEADER_MAX_MB, + DEFAULT_PDN_HEADER_MAX_MB, +) from tagstudio.qt.global_settings import GlobalSettings, Theme @@ -35,3 +41,31 @@ def test_read_settings(library_dir: Path): assert settings.date_format == "%x" assert settings.hour_format assert settings.zero_padding + + +def test_security_cap_defaults(): + """Default cap values match the canonical constants in core.constants.""" + settings = GlobalSettings() + assert settings.dupe_results_max_mb == DEFAULT_DUPE_RESULTS_MAX_MB + assert settings.comic_info_max_mb == DEFAULT_COMIC_INFO_MAX_MB + assert settings.mdp_header_max_mb == DEFAULT_MDP_HEADER_MAX_MB + assert settings.pdn_header_max_mb == DEFAULT_PDN_HEADER_MAX_MB + + +def test_security_caps_round_trip(tmp_path: Path): + """The four cap fields persist through save -> read_settings.""" + settings_path = tmp_path / "settings.toml" + settings = GlobalSettings( + loaded_from=settings_path, + dupe_results_max_mb=256, + comic_info_max_mb=4, + mdp_header_max_mb=8, + pdn_header_max_mb=32, + ) + settings.save() + + reloaded = GlobalSettings.read_settings(settings_path) + assert reloaded.dupe_results_max_mb == 256 + assert reloaded.comic_info_max_mb == 4 + assert reloaded.mdp_header_max_mb == 8 + assert reloaded.pdn_header_max_mb == 32 diff --git a/tests/qt/test_renderer_caps.py b/tests/qt/test_renderer_caps.py new file mode 100644 index 000000000..23c7efd8e --- /dev/null +++ b/tests/qt/test_renderer_caps.py @@ -0,0 +1,53 @@ +# Copyright (C) 2025 +# Licensed under the GPL-3.0 License. +# Created for TagStudio: https://github.com/CyanVoxel/TagStudio + +"""Tests for the security caps applied to attacker-controlled XML reads.""" + +import struct +from pathlib import Path +from unittest.mock import patch + +from pytestqt.qtbot import QtBot + +from tagstudio.qt.previews.renderer import ThumbRenderer +from tagstudio.qt.ts_qt import QtDriver + + +def test_mdp_thumb_rejects_oversize_header( + qtbot: QtBot, qt_driver: QtDriver, tmp_path: Path +): + """A .mdp claiming a header larger than the cap is rejected before allocation.""" + qt_driver.settings.mdp_header_max_mb = 1 + renderer = ThumbRenderer(qt_driver) + + # Magic ("mdipack" + null) followed by bin_header where bin_header[1] + # is the declared XML header length. 10 MiB > 1 MiB cap. + mdp = tmp_path / "test.mdp" + declared = 10 * 1024 * 1024 + mdp.write_bytes(b"mdipack\x00" + struct.pack(" 1 MiB cap. + pdn = tmp_path / "test.pdn" + declared = 10 * 1024 * 1024 + pdn.write_bytes(b"PDN3" + declared.to_bytes(3, "little")) + + with patch("tagstudio.qt.previews.renderer.ET.fromstring") as parse_spy: + result = renderer._pdn_thumb(pdn) # pyright: ignore[reportPrivateUsage] + + assert result is None + parse_spy.assert_not_called()