Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ readme = "README.md"
requires-python = ">=3.12,<3.13"
dependencies = [
"chardet~=5.2",
"defusedxml~=0.7",
"ffmpeg-python~=0.2",
"humanfriendly==10.*",
"mutagen~=1.47",
Expand Down
7 changes: 7 additions & 0 deletions src/tagstudio/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,10 @@
RESERVED_TAG_END = 999

RESERVED_NAMESPACE_PREFIX = "tagstudio"

# Default ceilings for attacker-controlled XML reads in the thumbnail renderer
# and the DupeGuru results XML. Values are in MiB.
DEFAULT_DUPE_RESULTS_MAX_MB: int = 128
DEFAULT_COMIC_INFO_MAX_MB: int = 1
DEFAULT_MDP_HEADER_MAX_MB: int = 1
DEFAULT_PDN_HEADER_MAX_MB: int = 16
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from pathlib import Path

import defusedxml.ElementTree as ET # noqa: N817
import structlog

from tagstudio.core.constants import DEFAULT_DUPE_RESULTS_MAX_MB
from tagstudio.core.library.alchemy.library import Library
from tagstudio.core.library.alchemy.models import Entry
from tagstudio.core.utils.types import unwrap
Expand All @@ -22,7 +23,11 @@ class DupeFilesRegistry:
def groups_count(self) -> int:
return len(self.groups)

def refresh_dupe_files(self, results_filepath: str | Path):
def refresh_dupe_files(
self,
results_filepath: str | Path,
max_bytes: int = DEFAULT_DUPE_RESULTS_MAX_MB * 1024 * 1024,
):
"""Refresh the list of duplicate files.

A duplicate file is defined as an identical or near-identical file as determined
Expand All @@ -35,6 +40,13 @@ def refresh_dupe_files(self, results_filepath: str | Path):
if not results_filepath.is_file():
raise ValueError("invalid file path")

# Guard ET.parse against a malicious or accidental over-large results file.
if results_filepath.stat().st_size > max_bytes:
raise ValueError(
f"dupe results file exceeds {max_bytes} byte limit "
f"(see Advanced settings to raise the cap)"
)

self.groups.clear()
tree = ET.parse(results_filepath)
root = tree.getroot()
Expand Down
11 changes: 11 additions & 0 deletions src/tagstudio/qt/global_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
import toml
from pydantic import BaseModel, Field

from tagstudio.core.constants import (
DEFAULT_COMIC_INFO_MAX_MB,
DEFAULT_DUPE_RESULTS_MAX_MB,
DEFAULT_MDP_HEADER_MAX_MB,
DEFAULT_PDN_HEADER_MAX_MB,
)
from tagstudio.core.enums import ShowFilepathOption, TagClickActionOption

logger = structlog.get_logger(__name__)
Expand Down Expand Up @@ -77,6 +83,11 @@ class GlobalSettings(BaseModel):
hour_format: bool = Field(default=True)
zero_padding: bool = Field(default=True)

dupe_results_max_mb: int = Field(default=DEFAULT_DUPE_RESULTS_MAX_MB)
comic_info_max_mb: int = Field(default=DEFAULT_COMIC_INFO_MAX_MB)
mdp_header_max_mb: int = Field(default=DEFAULT_MDP_HEADER_MAX_MB)
pdn_header_max_mb: int = Field(default=DEFAULT_PDN_HEADER_MAX_MB)

loaded_from: Path = Field(default=DEFAULT_GLOBAL_SETTINGS_PATH, exclude=True)

@staticmethod
Expand Down
5 changes: 4 additions & 1 deletion src/tagstudio/qt/mixed/fix_dupe_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,10 @@ def set_filename(self, filename: str):
self.mirror_modal.refresh_list()

def refresh_dupes(self):
self.tracker.refresh_dupe_files(self.filename)
self.tracker.refresh_dupe_files(
self.filename,
max_bytes=self.driver.settings.dupe_results_max_mb * 1024 * 1024,
)
self.set_dupe_count(self.tracker.groups_count)

def set_dupe_count(self, count: int):
Expand Down
58 changes: 58 additions & 0 deletions src/tagstudio/qt/mixed/settings_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
QWidget,
)

from tagstudio.core.constants import (
DEFAULT_COMIC_INFO_MAX_MB,
DEFAULT_DUPE_RESULTS_MAX_MB,
DEFAULT_MDP_HEADER_MAX_MB,
DEFAULT_PDN_HEADER_MAX_MB,
)
from tagstudio.core.enums import ShowFilepathOption, TagClickActionOption
from tagstudio.qt.global_settings import (
DEFAULT_THUMB_CACHE_SIZE,
Expand Down Expand Up @@ -102,6 +108,9 @@ def __init__(self, driver: "QtDriver"):
self.__build_global_settings()
self.tab_widget.addTab(self.global_settings_container, Translations["settings.global"])

self.__build_advanced_settings()
self.tab_widget.addTab(self.advanced_settings_container, Translations["settings.advanced"])

# self.__build_library_settings()
# self.tab_widget.addTab(self.library_settings_container, Translations["settings.library"])

Expand Down Expand Up @@ -282,6 +291,39 @@ def __build_library_settings(self): # pyright: ignore[reportUnusedFunction]
todo_label = QLabel("TODO")
form_layout.addRow(todo_label)

def __build_advanced_settings(self):
self.advanced_settings_container = QWidget()
form_layout = QFormLayout(self.advanced_settings_container)
form_layout.setContentsMargins(6, 6, 6, 6)

def add_mib_row(label_key: str, current: int) -> QLineEdit:
container = QWidget()
layout = QHBoxLayout(container)
layout.setContentsMargins(0, 0, 0, 0)
layout.setSpacing(6)
edit = QLineEdit()
edit.setAlignment(Qt.AlignmentFlag.AlignRight)
edit.setValidator(QDoubleValidator(1, 1_000_000_000, 0))
edit.setText(str(current))
layout.addWidget(edit)
layout.setStretch(1, 2)
layout.addWidget(QLabel("MiB"))
form_layout.addRow(Translations[label_key], container)
return edit

self.dupe_results_max_mb_edit = add_mib_row(
"settings.dupe_results_max_mb.label", self.driver.settings.dupe_results_max_mb
)
self.comic_info_max_mb_edit = add_mib_row(
"settings.comic_info_max_mb.label", self.driver.settings.comic_info_max_mb
)
self.mdp_header_max_mb_edit = add_mib_row(
"settings.mdp_header_max_mb.label", self.driver.settings.mdp_header_max_mb
)
self.pdn_header_max_mb_edit = add_mib_row(
"settings.pdn_header_max_mb.label", self.driver.settings.pdn_header_max_mb
)

def __get_language(self) -> str:
return list(LANGUAGES.values())[self.language_combobox.currentIndex()]

Expand All @@ -305,6 +347,18 @@ def get_settings(self) -> dict[str, Any]: # pyright: ignore[reportExplicitAny]
"hour_format": self.hourformat_checkbox.isChecked(),
"zero_padding": self.zeropadding_checkbox.isChecked(),
"splash": self.splash_combobox.currentData(),
"dupe_results_max_mb": max(
int(self.dupe_results_max_mb_edit.text() or DEFAULT_DUPE_RESULTS_MAX_MB), 1
),
"comic_info_max_mb": max(
int(self.comic_info_max_mb_edit.text() or DEFAULT_COMIC_INFO_MAX_MB), 1
),
"mdp_header_max_mb": max(
int(self.mdp_header_max_mb_edit.text() or DEFAULT_MDP_HEADER_MAX_MB), 1
),
"pdn_header_max_mb": max(
int(self.pdn_header_max_mb_edit.text() or DEFAULT_PDN_HEADER_MAX_MB), 1
),
}

def update_settings(self, driver: "QtDriver"):
Expand All @@ -325,6 +379,10 @@ def update_settings(self, driver: "QtDriver"):
driver.settings.hour_format = settings["hour_format"]
driver.settings.zero_padding = settings["zero_padding"]
driver.settings.splash = settings["splash"]
driver.settings.dupe_results_max_mb = settings["dupe_results_max_mb"]
driver.settings.comic_info_max_mb = settings["comic_info_max_mb"]
driver.settings.mdp_header_max_mb = settings["mdp_header_max_mb"]
driver.settings.pdn_header_max_mb = settings["pdn_header_max_mb"]

driver.settings.save()

Expand Down
42 changes: 35 additions & 7 deletions src/tagstudio/qt/previews/renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import sqlite3
import struct
import tarfile
import xml.etree.ElementTree as ET
import zipfile
import zlib
from copy import deepcopy
Expand All @@ -22,6 +21,7 @@
from xml.etree.ElementTree import Element

import cv2
import defusedxml.ElementTree as ET # noqa: N817
import numpy as np
import py7zr
import py7zr.io
Expand Down Expand Up @@ -894,8 +894,7 @@ def _powerpoint_thumb(filepath: Path) -> Image.Image | None:

return im

@staticmethod
def _epub_cover(filepath: Path, ext: str) -> Image.Image | None:
def _epub_cover(self, filepath: Path, ext: str) -> Image.Image | None:
"""Extracts the cover specified by ComicInfo.xml or first image found in the ePub file.

Args:
Expand All @@ -907,9 +906,15 @@ def _epub_cover(filepath: Path, ext: str) -> Image.Image | None:
the first image found in the ePub file, or None by default.
"""
im: Image.Image | None = None
max_bytes = self.driver.settings.comic_info_max_mb * 1024 * 1024
try:
with ThumbRenderer.__open_archive(filepath, ext) as archive:
if "ComicInfo.xml" in archive.namelist():
size = ThumbRenderer.__archive_member_size(archive, "ComicInfo.xml")
if size is not None and size > max_bytes:
raise ValueError(
f"ComicInfo.xml exceeds {max_bytes} byte limit"
)
comic_info = ET.fromstring(archive.read("ComicInfo.xml"))
im = ThumbRenderer.__cover_from_comic_info(archive, comic_info, "FrontCover")
if not im:
Expand Down Expand Up @@ -990,6 +995,19 @@ def __open_archive(filepath: Path, ext: str) -> _Archive:
archiver = _TarFile
return archiver(filepath, "r")

@staticmethod
def __archive_member_size(archive: _Archive, name: str) -> int | None:
"""Uncompressed size of an archive member.

Returns None for archive types that already enforce their own per-member
cap (currently _SevenZipFile via BytesIOFactory(limit=...)).
"""
if isinstance(archive, zipfile.ZipFile | rarfile.RarFile):
return archive.getinfo(name).file_size
if isinstance(archive, _TarFile):
return archive.tar.getmember(name).size
return None

@staticmethod
def __first_image(archive: _Archive) -> Image.Image | None:
"""Find and extract the first renderable image in the archive.
Expand Down Expand Up @@ -1422,8 +1440,7 @@ def _video_thumb(filepath: Path) -> Image.Image | None:
logger.error("Couldn't render thumbnail", filepath=filepath, error=type(e).__name__)
return im

@staticmethod
def _mdp_thumb(filepath: Path) -> Image.Image | None:
def _mdp_thumb(self, filepath: Path) -> Image.Image | None:
"""Extract the thumbnail from a .mdp file.

Args:
Expand All @@ -1433,13 +1450,19 @@ def _mdp_thumb(filepath: Path) -> Image.Image | None:
Image: The embedded thumbnail.
"""
im: Image.Image | None = None
max_bytes = self.driver.settings.mdp_header_max_mb * 1024 * 1024
try:
with open(filepath, "rb") as f:
magic = struct.unpack("<7sx", f.read(8))[0]
if magic != b"mdipack":
return im

bin_header = struct.unpack("<LLL", f.read(12))
if bin_header[1] > max_bytes:
raise ValueError(
f".mdp header exceeds {max_bytes} byte limit "
f"(raise mdp_header_max_mb to render this file)"
)
xml_header = ET.fromstring(f.read(bin_header[1]))
mdibin_count = len(xml_header.findall("./*Layer")) + 1
for _ in range(mdibin_count):
Expand All @@ -1464,8 +1487,7 @@ def _mdp_thumb(filepath: Path) -> Image.Image | None:

return im

@staticmethod
def _pdn_thumb(filepath: Path) -> Image.Image | None:
def _pdn_thumb(self, filepath: Path) -> Image.Image | None:
"""Extract the base64-encoded thumbnail from a .pdn file header.

Args:
Expand All @@ -1475,6 +1497,7 @@ def _pdn_thumb(filepath: Path) -> Image.Image | None:
Image: the decoded PNG thumbnail or None by default.
"""
im: Image.Image | None = None
max_bytes = self.driver.settings.pdn_header_max_mb * 1024 * 1024
with open(filepath, "rb") as f:
try:
# First 4 bytes are the magic number
Expand All @@ -1483,6 +1506,11 @@ def _pdn_thumb(filepath: Path) -> Image.Image | None:

# Header length is a little-endian 24-bit int
header_size = struct.unpack("<i", f.read(3) + b"\x00")[0]
if header_size > max_bytes:
raise ValueError(
f".pdn header exceeds {max_bytes} byte limit "
f"(raise pdn_header_max_mb to render this file)"
)
thumb_element = ET.fromstring(f.read(header_size)).find("./*thumb")
if thumb_element is None:
return im
Expand Down
5 changes: 5 additions & 0 deletions src/tagstudio/resources/translations/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,13 @@
"select.all": "Select All",
"select.clear": "Clear Selection",
"select.inverse": "Invert Selection",
"settings.advanced": "Advanced",
"settings.clear_thumb_cache.title": "Clear Thumbnail Cache",
"settings.comic_info_max_mb.label": "ComicInfo.xml Max Size",
"settings.dateformat.english": "English",
"settings.dupe_results_max_mb.label": "Dupe Results Max Size",
"settings.mdp_header_max_mb.label": "MDIPACK .mdp Header Max Size",
"settings.pdn_header_max_mb.label": "Paint.NET .pdn Header Max Size",
"settings.dateformat.international": "International",
"settings.dateformat.label": "Date Format",
"settings.dateformat.system": "System",
Expand Down
13 changes: 13 additions & 0 deletions tests/macros/test_dupe_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from pathlib import Path

import pytest

from tagstudio.core.library.alchemy.library import Library
from tagstudio.core.library.alchemy.models import Entry
from tagstudio.core.library.alchemy.registries.dupe_files_registry import DupeFilesRegistry
Expand Down Expand Up @@ -42,3 +44,14 @@ def test_refresh_dupe_files(library: Library):
Path("foo.txt"),
Path("foo/foo.txt"),
]


def test_refresh_dupe_files_rejects_oversized(library: Library, tmp_path: Path):
library.library_dir = Path("/tmp/")
registry = DupeFilesRegistry(library=library)

too_big = tmp_path / "big.dupeguru"
too_big.write_bytes(b"x" * 200)

with pytest.raises(ValueError, match="exceeds 100 byte limit"):
registry.refresh_dupe_files(too_big, max_bytes=100)
34 changes: 34 additions & 0 deletions tests/qt/test_global_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@

from pathlib import Path

from tagstudio.core.constants import (
DEFAULT_COMIC_INFO_MAX_MB,
DEFAULT_DUPE_RESULTS_MAX_MB,
DEFAULT_MDP_HEADER_MAX_MB,
DEFAULT_PDN_HEADER_MAX_MB,
)
from tagstudio.qt.global_settings import GlobalSettings, Theme


Expand Down Expand Up @@ -35,3 +41,31 @@ def test_read_settings(library_dir: Path):
assert settings.date_format == "%x"
assert settings.hour_format
assert settings.zero_padding


def test_security_cap_defaults():
"""Default cap values match the canonical constants in core.constants."""
settings = GlobalSettings()
assert settings.dupe_results_max_mb == DEFAULT_DUPE_RESULTS_MAX_MB
assert settings.comic_info_max_mb == DEFAULT_COMIC_INFO_MAX_MB
assert settings.mdp_header_max_mb == DEFAULT_MDP_HEADER_MAX_MB
assert settings.pdn_header_max_mb == DEFAULT_PDN_HEADER_MAX_MB


def test_security_caps_round_trip(tmp_path: Path):
"""The four cap fields persist through save -> read_settings."""
settings_path = tmp_path / "settings.toml"
settings = GlobalSettings(
loaded_from=settings_path,
dupe_results_max_mb=256,
comic_info_max_mb=4,
mdp_header_max_mb=8,
pdn_header_max_mb=32,
)
settings.save()

reloaded = GlobalSettings.read_settings(settings_path)
assert reloaded.dupe_results_max_mb == 256
assert reloaded.comic_info_max_mb == 4
assert reloaded.mdp_header_max_mb == 8
assert reloaded.pdn_header_max_mb == 32
Loading