diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 338e7d06..d991f123 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -19,6 +19,27 @@ from ._geotags import GeoInfo, GeoTransform, extract_geo_info from ._header import IFD, TIFFHeader, parse_all_ifds, parse_header +# --------------------------------------------------------------------------- +# Allocation guard: reject TIFF dimensions that would exhaust memory +# --------------------------------------------------------------------------- + +#: Default maximum total pixel count (width * height * samples). +#: ~1 billion pixels, which is ~4 GB for float32 single-band. +#: Override per-call via the ``max_pixels`` keyword argument. +MAX_PIXELS_DEFAULT = 1_000_000_000 + + +def _check_dimensions(width, height, samples, max_pixels): + """Raise ValueError if the requested allocation exceeds *max_pixels*.""" + total = width * height * samples + if total > max_pixels: + raise ValueError( + f"TIFF image dimensions ({width} x {height} x {samples} = " + f"{total:,} pixels) exceed the safety limit of " + f"{max_pixels:,} pixels. Pass a larger max_pixels value to " + f"read_to_array() if this file is legitimate." + ) + # --------------------------------------------------------------------------- # Data source abstraction @@ -292,7 +313,8 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples, # --------------------------------------------------------------------------- def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, - dtype: np.dtype, window=None) -> np.ndarray: + dtype: np.dtype, window=None, + max_pixels: int = MAX_PIXELS_DEFAULT) -> np.ndarray: """Read a strip-organized TIFF image. Parameters @@ -307,6 +329,8 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, Output pixel dtype. window : tuple or None (row_start, col_start, row_stop, col_stop) or None for full image. + max_pixels : int + Maximum allowed pixel count (width * height * samples). Returns ------- @@ -344,6 +368,8 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, out_h = r1 - r0 out_w = c1 - c0 + _check_dimensions(out_w, out_h, samples, max_pixels) + if samples > 1: result = np.empty((out_h, out_w, samples), dtype=dtype) else: @@ -408,7 +434,8 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, # --------------------------------------------------------------------------- def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, - dtype: np.dtype, window=None) -> np.ndarray: + dtype: np.dtype, window=None, + max_pixels: int = MAX_PIXELS_DEFAULT) -> np.ndarray: """Read a tile-organized TIFF image. Parameters @@ -423,6 +450,8 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, Output pixel dtype. window : tuple or None (row_start, col_start, row_stop, col_stop) or None for full image. + max_pixels : int + Maximum allowed pixel count (width * height * samples). Returns ------- @@ -462,6 +491,8 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, out_h = r1 - r0 out_w = c1 - c0 + _check_dimensions(out_w, out_h, samples, max_pixels) + _alloc = np.zeros if window is not None else np.empty if samples > 1: result = _alloc((out_h, out_w, samples), dtype=dtype) @@ -545,7 +576,9 @@ def _decode_one(job): # --------------------------------------------------------------------------- def _read_cog_http(url: str, overview_level: int | None = None, - band: int | None = None) -> tuple[np.ndarray, GeoInfo]: + band: int | None = None, + max_pixels: int = MAX_PIXELS_DEFAULT, + ) -> tuple[np.ndarray, GeoInfo]: """Read a COG via HTTP range requests. Parameters @@ -556,6 +589,8 @@ def _read_cog_http(url: str, overview_level: int | None = None, Which overview to read (0 = full res, 1 = first overview, etc.). band : int Band index (0-based, for multi-band files). + max_pixels : int + Maximum allowed pixel count (width * height * samples). Returns ------- @@ -613,6 +648,8 @@ def _read_cog_http(url: str, overview_level: int | None = None, tiles_across = math.ceil(width / tw) tiles_down = math.ceil(height / th) + _check_dimensions(width, height, samples, max_pixels) + if samples > 1: result = np.empty((height, width, samples), dtype=dtype) else: @@ -653,7 +690,9 @@ def _read_cog_http(url: str, overview_level: int | None = None, # --------------------------------------------------------------------------- def read_to_array(source: str, *, window=None, overview_level: int | None = None, - band: int | None = None) -> tuple[np.ndarray, GeoInfo]: + band: int | None = None, + max_pixels: int = MAX_PIXELS_DEFAULT, + ) -> tuple[np.ndarray, GeoInfo]: """Read a GeoTIFF/COG to a numpy array. Parameters @@ -666,13 +705,18 @@ def read_to_array(source: str, *, window=None, overview_level: int | None = None Overview level (0 = full res). band : int Band index for multi-band files. + max_pixels : int + Maximum allowed total pixel count (width * height * samples). + Prevents memory exhaustion from crafted TIFF headers. + Default is 1 billion (~4 GB for float32 single-band). Returns ------- (np.ndarray, GeoInfo) tuple """ if source.startswith(('http://', 'https://')): - return _read_cog_http(source, overview_level=overview_level, band=band) + return _read_cog_http(source, overview_level=overview_level, band=band, + max_pixels=max_pixels) # Local file or cloud storage: read all bytes then parse if _is_fsspec_uri(source): @@ -701,9 +745,11 @@ def read_to_array(source: str, *, window=None, overview_level: int | None = None geo_info = extract_geo_info(ifd, data, header.byte_order) if ifd.is_tiled: - arr = _read_tiles(data, ifd, header, dtype, window) + arr = _read_tiles(data, ifd, header, dtype, window, + max_pixels=max_pixels) else: - arr = _read_strips(data, ifd, header, dtype, window) + arr = _read_strips(data, ifd, header, dtype, window, + max_pixels=max_pixels) # For multi-band with band selection, extract single band if arr.ndim == 3 and ifd.samples_per_pixel > 1 and band is not None: diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index ca817f14..616c84b1 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -136,6 +136,8 @@ def parse_vrt(xml_str: str, vrt_dir: str = '.') -> VRTDataset: relative.get('relativeToVRT', '0') == '1') if is_relative and not os.path.isabs(filename): filename = os.path.join(vrt_dir, filename) + # Canonicalize to prevent path traversal (e.g. ../) + filename = os.path.realpath(filename) src_band = int(_text(src_elem, 'SourceBand') or '1') diff --git a/xrspatial/geotiff/tests/test_features.py b/xrspatial/geotiff/tests/test_features.py index 06d76cfd..8aabc689 100644 --- a/xrspatial/geotiff/tests/test_features.py +++ b/xrspatial/geotiff/tests/test_features.py @@ -1,6 +1,8 @@ """Tests for new features: multi-band, integer nodata, packbits, zstd, dask, BigTIFF.""" from __future__ import annotations +import os + import numpy as np import pytest import xarray as xr @@ -823,12 +825,10 @@ def test_vrt_parser(self): assert vrt.bands[0].nodata == 0.0 assert len(vrt.bands[0].sources) == 1 src = vrt.bands[0].sources[0] - assert src.filename == '/data/tile.tif' + assert src.filename == os.path.realpath('/data/tile.tif') assert src.src_rect.x_off == 10 -import os - class TestCloudStorage: def test_cloud_scheme_detection(self): diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py new file mode 100644 index 00000000..a3dd8c88 --- /dev/null +++ b/xrspatial/geotiff/tests/test_security.py @@ -0,0 +1,194 @@ +"""Security tests for the geotiff subpackage. + +Tests for: +- Unbounded allocation guard (issue #1184) +- VRT path traversal prevention (issue #1185) +""" +from __future__ import annotations + +import os +import struct +import tempfile + +import numpy as np +import pytest + +from xrspatial.geotiff._reader import ( + MAX_PIXELS_DEFAULT, + _check_dimensions, + _read_strips, + _read_tiles, + read_to_array, +) +from xrspatial.geotiff._header import parse_header, parse_all_ifds +from xrspatial.geotiff._dtypes import tiff_dtype_to_numpy +from .conftest import make_minimal_tiff + + +# --------------------------------------------------------------------------- +# Cat 1: Unbounded allocation guard +# --------------------------------------------------------------------------- + +class TestDimensionGuard: + def test_check_dimensions_rejects_oversized(self): + """_check_dimensions raises when total pixels exceed the limit.""" + with pytest.raises(ValueError, match="exceed the safety limit"): + _check_dimensions(100_000, 100_000, 1, MAX_PIXELS_DEFAULT) + + def test_check_dimensions_accepts_normal(self): + """_check_dimensions does not raise for normal sizes.""" + _check_dimensions(1000, 1000, 1, MAX_PIXELS_DEFAULT) + + def test_check_dimensions_considers_samples(self): + """Multi-band images multiply the pixel budget.""" + # 50_000 x 50_000 x 3 = 7.5 billion, should be rejected + with pytest.raises(ValueError, match="exceed the safety limit"): + _check_dimensions(50_000, 50_000, 3, MAX_PIXELS_DEFAULT) + + def test_custom_limit(self): + """A custom max_pixels lets callers tighten or relax the limit.""" + # Tight limit: 100 pixels + with pytest.raises(ValueError, match="exceed the safety limit"): + _check_dimensions(20, 20, 1, max_pixels=100) + + # Relaxed: passes with large limit + _check_dimensions(100_000, 100_000, 1, max_pixels=100_000_000_000) + + def test_read_strips_rejects_huge_header(self): + """_read_strips refuses to allocate when header claims huge dims.""" + # Build a valid TIFF with small pixel data but huge header dimensions. + # We fake the header to claim 100000x100000 but only provide 4x4 data. + data = make_minimal_tiff(4, 4, np.dtype('float32')) + header = parse_header(data) + ifds = parse_all_ifds(data, header) + ifd = ifds[0] + + # Monkey-patch the IFD width/height to simulate a crafted header + from xrspatial.geotiff._header import IFDEntry + ifd.entries[256] = IFDEntry(tag=256, type_id=3, count=1, value=100_000) + ifd.entries[257] = IFDEntry(tag=257, type_id=3, count=1, value=100_000) + + dtype = tiff_dtype_to_numpy(ifd.bits_per_sample, ifd.sample_format) + + with pytest.raises(ValueError, match="exceed the safety limit"): + _read_strips(data, ifd, header, dtype, max_pixels=1_000_000) + + def test_read_tiles_rejects_huge_header(self): + """_read_tiles refuses to allocate when header claims huge dims.""" + data = make_minimal_tiff(8, 8, np.dtype('float32'), tiled=True, tile_size=4) + header = parse_header(data) + ifds = parse_all_ifds(data, header) + ifd = ifds[0] + + from xrspatial.geotiff._header import IFDEntry + ifd.entries[256] = IFDEntry(tag=256, type_id=3, count=1, value=100_000) + ifd.entries[257] = IFDEntry(tag=257, type_id=3, count=1, value=100_000) + + dtype = tiff_dtype_to_numpy(ifd.bits_per_sample, ifd.sample_format) + + with pytest.raises(ValueError, match="exceed the safety limit"): + _read_tiles(data, ifd, header, dtype, max_pixels=1_000_000) + + def test_read_to_array_max_pixels_kwarg(self, tmp_path): + """read_to_array passes max_pixels through to the internal readers.""" + expected = np.arange(16, dtype=np.float32).reshape(4, 4) + data = make_minimal_tiff(4, 4, np.dtype('float32'), pixel_data=expected) + path = str(tmp_path / "small.tif") + with open(path, 'wb') as f: + f.write(data) + + # Should succeed with a generous limit + arr, _ = read_to_array(path, max_pixels=1_000_000) + np.testing.assert_array_equal(arr, expected) + + # Should fail with a tiny limit + with pytest.raises(ValueError, match="exceed the safety limit"): + read_to_array(path, max_pixels=10) + + def test_normal_read_unaffected(self, tmp_path): + """Normal reads within the default limit are not affected.""" + expected = np.arange(64, dtype=np.float32).reshape(8, 8) + data = make_minimal_tiff(8, 8, np.dtype('float32'), pixel_data=expected) + path = str(tmp_path / "normal.tif") + with open(path, 'wb') as f: + f.write(data) + + arr, _ = read_to_array(path) + np.testing.assert_array_equal(arr, expected) + + +# --------------------------------------------------------------------------- +# Cat 5: VRT path traversal +# --------------------------------------------------------------------------- + +class TestVRTPathTraversal: + def test_relative_path_canonicalized(self, tmp_path): + """Relative paths in VRT SourceFilename are canonicalized.""" + from xrspatial.geotiff._vrt import parse_vrt + + vrt_xml = ''' + + + ../../../etc/shadow + 1 + + + + +''' + + vrt_dir = str(tmp_path / "subdir") + os.makedirs(vrt_dir) + + vrt = parse_vrt(vrt_xml, vrt_dir) + source_path = vrt.bands[0].sources[0].filename + + # After canonicalization, the path should NOT contain ".." + assert ".." not in source_path + # It should be an absolute path + assert os.path.isabs(source_path) + # Verify it was resolved through realpath + assert source_path == os.path.realpath(source_path) + + def test_normal_relative_path_still_works(self, tmp_path): + """Normal relative paths without traversal still resolve correctly.""" + from xrspatial.geotiff._vrt import parse_vrt + + vrt_xml = ''' + + + data/tile.tif + 1 + + + + +''' + + vrt_dir = str(tmp_path) + vrt = parse_vrt(vrt_xml, vrt_dir) + source_path = vrt.bands[0].sources[0].filename + + expected = os.path.realpath(os.path.join(vrt_dir, "data", "tile.tif")) + assert source_path == expected + + def test_absolute_path_also_canonicalized(self, tmp_path): + """Absolute paths in VRT are also canonicalized.""" + from xrspatial.geotiff._vrt import parse_vrt + + vrt_xml = ''' + + + /tmp/../tmp/test.tif + 1 + + + + +''' + + vrt = parse_vrt(vrt_xml, str(tmp_path)) + source_path = vrt.bands[0].sources[0].filename + + assert ".." not in source_path + assert source_path == os.path.realpath("/tmp/../tmp/test.tif") diff --git a/xrspatial/polygon_clip.py b/xrspatial/polygon_clip.py index f8e2fe65..54c03ed2 100644 --- a/xrspatial/polygon_clip.py +++ b/xrspatial/polygon_clip.py @@ -79,10 +79,14 @@ def _resolve_geometry(geometry): ) -def _crop_to_bbox(raster, geom_pairs): +def _crop_to_bbox(raster, geom_pairs, all_touched=False): """Slice the raster to the bounding box of the geometry. Returns the sliced DataArray and the geometry pairs (unchanged). + + When *all_touched* is True the bounding-box comparison is expanded by + half a pixel on every side so that pixels whose cells overlap the + geometry boundary are not prematurely excluded. """ from shapely.ops import unary_union merged = unary_union([g for g, _ in geom_pairs]) @@ -91,19 +95,24 @@ def _crop_to_bbox(raster, geom_pairs): y_coords = raster.coords[raster.dims[-2]].values x_coords = raster.coords[raster.dims[-1]].values - # Determine coordinate ordering (ascending or descending) - y_ascending = y_coords[-1] >= y_coords[0] - x_ascending = x_coords[-1] >= x_coords[0] - - if y_ascending: - y_mask = (y_coords >= miny) & (y_coords <= maxy) - else: - y_mask = (y_coords >= miny) & (y_coords <= maxy) - - if x_ascending: - x_mask = (x_coords >= minx) & (x_coords <= maxx) - else: - x_mask = (x_coords >= minx) & (x_coords <= maxx) + # When all_touched is set, expand the bbox by half a pixel so that + # pixels whose cells overlap the geometry survive the crop. + if all_touched: + if len(x_coords) > 1: + half_px_x = abs(float(x_coords[1] - x_coords[0])) / 2.0 + else: + half_px_x = 0.0 + if len(y_coords) > 1: + half_py_y = abs(float(y_coords[1] - y_coords[0])) / 2.0 + else: + half_py_y = 0.0 + minx -= half_px_x + maxx += half_px_x + miny -= half_py_y + maxy += half_py_y + + y_mask = (y_coords >= miny) & (y_coords <= maxy) + x_mask = (x_coords >= minx) & (x_coords <= maxx) y_idx = np.where(y_mask)[0] x_idx = np.where(x_mask)[0] @@ -186,7 +195,7 @@ def clip_polygon( # Optionally crop to bounding box first (reduces rasterize cost) if crop: - raster = _crop_to_bbox(raster, geom_pairs) + raster = _crop_to_bbox(raster, geom_pairs, all_touched=all_touched) # Build a binary mask via rasterize, aligned to the (possibly cropped) # raster grid. Always produce a plain numpy mask first, then convert diff --git a/xrspatial/tests/test_polygon_clip.py b/xrspatial/tests/test_polygon_clip.py index e0645662..61373609 100644 --- a/xrspatial/tests/test_polygon_clip.py +++ b/xrspatial/tests/test_polygon_clip.py @@ -152,6 +152,42 @@ def test_all_touched(self): n_touched = np.count_nonzero(np.isfinite(result_touched.values)) assert n_touched >= n_default + def test_all_touched_crop_matches_nocrop(self): + """crop=True with all_touched=True must not drop boundary pixels. + + Regression test for #1197: _crop_to_bbox was comparing pixel + centers against the geometry bounding box without accounting for + pixel cell extent, so pixels whose centers fell just outside the + bbox were excluded even though their cells overlapped the polygon. + """ + raster = _make_raster() + # Polygon whose edges land between pixel centers on all four + # sides. Pixel spacing is 0.5, so the left edge at x=0.15 + # sits inside the cell of pixel x=0.0 (cell [-0.25, 0.25]). + poly = Polygon([(0.15, 0.15), (0.15, 3.35), (2.35, 3.35), + (2.35, 0.15)]) + + result_crop = clip_polygon(raster, poly, crop=True, + all_touched=True) + result_nocrop = clip_polygon(raster, poly, crop=False, + all_touched=True) + + # The crop path must keep every pixel the nocrop path keeps. + n_crop = np.count_nonzero(np.isfinite(result_crop.values)) + n_nocrop = np.count_nonzero(np.isfinite(result_nocrop.values)) + assert n_crop == n_nocrop, ( + f"crop=True lost {n_nocrop - n_crop} boundary pixels" + ) + + # Pixel values must match wherever both arrays have data. + # Align the cropped result back into the full grid for comparison. + aligned = result_nocrop.copy() + crop_y = result_crop.coords['y'].values + crop_x = result_crop.coords['x'].values + aligned_slice = aligned.sel(y=crop_y, x=crop_x) + np.testing.assert_array_equal(result_crop.values, + aligned_slice.values) + def test_single_cell_raster(self): """1x1 raster with polygon that covers it.""" data = np.array([[42.0]]) @@ -207,6 +243,23 @@ def test_crop_matches_numpy(self): dk_result.values, np_result.values, equal_nan=True ) + def test_all_touched_crop_matches_nocrop(self): + """Dask: crop=True with all_touched=True keeps boundary pixels (#1197).""" + dk_raster = _make_raster(backend='dask+numpy', chunks=(4, 3)) + poly = Polygon([(0.15, 0.15), (0.15, 3.35), (2.35, 3.35), + (2.35, 0.15)]) + + result_crop = clip_polygon(dk_raster, poly, crop=True, + all_touched=True) + result_nocrop = clip_polygon(dk_raster, poly, crop=False, + all_touched=True) + + n_crop = np.count_nonzero(np.isfinite(result_crop.values)) + n_nocrop = np.count_nonzero(np.isfinite(result_nocrop.values)) + assert n_crop == n_nocrop, ( + f"crop=True lost {n_nocrop - n_crop} boundary pixels" + ) + # --------------------------------------------------------------------------- # CuPy backend