Merge pull request #135 from mvanhorn/fix/120-empty-read-before-eof

Byron · web-flow · commit 0a019a2e2bd7 · 2026-05-09T08:42:50.000+08:00
Fix DecompressMemMapReader.read returning b'' before EOF (#120)
diff --git a/gitdb/stream.py b/gitdb/stream.py
@@ -254,68 +254,93 @@ def read(self, size=-1):
         # copied once, and another copy of a part of it when it creates the unconsumed
         # tail. We have to use it to hand in the appropriate amount of bytes during
         # the next read.
-        tail = self._zip.unconsumed_tail
-        if tail:
-            # move the window, make it as large as size demands. For code-clarity,
-            # we just take the chunk from our map again instead of reusing the unconsumed
-            # tail. The latter one would safe some memory copying, but we could end up
-            # with not getting enough data uncompressed, so we had to sort that out as well.
-            # Now we just assume the worst case, hence the data is uncompressed and the window
-            # needs to be as large as the uncompressed bytes we want to read.
-            self._cws = self._cwe - len(tail)
-            self._cwe = self._cws + size
-        else:
-            cws = self._cws
-            self._cws = self._cwe
-            self._cwe = cws + size
-        # END handle tail
-
-        # if window is too small, make it larger so zip can decompress something
-        if self._cwe - self._cws < 8:
-            self._cwe = self._cws + 8
-        # END adjust winsize
-
-        # takes a slice, but doesn't copy the data, it says ...
-        indata = self._m[self._cws:self._cwe]
-
-        # get the actual window end to be sure we don't use it for computations
-        self._cwe = self._cws + len(indata)
-        dcompdat = self._zip.decompress(indata, size)
-        # update the amount of compressed bytes read
-        # We feed possibly overlapping chunks, which is why the unconsumed tail
-        # has to be taken into consideration, as well as the unused data
-        # if we hit the end of the stream
-        # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly.
-        # They are thorough, and I assume it is truly working.
-        # Why is this logic as convoluted as it is ? Please look at the table in
-        # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results.
-        # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch
-        # will be the one that works.
-        # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the
-        # table in the github issue. This is it ... it was the only way I could make this work everywhere.
-        # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... .
-        if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin':
-            unused_datalen = len(self._zip.unconsumed_tail)
-        else:
-            unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data)
-        # # end handle very special case ...
-
-        self._cbr += len(indata) - unused_datalen
-        self._br += len(dcompdat)
+        #
+        # Decompress in a loop until we have produced `size` bytes or run out
+        # of progress. Iteration (instead of recursion) keeps the call bounded
+        # for streams that consume many input bytes per produced output byte
+        # (e.g. zlib stored blocks of length zero); the previous recursive
+        # form blew the stack on inputs > ~1500 empty blocks (issue #120
+        # follow-up).
+        dcompdat = b''
+        while True:
+            tail = self._zip.unconsumed_tail
+            remaining = size - len(dcompdat)
+            if tail:
+                # move the window, make it as large as size demands. For code-clarity,
+                # we just take the chunk from our map again instead of reusing the unconsumed
+                # tail. The latter one would save some memory copying, but we could end up
+                # with not getting enough data uncompressed, so we had to sort that out as well.
+                # Now we just assume the worst case, hence the data is uncompressed and the window
+                # needs to be as large as the uncompressed bytes we want to read.
+                self._cws = self._cwe - len(tail)
+                self._cwe = self._cws + remaining
+            else:
+                cws = self._cws
+                self._cws = self._cwe
+                self._cwe = cws + remaining
+            # END handle tail
+
+            # if window is too small, make it larger so zip can decompress something
+            if self._cwe - self._cws < 8:
+                self._cwe = self._cws + 8
+            # END adjust winsize
+
+            # takes a slice, but doesn't copy the data, it says ...
+            indata = self._m[self._cws:self._cwe]
+
+            # get the actual window end to be sure we don't use it for computations
+            self._cwe = self._cws + len(indata)
+            chunk = self._zip.decompress(indata, remaining)
+            # update the amount of compressed bytes read
+            # We feed possibly overlapping chunks, which is why the unconsumed tail
+            # has to be taken into consideration, as well as the unused data
+            # if we hit the end of the stream
+            # NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly.
+            # They are thorough, and I assume it is truly working.
+            # Why is this logic as convoluted as it is ? Please look at the table in
+            # https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results.
+            # Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch
+            # will be the one that works.
+            # However, the zlib VERSIONs as well as the platform check is used to further match the entries in the
+            # table in the github issue. This is it ... it was the only way I could make this work everywhere.
+            # IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... .
+            if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin':
+                unused_datalen = len(self._zip.unconsumed_tail)
+            else:
+                unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data)
+            # # end handle very special case ...
+
+            consumed = len(indata) - unused_datalen
+            self._cbr += consumed
+            self._br += len(chunk)
+            if chunk:
+                if not isinstance(dcompdat, bytearray):
+                    dcompdat = bytearray(dcompdat)
+                dcompdat.extend(chunk)
+
+            # Stop when we have enough or there is no path to more output.
+            # `chunk` may legitimately be empty mid-stream when zlib is
+            # consuming header / dictionary frames; in that case we keep
+            # iterating as long as we are still feeding zlib new bytes
+            # (consumed > 0) and zlib has not flagged end-of-stream. The
+            # compressed_bytes_read() scrub loop drives this same code with
+            # _br manipulated to 0 past zip EOF; it terminates here because
+            # `getattr(_zip, 'eof', False)` is True or no compressed bytes
+            # are consumed. The empty-block recursion attack from issue #120
+            # follow-up is bounded by the iteration; each empty block does
+            # consume input, so the loop walks the stream forward a constant
+            # amount per iteration without growing the call stack.
+            if len(dcompdat) >= size or self._br >= self._s:
+                break
+            zip_eof = getattr(self._zip, 'eof', False)
+            if not chunk and (zip_eof or len(indata) == 0 or consumed == 0):
+                break
+        # END iterative decompress
 
         if dat:
             dcompdat = dat + dcompdat
         # END prepend our cached data
 
-        # it can happen, depending on the compression, that we get less bytes
-        # than ordered as it needs the final portion of the data as well.
-        # Recursively resolve that.
-        # Note: dcompdat can be empty even though we still appear to have bytes
-        # to read, if we are called by compressed_bytes_read - it manipulates
-        # us to empty the stream
-        if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s:
-            dcompdat += self.read(size - len(dcompdat))
-        # END handle special case
         return dcompdat
 
 
diff --git a/gitdb/test/test_stream.py b/gitdb/test/test_stream.py
@@ -162,3 +162,39 @@ def test_decompress_reader_special_case(self):
             dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data)))
             assert dump.hexsha == sha
         # end for each loose object sha to test
+
+    def test_decompress_reader_chunked_read_does_not_terminate_early(self):
+        """Regression test for #120: read(N) must not return b'' before EOF.
+
+        zlib can consume input without producing decompressed output (e.g.
+        while ingesting block headers). The reader's internal recursion
+        previously bailed on any empty zip output, so a caller reading in
+        small chunks via the standard `while chunk := stream.read(N)` idiom
+        would terminate at the first empty chunk -- before the actual end
+        of the uncompressed stream.
+        """
+        # Highly compressible data exposes the bug because each zlib chunk
+        # spans many uncompressed bytes -- intermediate decompress() calls
+        # often return empty while consuming input.
+        data = b"hello world! " * 1000
+        zdata = zlib.compress(data)
+
+        # Loop with a small chunk size to force many internal read/decompression
+        # iterations before EOF.
+        for chunk_size in (1, 4, 16, 64):
+            reader = DecompressMemMapReader(
+                zdata, close_on_deletion=False, size=len(data)
+            )
+            out = bytearray()
+            while True:
+                chunk = reader.read(chunk_size)
+                if not chunk:
+                    break
+                out.extend(chunk)
+            assert bytes(out) == data, (
+                f"chunk_size={chunk_size}: got {len(out)}/{len(data)} bytes"
+            )
+            assert reader._br == reader._s, (
+                f"chunk_size={chunk_size}: stream stopped at "
+                f"{reader._br}/{reader._s}"
+            )