Skip to content

Commit 0a019a2

Browse files
authored
Merge pull request #135 from mvanhorn/fix/120-empty-read-before-eof
Fix DecompressMemMapReader.read returning b'' before EOF (#120)
2 parents 53c94d6 + ed41d2c commit 0a019a2

2 files changed

Lines changed: 118 additions & 57 deletions

File tree

gitdb/stream.py

Lines changed: 82 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -254,68 +254,93 @@ def read(self, size=-1):
254254
# copied once, and another copy of a part of it when it creates the unconsumed
255255
# tail. We have to use it to hand in the appropriate amount of bytes during
256256
# the next read.
257-
tail = self._zip.unconsumed_tail
258-
if tail:
259-
# move the window, make it as large as size demands. For code-clarity,
260-
# we just take the chunk from our map again instead of reusing the unconsumed
261-
# tail. The latter one would safe some memory copying, but we could end up
262-
# with not getting enough data uncompressed, so we had to sort that out as well.
263-
# Now we just assume the worst case, hence the data is uncompressed and the window
264-
# needs to be as large as the uncompressed bytes we want to read.
265-
self._cws = self._cwe - len(tail)
266-
self._cwe = self._cws + size
267-
else:
268-
cws = self._cws
269-
self._cws = self._cwe
270-
self._cwe = cws + size
271-
# END handle tail
272-
273-
# if window is too small, make it larger so zip can decompress something
274-
if self._cwe - self._cws < 8:
275-
self._cwe = self._cws + 8
276-
# END adjust winsize
277-
278-
# takes a slice, but doesn't copy the data, it says ...
279-
indata = self._m[self._cws:self._cwe]
280-
281-
# get the actual window end to be sure we don't use it for computations
282-
self._cwe = self._cws + len(indata)
283-
dcompdat = self._zip.decompress(indata, size)
284-
# update the amount of compressed bytes read
285-
# We feed possibly overlapping chunks, which is why the unconsumed tail
286-
# has to be taken into consideration, as well as the unused data
287-
# if we hit the end of the stream
288-
# NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly.
289-
# They are thorough, and I assume it is truly working.
290-
# Why is this logic as convoluted as it is ? Please look at the table in
291-
# https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results.
292-
# Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch
293-
# will be the one that works.
294-
# However, the zlib VERSIONs as well as the platform check is used to further match the entries in the
295-
# table in the github issue. This is it ... it was the only way I could make this work everywhere.
296-
# IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... .
297-
if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin':
298-
unused_datalen = len(self._zip.unconsumed_tail)
299-
else:
300-
unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data)
301-
# # end handle very special case ...
302-
303-
self._cbr += len(indata) - unused_datalen
304-
self._br += len(dcompdat)
257+
#
258+
# Decompress in a loop until we have produced `size` bytes or run out
259+
# of progress. Iteration (instead of recursion) keeps the call bounded
260+
# for streams that consume many input bytes per produced output byte
261+
# (e.g. zlib stored blocks of length zero); the previous recursive
262+
# form blew the stack on inputs > ~1500 empty blocks (issue #120
263+
# follow-up).
264+
dcompdat = b''
265+
while True:
266+
tail = self._zip.unconsumed_tail
267+
remaining = size - len(dcompdat)
268+
if tail:
269+
# move the window, make it as large as size demands. For code-clarity,
270+
# we just take the chunk from our map again instead of reusing the unconsumed
271+
# tail. The latter one would save some memory copying, but we could end up
272+
# with not getting enough data uncompressed, so we had to sort that out as well.
273+
# Now we just assume the worst case, hence the data is uncompressed and the window
274+
# needs to be as large as the uncompressed bytes we want to read.
275+
self._cws = self._cwe - len(tail)
276+
self._cwe = self._cws + remaining
277+
else:
278+
cws = self._cws
279+
self._cws = self._cwe
280+
self._cwe = cws + remaining
281+
# END handle tail
282+
283+
# if window is too small, make it larger so zip can decompress something
284+
if self._cwe - self._cws < 8:
285+
self._cwe = self._cws + 8
286+
# END adjust winsize
287+
288+
# takes a slice, but doesn't copy the data, it says ...
289+
indata = self._m[self._cws:self._cwe]
290+
291+
# get the actual window end to be sure we don't use it for computations
292+
self._cwe = self._cws + len(indata)
293+
chunk = self._zip.decompress(indata, remaining)
294+
# update the amount of compressed bytes read
295+
# We feed possibly overlapping chunks, which is why the unconsumed tail
296+
# has to be taken into consideration, as well as the unused data
297+
# if we hit the end of the stream
298+
# NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly.
299+
# They are thorough, and I assume it is truly working.
300+
# Why is this logic as convoluted as it is ? Please look at the table in
301+
# https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results.
302+
# Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch
303+
# will be the one that works.
304+
# However, the zlib VERSIONs as well as the platform check is used to further match the entries in the
305+
# table in the github issue. This is it ... it was the only way I could make this work everywhere.
306+
# IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... .
307+
if getattr(zlib, 'ZLIB_RUNTIME_VERSION', zlib.ZLIB_VERSION) in ('1.2.7', '1.2.5') and not sys.platform == 'darwin':
308+
unused_datalen = len(self._zip.unconsumed_tail)
309+
else:
310+
unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data)
311+
# # end handle very special case ...
312+
313+
consumed = len(indata) - unused_datalen
314+
self._cbr += consumed
315+
self._br += len(chunk)
316+
if chunk:
317+
if not isinstance(dcompdat, bytearray):
318+
dcompdat = bytearray(dcompdat)
319+
dcompdat.extend(chunk)
320+
321+
# Stop when we have enough or there is no path to more output.
322+
# `chunk` may legitimately be empty mid-stream when zlib is
323+
# consuming header / dictionary frames; in that case we keep
324+
# iterating as long as we are still feeding zlib new bytes
325+
# (consumed > 0) and zlib has not flagged end-of-stream. The
326+
# compressed_bytes_read() scrub loop drives this same code with
327+
# _br manipulated to 0 past zip EOF; it terminates here because
328+
# `getattr(_zip, 'eof', False)` is True or no compressed bytes
329+
# are consumed. The empty-block recursion attack from issue #120
330+
# follow-up is bounded by the iteration; each empty block does
331+
# consume input, so the loop walks the stream forward a constant
332+
# amount per iteration without growing the call stack.
333+
if len(dcompdat) >= size or self._br >= self._s:
334+
break
335+
zip_eof = getattr(self._zip, 'eof', False)
336+
if not chunk and (zip_eof or len(indata) == 0 or consumed == 0):
337+
break
338+
# END iterative decompress
305339

306340
if dat:
307341
dcompdat = dat + dcompdat
308342
# END prepend our cached data
309343

310-
# it can happen, depending on the compression, that we get less bytes
311-
# than ordered as it needs the final portion of the data as well.
312-
# Recursively resolve that.
313-
# Note: dcompdat can be empty even though we still appear to have bytes
314-
# to read, if we are called by compressed_bytes_read - it manipulates
315-
# us to empty the stream
316-
if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s:
317-
dcompdat += self.read(size - len(dcompdat))
318-
# END handle special case
319344
return dcompdat
320345

321346

gitdb/test/test_stream.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,39 @@ def test_decompress_reader_special_case(self):
162162
dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data)))
163163
assert dump.hexsha == sha
164164
# end for each loose object sha to test
165+
166+
def test_decompress_reader_chunked_read_does_not_terminate_early(self):
167+
"""Regression test for #120: read(N) must not return b'' before EOF.
168+
169+
zlib can consume input without producing decompressed output (e.g.
170+
while ingesting block headers). The reader's internal recursion
171+
previously bailed on any empty zip output, so a caller reading in
172+
small chunks via the standard `while chunk := stream.read(N)` idiom
173+
would terminate at the first empty chunk -- before the actual end
174+
of the uncompressed stream.
175+
"""
176+
# Highly compressible data exposes the bug because each zlib chunk
177+
# spans many uncompressed bytes -- intermediate decompress() calls
178+
# often return empty while consuming input.
179+
data = b"hello world! " * 1000
180+
zdata = zlib.compress(data)
181+
182+
# Loop with a small chunk size to force many internal read/decompression
183+
# iterations before EOF.
184+
for chunk_size in (1, 4, 16, 64):
185+
reader = DecompressMemMapReader(
186+
zdata, close_on_deletion=False, size=len(data)
187+
)
188+
out = bytearray()
189+
while True:
190+
chunk = reader.read(chunk_size)
191+
if not chunk:
192+
break
193+
out.extend(chunk)
194+
assert bytes(out) == data, (
195+
f"chunk_size={chunk_size}: got {len(out)}/{len(data)} bytes"
196+
)
197+
assert reader._br == reader._s, (
198+
f"chunk_size={chunk_size}: stream stopped at "
199+
f"{reader._br}/{reader._s}"
200+
)

0 commit comments

Comments
 (0)