DecompressMemMapReader: implemented compressed bytes counting, including test. This is required to properly read packs without the use of an index

Byron · Byron · commit bf4437ef45d9 · 2010-06-16T13:37:59.000+02:00
diff --git a/ext/async b/ext/async
@@ -1 +1 @@
-Subproject commit af0040b0f3c6ede3be5b2d6bc69f6ea5ac53c36c
+Subproject commit 796b5e94f19dfc36a3fb251468192373c76510b0
diff --git a/pack.py b/pack.py
@@ -54,15 +54,15 @@ def pack_object_at(data, as_stream):
 			s += 7
 		# END character loop
 		if as_stream:
-			stream = DecompressMemMapReader(buffer(data, i), False)
+			stream = DecompressMemMapReader(buffer(data, i), False, uncomp_size)
 			return ODeltaPackStream(type_id, uncomp_size, delta_offset, stream)
 		else:
 			return ODeltaPackInfo(type_id, uncomp_size, delta_offset)
 		# END handle stream
 	elif type_id == REF_DELTA:
 		ref_sha = data[:20]
 		if as_stream:
-			stream = DecompressMemMapReader(buffer(data, 20), False)
+			stream = DecompressMemMapReader(buffer(data, 20), False, uncomp_size)
 			return ODeltaPackStream(type_id, uncomp_size, ref_sha, stream)
 		else:
 			return ODeltaPackInfo(type_id, uncomp_size, ref_sha)
@@ -267,7 +267,7 @@ class PackFile(LazyMixin):
 	__slots__ = ('_packpath', '_data', '_size', '_version')
 	
 	# offset into our data at which the first object starts
-	_first_object_offset = 3*4 + 8
+	_first_object_offset = 3*4
 	
 	def __init__(self, packpath):
 		self._packpath = packpath
diff --git a/stream.py b/stream.py
@@ -1,6 +1,8 @@
 
 from cStringIO import StringIO
 import errno
+import mmap
+import os
 
 from util import (
 		LazyMixin,
@@ -13,10 +15,6 @@
 __all__ = ('DecompressMemMapReader', 'FDCompressedSha1Writer')
 
 
-# ZLIB configuration
-# used when compressing objects - 1 to 9 ( slowest )
-Z_BEST_SPEED = 1
-
 #{ RO Streams
 
 class DecompressMemMapReader(LazyMixin):
@@ -36,7 +34,8 @@ class DecompressMemMapReader(LazyMixin):
 		times we actually allocate. An own zlib implementation would be good here
 		to better support streamed reading - it would only need to keep the mmap
 		and decompress it into chunks, thats all ... """
-	__slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close')
+	__slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close', 
+				'_cbr', '_phi')
 	
 	max_read_size = 512*1024		# currently unused
 	
@@ -52,6 +51,8 @@ def __init__(self, m, close_on_deletion, size=None):
 		self._br = 0							# num uncompressed bytes read
 		self._cws = 0							# start byte of compression window
 		self._cwe = 0							# end byte of compression window
+		self._cbr = 0							# number of compressed bytes read
+		self._phi = False						# is True if we parsed the header info
 		self._close = close_on_deletion			# close the memmap on deletion ?
 		
 	def _set_cache_(self, attr):
@@ -85,6 +86,8 @@ def _parse_header_info(self):
 		self._buf = StringIO(hdr[hdrend:])
 		self._buflen = len(hdr) - hdrend
 		
+		self._phi = True
+		
 		return type, size
 		
 	@classmethod
@@ -98,7 +101,55 @@ def new(self, m, close_on_deletion=False):
 		inst = DecompressMemMapReader(m, close_on_deletion, 0)
 		type, size = inst._parse_header_info()
 		return type, size, inst
+
+	def compressed_bytes_read(self):
+		""":return: number of compressed bytes read. This includes the bytes it 
+		took to decompress the header ( if there was one )"""
+		# ABSTRACT: When decompressing a byte stream, it can be that the first
+		# x bytes which were requested match the first x bytes in the loosely 
+		# compressed datastream. This is the worst-case assumption that the reader
+		# does, it assumes that it will get at least X bytes from X compressed bytes
+		# in call cases.
+		# The caveat is that the object, according to our known uncompressed size, 
+		# is already complete, but there are still some bytes left in the compressed
+		# stream that contribute to the amount of compressed bytes.
+		# How can we know that we are truly done, and have read all bytes we need
+		# to read ? 
+		# Without help, we cannot know, as we need to obtain the status of the 
+		# decompression. If it is not finished, we need to decompress more data
+		# until it is finished, to yield the actual number of compressed bytes
+		# belonging to the decompressed object
+		# We are using a custom zlib module for this, if its not present, 
+		# we can only hope it works.
+		# Only scrub the stream forward if we are officially done with the
+		# bytes we were to have.
+		if self._br == self._s and hasattr(self._zip, 'status') and self._zip.status == zlib.Z_OK:
+			# manipulate the bytes-read to allow our own read method to coninute
+			# but keep the window at its current position
+			self._br = 0
+			while self._zip.status == zlib.Z_OK:
+				self.read(mmap.PAGESIZE)
+			# END scrub-loop
+			# reset bytes read, just to be sure
+			self._br = self._s
+		# END handle stream scrubbing
+		
+		return self._cbr - len(self._zip.unused_data)
 		
+	def seek(self, offset, whence=os.SEEK_SET):
+		"""Allows to reset the stream to restart reading
+		:raise ValueError: If offset and whence are not 0"""
+		if offset != 0 or whence != os.SEEK_SET:
+			raise ValueError("Can only seek to position 0")
+		# END handle offset
+		
+		self._zip = zlib.decompressobj()
+		self._br = self._cws = self._cwe = self._cbr = 0
+		if self._phi:
+			self._phi = False
+			del(self._s)		# trigger header parsing on first access
+		# END skip header
+	
 	def read(self, size=-1):
 		if size < 1:
 			size = self._s - self._br
@@ -109,33 +160,8 @@ def read(self, size=-1):
 		if size == 0:
 			return str()
 		# END handle depletion
-		
-		# protect from memory peaks
-		# If he tries to read large chunks, our memory patterns get really bad
-		# as we end up copying a possibly huge chunk from our memory map right into
-		# memory. This might not even be possible. Nonetheless, try to dampen the 
-		# effect a bit by reading in chunks, returning a huge string in the end.
-		# Our performance now depends on StringIO. This way we don't need two large
-		# buffers in peak times, but only one large one in the end which is 
-		# the return buffer
-		# NO: We don't do it - if the user thinks its best, he is right. If he 
-		# has trouble, he will start reading in chunks. According to our tests
-		# its still faster if we read 10 Mb at once instead of chunking it.
-		
-		# if size > self.max_read_size:
-			# sio = StringIO()
-			# while size:
-				# read_size = min(self.max_read_size, size)
-				# data = self.read(read_size)
-				# sio.write(data)
-				# size -= len(data)
-				# if len(data) < read_size:
-					# break
-			# # END data loop
-			# sio.seek(0)
-			# return sio.getvalue()
-		# # END handle maxread
-		# 
+	
+	
 		# deplete the buffer, then just continue using the decompress object 
 		# which has an own buffer. We just need this to transparently parse the 
 		# header from the zlib stream
@@ -186,8 +212,7 @@ def read(self, size=-1):
 		
 		
 		# if window is too small, make it larger so zip can decompress something
-		win_size = self._cwe - self._cws 
-		if win_size < 8:
+		if self._cwe - self._cws < 8:
 			self._cwe = self._cws + 8
 		# END adjust winsize
 		
@@ -196,10 +221,18 @@ def read(self, size=-1):
 		
 		# get the actual window end to be sure we don't use it for computations
 		self._cwe = self._cws + len(indata)
-			
+		
 		dcompdat = self._zip.decompress(indata, size)
 		
+		# update the amount of compressed bytes read
+		# We feed possibly overlapping chunks, which is why the unconsumed tail
+		# has to be taken into consideration, as well as the unused data
+		# if we hit the end of the stream
+		self._cbr += len(indata) - len(self._zip.unconsumed_tail)
 		self._br += len(dcompdat)
+		
+		print size, self._br, self._cbr, len(indata), self._cws, self._cwe, len(self._zip.unused_data), len(self._zip.unconsumed_tail)
+		
 		if dat:
 			dcompdat = dat + dcompdat
 			
@@ -252,7 +285,7 @@ class FDCompressedSha1Writer(Sha1Writer):
 	def __init__(self, fd):
 		super(FDCompressedSha1Writer, self).__init__()
 		self.fd = fd
-		self.zip = zlib.compressobj(Z_BEST_SPEED)
+		self.zip = zlib.compressobj(zlib.Z_BEST_SPEED)
 
 	#{ Stream Interface
 
diff --git a/test/test_stream.py b/test/test_stream.py
@@ -49,12 +49,20 @@ def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None):
 			assert rest == cdata[-len(rest):]
 		# END handle rest
 		
+		if isinstance(stream, DecompressMemMapReader):
+			assert len(stream._m) == stream.compressed_bytes_read()
+		# END handle special type
+		
 		rewind_stream(stream)
 		
 		# read everything
 		rdata = stream.read()
 		assert rdata == cdata
 		
+		if isinstance(stream, DecompressMemMapReader):
+			assert len(stream._m) == stream.compressed_bytes_read()
+		# END handle special type
+		
 	def test_decompress_reader(self):
 		for close_on_deletion in range(2):
 			for with_size in range(2):
@@ -82,15 +90,7 @@ def test_decompress_reader(self):
 						assert reader._s == len(cdata)
 					# END get reader 
 					
-					def rewind(r):
-						r._zip = zlib.decompressobj()
-						r._br = r._cws = r._cwe = 0
-						if with_size:
-							r._parse_header_info()
-						# END skip header
-					# END make rewind func
-					
-					self._assert_stream_reader(reader, cdata, rewind)
+					self._assert_stream_reader(reader, cdata, lambda r: r.seek(0))
 					
 					# put in a dummy stream for closing
 					dummy = DummyStream()
@@ -99,7 +99,6 @@ def rewind(r):
 					assert not dummy.closed
 					del(reader)
 					assert dummy.closed == close_on_deletion
-					#zdi#
 				# END for each datasize
 			# END whether size should be used
 		# END whether stream should be closed when deleted