implemented direct pack reading - currently not all information is passed on, the absolute offset into the packfile could be interesting to the caller

Byron · Byron · commit 4977bc52938c · 2010-06-16T17:36:57.000+02:00
diff --git a/fun.py b/fun.py
@@ -53,7 +53,7 @@ def pack_object_header_info(data):
 	The type_id should be interpreted according to the ``type_id_to_type_map`` map
 	The byte-offset specifies the start of the actual zlib compressed datastream
 	:param m: random-access memory, like a string or memory map"""
-	c = b0							# first byte
+	c = ord(data[0])				# first byte
 	i = 1							# next char to read
 	type_id = (c >> 4) & 7			# numeric type
 	size = c & 15					# starting size
@@ -66,7 +66,7 @@ def pack_object_header_info(data):
 	# END character loop
 	
 	try:
-		return (type_id_to_type_map[type_id], size)
+		return (type_id, size, i)
 	except KeyError:
 		# invalid object type - we could try to be smart now and decode part 
 		# of the stream to get the info, problem is that we had trouble finding 
diff --git a/pack.py b/pack.py
@@ -8,6 +8,8 @@
 
 from fun import (
 					pack_object_header_info,
+					stream_copy, 
+					chunk_size,
 					OFS_DELTA, 
 					REF_DELTA
 				)
@@ -20,6 +22,7 @@
 					)
 from stream import (
 						DecompressMemMapReader,
+						NullStream
 					)
 
 from struct import (
@@ -34,50 +37,61 @@
 
 def pack_object_at(data, as_stream):
 	"""
-	:return: info or stream object of the correct type according to the type 
-		of the object, REF_DELTAS will not be resolved in case a stream is desired.
-		The resulting ODeltaPackStream will have None instead of a stream. 
+	:return: tuple(num_header_bytes, PackInfo|PackStream)
+		Tuple of number of additional bytes read from data until the data stream begins
+		and object of the correct type according to the type  of the object.
+		If as_stream is True, the object will contain a stream, allowing  the
+		data to be read decompressed.
 	:param data: random accessable data at which the header of an object can be read
 	:param as_stream: if True, a stream object will be returned that can read 
 		the data, otherwise you receive an info object only
 	:note: a bit redundant, but it needs to be as fast as possible !"""
 	type_id, uncomp_size, data_offset = pack_object_header_info(data)
-	
+	total_offset = None				# set later, actual offset until data stream begins
+	obj = None
 	if type_id == OFS_DELTA:
-		i = 0
+		i = data_offset
 		delta_offset = 0
 		s = 7
-		while c & 0x80:
+		while True:
 			c = ord(data[i])
-			i += 1
 			delta_offset += (c & 0x7f) << s
+			i += 1
+			if not (c & 0x80):
+				break
 			s += 7
 		# END character loop
+		total_offset = i
 		if as_stream:
-			stream = DecompressMemMapReader(buffer(data, i), False, uncomp_size)
-			return ODeltaPackStream(type_id, uncomp_size, delta_offset, stream)
+			stream = DecompressMemMapReader(buffer(data, total_offset), False, uncomp_size)
+			obj = ODeltaPackStream(type_id, uncomp_size, delta_offset, stream)
 		else:
-			return ODeltaPackInfo(type_id, uncomp_size, delta_offset)
+			obj = ODeltaPackInfo(type_id, uncomp_size, delta_offset)
 		# END handle stream
 	elif type_id == REF_DELTA:
-		ref_sha = data[:20]
+		total_offset = data_offset+20
+		ref_sha = data[data_offset:total_offset]
+		
 		if as_stream:
-			stream = DecompressMemMapReader(buffer(data, 20), False, uncomp_size)
-			return ODeltaPackStream(type_id, uncomp_size, ref_sha, stream)
+			stream = DecompressMemMapReader(buffer(data, total_offset), False, uncomp_size)
+			obj = ODeltaPackStream(type_id, uncomp_size, ref_sha, stream)
 		else:
-			return ODeltaPackInfo(type_id, uncomp_size, ref_sha)
+			obj = ODeltaPackInfo(type_id, uncomp_size, ref_sha)
 		# END handle stream
 	else:
+		total_offset = data_offset
 		# assume its a base object
 		if as_stream:
 			# if no size is given, it will read the header on first access
-			stream = DecompressMemMapReader(buffer(data, data_offset), False)
-			return OPackStream(type_id, uncomp_size, stream)
+			stream = DecompressMemMapReader(buffer(data, data_offset), False, uncomp_size)
+			obj = OPackStream(type_id, uncomp_size, stream)
 		else:
-			return OPackInfo(type_id, uncomp_size)
+			obj = OPackInfo(type_id, uncomp_size)
 		# END handle as_stream
 	# END handle type id
 	
+	return total_offset, obj
+	
 
 #} END utilities
 
@@ -267,7 +281,8 @@ class PackFile(LazyMixin):
 	__slots__ = ('_packpath', '_data', '_size', '_version')
 	
 	# offset into our data at which the first object starts
-	_first_object_offset = 3*4
+	_first_object_offset = 3*4		# header bytes
+	_footer_size = 20				# final sha
 	
 	def __init__(self, packpath):
 		self._packpath = packpath
@@ -287,16 +302,28 @@ def _set_cache_(self, attr):
 			assert self._version in (2, 3), "Cannot handle pack format version %i" % self._version
 		# END handle header
 		
-	def _iter_objects(self, start_offset, as_stream):
+	def _iter_objects(self, start_offset, as_stream=True):
 		"""Handle the actual iteration of objects within this pack"""
 		data = self._data
-		size = len(data)
+		content_size = len(data) - self._footer_size
 		cur_offset = start_offset or self._first_object_offset
 		
-		while cur_offset < size:
-			ostream = pack_object_at(buffer(data, cur_offset), True)
-			# TODO: Decompressor needs to track the size of bytes actually decompressed
+		null = NullStream()
+		while cur_offset < content_size:
+			header_offset, ostream = pack_object_at(buffer(data, cur_offset), True)
+			# scrub the stream to the end - this decompresses the object, but yields
+			# the amount of compressed bytes we need to get to the next offset
+				
+			stream_copy(ostream.read, null.write, ostream.size, chunk_size)
+			cur_offset += header_offset + ostream.stream.compressed_bytes_read()
+			
 			
+			# if a stream is requested, reset it beforehand
+			# Otherwise return the Stream object directly, its derived from the 
+			# info object
+			if as_stream:
+				ostream.stream.seek(0)
+			yield ostream
 		# END until we have read everything
 		
 	#{ Interface
@@ -329,6 +356,15 @@ def stream(self, offset):
 		:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
 		raise NotImplementedError()
 		
+	def stream_iter(self, start_offset=0):
+		""":return: iterator yielding OPackStream compatible instances, allowing 
+		to access the data in the pack directly.
+		:param start_offset: offset to the first object to iterate. If 0, iteration 
+			starts at the very first object in the pack.
+		:note: Iterating a pack directly is costly as the datastream has to be decompressed
+			to determine the bounds between the objects"""
+		return self._iter_objects(start_offset, as_stream=True)
+		
 	#} END Read-Database like Interface
 	
 	
diff --git a/stream.py b/stream.py
@@ -17,6 +17,21 @@
 
 #{ RO Streams
 
+class NullStream(object):
+	"""A stream that does nothing but providing a stream interface.
+	Use it like /dev/null"""
+	__slots__ = tuple()
+		
+	def read(self, size=0):
+		return ''
+		
+	def close(self):
+		pass
+		
+	def write(self, data):
+		return len(data)
+
+
 class DecompressMemMapReader(LazyMixin):
 	"""Reads data in chunks from a memory map and decompresses it. The client sees 
 	only the uncompressed data, respective file-like read calls are handling on-demand
@@ -144,7 +159,9 @@ def compressed_bytes_read(self):
 			self._br = self._s
 		# END handle stream scrubbing
 		
-		return self._cbr - len(self._zip.unused_data)
+		# unused data ends up in the unconsumed tail, which was removed
+		# from the count already
+		return self._cbr
 		
 	def seek(self, offset, whence=os.SEEK_SET):
 		"""Allows to reset the stream to restart reading
@@ -243,7 +260,17 @@ def read(self, size=-1):
 		
 		if dat:
 			dcompdat = dat + dcompdat
+		# END prepend our cached data
 			
+		# it can happen, depending on the compression, that we get less bytes 
+		# than ordered as it needs the final portion of the data as well. 
+		# Recursively resolve that.
+		# Note: dcompdat can be empty even though we still appear to have bytes
+		# to read, if we are called by compressed_bytes_read - it manipulates
+		# us to empty the stream
+		if dcompdat and len(dcompdat) < size and self._br < self._s:
+			dcompdat += self.read(size-len(dcompdat))
+		# END handle special case
 		return dcompdat
 		
 #} END RO streams
diff --git a/test/test_pack.py b/test/test_pack.py
@@ -52,6 +52,9 @@ def _assert_pack_file(self, pack, version, size):
 		assert pack.size() == size
 		assert len(pack.checksum()) == 20
 		
+		objs = list(pack.stream_iter())
+		assert len(objs) == size
+		
 	
 	def test_pack_index(self):
 		# check version 1 and 2