initial version of delta-apply, but more pedandic testing is required

Byron · Byron · commit 6a4eee20486e · 2010-06-17T11:42:42.000+02:00
diff --git a/fun.py b/fun.py
@@ -9,6 +9,7 @@
 from util import zlib
 decompressobj = zlib.decompressobj
 
+import mmap
 
 # INVARIANTS
 OFS_DELTA = 6
@@ -34,7 +35,7 @@
 						)
 
 # used when dealing with larger streams
-chunk_size = 1000*1000
+chunk_size = 1000*mmap.PAGESIZE
 
 __all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info', 
 			'write_object' )
@@ -83,6 +84,26 @@ def pack_object_header_info(data):
 		raise BadObjectType(type_id)
 	# END handle exceptions
 	
+def msb_size(data, offset=0):
+	""":return: tuple(read_bytes, size) read the msb size from the given random 
+	access data starting at the given byte offset"""
+	size = 0
+	i = 0
+	l = len(data)
+	hit_msb = False
+	while i < l:
+		c = ord(data[i+offset])
+		size |= (c & 0x7f) << i*7
+		i += 1
+		if not c & 0x80:
+			hit_msb = True
+			break
+		# END check msb bit
+	# END while in range
+	if not hit_msb:
+		raise AssertionError("Could not find terminating MSB byte in data stream")
+	return i+offset, size 
+	
 def write_object(type, size, read, write, chunk_size=chunk_size):
 	"""Write the object as identified by type, size and source_stream into the 
 	target_stream
@@ -111,14 +132,78 @@ def stream_copy(read, write, size, chunk_size):
 	# WRITE ALL DATA UP TO SIZE
 	while True:
 		cs = min(chunk_size, size-dbw)
-		data_len = write(read(cs))
+		# NOTE: not all write methods return the amount of written bytes, like
+		# mmap.write. Its bad, but we just deal with it ... perhaps its not 
+		# even less efficient
+		# data_len = write(read(cs))
+		# dbw += data_len
+		data = read(cs)
+		data_len = len(data)
 		dbw += data_len
+		write(data)
 		if data_len < cs or dbw == size:
 			break
 		# END check for stream end
 	# END duplicate data
 	return dbw
 	
 	
+def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, target_file):
+	"""Apply data from a delta buffer using a source buffer to the target file, 
+	which will be written to
+	:param src_buf: random access data from which the delta was created
+	:param src_buf_size: size of the source buffer in bytes
+	:param delta_buf_size: size fo the delta buffer in bytes
+	:param delta_buf: random access delta data
+	:param target_file: file like object to write the result to
+	:note: transcribed to python from the similar routine in patch-delta.c"""
+	i = 0
+	twrite = target_file.write
+	db = delta_buf
+	while i < delta_buf_size:
+		c = ord(db[i])
+		i += 1
+		if c & 0x80:
+			cp_off, cp_size = 0, 0
+			if (c & 0x01):
+				cp_off = ord(db[i])
+				i += 1
+			if (c & 0x02):
+				cp_off |= (ord(db[i]) << 8)
+				i += 1
+			if (c & 0x04):
+				cp_off |= (ord(db[i]) << 16)
+				i += i
+			if (c & 0x08):
+				cp_off |= (ord(db[i]) << 24)
+				i += 1
+			if (c & 0x10):
+				cp_size = ord(db[i])
+				i += 1
+			if (c & 0x20):
+				cp_size |= (ord(db[i]) << 8)
+				i += 1
+			if (c & 0x40):
+				cp_size |= (ord(db[i]) << 16)
+				i += 1
+				
+			if not cp_size: 
+				cp_size = 0x10000
+			# maybe skip this check ?
+			if (cp_off + cp_size < cp_size or
+			    cp_off + cp_size > src_buf_size):
+				break
+			twrite(src_buf[cp_off:cp_off+cp_size])
+		elif c:
+			twrite(db[i:i+c])
+			i += c
+		else:
+			raise ValueError("unexpected delta opcode 0")
+		# END handle command byte
+	# END while processing delta data
+	
+	# yes, lets use the exact same error message that git uses :)
+	assert i == delta_buf_size, "delta replay has gone wild"
+	
 #} END routines
 
diff --git a/stream.py b/stream.py
@@ -4,7 +4,14 @@
 import mmap
 import os
 
+from fun import (
+					msb_size,
+					stream_copy, 
+					apply_delta_data
+				)
+
 from util import (
+		allocate_memory,
 		LazyMixin,
 		make_sha,
 		write, 
@@ -300,9 +307,11 @@ class DeltaApplyReader(LazyMixin):
 	 * cmd == 0 - invalid operation ( or error in delta stream )
 	"""
 	__slots__ = (
-					"_streams",				# tuple of our stream objects
-					"_readers",				# list of read methods from our streams
+					"_bstream",				# base stream to which to apply the deltas
+					"_dstreams",			# tuple of delta stream readers
 					"_mm_target",			# memory map of the delta-applied data
+					"_size",				# actual number of bytes in _mm_target
+					"_br"					# number of bytes read 
 				)
 	
 	def __init__(self, stream_list):
@@ -311,31 +320,81 @@ def __init__(self, stream_list):
 		base object onto which to apply the deltas"""
 		assert len(stream_list) > 1, "Need at least one delta and one base stream"
 		
-		self._streams = tuple(stream_list)
-		self._readers = None					# TODO
+		self._bstream = stream_list[-1]
+		self._dstreams = tuple(stream_list[:-1])
+		self._br = 0
 		
 	def _set_cache_(self, attr):
 		"""If we are here, we apply the actual deltas"""
 		# fill in delta info structures, providing the source and target buffer
 		# sizes.
+		buffer_offset_list = list()
+		final_target_size = None
+		max_target_size = 0
+		for dstream in self._dstreams:
+			buf = dstream.read(512)			# read the header information + X
+			offset, src_size = msb_size(buf)
+			offset, target_size = msb_size(buf, offset)
+			if final_target_size is None:
+				final_target_size = target_size
+			# END set final target size
+			buffer_offset_list.append((buffer(buf, offset), offset))
+			max_target_size = max(max_target_size, target_size)
+		# END for each delta stream
+		
+		# sanity check - the first delta to apply should have the same source
+		# size as our actual base stream
+		base_size = self._bstream.size
+		target_size = max_target_size
+		
+		# if we have more than 1 delta to apply, we will swap buffers, hence we must
+		# assure that all buffers we use are large enough to hold all the results
+		if len(self._dstreams) > 1:
+			base_size = target_size = max(base_size, max_target_size)
+		# END adjust buffer sizes
+			
 		
 		# Allocate private memory map big enough to hold the first base buffer
-		# It can be swapped out if it is too large. We need random access to it
+		# We need random access to it
+		bbuf = allocate_memory(base_size)
 		
 		# allocate memory map large enough for the largest (intermediate) target
 		# We will use it as scratch space for all delta ops. If the final 
 		# target buffer is smaller than our allocated space, we just use parts
-		# of it
+		# of it upon return.
+		tbuf = allocate_memory(target_size)
 		
 		# for each delta to apply, memory map the decompressed delta and 
 		# work on the op-codes to reconstruct everything.
 		# For the actual copying, we use a seek and write pattern of buffer
 		# slices.
-		
-		# NOTE: on py pre 2.5, all memory maps must actually be some kind 
-		# of memory buffer,like StringIO ( ouch ;) )
-		
-		
+		for (dbuf, offset), dstream in reversed(zip(buffer_offset_list, self._dstreams)):
+			# allocate a buffer to hold all delta data - fill in the data for 
+			# fast access. We do this as we know that reading individual bytes
+			# from our stream would be slower than necessary ( although possible )
+			# The dbuf buffer contains commands after the first two MSB sizes, the
+			# offset specifies the amount of bytes read to get the sizes.
+			ddata = allocate_memory(dstream.size - offset)
+			ddata.write(dbuf)
+			# read the rest from the stream. The size we give is larger than necessary
+			stream_copy(dstream.read, ddata.write, dstream.size, 256*mmap.PAGESIZE)
+			
+			################################################################
+			apply_delta_data(bbuf, len(bbuf), ddata, len(ddata), tbuf)
+			################################################################
+			
+			# finally, swap out source and target buffers. The target is now the 
+			# base for the next delta to apply
+			bbuf, tbuf = tbuf, bbuf
+			bbuf.seek(0)
+			tbuf.seek(0)
+		# END for each delta to apply
+		
+		# its already seeked to 0, constrain it to the actual size
+		# NOTE: in the end of the loop, it swaps buffers, hence our target buffer
+		# is not tbuf, but bbuf !
+		self._mm_target = bbuf
+		self._size = final_target_size
 		
 		# TODO: Once that works, figure out the ordering of the opcodes. If they
 		# are always in-order/sequential, an alternate implementation could 
@@ -344,10 +403,21 @@ def _set_cache_(self, attr):
 		# concatenated opcode list which indicates what to copy from which delta
 		# to which position. This preprocessing would allow true streaming
 		
-	def read(self, size=0):
-		# pass the call to our lazy-loaded delta-applied data
-		return self._mm_target.read(size) 
-
+	def read(self, count=0):
+		bl = self._size - self._br		# bytes left
+		if count < 1 or count > bl:
+			count = bl
+		data = self._mm_target.read(count)
+		self._br += len(data)
+		return data
+		
+	def seek(self, offset, whence=os.SEEK_SET):
+		"""Allows to reset the stream to restart reading
+		:raise ValueError: If offset and whence are not 0"""
+		if offset != 0 or whence != os.SEEK_SET:
+			raise ValueError("Can only seek to position 0")
+		# END handle offset
+		self._size
 #} END RO streams
 
 
diff --git a/test/test_pack.py b/test/test_pack.py
@@ -78,12 +78,12 @@ def _assert_pack_file(self, pack, version, size):
 				continue
 			# END get deltastream
 			
-			# TODO: TestStream._assert_stream_reader does that already, should 
-			# be used instead
 			# read all
-			dstream.read()
+			assert len(dstream.read())
 			
 			# read chunks
+			# NOTE: the current implementation is safe, it basically transfers
+			# all calls to the underlying memory map
 			
 		# END for each object
 		assert num_obj == size
diff --git a/util.py b/util.py
@@ -94,6 +94,19 @@ def stream_copy(source, destination, chunk_size=512*1024):
 	# END reading output stream
 	return br
 
+def allocate_memory(size):
+	""":return: a file-protocol accessible memory block of the given size"""
+	try:
+		return mmap.mmap(-1, size)	# read-write by default
+	except EnvironmentError:
+		# setup real memory instead
+		# this of course may fail if the amount of memory is not available in
+		# one chunk - would only be the case in python 2.4, being more likely on 
+		# 32 bit systems.
+		return cStringIO.StringIO("\0"*size)
+	# END handle memory allocation
+	
+
 def file_contents_ro(fd, stream=False, allow_mmap=True):
 	""":return: read-only contents of the file represented by the file descriptor fd
 	:param fd: file descriptor opened for reading