Skip to content

Commit 84c4e5a

Browse files
committed
initial research on possible delta-apply algorithms. True streaming appears only possible if delta opcodes are acessing only sequential memory, but through mmaps, it should still be possible to obtain decent performance even on big files
1 parent ca82364 commit 84c4e5a

3 files changed

Lines changed: 86 additions & 3 deletions

File tree

pack.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
)
2626
from stream import (
2727
DecompressMemMapReader,
28-
NullStream
28+
DeltaApplyReader,
29+
NullStream,
2930
)
3031

3132
from struct import (
@@ -49,7 +50,6 @@ def pack_object_at(data, offset, as_stream):
4950
:parma offset: offset in to the data at which the object information is located
5051
:param as_stream: if True, a stream object will be returned that can read
5152
the data, otherwise you receive an info object only"""
52-
ldata = len(data) # debug
5353
data = buffer(data, offset)
5454
type_id, uncomp_size, data_rela_offset = pack_object_header_info(data)
5555
total_rela_offset = None # set later, actual offset until data stream begins
@@ -392,7 +392,7 @@ def to_delta_stream(self, stream_list):
392392
# END check stream
393393

394394
# just create the respective stream wrapper
395-
raise NotImplementedError()
395+
return DeltaApplyReader(stream_list)
396396

397397

398398
#} END pack specific

stream.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,82 @@ def read(self, size=-1):
272272
dcompdat += self.read(size-len(dcompdat))
273273
# END handle special case
274274
return dcompdat
275+
276+
277+
class DeltaApplyReader(LazyMixin):
278+
"""A reader which dynamically applies pack deltas to a base object, keeping the
279+
memory demands to a minimum.
280+
281+
The size of the final object is only obtainable once all deltas have been
282+
applied, unless it is retrieved from a pack index.
283+
284+
The uncompressed Delta has the following layout (MSB being a most significant
285+
bit encoded dynamic size):
286+
287+
* MSB Source Size - the size of the base against which the delta was created
288+
* MSB Target Size - the size of the resulting data after the delta was applied
289+
* A list of one byte commands (cmd) which are followed by a specific protocol:
290+
291+
* cmd & 0x80 - copy delta_data[offset:offset+size]
292+
293+
* Followed by an encoded offset into the delta data
294+
* Followed by an encoded size of the chunk to copy
295+
296+
* cmd & 0x7f - insert
297+
298+
* insert cmd bytes from the delta buffer into the output stream
299+
300+
* cmd == 0 - invalid operation ( or error in delta stream )
301+
"""
302+
__slots__ = (
303+
"_streams", # tuple of our stream objects
304+
"_readers", # list of read methods from our streams
305+
"_mm_target", # memory map of the delta-applied data
306+
)
307+
308+
def __init__(self, stream_list):
309+
"""Initialize this instance with a list of streams, the first stream being
310+
the delta to apply on top of all following deltas, the last stream being the
311+
base object onto which to apply the deltas"""
312+
assert len(stream_list) > 1, "Need at least one delta and one base stream"
313+
314+
self._streams = tuple(stream_list)
315+
self._readers = None # TODO
316+
317+
def _set_cache_(self, attr):
318+
"""If we are here, we apply the actual deltas"""
319+
# fill in delta info structures, providing the source and target buffer
320+
# sizes.
275321

322+
# Allocate private memory map big enough to hold the first base buffer
323+
# It can be swapped out if it is too large. We need random access to it
324+
325+
# allocate memory map large enough for the largest (intermediate) target
326+
# We will use it as scratch space for all delta ops. If the final
327+
# target buffer is smaller than our allocated space, we just use parts
328+
# of it
329+
330+
# for each delta to apply, memory map the decompressed delta and
331+
# work on the op-codes to reconstruct everything.
332+
# For the actual copying, we use a seek and write pattern of buffer
333+
# slices.
334+
335+
# NOTE: on py pre 2.5, all memory maps must actually be some kind
336+
# of memory buffer,like StringIO ( ouch ;) )
337+
338+
339+
340+
# TODO: Once that works, figure out the ordering of the opcodes. If they
341+
# are always in-order/sequential, an alternate implementation could
342+
# use stream access only. Of course this would mean we would read
343+
# all deltas in advance, analyse the opcode ranges to determine a final
344+
# concatenated opcode list which indicates what to copy from which delta
345+
# to which position. This preprocessing would allow true streaming
346+
347+
def read(self, size=0):
348+
# pass the call to our lazy-loaded delta-applied data
349+
return self._mm_target.read(size)
350+
276351
#} END RO streams
277352

278353

test/test_pack.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,14 @@ def _assert_pack_file(self, pack, version, size):
7777
# which we havent resolved ( as we are without an index )
7878
continue
7979
# END get deltastream
80+
81+
# TODO: TestStream._assert_stream_reader does that already, should
82+
# be used instead
83+
# read all
84+
dstream.read()
85+
86+
# read chunks
87+
8088
# END for each object
8189
assert num_obj == size
8290

0 commit comments

Comments
 (0)