Skip to content

Commit 4977bc5

Browse files
committed
implemented direct pack reading - currently not all information is passed on, the absolute offset into the packfile could be interesting to the caller
1 parent b6db082 commit 4977bc5

4 files changed

Lines changed: 92 additions & 26 deletions

File tree

fun.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def pack_object_header_info(data):
5353
The type_id should be interpreted according to the ``type_id_to_type_map`` map
5454
The byte-offset specifies the start of the actual zlib compressed datastream
5555
:param m: random-access memory, like a string or memory map"""
56-
c = b0 # first byte
56+
c = ord(data[0]) # first byte
5757
i = 1 # next char to read
5858
type_id = (c >> 4) & 7 # numeric type
5959
size = c & 15 # starting size
@@ -66,7 +66,7 @@ def pack_object_header_info(data):
6666
# END character loop
6767

6868
try:
69-
return (type_id_to_type_map[type_id], size)
69+
return (type_id, size, i)
7070
except KeyError:
7171
# invalid object type - we could try to be smart now and decode part
7272
# of the stream to get the info, problem is that we had trouble finding

pack.py

Lines changed: 59 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
from fun import (
1010
pack_object_header_info,
11+
stream_copy,
12+
chunk_size,
1113
OFS_DELTA,
1214
REF_DELTA
1315
)
@@ -20,6 +22,7 @@
2022
)
2123
from stream import (
2224
DecompressMemMapReader,
25+
NullStream
2326
)
2427

2528
from struct import (
@@ -34,50 +37,61 @@
3437

3538
def pack_object_at(data, as_stream):
3639
"""
37-
:return: info or stream object of the correct type according to the type
38-
of the object, REF_DELTAS will not be resolved in case a stream is desired.
39-
The resulting ODeltaPackStream will have None instead of a stream.
40+
:return: tuple(num_header_bytes, PackInfo|PackStream)
41+
Tuple of number of additional bytes read from data until the data stream begins
42+
and object of the correct type according to the type of the object.
43+
If as_stream is True, the object will contain a stream, allowing the
44+
data to be read decompressed.
4045
:param data: random accessable data at which the header of an object can be read
4146
:param as_stream: if True, a stream object will be returned that can read
4247
the data, otherwise you receive an info object only
4348
:note: a bit redundant, but it needs to be as fast as possible !"""
4449
type_id, uncomp_size, data_offset = pack_object_header_info(data)
45-
50+
total_offset = None # set later, actual offset until data stream begins
51+
obj = None
4652
if type_id == OFS_DELTA:
47-
i = 0
53+
i = data_offset
4854
delta_offset = 0
4955
s = 7
50-
while c & 0x80:
56+
while True:
5157
c = ord(data[i])
52-
i += 1
5358
delta_offset += (c & 0x7f) << s
59+
i += 1
60+
if not (c & 0x80):
61+
break
5462
s += 7
5563
# END character loop
64+
total_offset = i
5665
if as_stream:
57-
stream = DecompressMemMapReader(buffer(data, i), False, uncomp_size)
58-
return ODeltaPackStream(type_id, uncomp_size, delta_offset, stream)
66+
stream = DecompressMemMapReader(buffer(data, total_offset), False, uncomp_size)
67+
obj = ODeltaPackStream(type_id, uncomp_size, delta_offset, stream)
5968
else:
60-
return ODeltaPackInfo(type_id, uncomp_size, delta_offset)
69+
obj = ODeltaPackInfo(type_id, uncomp_size, delta_offset)
6170
# END handle stream
6271
elif type_id == REF_DELTA:
63-
ref_sha = data[:20]
72+
total_offset = data_offset+20
73+
ref_sha = data[data_offset:total_offset]
74+
6475
if as_stream:
65-
stream = DecompressMemMapReader(buffer(data, 20), False, uncomp_size)
66-
return ODeltaPackStream(type_id, uncomp_size, ref_sha, stream)
76+
stream = DecompressMemMapReader(buffer(data, total_offset), False, uncomp_size)
77+
obj = ODeltaPackStream(type_id, uncomp_size, ref_sha, stream)
6778
else:
68-
return ODeltaPackInfo(type_id, uncomp_size, ref_sha)
79+
obj = ODeltaPackInfo(type_id, uncomp_size, ref_sha)
6980
# END handle stream
7081
else:
82+
total_offset = data_offset
7183
# assume its a base object
7284
if as_stream:
7385
# if no size is given, it will read the header on first access
74-
stream = DecompressMemMapReader(buffer(data, data_offset), False)
75-
return OPackStream(type_id, uncomp_size, stream)
86+
stream = DecompressMemMapReader(buffer(data, data_offset), False, uncomp_size)
87+
obj = OPackStream(type_id, uncomp_size, stream)
7688
else:
77-
return OPackInfo(type_id, uncomp_size)
89+
obj = OPackInfo(type_id, uncomp_size)
7890
# END handle as_stream
7991
# END handle type id
8092

93+
return total_offset, obj
94+
8195

8296
#} END utilities
8397

@@ -267,7 +281,8 @@ class PackFile(LazyMixin):
267281
__slots__ = ('_packpath', '_data', '_size', '_version')
268282

269283
# offset into our data at which the first object starts
270-
_first_object_offset = 3*4
284+
_first_object_offset = 3*4 # header bytes
285+
_footer_size = 20 # final sha
271286

272287
def __init__(self, packpath):
273288
self._packpath = packpath
@@ -287,16 +302,28 @@ def _set_cache_(self, attr):
287302
assert self._version in (2, 3), "Cannot handle pack format version %i" % self._version
288303
# END handle header
289304

290-
def _iter_objects(self, start_offset, as_stream):
305+
def _iter_objects(self, start_offset, as_stream=True):
291306
"""Handle the actual iteration of objects within this pack"""
292307
data = self._data
293-
size = len(data)
308+
content_size = len(data) - self._footer_size
294309
cur_offset = start_offset or self._first_object_offset
295310

296-
while cur_offset < size:
297-
ostream = pack_object_at(buffer(data, cur_offset), True)
298-
# TODO: Decompressor needs to track the size of bytes actually decompressed
311+
null = NullStream()
312+
while cur_offset < content_size:
313+
header_offset, ostream = pack_object_at(buffer(data, cur_offset), True)
314+
# scrub the stream to the end - this decompresses the object, but yields
315+
# the amount of compressed bytes we need to get to the next offset
316+
317+
stream_copy(ostream.read, null.write, ostream.size, chunk_size)
318+
cur_offset += header_offset + ostream.stream.compressed_bytes_read()
319+
299320

321+
# if a stream is requested, reset it beforehand
322+
# Otherwise return the Stream object directly, its derived from the
323+
# info object
324+
if as_stream:
325+
ostream.stream.seek(0)
326+
yield ostream
300327
# END until we have read everything
301328

302329
#{ Interface
@@ -329,6 +356,15 @@ def stream(self, offset):
329356
:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
330357
raise NotImplementedError()
331358

359+
def stream_iter(self, start_offset=0):
360+
""":return: iterator yielding OPackStream compatible instances, allowing
361+
to access the data in the pack directly.
362+
:param start_offset: offset to the first object to iterate. If 0, iteration
363+
starts at the very first object in the pack.
364+
:note: Iterating a pack directly is costly as the datastream has to be decompressed
365+
to determine the bounds between the objects"""
366+
return self._iter_objects(start_offset, as_stream=True)
367+
332368
#} END Read-Database like Interface
333369

334370

stream.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,21 @@
1717

1818
#{ RO Streams
1919

20+
class NullStream(object):
21+
"""A stream that does nothing but providing a stream interface.
22+
Use it like /dev/null"""
23+
__slots__ = tuple()
24+
25+
def read(self, size=0):
26+
return ''
27+
28+
def close(self):
29+
pass
30+
31+
def write(self, data):
32+
return len(data)
33+
34+
2035
class DecompressMemMapReader(LazyMixin):
2136
"""Reads data in chunks from a memory map and decompresses it. The client sees
2237
only the uncompressed data, respective file-like read calls are handling on-demand
@@ -144,7 +159,9 @@ def compressed_bytes_read(self):
144159
self._br = self._s
145160
# END handle stream scrubbing
146161

147-
return self._cbr - len(self._zip.unused_data)
162+
# unused data ends up in the unconsumed tail, which was removed
163+
# from the count already
164+
return self._cbr
148165

149166
def seek(self, offset, whence=os.SEEK_SET):
150167
"""Allows to reset the stream to restart reading
@@ -243,7 +260,17 @@ def read(self, size=-1):
243260

244261
if dat:
245262
dcompdat = dat + dcompdat
263+
# END prepend our cached data
246264

265+
# it can happen, depending on the compression, that we get less bytes
266+
# than ordered as it needs the final portion of the data as well.
267+
# Recursively resolve that.
268+
# Note: dcompdat can be empty even though we still appear to have bytes
269+
# to read, if we are called by compressed_bytes_read - it manipulates
270+
# us to empty the stream
271+
if dcompdat and len(dcompdat) < size and self._br < self._s:
272+
dcompdat += self.read(size-len(dcompdat))
273+
# END handle special case
247274
return dcompdat
248275

249276
#} END RO streams

test/test_pack.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ def _assert_pack_file(self, pack, version, size):
5252
assert pack.size() == size
5353
assert len(pack.checksum()) == 20
5454

55+
objs = list(pack.stream_iter())
56+
assert len(objs) == size
57+
5558

5659
def test_pack_index(self):
5760
# check version 1 and 2

0 commit comments

Comments
 (0)