Skip to content

Commit ca82364

Browse files
committed
Implemented offset based pack object collection including test, next up is the actual stream delta handling
1 parent 4977bc5 commit ca82364

5 files changed

Lines changed: 191 additions & 75 deletions

File tree

base.py

Lines changed: 39 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55
zlib
66
)
77

8-
from fun import type_id_to_type_map
8+
from fun import (
9+
type_id_to_type_map,
10+
type_to_type_id_map
11+
)
912

1013
__all__ = ('OInfo', 'OPackInfo', 'ODeltaPackInfo',
1114
'OStream', 'OPackStream', 'ODeltaPackStream',
@@ -41,6 +44,10 @@ def sha(self):
4144
@property
4245
def type(self):
4346
return self[1]
47+
48+
@property
49+
def type_id(self):
50+
return type_to_type_id_map[self[1]]
4451

4552
@property
4653
def size(self):
@@ -50,46 +57,58 @@ def size(self):
5057

5158
class OPackInfo(tuple):
5259
"""As OInfo, but provides a type_id property to retrieve the numerical type id, and
53-
does not include a sha"""
60+
does not include a sha.
61+
62+
Additionally, the pack_offset is the absolute offset into the packfile at which
63+
all object information is located. The data_offset property points to the abosolute
64+
location in the pack at which that actual data stream can be found."""
5465
__slots__ = tuple()
5566

56-
def __new__(cls, type, size):
57-
return tuple.__new__(cls, (type, size))
67+
def __new__(cls, packoffset, dataoffset, type, size):
68+
return tuple.__new__(cls, (packoffset, dataoffset, type, size))
5869

5970
def __init__(self, *args):
6071
tuple.__init__(self)
6172

6273
#{ Interface
6374

75+
@property
76+
def pack_offset(self):
77+
return self[0]
78+
79+
@property
80+
def data_offset(self):
81+
return self[1]
82+
6483
@property
6584
def type(self):
66-
return type_id_to_type_map[self[0]]
85+
return type_id_to_type_map[self[2]]
6786

6887
@property
6988
def type_id(self):
70-
return self[0]
89+
return self[2]
7190

7291
@property
7392
def size(self):
74-
return self[1]
93+
return self[3]
7594

7695
#} END interface
7796

7897

7998
class ODeltaPackInfo(OPackInfo):
8099
"""Adds delta specific information,
81100
Either the 20 byte sha which points to some object in the database,
82-
or the base_offset, being an offset into the pack at which our base
83-
can be found"""
101+
or the negative offset from the pack_offset, so that pack_offset - delta_info yields
102+
the pack offset of the base object"""
84103
__slots__ = tuple()
85104

86-
def __new__(cls, type, size, delta_info):
87-
return tuple.__new__(cls, (type, size, delta_info))
105+
def __new__(cls, packoffset, dataoffset, type, size, delta_info):
106+
return tuple.__new__(cls, (packoffset, dataoffset, type, size, delta_info))
88107

89108
#{ Interface
90109
@property
91110
def delta_info(self):
92-
return self[2]
111+
return self[4]
93112
#} END interface
94113

95114

@@ -123,35 +142,35 @@ class OPackStream(OPackInfo):
123142
is provided"""
124143
__slots__ = tuple()
125144

126-
def __new__(cls, type, size, stream, *args):
145+
def __new__(cls, packoffset, dataoffset, type, size, stream, *args):
127146
"""Helps with the initialization of subclasses"""
128-
return tuple.__new__(cls, (type, size, stream))
147+
return tuple.__new__(cls, (packoffset, dataoffset, type, size, stream))
129148

130149
#{ Stream Reader Interface
131150
def read(self, size=-1):
132-
return self[2].read(size)
151+
return self[4].read(size)
133152

134153
@property
135154
def stream(self):
136-
return self[2]
155+
return self[4]
137156
#} END stream reader interface
138157

139158

140159
class ODeltaPackStream(ODeltaPackInfo):
141160
"""Provides a stream outputting the uncompressed offset delta information"""
142161
__slots__ = tuple()
143162

144-
def __new__(cls, type, size, delta_info, stream):
145-
return tuple.__new__(cls, (type, size, delta_info, stream))
163+
def __new__(cls, packoffset, dataoffset, type, size, delta_info, stream):
164+
return tuple.__new__(cls, (packoffset, dataoffset, type, size, delta_info, stream))
146165

147166

148167
#{ Stream Reader Interface
149168
def read(self, size=-1):
150-
return self[3].read(size)
169+
return self[5].read(size)
151170

152171
@property
153172
def stream(self):
154-
return self[3]
173+
return self[5]
155174
#} END stream reader interface
156175

157176

fun.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@
2424
REF_DELTA : "REF_DELTA" # REFERENCE DELTA
2525
}
2626

27+
type_to_type_id_map = dict(
28+
commit=1,
29+
tree=2,
30+
blob=3,
31+
tag=4,
32+
OFS_DELTA=OFS_DELTA,
33+
REF_DELTA=REF_DELTA
34+
)
35+
2736
# used when dealing with larger streams
2837
chunk_size = 1000*1000
2938

pack.py

Lines changed: 107 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
"""Contains PackIndexFile and PackFile implementations"""
2+
from gitdb.exc import (
3+
BadObject,
4+
)
25
from util import (
36
LockedFD,
47
LazyMixin,
@@ -31,67 +34,68 @@
3134

3235
__all__ = ('PackIndexFile', 'PackFile')
3336

37+
_delta_types = (OFS_DELTA, REF_DELTA)
3438

3539

3640
#{ Utilities
3741

38-
def pack_object_at(data, as_stream):
42+
def pack_object_at(data, offset, as_stream):
3943
"""
40-
:return: tuple(num_header_bytes, PackInfo|PackStream)
41-
Tuple of number of additional bytes read from data until the data stream begins
42-
and object of the correct type according to the type of the object.
44+
:return: PackInfo|PackStream
45+
an object of the correct type according to the type_id of the object.
4346
If as_stream is True, the object will contain a stream, allowing the
4447
data to be read decompressed.
45-
:param data: random accessable data at which the header of an object can be read
48+
:param data: random accessable data containing all required information
49+
:parma offset: offset in to the data at which the object information is located
4650
:param as_stream: if True, a stream object will be returned that can read
47-
the data, otherwise you receive an info object only
48-
:note: a bit redundant, but it needs to be as fast as possible !"""
49-
type_id, uncomp_size, data_offset = pack_object_header_info(data)
50-
total_offset = None # set later, actual offset until data stream begins
51-
obj = None
51+
the data, otherwise you receive an info object only"""
52+
ldata = len(data) # debug
53+
data = buffer(data, offset)
54+
type_id, uncomp_size, data_rela_offset = pack_object_header_info(data)
55+
total_rela_offset = None # set later, actual offset until data stream begins
56+
delta_info = None
57+
58+
# OFFSET DELTA
5259
if type_id == OFS_DELTA:
53-
i = data_offset
54-
delta_offset = 0
55-
s = 7
56-
while True:
60+
i = data_rela_offset
61+
c = ord(data[i])
62+
i += 1
63+
delta_offset = c & 0x7f
64+
while c & 0x80:
5765
c = ord(data[i])
58-
delta_offset += (c & 0x7f) << s
5966
i += 1
60-
if not (c & 0x80):
61-
break
62-
s += 7
67+
delta_offset += 1
68+
delta_offset = (delta_offset << 7) + (c & 0x7f)
6369
# END character loop
64-
total_offset = i
65-
if as_stream:
66-
stream = DecompressMemMapReader(buffer(data, total_offset), False, uncomp_size)
67-
obj = ODeltaPackStream(type_id, uncomp_size, delta_offset, stream)
68-
else:
69-
obj = ODeltaPackInfo(type_id, uncomp_size, delta_offset)
70-
# END handle stream
70+
delta_info = delta_offset
71+
total_rela_offset = i
72+
# REF DELTA
7173
elif type_id == REF_DELTA:
72-
total_offset = data_offset+20
73-
ref_sha = data[data_offset:total_offset]
74-
75-
if as_stream:
76-
stream = DecompressMemMapReader(buffer(data, total_offset), False, uncomp_size)
77-
obj = ODeltaPackStream(type_id, uncomp_size, ref_sha, stream)
78-
else:
79-
obj = ODeltaPackInfo(type_id, uncomp_size, ref_sha)
80-
# END handle stream
74+
total_rela_offset = data_rela_offset+20
75+
ref_sha = data[data_rela_offset:total_rela_offset]
76+
delta_info = ref_sha
77+
# BASE OBJECT
8178
else:
82-
total_offset = data_offset
8379
# assume its a base object
84-
if as_stream:
85-
# if no size is given, it will read the header on first access
86-
stream = DecompressMemMapReader(buffer(data, data_offset), False, uncomp_size)
87-
obj = OPackStream(type_id, uncomp_size, stream)
88-
else:
89-
obj = OPackInfo(type_id, uncomp_size)
90-
# END handle as_stream
80+
total_rela_offset = data_rela_offset
9181
# END handle type id
9282

93-
return total_offset, obj
94-
83+
abs_data_offset = offset + total_rela_offset
84+
if as_stream:
85+
stream = DecompressMemMapReader(buffer(data, total_rela_offset), False, uncomp_size)
86+
if delta_info is None:
87+
return OPackStream(offset, abs_data_offset, type_id, uncomp_size, stream)
88+
else:
89+
return ODeltaPackStream(offset, abs_data_offset, type_id, uncomp_size, delta_info, stream)
90+
else:
91+
if delta_info is None:
92+
return OPackInfo(offset, abs_data_offset, type_id, uncomp_size)
93+
else:
94+
return ODeltaPackInfo(offset, abs_data_offset, type_id, uncomp_size, delta_info)
95+
# END handle info
96+
# END handle stream
97+
98+
9599

96100
#} END utilities
97101

@@ -310,12 +314,12 @@ def _iter_objects(self, start_offset, as_stream=True):
310314

311315
null = NullStream()
312316
while cur_offset < content_size:
313-
header_offset, ostream = pack_object_at(buffer(data, cur_offset), True)
317+
ostream = pack_object_at(data, cur_offset, True)
314318
# scrub the stream to the end - this decompresses the object, but yields
315319
# the amount of compressed bytes we need to get to the next offset
316320

317321
stream_copy(ostream.read, null.write, ostream.size, chunk_size)
318-
cur_offset += header_offset + ostream.stream.compressed_bytes_read()
322+
cur_offset += (ostream.data_offset - ostream.pack_offset) + ostream.stream.compressed_bytes_read()
319323

320324

321325
# if a stream is requested, reset it beforehand
@@ -326,7 +330,7 @@ def _iter_objects(self, start_offset, as_stream=True):
326330
yield ostream
327331
# END until we have read everything
328332

329-
#{ Interface
333+
#{ Pack Information
330334

331335
def size(self):
332336
""":return: The amount of objects stored in this pack"""
@@ -340,21 +344,72 @@ def checksum(self):
340344
""":return: 20 byte sha1 hash on all object sha's contained in this file"""
341345
return self._data[-20:]
342346

343-
#} END interface
347+
#} END pack information
348+
349+
#{ Pack Specific
350+
351+
def collect_streams(self, offset):
352+
"""
353+
:return: list of pack streams which are required to build the object
354+
at the given offset. The first entry of the list is the object at offset,
355+
the last one is either a full object, or a REF_Delta stream. The latter
356+
type needs its reference object to be locked up in an ODB to form a valid
357+
delta chain.
358+
:param offset: specifies the first byte of the object within this pack"""
359+
out = list()
360+
while True:
361+
ostream = pack_object_at(self._data, offset, True)
362+
out.append(ostream)
363+
if ostream.type_id == OFS_DELTA:
364+
offset = ostream.pack_offset - ostream.delta_info
365+
else:
366+
# the only thing we can lookup are OFFSET deltas. Everything
367+
# else is either an object, or a ref delta, in the latter
368+
# case someone else has to find it
369+
break
370+
# END handle type
371+
# END while chaining streams
372+
return out
373+
374+
def to_delta_stream(self, stream_list):
375+
"""Convert the given list of streams into a stream which resolves deltas
376+
(if availble) when reading from it.
377+
:param stream_list: one or more stream objects. If the first stream is a Delta,
378+
there must be at least two streams in the list. The list's last stream
379+
must be a non-delta stream.
380+
:return: Non-Delta OPackStream object whose stream can be used to obtain
381+
the decompressed resolved data
382+
:raise ValueError: if the stream list cannot be handled due to a missing base object"""
383+
if len(stream_list) == 1:
384+
if stream_list[0].type_id in _delta_types:
385+
raise ValueError("Cannot resolve deltas if only one stream is given", stream_list[0].type)
386+
# its an object, no need to resolve anything
387+
return stream_list[0]
388+
# END single object special handling
389+
390+
if stream_list[-1].type_id in _delta_types:
391+
raise ValueError("Cannot resolve deltas if there is no base object stream, last one was type: %s" % stream_list[-1].type)
392+
# END check stream
393+
394+
# just create the respective stream wrapper
395+
raise NotImplementedError()
396+
397+
398+
#} END pack specific
344399

345400
#{ Read-Database like Interface
346401

347402
def info(self, offset):
348403
"""Retrieve information about the object at the given file-absolute offset
349404
:param offset: byte offset
350405
:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
351-
raise NotImplementedError()
406+
return pack_object_at(self._data, offset or self._first_object_offset, False)
352407

353408
def stream(self, offset):
354409
"""Retrieve an object at the given file-relative offset as stream along with its information
355410
:param offset: byte offset
356411
:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
357-
raise NotImplementedError()
412+
return pack_object_at(self._data, offset or self._first_object_offset, True)
358413

359414
def stream_iter(self, start_offset=0):
360415
""":return: iterator yielding OPackStream compatible instances, allowing
@@ -390,12 +445,14 @@ def _iter_objects(self, as_stream):
390445
def info(self, sha):
391446
"""Retrieve information about the object identified by the given sha
392447
:param sha: 20 byte sha1
448+
:raise BadObject:
393449
:return: OInfo instance"""
394450
raise NotImplementedError()
395451

396452
def stream(self, sha):
397453
"""Retrieve an object stream along with its information as identified by the given sha
398454
:param sha: 20 byte sha1
455+
:raise BadObject:
399456
:return: OStream instance"""
400457
raise NotImplementedError()
401458

0 commit comments

Comments
 (0)