Skip to content

Commit 001f030

Browse files
committed
Initial implementation of stream validation - this is the final hurdle, if that works ( which it doesn't for yet for everything ), than the pack reading would officially work
1 parent f4b6e27 commit 001f030

5 files changed

Lines changed: 146 additions & 40 deletions

File tree

fun.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,11 @@ def msb_size(data, offset=0):
106106
raise AssertionError("Could not find terminating MSB byte in data stream")
107107
return i+offset, size
108108

109+
def loose_object_header(type, size):
110+
""":return: string representing the loose object header, which is immediately
111+
followed by the content stream of size 'size'"""
112+
return "%s %i\0" % (type, size)
113+
109114
def write_object(type, size, read, write, chunk_size=chunk_size):
110115
"""Write the object as identified by type, size and source_stream into the
111116
target_stream
@@ -120,7 +125,7 @@ def write_object(type, size, read, write, chunk_size=chunk_size):
120125
tbw = 0 # total num bytes written
121126

122127
# WRITE HEADER: type SP size NULL
123-
tbw += write("%s %i\0" % (type, size))
128+
tbw += write(loose_object_header(type, size))
124129
tbw += stream_copy(read, write, size, chunk_size)
125130

126131
return tbw

pack.py

Lines changed: 104 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
BadObject,
44
)
55
from util import (
6+
zlib,
67
LockedFD,
78
LazyMixin,
89
unpack_from,
@@ -12,6 +13,7 @@
1213
from fun import (
1314
pack_object_header_info,
1415
type_id_to_type_map,
16+
write_object,
1517
stream_copy,
1618
chunk_size,
1719
delta_types,
@@ -31,14 +33,16 @@
3133
from stream import (
3234
DecompressMemMapReader,
3335
DeltaApplyReader,
36+
Sha1Writer,
3437
NullStream,
3538
)
3639

3740
from struct import (
3841
pack,
3942
)
4043

41-
__all__ = ('PackIndexFile', 'PackFile')
44+
import os
45+
__all__ = ('PackIndexFile', 'PackFile', 'PackEntity')
4246

4347

4448

@@ -237,6 +241,10 @@ def size(self):
237241
""":return: amount of objects referred to by this index"""
238242
return self._fanout_table[255]
239243

244+
def path(self):
245+
""":return: path to the packindexfile"""
246+
return self._indexpath
247+
240248
def packfile_checksum(self):
241249
""":return: 20 byte sha representing the sha1 hash of the pack file"""
242250
return self._data[-40:-20]
@@ -288,8 +296,8 @@ class PackFile(LazyMixin):
288296
__slots__ = ('_packpath', '_data', '_size', '_version')
289297

290298
# offset into our data at which the first object starts
291-
_first_object_offset = 3*4 # header bytes
292-
_footer_size = 20 # final sha
299+
first_object_offset = 3*4 # header bytes
300+
footer_size = 20 # final sha
293301

294302
def __init__(self, packpath):
295303
self._packpath = packpath
@@ -312,8 +320,8 @@ def _set_cache_(self, attr):
312320
def _iter_objects(self, start_offset, as_stream=True):
313321
"""Handle the actual iteration of objects within this pack"""
314322
data = self._data
315-
content_size = len(data) - self._footer_size
316-
cur_offset = start_offset or self._first_object_offset
323+
content_size = len(data) - self.footer_size
324+
cur_offset = start_offset or self.first_object_offset
317325

318326
null = NullStream()
319327
while cur_offset < content_size:
@@ -343,10 +351,18 @@ def version(self):
343351
""":return: the version of this pack"""
344352
return self._version
345353

354+
def data(self):
355+
""":return: read-only data of this pack. It provides random access and usually
356+
is a memory map"""
357+
return self._data
358+
346359
def checksum(self):
347360
""":return: 20 byte sha1 hash on all object sha's contained in this file"""
348361
return self._data[-20:]
349-
362+
363+
def path(self):
364+
""":return: path to the packfile"""
365+
return self._packpath
350366
#} END pack information
351367

352368
#{ Pack Specific
@@ -383,13 +399,13 @@ def info(self, offset):
383399
"""Retrieve information about the object at the given file-absolute offset
384400
:param offset: byte offset
385401
:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
386-
return pack_object_at(self._data, offset or self._first_object_offset, False)
402+
return pack_object_at(self._data, offset or self.first_object_offset, False)
387403

388404
def stream(self, offset):
389405
"""Retrieve an object at the given file-relative offset as stream along with its information
390406
:param offset: byte offset
391407
:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
392-
return pack_object_at(self._data, offset or self._first_object_offset, True)
408+
return pack_object_at(self._data, offset or self.first_object_offset, True)
393409

394410
def stream_iter(self, start_offset=0):
395411
""":return: iterator yielding OPackStream compatible instances, allowing
@@ -403,7 +419,7 @@ def stream_iter(self, start_offset=0):
403419
#} END Read-Database like Interface
404420

405421

406-
class PackFileEntity(object):
422+
class PackEntity(object):
407423
"""Combines the PackIndexFile and the PackFile into one, allowing the
408424
actual objects to be resolved and iterated"""
409425

@@ -412,11 +428,12 @@ class PackFileEntity(object):
412428
IndexFileCls = PackIndexFile
413429
PackFileCls = PackFile
414430

415-
def __init__(self, basename):
431+
def __init__(self, pack_or_index_path):
432+
"""Initialize ourselves with the path to the respective pack or index file"""
433+
basename, ext = os.path.splitext(pack_or_index_path)
416434
self._index = self.IndexFileCls("%s.idx" % basename) # PackIndexFile instance
417435
self._pack = self.PackFileCls("%s.pack" % basename) # corresponding PackFile instance
418436

419-
420437
def _sha_to_index(self, sha):
421438
""":return: index for the given sha, or raise"""
422439
index = self._index.sha_to_index(sha)
@@ -426,12 +443,20 @@ def _sha_to_index(self, sha):
426443

427444
def _iter_objects(self, as_stream):
428445
"""Iterate over all objects in our index and yield their OInfo or OStream instences"""
429-
raise NotImplementedError()
430-
431-
def _object(self, sha, as_stream):
432-
""":return: OInfo or OStream object providing information about the given sha"""
446+
indexfile = self._index
447+
_object = self._object
448+
for index in xrange(indexfile.size()):
449+
sha = indexfile.sha(index)
450+
yield _object(sha, as_stream, index)
451+
# END for each index
452+
453+
def _object(self, sha, as_stream, index=-1):
454+
""":return: OInfo or OStream object providing information about the given sha
455+
:param index: if not -1, its assumed to be the sha's index in the IndexFile"""
433456
# its a little bit redundant here, but it needs to be efficient
434-
offset = self._index.offset(self._sha_to_index(sha))
457+
if index < 0:
458+
index = self._sha_to_index(sha)
459+
offset = self._index.offset(index)
435460
type_id, uncomp_size, data_rela_offset = pack_object_header_info(buffer(self._pack._data, offset))
436461
if as_stream:
437462
if type_id not in delta_types:
@@ -447,7 +472,7 @@ def _object(self, sha, as_stream):
447472
offset, src_size = msb_size(buf)
448473
offset, target_size = msb_size(buf, offset)
449474

450-
streams[0].seek(0) # assure it can be read by the delta reader
475+
streams[0].stream.seek(0) # assure it can be read by the delta reader
451476
dstream = DeltaApplyReader.new(streams)
452477

453478
return OStream(sha, dstream.type, target_size, dstream)
@@ -476,20 +501,79 @@ def info(self, sha):
476501
"""Retrieve information about the object identified by the given sha
477502
:param sha: 20 byte sha1
478503
:raise BadObject:
479-
:return: OInfo instance"""
504+
:return: OInfo instance, with 20 byte sha"""
480505
return self._object(sha, as_stream=False)
481506

482507
def stream(self, sha):
483508
"""Retrieve an object stream along with its information as identified by the given sha
484509
:param sha: 20 byte sha1
485510
:raise BadObject:
486-
:return: OStream instance"""
511+
:return: OStream instance, with 20 byte sha"""
487512
return self._object(sha, as_stream=True)
488513

489514
#} END Read-Database like Interface
490515

491516
#{ Interface
492-
517+
518+
def pack(self):
519+
""":return: the underlying pack file instance"""
520+
return self._pack
521+
522+
def index(self):
523+
""":return: the underlying pack index file instance"""
524+
return self._index
525+
526+
def is_valid_stream(self, sha, use_crc=False):
527+
"""Verify that the stream at the given sha is valid.
528+
:param sha: 20 byte sha1 of the object whose stream to verify
529+
:param use_crc: if True, the index' crc for the sha is used to determine
530+
whether the compressed stream of the object is valid. If it is
531+
a delta, this only verifies that the delta's data is valid, not the
532+
data of the actual undeltified object, as it depends on more than
533+
just this stream.
534+
If False, the object will be decompressed and the sha generated. It must
535+
match the given sha
536+
:return: True if the stream is valid
537+
:raise UnsupportedOperation: If the index is version 1 only
538+
:raise BadObject: sha was not found"""
539+
if use_crc:
540+
index = self._sha_to_index(sha)
541+
offset = self._index.offset(index)
542+
pack_data = self._pack.data()
543+
next_index = min(self._index.size()-1, index+1)
544+
next_offset = 0
545+
if next_index == index:
546+
next_offset = len(pack_data) - self._pack.footer_size
547+
else:
548+
next_offset = self._index.offset(next_index)
549+
# END get next offset
550+
crc_value = self._index.crc(index)
551+
552+
this_crc_value = 0
553+
crc_update = zlib.crc32
554+
555+
# create the current crc value, on the compressed object data
556+
# Read it in chunks, without copying the data
557+
cur_pos = offset
558+
while cur_pos < next_offset:
559+
rbound = min(cur_pos + chunk_size, next_offset)
560+
size = rbound - cur_pos
561+
crc_update(buffer(pack_data, cur_pos, size), this_crc_value)
562+
cur_pos += size
563+
# END window size loop
564+
565+
assert this_crc_value == crc_value
566+
return this_crc_value == crc_value
567+
else:
568+
shawriter = Sha1Writer()
569+
stream = self._object(sha, as_stream=True)
570+
# write a loose object, which is the basis for the sha
571+
write_object(stream.type, stream.size, stream.read, shawriter.write)
572+
573+
return shawriter.sha(as_hex=False) == sha
574+
# END handle crc/sha verification
575+
return True
576+
493577
def info_iter(self):
494578
""":return: Iterator over all objects in this pack. The iterator yields
495579
OInfo instances"""

stream.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ class Sha1Writer(object):
474474
__slots__ = "sha1"
475475

476476
def __init__(self):
477-
self.sha1 = make_sha("")
477+
self.sha1 = make_sha()
478478

479479
#{ Stream Interface
480480

test/test_pack.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,16 @@
1010
)
1111

1212
from gitdb.pack import (
13+
PackEntity,
1314
PackIndexFile,
1415
PackFile
1516
)
17+
18+
from gitdb.fun import (
19+
delta_types,
20+
)
1621
from gitdb.util import to_bin_sha
22+
from itertools import izip
1723
import os
1824

1925

@@ -84,7 +90,7 @@ def _assert_pack_file(self, pack, version, size):
8490
# END get deltastream
8591

8692
# read all
87-
assert len(dstream.read())
93+
assert len(dstream.read()) == dstream.size
8894

8995
# read chunks
9096
# NOTE: the current implementation is safe, it basically transfers
@@ -109,8 +115,34 @@ def test_pack(self):
109115
# END for each pack to test
110116

111117
def test_pack_entity(self):
112-
# TODO:
113-
pass
118+
for packinfo, indexinfo in ( (self.packfile_v2_1, self.packindexfile_v1),
119+
(self.packfile_v2_2, self.packindexfile_v2)):
120+
packfile, version, size = packinfo
121+
indexfile, version, size = indexinfo
122+
print packfile
123+
entity = PackEntity(packfile)
124+
assert entity.pack().path() == packfile
125+
assert entity.index().path() == indexfile
126+
127+
count = 0
128+
for info, stream in izip(entity.info_iter(), entity.stream_iter()):
129+
count += 1
130+
assert info.sha == stream.sha
131+
assert len(info.sha) == 20
132+
assert info.type_id == stream.type_id
133+
assert info.size == stream.size
134+
135+
# we return fully resolved items, which is implied by the sha centric access
136+
assert not info.type_id in delta_types
137+
138+
# verify the stream
139+
print info
140+
assert entity.is_valid_stream(info.sha, use_crc=True)
141+
#assert entity.is_valid_stream(info.sha, use_crc=False)
142+
# END for each info, stream tuple
143+
assert count == size
144+
145+
# END for each entity
114146

115147
def test_pack_64(self):
116148
# TODO: hex-edit a pack helping us to verify that we can handle 64 byte offsets

util.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -79,21 +79,6 @@ def make_sha(source=''):
7979
sha1 = sha.sha(source)
8080
return sha1
8181

82-
def stream_copy(source, destination, chunk_size=512*1024):
83-
"""Copy all data from the source stream into the destination stream in chunks
84-
of size chunk_size
85-
86-
:return: amount of bytes written"""
87-
br = 0
88-
while True:
89-
chunk = source.read(chunk_size)
90-
destination.write(chunk)
91-
br += len(chunk)
92-
if len(chunk) < chunk_size:
93-
break
94-
# END reading output stream
95-
return br
96-
9782
def allocate_memory(size):
9883
""":return: a file-protocol accessible memory block of the given size"""
9984
try:

0 commit comments

Comments
 (0)