Skip to content

Commit ecb1878

Browse files
committed
CRC verification already works for all packs, sha1 still needs some work, probably with deltified objects, there it shows whether we did it aaaaaall correctly ;)
1 parent 001f030 commit ecb1878

2 files changed

Lines changed: 67 additions & 22 deletions

File tree

pack.py

Lines changed: 58 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Contains PackIndexFile and PackFile implementations"""
22
from gitdb.exc import (
3-
BadObject,
3+
BadObject,
4+
UnsupportedOperation
45
)
56
from util import (
67
zlib,
@@ -41,6 +42,8 @@
4142
pack,
4243
)
4344

45+
from itertools import izip
46+
import array
4447
import os
4548
__all__ = ('PackIndexFile', 'PackFile', 'PackEntity')
4649

@@ -253,6 +256,21 @@ def indexfile_checksum(self):
253256
""":return: 20 byte sha representing the sha1 hash of this index file"""
254257
return self._data[-20:]
255258

259+
def offsets(self):
260+
""":return: sequence of all offsets in the order in which they were written
261+
:note: return value can be random accessed, but may be immmutable"""
262+
if self._version == 2:
263+
# read stream to array, convert to tuple
264+
a = array.array('I') # 4 byte unsigned int, long are 8 byte on 64 bit it appears
265+
a.fromstring(buffer(self._data, self._pack_offset, self._pack_64_offset - self._pack_offset))
266+
267+
# networkbyteorder to something array likes more
268+
a.byteswap()
269+
return a
270+
else:
271+
return tuple(self.offset(index) for index in xrange(self.size()))
272+
# END handle version
273+
256274
def sha_to_index(self, sha):
257275
"""
258276
:return: index usable with the ``offset`` or ``entry`` method, or None
@@ -419,11 +437,14 @@ def stream_iter(self, start_offset=0):
419437
#} END Read-Database like Interface
420438

421439

422-
class PackEntity(object):
440+
class PackEntity(LazyMixin):
423441
"""Combines the PackIndexFile and the PackFile into one, allowing the
424442
actual objects to be resolved and iterated"""
425443

426-
__slots__ = ('_index', '_pack')
444+
__slots__ = ( '_index', # our index file
445+
'_pack', # our pack file
446+
'_offset_map' # on demand dict mapping one offset to the next consecutive one
447+
)
427448

428449
IndexFileCls = PackIndexFile
429450
PackFileCls = PackFile
@@ -433,6 +454,28 @@ def __init__(self, pack_or_index_path):
433454
basename, ext = os.path.splitext(pack_or_index_path)
434455
self._index = self.IndexFileCls("%s.idx" % basename) # PackIndexFile instance
435456
self._pack = self.PackFileCls("%s.pack" % basename) # corresponding PackFile instance
457+
458+
def _set_cache_(self, attr):
459+
# currently this can only be _offset_map
460+
offsets_sorted = sorted(self._index.offsets())
461+
last_offset = len(self._pack.data()) - self._pack.footer_size
462+
assert offsets_sorted, "Cannot handle empty indices"
463+
464+
offset_map = None
465+
if len(offsets_sorted) == 1:
466+
offset_map = { offsets_sorted[0] : last_offset }
467+
else:
468+
iter_offsets = iter(offsets_sorted)
469+
iter_offsets_plus_one = iter(offsets_sorted)
470+
iter_offsets_plus_one.next()
471+
consecutive = izip(iter_offsets, iter_offsets_plus_one)
472+
473+
offset_map = dict(consecutive)
474+
475+
# the last offset is not yet set
476+
offset_map[offsets_sorted[-1]] = last_offset
477+
# END handle offset amount
478+
self._offset_map = offset_map
436479

437480
def _sha_to_index(self, sha):
438481
""":return: index for the given sha, or raise"""
@@ -537,33 +580,31 @@ def is_valid_stream(self, sha, use_crc=False):
537580
:raise UnsupportedOperation: If the index is version 1 only
538581
:raise BadObject: sha was not found"""
539582
if use_crc:
583+
if self._index.version() < 2:
584+
raise UnsupportedOperation("Version 1 indices do not contain crc's, verify by sha instead")
585+
# END handle index version
586+
540587
index = self._sha_to_index(sha)
541588
offset = self._index.offset(index)
542-
pack_data = self._pack.data()
543-
next_index = min(self._index.size()-1, index+1)
544-
next_offset = 0
545-
if next_index == index:
546-
next_offset = len(pack_data) - self._pack.footer_size
547-
else:
548-
next_offset = self._index.offset(next_index)
549-
# END get next offset
589+
next_offset = self._offset_map[offset]
550590
crc_value = self._index.crc(index)
551591

552-
this_crc_value = 0
553-
crc_update = zlib.crc32
554-
555592
# create the current crc value, on the compressed object data
556593
# Read it in chunks, without copying the data
594+
crc_update = zlib.crc32
595+
pack_data = self._pack.data()
557596
cur_pos = offset
597+
this_crc_value = 0
558598
while cur_pos < next_offset:
559599
rbound = min(cur_pos + chunk_size, next_offset)
560600
size = rbound - cur_pos
561-
crc_update(buffer(pack_data, cur_pos, size), this_crc_value)
601+
this_crc_value = crc_update(buffer(pack_data, cur_pos, size), this_crc_value)
562602
cur_pos += size
563603
# END window size loop
564604

565-
assert this_crc_value == crc_value
566-
return this_crc_value == crc_value
605+
# crc returns signed 32 bit numbers, the AND op forces it into unsigned
606+
# mode ... wow, sneaky, from dulwich.
607+
return (this_crc_value & 0xffffffff) == crc_value
567608
else:
568609
shawriter = Sha1Writer()
569610
stream = self._object(sha, as_stream=True)

test/test_pack.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@
1515
PackFile
1616
)
1717

18-
from gitdb.fun import (
19-
delta_types,
20-
)
18+
from gitdb.fun import delta_types
19+
from gitdb.exc import UnsupportedOperation
2120
from gitdb.util import to_bin_sha
2221
from itertools import izip
2322
import os
@@ -42,6 +41,7 @@ def _assert_index_file(self, index, version, size):
4241
assert len(index.indexfile_checksum()) == 20
4342
assert index.version() == version
4443
assert index.size() == size
44+
assert len(index.offsets()) == size
4545

4646
# get all data of all objects
4747
for oidx in xrange(index.size()):
@@ -137,8 +137,12 @@ def test_pack_entity(self):
137137

138138
# verify the stream
139139
print info
140-
assert entity.is_valid_stream(info.sha, use_crc=True)
141-
#assert entity.is_valid_stream(info.sha, use_crc=False)
140+
try:
141+
assert entity.is_valid_stream(info.sha, use_crc=True)
142+
except UnsupportedOperation:
143+
pass
144+
# END ignore version issues
145+
assert entity.is_valid_stream(info.sha, use_crc=False)
142146
# END for each info, stream tuple
143147
assert count == size
144148

0 commit comments

Comments
 (0)