Skip to content

Commit 099ec0d

Browse files
committed
index reading from V2 index files implemeneted and tested.
Added LazyMixin type from git-python
1 parent f50643f commit 099ec0d

8 files changed

Lines changed: 297 additions & 13 deletions

File tree

fun.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,4 @@ def stream_copy(read, write, size, chunk_size):
113113

114114

115115
#} END routines
116+

pack.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,191 @@
11
"""Contains PackIndex and PackFile implementations"""
2+
from util import (
3+
LockedFD,
4+
LazyMixin,
5+
file_contents_ro,
6+
unpack_from
7+
)
8+
9+
from struct import (
10+
pack,
11+
)
12+
13+
__all__ = ('PackIndex', 'Pack')
14+
15+
16+
class PackIndex(LazyMixin):
17+
"""A pack index provides offsets into the corresponding pack, allowing to find
18+
locations for offsets faster."""
19+
20+
# Dont use slots as we dynamically bind functions for each version, need a dict for this
21+
# The slots you see here are just to keep track of our instance variables
22+
# __slots__ = ('_indexpath', '_fanout_table', '_data', '_version',
23+
# '_sha_list_offset', '_crc_list_offset', '_pack_offset', '_pack_64_offset')
24+
25+
# used in v2 indices
26+
_sha_list_offset = 8 + 1024
27+
28+
def __init__(self, indexpath):
29+
super(PackIndex, self).__init__()
30+
self._indexpath = indexpath
31+
32+
def _set_cache_(self, attr):
33+
if attr == "_packfile_checksum":
34+
self._packfile_checksum = self._data[-40:-20]
35+
elif attr == "_packfile_checksum":
36+
self._packfile_checksum = self._data[-20:]
37+
elif attr == "_data":
38+
lfd = LockedFD(self._indexpath)
39+
fd = lfd.open()
40+
self._data = file_contents_ro(fd)
41+
lfd.rollback()
42+
else:
43+
# now its time to initialize everything - if we are here, someone wants
44+
# to access the fanout table or related properties
45+
46+
# CHECK VERSION
47+
self._version = (self._data[:4] == '\377tOc' and 2) or 1
48+
if self._version == 2:
49+
version_id = unpack_from(">L", self._data, 4)[0]
50+
assert version_id == self._version, "Unsupported index version: %i" % version_id
51+
# END assert version
52+
53+
# SETUP FUNCTIONS
54+
# setup our functions according to the actual version
55+
for fname in ('entry', 'offset', 'sha', 'crc'):
56+
setattr(self, fname, getattr(self, "_%s_v%i" % (fname, self._version)))
57+
# END for each function to initialize
58+
59+
60+
# INITIALIZE DATA
61+
# byte offset is 8 if version is 2, 0 otherwise
62+
self._initialize()
63+
# END handle attributes
64+
65+
66+
#{ Access V1
67+
68+
def _entry_v1(self, i):
69+
""":return: tuple(offset, binsha)"""
70+
return unpack_from(">L20s", self._data, 1024 + i*24)[0]
71+
72+
def _offset_v1(self, i):
73+
"""see ``_offset_v2``"""
74+
return unpack_from(">L", self._data, 1024 + i*24)[0]
75+
76+
def _sha_v1(self, i):
77+
"""see ``_sha_v2``"""
78+
base = 1024 + i*24
79+
return self._data[base:base+20]
80+
81+
def _crc_v1(self, i):
82+
"""unsupported"""
83+
return 0
84+
85+
#} END access V1
86+
87+
#{ Access V2
88+
def _entry_v2(self, i):
89+
""":return: tuple(offset, binsha, crc)"""
90+
return (self._offset_v2(i), self._sha_v2(i), self._crc_v2(i))
91+
92+
def _offset_v2(self, i):
93+
""":return: 32 or 64 byte offset into pack files. 64 byte offsets will only
94+
be returned if the pack is larger than 4 GiB, or 2^32"""
95+
offset = unpack_from(">L", self._data, self._pack_offset + i * 4)[0]
96+
97+
# if the high-bit is set, this indicates that we have to lookup the offset
98+
# in the 64 bit region of the file. The current offset ( lower 31 bits )
99+
# are the index into it
100+
if offset & 0x80000000:
101+
offset = unpack_from(">Q", self._data, self._pack_64_offset + (self.offset & ~0x80000000) * 8)[0]
102+
# END handle 64 bit offset
103+
104+
return offset
105+
106+
def _sha_v2(self, i):
107+
""":return: sha at the given index of this file index instance"""
108+
base = self._sha_list_offset + i * 20
109+
return self._data[base:base+20]
110+
111+
def _crc_v2(self, i):
112+
""":return: 4 bytes crc for the object at index i"""
113+
return unpack_from(">L", self._data, self._crc_list_offset + i * 4)[0]
114+
115+
#} END access V2
116+
117+
#{ Initialization
118+
119+
def _initialize(self):
120+
"""initialize base data"""
121+
self._fanout_table = self._read_fanout((self._version == 2) * 8)
122+
123+
if self._version == 2:
124+
self._crc_list_offset = self._sha_list_offset + self.size * 20
125+
self._pack_offset = self._crc_list_offset + self.size * 4
126+
self._pack_64_offset = self._pack_offset + self.size * 4
127+
# END setup base
128+
129+
def _read_fanout(self, byte_offset):
130+
"""Generate a fanout table from our data"""
131+
d = self._data
132+
out = list()
133+
append = out.append
134+
for i in range(256):
135+
append(unpack_from('>L', d, byte_offset + i*4)[0])
136+
# END for each entry
137+
return out
138+
139+
#} END initialization
140+
141+
#{ Properties
142+
@property
143+
def version(self):
144+
return self._version
145+
146+
@property
147+
def size(self):
148+
""":return: amount of objects referred to by this index"""
149+
return self._fanout_table[255]
150+
151+
@property
152+
def packfile_checksum(self):
153+
""":return: 20 byte sha representing the sha1 hash of the pack file"""
154+
return self._data[-40:-20]
155+
156+
@property
157+
def indexfile_checksum(self):
158+
""":return: 20 byte sha representing the sha1 hash of this index file"""
159+
return self._data[-20:]
160+
161+
def sha_to_index(self, sha):
162+
"""
163+
:return: index usable with the ``offset`` or ``entry`` method, or None
164+
if the sha was not found in this pack index
165+
:param sha: 20 byte sha to lookup"""
166+
first_byte = ord(sha[0])
167+
lo = 0 # lower index, the left bound of the bisection
168+
if first_byte != 0:
169+
lo = self._fanout_table[first_byte-1]
170+
hi = self._fanout_table[first_byte] # the upper, right bound of the bisection
171+
172+
# bisect until we have the sha
173+
while lo < hi:
174+
mid = (lo + hi) / 2
175+
c = cmp(sha, self.sha(mid))
176+
if c < 0:
177+
hi = mid
178+
elif not c:
179+
return mid
180+
else:
181+
lo = mid
182+
# END handle midpoint
183+
# END bisect
184+
return None
185+
186+
#} END properties
187+
188+
189+
class Pack(LazyMixin):
190+
"""A pack is a file written according to the Version 2 for git packs"""
191+

stream.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,9 @@ def read(self, size=-1):
361361
if win_size < 8:
362362
self._cwe = self._cws + 8
363363
# END adjust winsize
364-
indata = self._m[self._cws:self._cwe] # another copy ... :(
364+
365+
# takes a slice, but doesn't copy the data, it says ...
366+
indata = buffer(self._m, self._cws, self._cwe - self._cws)
365367

366368
# get the actual window end to be sure we don't use it for computations
367369
self._cwe = self._cws + len(indata)

test/db/lib.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Base classes for object db testing"""
22
from gitdb.test.lib import (
33
with_rw_directory,
4-
with_packs,
4+
with_packs_rw,
55
ZippedStoreShaWriter,
66
TestBase
77
)
@@ -20,7 +20,7 @@
2020
from cStringIO import StringIO
2121

2222

23-
__all__ = ('TestDBBase', 'with_rw_directory', 'with_packs' )
23+
__all__ = ('TestDBBase', 'with_rw_directory', 'with_packs_rw' )
2424

2525
class TestDBBase(TestBase):
2626
"""Base class providing testing routines on databases"""

test/db/test_pack.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
class TestPackDB(TestDBBase):
55

66
@with_rw_directory
7-
@with_packs
7+
@with_packs_rw
88
def test_writing(self, path):
99
ldb = PackedDB(path)
1010
# TODO

test/lib.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def wrapper(self):
4747
return wrapper
4848

4949

50-
def with_packs(func):
50+
def with_packs_rw(func):
5151
"""Function that provides a path into which the packs for testing should be
5252
copied. Will pass on the path to the actual function afterwards"""
5353
def wrapper(self, path):

test/test_pack.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,38 @@
11
"""Test everything about packs reading and writing"""
2-
32
from lib import (
43
TestBase,
54
with_rw_directory,
6-
with_packs
5+
with_packs_rw,
6+
fixture_path
77
)
8-
8+
from gitdb.pack import (
9+
PackIndex
10+
)
11+
import os
12+
913

1014
class TestPack(TestBase):
1115

12-
@with_rw_directory
13-
@with_packs
14-
def test_reading(self, pack_dir):
15-
# initialze a pack file for reading
16-
pass
16+
def test_pack_index(self):
17+
# read v2 index information
18+
index_file = fixture_path('packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx')
19+
index = PackIndex(index_file)
20+
21+
assert index.packfile_checksum != index.indexfile_checksum
22+
assert index.version == 2
23+
assert index.size == 30
24+
25+
# get all data of all objects
26+
for oidx in xrange(index.size):
27+
sha = index.sha(oidx)
28+
assert oidx == index.sha_to_index(sha)
29+
30+
entry = index.entry(oidx)
31+
assert len(entry) == 3
32+
33+
assert entry[0] == index.offset(oidx)
34+
assert entry[1] == sha
35+
assert entry[2] == index.crc(oidx)
36+
# END for each object index in indexfile
37+
38+

util.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import binascii
22
import os
3+
import mmap
34
import sys
45
import errno
6+
import cStringIO
57

68
try:
79
import async.mod.zlib as zlib
@@ -16,6 +18,22 @@
1618
except ImportError:
1719
import sha
1820

21+
try:
22+
from struct import unpack_from
23+
except ImportError:
24+
from struct import unpack, calcsize
25+
__calcsize_cache = dict()
26+
def unpack_from(fmt, data, offset=0):
27+
try:
28+
size = __calcsize_cache[fmt]
29+
except KeyError:
30+
size = calcsize(fmt)
31+
__calcsize_cache[fmt] = size
32+
# END exception handling
33+
return unpack(fmt, data[offset : offset + size])
34+
# END own unpack_from implementation
35+
36+
1937
#{ Globals
2038

2139
# A pool distributing tasks, initially with zero threads, hence everything
@@ -76,6 +94,28 @@ def stream_copy(source, destination, chunk_size=512*1024):
7694
# END reading output stream
7795
return br
7896

97+
def file_contents_ro(fd, stream=False, allow_mmap=True):
98+
""":return: read-only contents of the file represented by the file descriptor fd
99+
:param fd: file descriptor opened for reading
100+
:param stream: if False, random access is provided, otherwise the stream interface
101+
is provided.
102+
:param allow_mmap: if True, its allowed to map the contents into memory, which
103+
allows large files to be handled and accessed efficiently. The file-descriptor
104+
will change its position if this is False"""
105+
try:
106+
if allow_mmap:
107+
# supports stream and random access
108+
return mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
109+
except OSError:
110+
pass
111+
# END exception handling
112+
113+
# read manully
114+
contents = os.read(fd, os.fstat(fd).st_size)
115+
if stream:
116+
return cStringIO.StringIO(contents)
117+
return contents
118+
79119
def to_hex_sha(sha):
80120
""":return: hexified version of sha"""
81121
if len(sha) == 40:
@@ -93,6 +133,35 @@ def to_bin_sha(sha):
93133

94134
#{ Utilities
95135

136+
class LazyMixin(object):
137+
"""
138+
Base class providing an interface to lazily retrieve attribute values upon
139+
first access. If slots are used, memory will only be reserved once the attribute
140+
is actually accessed and retrieved the first time. All future accesses will
141+
return the cached value as stored in the Instance's dict or slot.
142+
"""
143+
__slots__ = tuple()
144+
145+
def __getattr__(self, attr):
146+
"""
147+
Whenever an attribute is requested that we do not know, we allow it
148+
to be created and set. Next time the same attribute is reqeusted, it is simply
149+
returned from our dict/slots.
150+
"""
151+
self._set_cache_(attr)
152+
# will raise in case the cache was not created
153+
return object.__getattribute__(self, attr)
154+
155+
def _set_cache_(self, attr):
156+
""" This method should be overridden in the derived class.
157+
It should check whether the attribute named by attr can be created
158+
and cached. Do nothing if you do not know the attribute or call your subclass
159+
160+
The derived class may create as many additional attributes as it deems
161+
necessary in case a git command returns more information than represented
162+
in the single attribute."""
163+
pass
164+
96165

97166
class FDStreamWrapper(object):
98167
"""A simple wrapper providing the most basic functions on a file descriptor

0 commit comments

Comments
 (0)