Skip to content

Commit 325742c

Browse files
committed
Implemented basic info and stream retrieval as well as pack file handling of PackedDB - its now operational. Next up is a performance test
1 parent 5af5cd9 commit 325742c

4 files changed

Lines changed: 193 additions & 19 deletions

File tree

db/pack.py

Lines changed: 119 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,95 @@
44
ObjectDBR
55
)
66

7+
from gitdb.util import (
8+
to_bin_sha,
9+
LazyMixin
10+
)
11+
712
from gitdb.exc import (
13+
BadObject,
814
UnsupportedOperation,
915
)
1016

17+
from gitdb.pack import PackEntity
18+
19+
import os
20+
import glob
1121
__all__ = ('PackedDB', )
1222

13-
class PackedDB(FileDBBase, ObjectDBR):
23+
24+
#{ Utilities
25+
26+
27+
class PackedDB(FileDBBase, ObjectDBR, LazyMixin):
1428
"""A database operating on a set of object packs"""
1529

30+
# sort the priority list every N queries
31+
_sort_interval = 15
32+
1633
def __init__(self, root_path):
1734
super(PackedDB, self).__init__(root_path)
35+
# list of lists with three items:
36+
# * hits - number of times the pack was hit with a request
37+
# * entity - Pack entity instance
38+
# * sha_to_index - PackIndexFile.sha_to_index method for direct cache query
39+
# self._entities = list() # lazy loaded list
40+
self._hit_count = 0 # amount of hits
41+
self._st_mtime = 0 # last modification data of our root path
42+
43+
def _set_cache_(self, attr):
44+
# currently it can only be our _entities attribute
45+
self._entities = list()
46+
self.update_pack_entity_cache()
47+
48+
def _sort_entities(self):
49+
self._entities.sort(key=lambda l: l[0], reverse=True)
50+
51+
def _pack_info(self, sha):
52+
""":return: tuple(entity, index) for an item at the given sha
53+
:param sha: 20 or 40 byte sha
54+
:raise BadObject:
55+
:note: This method is not thread-safe, but may be hit in multi-threaded
56+
operation. The worst thing that can happen though is a counter that
57+
was not incremented, or the list being in wrong order. So we safe
58+
the time for locking here, lets see how that goes"""
59+
# presort ?
60+
if self._hit_count % self._sort_interval == 0:
61+
self._sort_entities()
62+
# END update sorting
63+
64+
sha = to_bin_sha(sha)
65+
for item in self._entities:
66+
index = item[2](sha)
67+
if index is not None:
68+
item[0] += 1 # one hit for you
69+
self._hit_count += 1 # general hit count
70+
return (item[1], index)
71+
# END index found in pack
72+
# END for each item
1873

74+
# no hit, see whether we have to update packs
75+
# NOTE: considering packs don't change very often, we safe this call
76+
# and leave it to the super-caller to trigger that
77+
raise BadObject(sha)
1978

2079
#{ Object DB Read
2180

2281
def has_object(self, sha):
23-
raise NotImplementedError()
82+
try:
83+
self._pack_info(sha)
84+
return True
85+
except BadObject:
86+
return False
87+
# END exception handling
2488

2589
def info(self, sha):
26-
raise NotImplementedError()
90+
entity, index = self._pack_info(sha)
91+
return entity.info_at_index(index)
2792

2893
def stream(self, sha):
29-
raise NotImplementedError()
94+
entity, index = self._pack_info(sha)
95+
return entity.stream_at_index(index)
3096

3197
#} END object db read
3298

@@ -39,6 +105,55 @@ def store(self, istream):
39105
raise UnsupportedOperation()
40106

41107
def store_async(self, reader):
108+
# TODO: add ObjectDBRW before implementing this
42109
raise NotImplementedError()
43110

44111
#} END object db write
112+
113+
114+
#{ Interface
115+
116+
def update_pack_entity_cache(self, force=False):
117+
"""Update our cache with the acutally existing packs on disk. Add new ones,
118+
and remove deleted ones. We keep the unchanged ones
119+
:param force: If True, the cache will be updated even though the directory
120+
does not appear to have changed according to its modification timestamp.
121+
:return: True if the packs have been updated so there is new information,
122+
False if there was no change to the pack database"""
123+
stat = os.stat(self.root_path())
124+
if not force and stat.st_mtime <= self._st_mtime:
125+
return False
126+
# END abort early on no change
127+
self._st_mtime = stat.st_mtime
128+
129+
# packs are supposed to be prefixed with pack- by git-convention
130+
# get all pack files, figure out what changed
131+
pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack")))
132+
our_pack_files = set(item[1].pack().path() for item in self._entities)
133+
134+
# new packs
135+
for pack_file in (pack_files - our_pack_files):
136+
# init the hit-counter/priority with the size, a good measure for hit-
137+
# probability. Its implemented so that only 12 bytes will be read
138+
entity = PackEntity(pack_file)
139+
self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index])
140+
# END for each new packfile
141+
142+
# removed packs
143+
for pack_file in (our_pack_files - pack_files):
144+
del_index = -1
145+
for i, item in enumerate(self._entities):
146+
if item[1].pack().path() == pack_file:
147+
del_index = i
148+
break
149+
# END found index
150+
# END for each entity
151+
assert del_index != -1
152+
del(self._entities[del_index])
153+
# END for each removed pack
154+
155+
# reinitialize prioritiess
156+
self._sort_entities()
157+
return True
158+
159+
#} END interface

pack.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040

4141
from struct import (
4242
pack,
43+
unpack,
4344
)
4445

4546
from itertools import izip
@@ -196,7 +197,7 @@ def _offset_v2(self, i):
196197
# in the 64 bit region of the file. The current offset ( lower 31 bits )
197198
# are the index into it
198199
if offset & 0x80000000:
199-
offset = unpack_from(">Q", self._data, self._pack_64_offset + (self.offset & ~0x80000000) * 8)[0]
200+
offset = unpack_from(">Q", self._data, self._pack_64_offset + (offset & ~0x80000000) * 8)[0]
200201
# END handle 64 bit offset
201202

202203
return offset
@@ -291,7 +292,7 @@ def sha_to_index(self, sha):
291292
elif not c:
292293
return mid
293294
else:
294-
lo = mid
295+
lo = mid + 1
295296
# END handle midpoint
296297
# END bisect
297298
return None
@@ -326,13 +327,15 @@ def _set_cache_(self, attr):
326327
fd = ldb.open()
327328
self._data = file_contents_ro(fd)
328329
ldb.rollback()
329-
# TODO: figure out whether we should better keep the lock, or maybe
330-
# add a .keep file instead ?
331-
else:
330+
332331
# read the header information
333332
type_id, self._version, self._size = unpack_from(">4sLL", self._data, 0)
334-
assert type_id == "PACK", "Pack file format is invalid: %r" % type_id
335-
assert self._version in (2, 3), "Cannot handle pack format version %i" % self._version
333+
334+
# TODO: figure out whether we should better keep the lock, or maybe
335+
# add a .keep file instead ?
336+
else: # must be '_size' or '_version'
337+
# read header info - we do that just with a file stream
338+
type_id, self._version, self._size = unpack(">4sLL", open(self._packpath).read(12))
336339
# END handle header
337340

338341
def _iter_objects(self, start_offset, as_stream=True):
@@ -545,14 +548,23 @@ def info(self, sha):
545548
:param sha: 20 byte sha1
546549
:raise BadObject:
547550
:return: OInfo instance, with 20 byte sha"""
548-
return self._object(sha, as_stream=False)
551+
return self._object(sha, False)
549552

550553
def stream(self, sha):
551554
"""Retrieve an object stream along with its information as identified by the given sha
552555
:param sha: 20 byte sha1
553556
:raise BadObject:
554557
:return: OStream instance, with 20 byte sha"""
555-
return self._object(sha, as_stream=True)
558+
return self._object(sha, True)
559+
560+
def info_at_index(self, index):
561+
"""As ``info``, but uses a PackIndexFile compatible index to refer to the object"""
562+
return self._object(None, False, index)
563+
564+
def stream_at_index(self, index):
565+
"""As ``stream``, but uses a PackIndexFile compatible index to refer to the
566+
object"""
567+
return self._object(None, True, index)
556568

557569
#} END Read-Database like Interface
558570

test/db/test_pack.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,51 @@
11
from lib import *
22
from gitdb.db import PackedDB
3-
3+
from gitdb.test.lib import fixture_path
4+
5+
import os
6+
import random
7+
48
class TestPackDB(TestDBBase):
59

610
@with_rw_directory
711
@with_packs_rw
812
def test_writing(self, path):
9-
ldb = PackedDB(path)
10-
# TODO
13+
pdb = PackedDB(path)
14+
15+
# on demand, we init our pack cache
16+
num_packs = 2
17+
assert len(pdb._entities) == num_packs
18+
assert pdb._st_mtime != 0
19+
20+
# test pack directory changed:
21+
# packs removed - rename a file, should affect the glob
22+
pack_path = pdb._entities[0][1].pack().path()
23+
new_pack_path = pack_path + "renamed"
24+
os.rename(pack_path, new_pack_path)
1125

26+
pdb.update_pack_entity_cache(force=True)
27+
assert len(pdb._entities) == num_packs - 1
28+
29+
# packs added
30+
os.rename(new_pack_path, pack_path)
31+
pdb.update_pack_entity_cache(force=True)
32+
assert len(pdb._entities) == num_packs
1233

34+
# bang on the cache
35+
# access the Entities directly, as there is no iteration interface
36+
# yet ( or required for now )
37+
sha_list = list()
38+
for entity in (item[1] for item in pdb._entities):
39+
for index in xrange(entity.index().size()):
40+
41+
sha_list.append(entity.index().sha(index))
42+
# END for each index
43+
# END for each entity
44+
45+
# hit all packs in random order
46+
random.shuffle(sha_list)
47+
48+
for sha in sha_list:
49+
info = pdb.info(sha)
50+
stream = pdb.stream(sha)
51+
# END for each sha to query

test/test_pack.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,19 @@
55
with_packs_rw,
66
fixture_path
77
)
8-
from gitdb.stream import (
9-
DeltaApplyReader
10-
)
8+
from gitdb.stream import DeltaApplyReader
119

1210
from gitdb.pack import (
1311
PackEntity,
1412
PackIndexFile,
1513
PackFile
1614
)
1715

16+
from gitdb.base import (
17+
OInfo,
18+
OStream,
19+
)
20+
1821
from gitdb.fun import delta_types
1922
from gitdb.exc import UnsupportedOperation
2023
from gitdb.util import to_bin_sha
@@ -140,6 +143,11 @@ def test_pack_entity(self):
140143
# we return fully resolved items, which is implied by the sha centric access
141144
assert not info.type_id in delta_types
142145

146+
# try all calls
147+
assert len(entity.collect_streams(info.sha))
148+
assert isinstance(entity.info(info.sha), OInfo)
149+
assert isinstance(entity.stream(info.sha), OStream)
150+
143151
# verify the stream
144152
try:
145153
assert entity.is_valid_stream(info.sha, use_crc=True)

0 commit comments

Comments
 (0)