Skip to content

Commit 8ab9b4f

Browse files
committed
PackedDB: added sha_iter and size methods, these should move to the ObjectDBR actually
Added performance test, packed stream reading still runs into errors, which is interesting as it dealt with the sample packs very well before
1 parent 325742c commit 8ab9b4f

4 files changed

Lines changed: 84 additions & 15 deletions

File tree

db/pack.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ class PackedDB(FileDBBase, ObjectDBR, LazyMixin):
2828
"""A database operating on a set of object packs"""
2929

3030
# sort the priority list every N queries
31-
_sort_interval = 15
31+
# Higher values are better, performance tests don't show this has
32+
# any effect, but it should have one
33+
_sort_interval = 500
3234

3335
def __init__(self, root_path):
3436
super(PackedDB, self).__init__(root_path)
@@ -156,4 +158,19 @@ def update_pack_entity_cache(self, force=False):
156158
self._sort_entities()
157159
return True
158160

161+
def sha_iter(self):
162+
"""Return iterator yielding 20 byte shas for the packed objects in this data base"""
163+
sha_list = list()
164+
for entity in (item[1] for item in self._entities):
165+
index = entity.index()
166+
sha_by_index = index.sha
167+
for index in xrange(index.size()):
168+
yield sha_by_index(index)
169+
# END for each index
170+
# END for each entity
171+
172+
def size(self):
173+
""":return: amount of packed objects in this database"""
174+
sizes = [item[1].index().size() for item in self._entities]
175+
return reduce(lambda x,y: x+y, sizes)
159176
#} END interface

test/db/test_pack.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,8 @@ def test_writing(self, path):
3434
# bang on the cache
3535
# access the Entities directly, as there is no iteration interface
3636
# yet ( or required for now )
37-
sha_list = list()
38-
for entity in (item[1] for item in pdb._entities):
39-
for index in xrange(entity.index().size()):
40-
41-
sha_list.append(entity.index().sha(index))
42-
# END for each index
43-
# END for each entity
37+
sha_list = list(pdb.sha_iter())
38+
assert len(sha_list) == pdb.size()
4439

4540
# hit all packs in random order
4641
random.shuffle(sha_list)

test/performance/lib.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def setUpAll(cls):
4444
except AttributeError:
4545
pass
4646
cls.gitrepopath = resolve_or_fail(k_env_git_repo)
47+
assert cls.gitrepopath.endswith('.git')
4748

4849

4950
#} END base classes

test/performance/test_db.py

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,71 @@
11
"""Performance tests for object store"""
2+
from lib import (
3+
TestBigRepoR
4+
)
5+
6+
from gitdb.db.pack import PackedDB
27

38
import sys
9+
import os
410
from time import time
5-
6-
from lib import (
7-
TestBigRepoR
8-
)
11+
import random
912

1013
class TestGitDBPerformance(TestBigRepoR):
1114

12-
def test_random_access(self):
13-
pass
14-
# TODO: use the actual db for this
15+
def test_pack_random_access(self):
16+
pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack"))
17+
assert len(pdb._entities) > 1
18+
19+
# sha lookup
20+
st = time()
21+
sha_list = list(pdb.sha_iter())
22+
elapsed = time() - st
23+
ns = len(sha_list)
24+
print >> sys.stderr, "PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / elapsed)
25+
26+
27+
# sha lookup: best-case and worst case access
28+
pdb_pack_info = pdb._pack_info
29+
access_times = list()
30+
for rand in range(2):
31+
if rand:
32+
random.shuffle(sha_list)
33+
# END shuffle shas
34+
st = time()
35+
for sha in sha_list:
36+
pdb_pack_info(sha)
37+
# END for each sha to look up
38+
elapsed = time() - st
39+
access_times.append(elapsed)
40+
41+
# discard cache
42+
del(pdb._entities)
43+
pdb._entities
44+
print >> sys.stderr, "PDB: looked up %i sha (random=%i) in %f s ( %f shas/s )" % (ns, rand, elapsed, ns / elapsed)
45+
# END for each random mode
46+
elapsed_order, elapsed_rand = access_times
47+
48+
# well, its never really sequencial regarding the memory patterns, but it
49+
# shows how well the prioriy cache performs
50+
print >> sys.stderr, "PDB: sequential access is %f %% faster than random-access" % (100 - ((elapsed_order / elapsed_rand) * 100))
51+
52+
53+
# query info and streams only
54+
max_items = 10000 # can wait longer when testing memory
55+
for pdb_fun in (pdb.info, pdb.stream):
56+
st = time()
57+
for sha in sha_list[:max_items]:
58+
pdb_fun(sha)
59+
elapsed = time() - st
60+
print >> sys.stderr, "PDB: Obtained %i object %s by sha in %f s ( %f info/s )" % (max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed)
61+
# END for each function
1562

63+
# retrieve stream and read all
64+
max_items = 5000
65+
pdb_stream = pdb.stream
66+
st = time()
67+
for sha in sha_list[:max_items]:
68+
stream = pdb_stream(sha)
69+
stream.read()
70+
elapsed = time() - st
71+
print >> sys.stderr, "PDB: Obtained %i streams by sha and read all bytes in %f s ( %f info/s )" % (max_items, elapsed, max_items / elapsed)

0 commit comments

Comments
 (0)