11
22from cStringIO import StringIO
33import errno
4+ import mmap
5+ import os
46
57from util import (
68 LazyMixin ,
1315__all__ = ('DecompressMemMapReader' , 'FDCompressedSha1Writer' )
1416
1517
16- # ZLIB configuration
17- # used when compressing objects - 1 to 9 ( slowest )
18- Z_BEST_SPEED = 1
19-
2018#{ RO Streams
2119
2220class DecompressMemMapReader (LazyMixin ):
@@ -36,7 +34,8 @@ class DecompressMemMapReader(LazyMixin):
3634 times we actually allocate. An own zlib implementation would be good here
3735 to better support streamed reading - it would only need to keep the mmap
3836 and decompress it into chunks, thats all ... """
39- __slots__ = ('_m' , '_zip' , '_buf' , '_buflen' , '_br' , '_cws' , '_cwe' , '_s' , '_close' )
37+ __slots__ = ('_m' , '_zip' , '_buf' , '_buflen' , '_br' , '_cws' , '_cwe' , '_s' , '_close' ,
38+ '_cbr' , '_phi' )
4039
4140 max_read_size = 512 * 1024 # currently unused
4241
@@ -52,6 +51,8 @@ def __init__(self, m, close_on_deletion, size=None):
5251 self ._br = 0 # num uncompressed bytes read
5352 self ._cws = 0 # start byte of compression window
5453 self ._cwe = 0 # end byte of compression window
54+ self ._cbr = 0 # number of compressed bytes read
55+ self ._phi = False # is True if we parsed the header info
5556 self ._close = close_on_deletion # close the memmap on deletion ?
5657
5758 def _set_cache_ (self , attr ):
@@ -85,6 +86,8 @@ def _parse_header_info(self):
8586 self ._buf = StringIO (hdr [hdrend :])
8687 self ._buflen = len (hdr ) - hdrend
8788
89+ self ._phi = True
90+
8891 return type , size
8992
9093 @classmethod
@@ -98,7 +101,55 @@ def new(self, m, close_on_deletion=False):
98101 inst = DecompressMemMapReader (m , close_on_deletion , 0 )
99102 type , size = inst ._parse_header_info ()
100103 return type , size , inst
104+
105+ def compressed_bytes_read (self ):
106+ """:return: number of compressed bytes read. This includes the bytes it
107+ took to decompress the header ( if there was one )"""
108+ # ABSTRACT: When decompressing a byte stream, it can be that the first
109+ # x bytes which were requested match the first x bytes in the loosely
110+ # compressed datastream. This is the worst-case assumption that the reader
111+ # does, it assumes that it will get at least X bytes from X compressed bytes
112+ # in call cases.
113+ # The caveat is that the object, according to our known uncompressed size,
114+ # is already complete, but there are still some bytes left in the compressed
115+ # stream that contribute to the amount of compressed bytes.
116+ # How can we know that we are truly done, and have read all bytes we need
117+ # to read ?
118+ # Without help, we cannot know, as we need to obtain the status of the
119+ # decompression. If it is not finished, we need to decompress more data
120+ # until it is finished, to yield the actual number of compressed bytes
121+ # belonging to the decompressed object
122+ # We are using a custom zlib module for this, if its not present,
123+ # we can only hope it works.
124+ # Only scrub the stream forward if we are officially done with the
125+ # bytes we were to have.
126+ if self ._br == self ._s and hasattr (self ._zip , 'status' ) and self ._zip .status == zlib .Z_OK :
127+ # manipulate the bytes-read to allow our own read method to coninute
128+ # but keep the window at its current position
129+ self ._br = 0
130+ while self ._zip .status == zlib .Z_OK :
131+ self .read (mmap .PAGESIZE )
132+ # END scrub-loop
133+ # reset bytes read, just to be sure
134+ self ._br = self ._s
135+ # END handle stream scrubbing
136+
137+ return self ._cbr - len (self ._zip .unused_data )
101138
139+ def seek (self , offset , whence = os .SEEK_SET ):
140+ """Allows to reset the stream to restart reading
141+ :raise ValueError: If offset and whence are not 0"""
142+ if offset != 0 or whence != os .SEEK_SET :
143+ raise ValueError ("Can only seek to position 0" )
144+ # END handle offset
145+
146+ self ._zip = zlib .decompressobj ()
147+ self ._br = self ._cws = self ._cwe = self ._cbr = 0
148+ if self ._phi :
149+ self ._phi = False
150+ del (self ._s ) # trigger header parsing on first access
151+ # END skip header
152+
102153 def read (self , size = - 1 ):
103154 if size < 1 :
104155 size = self ._s - self ._br
@@ -109,33 +160,8 @@ def read(self, size=-1):
109160 if size == 0 :
110161 return str ()
111162 # END handle depletion
112-
113- # protect from memory peaks
114- # If he tries to read large chunks, our memory patterns get really bad
115- # as we end up copying a possibly huge chunk from our memory map right into
116- # memory. This might not even be possible. Nonetheless, try to dampen the
117- # effect a bit by reading in chunks, returning a huge string in the end.
118- # Our performance now depends on StringIO. This way we don't need two large
119- # buffers in peak times, but only one large one in the end which is
120- # the return buffer
121- # NO: We don't do it - if the user thinks its best, he is right. If he
122- # has trouble, he will start reading in chunks. According to our tests
123- # its still faster if we read 10 Mb at once instead of chunking it.
124-
125- # if size > self.max_read_size:
126- # sio = StringIO()
127- # while size:
128- # read_size = min(self.max_read_size, size)
129- # data = self.read(read_size)
130- # sio.write(data)
131- # size -= len(data)
132- # if len(data) < read_size:
133- # break
134- # # END data loop
135- # sio.seek(0)
136- # return sio.getvalue()
137- # # END handle maxread
138- #
163+
164+
139165 # deplete the buffer, then just continue using the decompress object
140166 # which has an own buffer. We just need this to transparently parse the
141167 # header from the zlib stream
@@ -186,8 +212,7 @@ def read(self, size=-1):
186212
187213
188214 # if window is too small, make it larger so zip can decompress something
189- win_size = self ._cwe - self ._cws
190- if win_size < 8 :
215+ if self ._cwe - self ._cws < 8 :
191216 self ._cwe = self ._cws + 8
192217 # END adjust winsize
193218
@@ -196,10 +221,18 @@ def read(self, size=-1):
196221
197222 # get the actual window end to be sure we don't use it for computations
198223 self ._cwe = self ._cws + len (indata )
199-
224+
200225 dcompdat = self ._zip .decompress (indata , size )
201226
227+ # update the amount of compressed bytes read
228+ # We feed possibly overlapping chunks, which is why the unconsumed tail
229+ # has to be taken into consideration, as well as the unused data
230+ # if we hit the end of the stream
231+ self ._cbr += len (indata ) - len (self ._zip .unconsumed_tail )
202232 self ._br += len (dcompdat )
233+
234+ print size , self ._br , self ._cbr , len (indata ), self ._cws , self ._cwe , len (self ._zip .unused_data ), len (self ._zip .unconsumed_tail )
235+
203236 if dat :
204237 dcompdat = dat + dcompdat
205238
@@ -252,7 +285,7 @@ class FDCompressedSha1Writer(Sha1Writer):
252285 def __init__ (self , fd ):
253286 super (FDCompressedSha1Writer , self ).__init__ ()
254287 self .fd = fd
255- self .zip = zlib .compressobj (Z_BEST_SPEED )
288+ self .zip = zlib .compressobj (zlib . Z_BEST_SPEED )
256289
257290 #{ Stream Interface
258291
0 commit comments