Package obitools :: Module gzip
[hide private]
[frames] | no frames]

Source Code for Module obitools.gzip

  1  """Functions that read and write gzipped files. 
  2   
  3  The user of the file doesn't have to worry about the compression, 
  4  but random access is not allowed. 
  5   
  6  This consisted on a patched version of of standard gzip python 
  7  module based on Andrew Kuchling's minigzip.py distributed with the zlib module 
  8   
  9  """ 
 10   
 11  # based on Andrew Kuchling's minigzip.py distributed with the zlib module 
 12   
 13  import struct, sys, time 
 14  import zlib 
 15  import __builtin__ 
 16   
 17  __all__ = ["GzipFile","open"] 
 18   
 19  FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 
 20   
 21  READ, WRITE = 1, 2 
 22   
23 -def U32(i):
24 """Return i as an unsigned integer, assuming it fits in 32 bits. 25 26 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long. 27 """ 28 if i < 0: 29 i += 1L << 32 30 return i
31
32 -def LOWU32(i):
33 """Return the low-order 32 bits of an int, as a non-negative int.""" 34 return i & 0xFFFFFFFFL
35
36 -def write32(output, value):
37 output.write(struct.pack("<l", value))
38
39 -def write32u(output, value):
40 # The L format writes the bit pattern correctly whether signed 41 # or unsigned. 42 output.write(struct.pack("<L", value))
43
44 -def read32(input):
45 return struct.unpack("<l", input.read(4))[0]
46
47 -def unpack32(buf):
48 return struct.unpack("<l", buf)[0]
49
50 -def open(filename, mode="rb", compresslevel=9):
51 """Shorthand for GzipFile(filename, mode, compresslevel). 52 53 The filename argument is required; mode defaults to 'rb' 54 and compresslevel defaults to 9. 55 56 """ 57 return GzipFile(filename, mode, compresslevel)
58
59 -class GzipFile:
60 """The GzipFile class simulates most of the methods of a file object with 61 the exception of the readinto() and truncate() methods. 62 63 """ 64 65 myfileobj = None 66 max_read_chunk = 10 * 1024 * 1024 # 10Mb 67
68 - def __init__(self, filename=None, mode=None, 69 compresslevel=9, fileobj=None):
70 """Constructor for the GzipFile class. 71 72 At least one of fileobj and filename must be given a 73 non-trivial value. 74 75 The new class instance is based on fileobj, which can be a regular 76 file, a StringIO object, or any other object which simulates a file. 77 It defaults to None, in which case filename is opened to provide 78 a file object. 79 80 When fileobj is not None, the filename argument is only used to be 81 included in the gzip file header, which may includes the original 82 filename of the uncompressed file. It defaults to the filename of 83 fileobj, if discernible; otherwise, it defaults to the empty string, 84 and in this case the original filename is not included in the header. 85 86 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', 87 depending on whether the file will be read or written. The default 88 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 89 Be aware that only the 'rb', 'ab', and 'wb' values should be used 90 for cross-platform portability. 91 92 The compresslevel argument is an integer from 1 to 9 controlling the 93 level of compression; 1 is fastest and produces the least compression, 94 and 9 is slowest and produces the most compression. The default is 9. 95 96 """ 97 98 # guarantee the file is opened in binary mode on platforms 99 # that care about that sort of thing 100 if mode and 'b' not in mode: 101 mode += 'b' 102 if fileobj is None: 103 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb') 104 if filename is None: 105 if hasattr(fileobj, 'name'): filename = fileobj.name 106 else: filename = '' 107 if mode is None: 108 if hasattr(fileobj, 'mode'): mode = fileobj.mode 109 else: mode = 'rb' 110 111 if mode[0:1] == 'r': 112 self.mode = READ 113 # Set flag indicating start of a new member 114 self._new_member = True 115 self.extrabuf = "" 116 self.extrasize = 0 117 self.filename = filename 118 # Starts small, scales exponentially 119 self.min_readsize = 100 120 121 elif mode[0:1] == 'w' or mode[0:1] == 'a': 122 self.mode = WRITE 123 self._init_write(filename) 124 self.compress = zlib.compressobj(compresslevel, 125 zlib.DEFLATED, 126 -zlib.MAX_WBITS, 127 zlib.DEF_MEM_LEVEL, 128 0) 129 else: 130 raise IOError, "Mode " + mode + " not supported" 131 132 self.fileobj = fileobj 133 self.offset = 0 134 self.inputbuf = '' 135 self.last8 = '' 136 137 if self.mode == WRITE: 138 self._write_gzip_header()
139
140 - def __repr__(self):
141 s = repr(self.fileobj) 142 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
143
144 - def _init_write(self, filename):
145 if filename[-3:] != '.gz': 146 filename = filename + '.gz' 147 self.filename = filename 148 self.crc = zlib.crc32("") 149 self.size = 0 150 self.writebuf = [] 151 self.bufsize = 0
152
153 - def _write_gzip_header(self):
154 self.fileobj.write('\037\213') # magic header 155 self.fileobj.write('\010') # compression method 156 fname = self.filename[:-3] 157 flags = 0 158 if fname: 159 flags = FNAME 160 self.fileobj.write(chr(flags)) 161 write32u(self.fileobj, long(time.time())) 162 self.fileobj.write('\002') 163 self.fileobj.write('\377') 164 if fname: 165 self.fileobj.write(fname + '\000')
166
167 - def _init_read(self):
168 self.crc = zlib.crc32("") 169 self.size = 0
170
171 - def _read_internal(self, size):
172 if len(self.inputbuf) < size: 173 self.inputbuf += self.fileobj.read(size-len(self.inputbuf)) 174 chunk = self.inputbuf[:size] 175 # need to use len(chunk) bellow instead of size in case it's EOF. 176 if len(chunk) < 8: 177 self.last8 = self.last8[len(chunk):] + chunk 178 else: 179 self.last8 = chunk[-8:] 180 self.inputbuf = self.inputbuf[size:] 181 return chunk
182
183 - def _read_gzip_header(self):
184 magic = self._read_internal(2) 185 if len(magic) != 2: 186 raise EOFError, "Reached EOF" 187 if magic != '\037\213': 188 raise IOError, 'Not a gzipped file' 189 method = ord( self._read_internal(1) ) 190 if method != 8: 191 raise IOError, 'Unknown compression method' 192 flag = ord( self._read_internal(1) ) 193 # modtime = self.fileobj.read(4) 194 # extraflag = self.fileobj.read(1) 195 # os = self.fileobj.read(1) 196 self._read_internal(6) 197 198 if flag & FEXTRA: 199 # Read & discard the extra field, if present 200 xlen = ord(self._read_internal(1)) 201 xlen = xlen + 256*ord(self._read_internal(1)) 202 self._read_internal(xlen) 203 if flag & FNAME: 204 # Read and discard a null-terminated string containing the filename 205 while True: 206 s = self._read_internal(1) 207 if not s or s=='\000': 208 break 209 if flag & FCOMMENT: 210 # Read and discard a null-terminated string containing a comment 211 while True: 212 s = self._read_internal(1) 213 if not s or s=='\000': 214 break 215 if flag & FHCRC: 216 self._read_internal(2) # Read & discard the 16-bit header CRC
217 218
219 - def write(self,data):
220 if self.mode != WRITE: 221 import errno 222 raise IOError(errno.EBADF, "write() on read-only GzipFile object") 223 224 if self.fileobj is None: 225 raise ValueError, "write() on closed GzipFile object" 226 if len(data) > 0: 227 self.size = self.size + len(data) 228 self.crc = zlib.crc32(data, self.crc) 229 self.fileobj.write( self.compress.compress(data) ) 230 self.offset += len(data)
231
232 - def read(self, size=-1):
233 if self.mode != READ: 234 import errno 235 raise IOError(errno.EBADF, "read() on write-only GzipFile object") 236 237 if self.extrasize <= 0 and self.fileobj is None: 238 return '' 239 240 readsize = 1024 241 if size < 0: # get the whole thing 242 try: 243 while True: 244 self._read(readsize) 245 readsize = min(self.max_read_chunk, readsize * 2) 246 except EOFError: 247 size = self.extrasize 248 else: # just get some more of it 249 try: 250 while size > self.extrasize: 251 self._read(readsize) 252 readsize = min(self.max_read_chunk, readsize * 2) 253 except EOFError: 254 if size > self.extrasize: 255 size = self.extrasize 256 257 chunk = self.extrabuf[:size] 258 self.extrabuf = self.extrabuf[size:] 259 self.extrasize = self.extrasize - size 260 261 self.offset += size 262 return chunk
263
264 - def _unread(self, buf):
265 self.extrabuf = buf + self.extrabuf 266 self.extrasize = len(buf) + self.extrasize 267 self.offset -= len(buf)
268
269 - def _read(self, size=1024):
270 if self.fileobj is None: 271 raise EOFError, "Reached EOF" 272 273 if self._new_member: 274 # If the _new_member flag is set, we have to 275 # jump to the next member, if there is one. 276 # 277 # _read_gzip_header will raise EOFError exception 278 # if there no more members to read. 279 self._init_read() 280 self._read_gzip_header() 281 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) 282 self._new_member = False 283 284 # Read a chunk of data from the file 285 buf = self._read_internal(size) 286 287 # If the EOF has been reached, flush the decompression object 288 # and mark this object as finished. 289 290 if buf == "": 291 uncompress = self.decompress.flush() 292 self._read_eof() 293 self._add_read_data( uncompress ) 294 raise EOFError, 'Reached EOF' 295 296 uncompress = self.decompress.decompress(buf) 297 self._add_read_data( uncompress ) 298 299 if self.decompress.unused_data != "": 300 # Ending case: we've come to the end of a member in the file, 301 # so put back unused_data and initialize last8 by reading them. 302 self.inputbuf = self.decompress.unused_data + self.inputbuf 303 self._read_internal(8) 304 305 # Check the CRC and file size, and set the flag so we read 306 # a new member on the next call 307 self._read_eof() 308 self._new_member = True
309
310 - def _add_read_data(self, data):
311 self.crc = zlib.crc32(data, self.crc) 312 self.extrabuf = self.extrabuf + data 313 self.extrasize = self.extrasize + len(data) 314 self.size = self.size + len(data)
315
316 - def _read_eof(self):
317 # We've read to the end of the file, so we have to rewind in order 318 # to reread the 8 bytes containing the CRC and the file size. 319 # We check the that the computed CRC and size of the 320 # uncompressed data matches the stored values. Note that the size 321 # stored is the true file size mod 2**32. 322 crc32 = unpack32(self.last8[:4]) 323 isize = U32(unpack32(self.last8[4:])) # may exceed 2GB 324 if U32(crc32) != U32(self.crc): 325 raise IOError, "CRC check failed" 326 elif isize != LOWU32(self.size): 327 raise IOError, "Incorrect length of data produced"
328
329 - def close(self):
330 if self.mode == WRITE: 331 self.fileobj.write(self.compress.flush()) 332 # The native zlib crc is an unsigned 32-bit integer, but 333 # the Python wrapper implicitly casts that to a signed C 334 # long. So, on a 32-bit box self.crc may "look negative", 335 # while the same crc on a 64-bit box may "look positive". 336 # To avoid irksome warnings from the `struct` module, force 337 # it to look positive on all boxes. 338 write32u(self.fileobj, LOWU32(self.crc)) 339 # self.size may exceed 2GB, or even 4GB 340 write32u(self.fileobj, LOWU32(self.size)) 341 self.fileobj = None 342 elif self.mode == READ: 343 self.fileobj = None 344 if self.myfileobj: 345 self.myfileobj.close() 346 self.myfileobj = None
347
348 - def __del__(self):
349 try: 350 if (self.myfileobj is None and 351 self.fileobj is None): 352 return 353 except AttributeError: 354 return 355 self.close()
356
357 - def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
358 if self.mode == WRITE: 359 # Ensure the compressor's buffer is flushed 360 self.fileobj.write(self.compress.flush(zlib_mode)) 361 self.fileobj.flush()
362
363 - def fileno(self):
364 """Invoke the underlying file object's fileno() method. 365 366 This will raise AttributeError if the underlying file object 367 doesn't support fileno(). 368 """ 369 return self.fileobj.fileno()
370
371 - def isatty(self):
372 return False
373
374 - def tell(self):
375 return self.offset
376
377 - def rewind(self):
378 '''Return the uncompressed stream file position indicator to the 379 beginning of the file''' 380 if self.mode != READ: 381 raise IOError("Can't rewind in write mode") 382 self.fileobj.seek(0) 383 self._new_member = True 384 self.extrabuf = "" 385 self.extrasize = 0 386 self.offset = 0
387
388 - def seek(self, offset):
389 if self.mode == WRITE: 390 if offset < self.offset: 391 raise IOError('Negative seek in write mode') 392 count = offset - self.offset 393 for i in range(count // 1024): 394 self.write(1024 * '\0') 395 self.write((count % 1024) * '\0') 396 elif self.mode == READ: 397 if offset < self.offset: 398 # for negative seek, rewind and do positive seek 399 self.rewind() 400 count = offset - self.offset 401 for i in range(count // 1024): 402 self.read(1024) 403 self.read(count % 1024)
404
405 - def readline(self, size=-1):
406 if size < 0: 407 size = sys.maxint 408 readsize = self.min_readsize 409 else: 410 readsize = size 411 bufs = [] 412 while size != 0: 413 c = self.read(readsize) 414 i = c.find('\n') 415 416 # We set i=size to break out of the loop under two 417 # conditions: 1) there's no newline, and the chunk is 418 # larger than size, or 2) there is a newline, but the 419 # resulting line would be longer than 'size'. 420 if (size <= i) or (i == -1 and len(c) > size): 421 i = size - 1 422 423 if i >= 0 or c == '': 424 bufs.append(c[:i + 1]) # Add portion of last chunk 425 self._unread(c[i + 1:]) # Push back rest of chunk 426 break 427 428 # Append chunk to list, decrease 'size', 429 bufs.append(c) 430 size = size - len(c) 431 readsize = min(size, readsize * 2) 432 if readsize > self.min_readsize: 433 self.min_readsize = min(readsize, self.min_readsize * 2, 512) 434 return ''.join(bufs) # Return resulting line
435
436 - def readlines(self, sizehint=0):
437 # Negative numbers result in reading all the lines 438 if sizehint <= 0: 439 sizehint = sys.maxint 440 L = [] 441 while sizehint > 0: 442 line = self.readline() 443 if line == "": 444 break 445 L.append(line) 446 sizehint = sizehint - len(line) 447 448 return L
449
450 - def writelines(self, L):
451 for line in L: 452 self.write(line)
453
454 - def __iter__(self):
455 return self
456
457 - def next(self):
458 line = self.readline() 459 if line: 460 return line 461 else: 462 raise StopIteration
463 464
465 -def _test():
466 # Act like gzip; with -d, act like gunzip. 467 # The input file is not deleted, however, nor are any other gzip 468 # options or features supported. 469 args = sys.argv[1:] 470 decompress = args and args[0] == "-d" 471 if decompress: 472 args = args[1:] 473 if not args: 474 args = ["-"] 475 for arg in args: 476 if decompress: 477 if arg == "-": 478 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) 479 g = sys.stdout 480 else: 481 if arg[-3:] != ".gz": 482 print "filename doesn't end in .gz:", repr(arg) 483 continue 484 f = open(arg, "rb") 485 g = __builtin__.open(arg[:-3], "wb") 486 else: 487 if arg == "-": 488 f = sys.stdin 489 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) 490 else: 491 f = __builtin__.open(arg, "rb") 492 g = open(arg + ".gz", "wb") 493 while True: 494 chunk = f.read(1024) 495 if not chunk: 496 break 497 g.write(chunk) 498 if g is not sys.stdout: 499 g.close() 500 if f is not sys.stdin: 501 f.close()
502 503 if __name__ == '__main__': 504 _test() 505