1 """Functions that read and write gzipped files.
2
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed.
5
6 This consisted on a patched version of of standard gzip python
7 module based on Andrew Kuchling's minigzip.py distributed with the zlib module
8
9 """
10
11
12
13 import struct, sys, time
14 import zlib
15 import __builtin__
16
17 __all__ = ["GzipFile","open"]
18
19 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
20
21 READ, WRITE = 1, 2
22
24 """Return i as an unsigned integer, assuming it fits in 32 bits.
25
26 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
27 """
28 if i < 0:
29 i += 1L << 32
30 return i
31
33 """Return the low-order 32 bits of an int, as a non-negative int."""
34 return i & 0xFFFFFFFFL
35
38
43
45 return struct.unpack("<l", input.read(4))[0]
46
48 return struct.unpack("<l", buf)[0]
49
50 -def open(filename, mode="rb", compresslevel=9):
51 """Shorthand for GzipFile(filename, mode, compresslevel).
52
53 The filename argument is required; mode defaults to 'rb'
54 and compresslevel defaults to 9.
55
56 """
57 return GzipFile(filename, mode, compresslevel)
58
60 """The GzipFile class simulates most of the methods of a file object with
61 the exception of the readinto() and truncate() methods.
62
63 """
64
65 myfileobj = None
66 max_read_chunk = 10 * 1024 * 1024
67
68 - def __init__(self, filename=None, mode=None,
69 compresslevel=9, fileobj=None):
70 """Constructor for the GzipFile class.
71
72 At least one of fileobj and filename must be given a
73 non-trivial value.
74
75 The new class instance is based on fileobj, which can be a regular
76 file, a StringIO object, or any other object which simulates a file.
77 It defaults to None, in which case filename is opened to provide
78 a file object.
79
80 When fileobj is not None, the filename argument is only used to be
81 included in the gzip file header, which may includes the original
82 filename of the uncompressed file. It defaults to the filename of
83 fileobj, if discernible; otherwise, it defaults to the empty string,
84 and in this case the original filename is not included in the header.
85
86 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
87 depending on whether the file will be read or written. The default
88 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
89 Be aware that only the 'rb', 'ab', and 'wb' values should be used
90 for cross-platform portability.
91
92 The compresslevel argument is an integer from 1 to 9 controlling the
93 level of compression; 1 is fastest and produces the least compression,
94 and 9 is slowest and produces the most compression. The default is 9.
95
96 """
97
98
99
100 if mode and 'b' not in mode:
101 mode += 'b'
102 if fileobj is None:
103 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
104 if filename is None:
105 if hasattr(fileobj, 'name'): filename = fileobj.name
106 else: filename = ''
107 if mode is None:
108 if hasattr(fileobj, 'mode'): mode = fileobj.mode
109 else: mode = 'rb'
110
111 if mode[0:1] == 'r':
112 self.mode = READ
113
114 self._new_member = True
115 self.extrabuf = ""
116 self.extrasize = 0
117 self.filename = filename
118
119 self.min_readsize = 100
120
121 elif mode[0:1] == 'w' or mode[0:1] == 'a':
122 self.mode = WRITE
123 self._init_write(filename)
124 self.compress = zlib.compressobj(compresslevel,
125 zlib.DEFLATED,
126 -zlib.MAX_WBITS,
127 zlib.DEF_MEM_LEVEL,
128 0)
129 else:
130 raise IOError, "Mode " + mode + " not supported"
131
132 self.fileobj = fileobj
133 self.offset = 0
134 self.inputbuf = ''
135 self.last8 = ''
136
137 if self.mode == WRITE:
138 self._write_gzip_header()
139
141 s = repr(self.fileobj)
142 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
143
152
154 self.fileobj.write('\037\213')
155 self.fileobj.write('\010')
156 fname = self.filename[:-3]
157 flags = 0
158 if fname:
159 flags = FNAME
160 self.fileobj.write(chr(flags))
161 write32u(self.fileobj, long(time.time()))
162 self.fileobj.write('\002')
163 self.fileobj.write('\377')
164 if fname:
165 self.fileobj.write(fname + '\000')
166
168 self.crc = zlib.crc32("")
169 self.size = 0
170
172 if len(self.inputbuf) < size:
173 self.inputbuf += self.fileobj.read(size-len(self.inputbuf))
174 chunk = self.inputbuf[:size]
175
176 if len(chunk) < 8:
177 self.last8 = self.last8[len(chunk):] + chunk
178 else:
179 self.last8 = chunk[-8:]
180 self.inputbuf = self.inputbuf[size:]
181 return chunk
182
217
218
220 if self.mode != WRITE:
221 import errno
222 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
223
224 if self.fileobj is None:
225 raise ValueError, "write() on closed GzipFile object"
226 if len(data) > 0:
227 self.size = self.size + len(data)
228 self.crc = zlib.crc32(data, self.crc)
229 self.fileobj.write( self.compress.compress(data) )
230 self.offset += len(data)
231
232 - def read(self, size=-1):
233 if self.mode != READ:
234 import errno
235 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
236
237 if self.extrasize <= 0 and self.fileobj is None:
238 return ''
239
240 readsize = 1024
241 if size < 0:
242 try:
243 while True:
244 self._read(readsize)
245 readsize = min(self.max_read_chunk, readsize * 2)
246 except EOFError:
247 size = self.extrasize
248 else:
249 try:
250 while size > self.extrasize:
251 self._read(readsize)
252 readsize = min(self.max_read_chunk, readsize * 2)
253 except EOFError:
254 if size > self.extrasize:
255 size = self.extrasize
256
257 chunk = self.extrabuf[:size]
258 self.extrabuf = self.extrabuf[size:]
259 self.extrasize = self.extrasize - size
260
261 self.offset += size
262 return chunk
263
265 self.extrabuf = buf + self.extrabuf
266 self.extrasize = len(buf) + self.extrasize
267 self.offset -= len(buf)
268
269 - def _read(self, size=1024):
270 if self.fileobj is None:
271 raise EOFError, "Reached EOF"
272
273 if self._new_member:
274
275
276
277
278
279 self._init_read()
280 self._read_gzip_header()
281 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
282 self._new_member = False
283
284
285 buf = self._read_internal(size)
286
287
288
289
290 if buf == "":
291 uncompress = self.decompress.flush()
292 self._read_eof()
293 self._add_read_data( uncompress )
294 raise EOFError, 'Reached EOF'
295
296 uncompress = self.decompress.decompress(buf)
297 self._add_read_data( uncompress )
298
299 if self.decompress.unused_data != "":
300
301
302 self.inputbuf = self.decompress.unused_data + self.inputbuf
303 self._read_internal(8)
304
305
306
307 self._read_eof()
308 self._new_member = True
309
311 self.crc = zlib.crc32(data, self.crc)
312 self.extrabuf = self.extrabuf + data
313 self.extrasize = self.extrasize + len(data)
314 self.size = self.size + len(data)
315
317
318
319
320
321
322 crc32 = unpack32(self.last8[:4])
323 isize = U32(unpack32(self.last8[4:]))
324 if U32(crc32) != U32(self.crc):
325 raise IOError, "CRC check failed"
326 elif isize != LOWU32(self.size):
327 raise IOError, "Incorrect length of data produced"
328
347
349 try:
350 if (self.myfileobj is None and
351 self.fileobj is None):
352 return
353 except AttributeError:
354 return
355 self.close()
356
357 - def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
362
364 """Invoke the underlying file object's fileno() method.
365
366 This will raise AttributeError if the underlying file object
367 doesn't support fileno().
368 """
369 return self.fileobj.fileno()
370
373
376
378 '''Return the uncompressed stream file position indicator to the
379 beginning of the file'''
380 if self.mode != READ:
381 raise IOError("Can't rewind in write mode")
382 self.fileobj.seek(0)
383 self._new_member = True
384 self.extrabuf = ""
385 self.extrasize = 0
386 self.offset = 0
387
388 - def seek(self, offset):
389 if self.mode == WRITE:
390 if offset < self.offset:
391 raise IOError('Negative seek in write mode')
392 count = offset - self.offset
393 for i in range(count // 1024):
394 self.write(1024 * '\0')
395 self.write((count % 1024) * '\0')
396 elif self.mode == READ:
397 if offset < self.offset:
398
399 self.rewind()
400 count = offset - self.offset
401 for i in range(count // 1024):
402 self.read(1024)
403 self.read(count % 1024)
404
406 if size < 0:
407 size = sys.maxint
408 readsize = self.min_readsize
409 else:
410 readsize = size
411 bufs = []
412 while size != 0:
413 c = self.read(readsize)
414 i = c.find('\n')
415
416
417
418
419
420 if (size <= i) or (i == -1 and len(c) > size):
421 i = size - 1
422
423 if i >= 0 or c == '':
424 bufs.append(c[:i + 1])
425 self._unread(c[i + 1:])
426 break
427
428
429 bufs.append(c)
430 size = size - len(c)
431 readsize = min(size, readsize * 2)
432 if readsize > self.min_readsize:
433 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
434 return ''.join(bufs)
435
437
438 if sizehint <= 0:
439 sizehint = sys.maxint
440 L = []
441 while sizehint > 0:
442 line = self.readline()
443 if line == "":
444 break
445 L.append(line)
446 sizehint = sizehint - len(line)
447
448 return L
449
453
456
463
464
466
467
468
469 args = sys.argv[1:]
470 decompress = args and args[0] == "-d"
471 if decompress:
472 args = args[1:]
473 if not args:
474 args = ["-"]
475 for arg in args:
476 if decompress:
477 if arg == "-":
478 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
479 g = sys.stdout
480 else:
481 if arg[-3:] != ".gz":
482 print "filename doesn't end in .gz:", repr(arg)
483 continue
484 f = open(arg, "rb")
485 g = __builtin__.open(arg[:-3], "wb")
486 else:
487 if arg == "-":
488 f = sys.stdin
489 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
490 else:
491 f = __builtin__.open(arg, "rb")
492 g = open(arg + ".gz", "wb")
493 while True:
494 chunk = f.read(1024)
495 if not chunk:
496 break
497 g.write(chunk)
498 if g is not sys.stdout:
499 g.close()
500 if f is not sys.stdin:
501 f.close()
502
503 if __name__ == '__main__':
504 _test()
505