Package obitools :: Package ecopcr :: Module sequence
[hide private]
[frames] | no frames]

Source Code for Module obitools.ecopcr.sequence

  1  from obitools import NucSequence 
  2  from obitools.ecopcr import EcoPCRDBFile 
  3  from obitools.ecopcr.taxonomy import EcoTaxonomyDB 
  4  from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter 
  5  from obitools.utils import universalOpen 
  6  from glob import glob 
  7  import struct 
  8  import gzip 
  9   
 10   
11 -class EcoPCRDBSequenceIterator(EcoPCRDBFile):
12 ''' 13 Build an iterator over the sequences include in a sequence database 14 formated for ecoPCR 15 ''' 16
17 - def __init__(self,path,taxonomy=None):
18 ''' 19 ecoPCR data iterator constructor 20 21 @param path: path to the ecoPCR database including the database prefix name 22 @type path: C{str} 23 @param taxonomy: a taxonomy can be given to the reader to decode the taxonomic data 24 associated to the sequences. If no Taxonomy is furnish, it will be read 25 before the sequence database files using the same path. 26 @type taxonomy: L{obitools.ecopcr.taxonomy.Taxonomy} 27 ''' 28 self._path = path 29 30 if taxonomy is not None: 31 self._taxonomy=taxonomy 32 else: 33 self._taxonomy=EcoTaxonomyDB(path) 34 35 self._seqfilesFiles = glob('%s_???.sdx' % self._path) 36 self._seqfilesFiles.sort()
37
38 - def __ecoSequenceIterator(self,file):
39 for record in self._ecoRecordIterator(file): 40 lrecord = len(record) 41 lnames = lrecord - (4*4+20) 42 (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) 43 seqid=seqid.strip('\x00') 44 de = string[:deflength] 45 seq = gzip.zlib.decompress(string[deflength:]) 46 bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0]) 47 yield bioseq
48
49 - def __iter__(self):
50 for seqfile in self._seqfilesFiles: 51 for seq in self.__ecoSequenceIterator(seqfile): 52 yield seq
53
54 -class EcoPCRDBSequenceWriter(object):
55
56 - def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
57 self._taxonomy=taxonomy 58 self._filename="%s_%03d.sdx" % (dbname,fileidx) 59 if append: 60 mode ='r+b' 61 f = universalOpen(self._filename) 62 (recordCount,) = struct.unpack('> I',f.read(4)) 63 self._sequenceCount=recordCount 64 del f 65 self._file = open(self._filename,mode) 66 self._file.seek(0,0) 67 self._file.write(struct.pack('> I',0)) 68 self._file.seek(0,2) 69 else: 70 self._sequenceCount=0 71 mode = 'wb' 72 self._file = open(self._filename,mode) 73 self._file.write(struct.pack('> I',self._sequenceCount)) 74 75 76 77 if type is not None: 78 assert ftid is not None,"You must specify an id attribute for features" 79 self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition) 80 else: 81 self._annotation = None
82
83 - def _ecoSeqPacker(self,seq):
84 85 compactseq = gzip.zlib.compress(str(seq).upper(),9) 86 cptseqlength = len(compactseq) 87 delength = len(seq.definition) 88 89 totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength 90 91 if self._taxonomy is None or 'taxid' not in seq: 92 taxon=-1 93 else: 94 taxon=self._taxonomy.findIndex(seq['taxid']) 95 96 packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength), 97 totalSize, 98 taxon, 99 seq.id, 100 delength, 101 len(seq), 102 cptseqlength, 103 seq.definition, 104 compactseq) 105 106 assert len(packed) == totalSize+4, "error in sequence packing" 107 108 return packed
109 110
111 - def put(self,sequence):
112 if self._taxonomy is not None: 113 if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'): 114 sequence.extractTaxon() 115 self._file.write(self._ecoSeqPacker(sequence)) 116 if self._annotation is not None: 117 self._annotation.put(sequence, self._sequenceCount) 118 self._sequenceCount+=1
119
120 - def __del__(self):
121 self._file.seek(0,0) 122 self._file.write(struct.pack('> I',self._sequenceCount)) 123 self._file.close()
124