1 from obitools import NucSequence
2 from obitools.ecopcr import EcoPCRDBFile
3 from obitools.ecopcr.taxonomy import EcoTaxonomyDB
4 from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
5 from obitools.utils import universalOpen
6 from glob import glob
7 import struct
8 import gzip
9
10
12 '''
13 Build an iterator over the sequences include in a sequence database
14 formated for ecoPCR
15 '''
16
18 '''
19 ecoPCR data iterator constructor
20
21 @param path: path to the ecoPCR database including the database prefix name
22 @type path: C{str}
23 @param taxonomy: a taxonomy can be given to the reader to decode the taxonomic data
24 associated to the sequences. If no Taxonomy is furnish, it will be read
25 before the sequence database files using the same path.
26 @type taxonomy: L{obitools.ecopcr.taxonomy.Taxonomy}
27 '''
28 self._path = path
29
30 if taxonomy is not None:
31 self._taxonomy=taxonomy
32 else:
33 self._taxonomy=EcoTaxonomyDB(path)
34
35 self._seqfilesFiles = glob('%s_???.sdx' % self._path)
36 self._seqfilesFiles.sort()
37
39 for record in self._ecoRecordIterator(file):
40 lrecord = len(record)
41 lnames = lrecord - (4*4+20)
42 (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
43 seqid=seqid.strip('\x00')
44 de = string[:deflength]
45 seq = gzip.zlib.decompress(string[deflength:])
46 bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
47 yield bioseq
48
53
55
56 - def __init__(self,dbname,fileidx=1,taxonomy=None,ftid=None,type=None,definition=None,append=False):
57 self._taxonomy=taxonomy
58 self._filename="%s_%03d.sdx" % (dbname,fileidx)
59 if append:
60 mode ='r+b'
61 f = universalOpen(self._filename)
62 (recordCount,) = struct.unpack('> I',f.read(4))
63 self._sequenceCount=recordCount
64 del f
65 self._file = open(self._filename,mode)
66 self._file.seek(0,0)
67 self._file.write(struct.pack('> I',0))
68 self._file.seek(0,2)
69 else:
70 self._sequenceCount=0
71 mode = 'wb'
72 self._file = open(self._filename,mode)
73 self._file.write(struct.pack('> I',self._sequenceCount))
74
75
76
77 if type is not None:
78 assert ftid is not None,"You must specify an id attribute for features"
79 self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
80 else:
81 self._annotation = None
82
84
85 compactseq = gzip.zlib.compress(str(seq).upper(),9)
86 cptseqlength = len(compactseq)
87 delength = len(seq.definition)
88
89 totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
90
91 if self._taxonomy is None or 'taxid' not in seq:
92 taxon=-1
93 else:
94 taxon=self._taxonomy.findIndex(seq['taxid'])
95
96 packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
97 totalSize,
98 taxon,
99 seq.id,
100 delength,
101 len(seq),
102 cptseqlength,
103 seq.definition,
104 compactseq)
105
106 assert len(packed) == totalSize+4, "error in sequence packing"
107
108 return packed
109
110
111 - def put(self,sequence):
119
121 self._file.seek(0,0)
122 self._file.write(struct.pack('> I',self._sequenceCount))
123 self._file.close()
124