1 import struct
2 import sys
3 import os
4 import gzip
5
6 from itertools import count,imap
7
8 from obitools.ecopcr import EcoPCRDBFile
9 from obitools.utils import universalOpen, universalTell, fileSize, progressBar
10 from obitools.utils import ColumnFile
11
14 '''
15 The taxonomy database constructor
16
17 @param path: path to the ecoPCR database including the database prefix name
18 @type path: C{str}
19 '''
20 self._speciesidx = self._ranks.index('species')
21 self._genusidx = self._ranks.index('genus')
22 self._familyidx = self._ranks.index('family')
23 self._orderidx = self._ranks.index('order')
24 self._nameidx=dict((x[0],x[2]) for x in self._name)
25
27 return self._taxonomy[self._index[taxid]]
28
30 return self._taxonomy[self._nameidx[name]]
31
32
33
35 try:
36 return self._ranks.index(rank)
37 except ValueError:
38 return None
39
41 return self._index[taxid]
42
43
44
45
46
47
48
49
50
52 "return subtree for given taxonomic id "
53 idx = self._index[taxid]
54 yield self._taxonomy[idx]
55 for t in self._taxonomy:
56 if t[2] == idx:
57 for subt in self.subTreeIterator(t[0]):
58 yield subt
59
61 """
62 return parental tree for given taxonomic id starting from
63 first ancester to the root.
64 """
65 taxon=self.findTaxonByTaxid(taxid)
66 while taxon[2]!= 0:
67 yield taxon
68 taxon = self._taxonomy[taxon[2]]
69 yield self._taxonomy[0]
70
73
75 if not taxids:
76 return None
77 if len(taxids)==1:
78 return taxids[0]
79
80 if len(taxids)==2:
81 t1 = [x[0] for x in self.parentalTreeIterator(taxids[0])]
82 t2 = [x[0] for x in self.parentalTreeIterator(taxids[1])]
83 t1.reverse()
84 t2.reverse()
85
86 count = min(len(t1),len(t2))
87 i=0
88 while(i < count and t1[i]==t2[i]):
89 i+=1
90 i-=1
91
92 return t1[i]
93
94 ancetre = taxids[0]
95 for taxon in taxids[1:]:
96 ancetre = self.lastCommonTaxon(ancetre,taxon)
97
98 return ancetre
99
102
105
108
110 if isinstance(rankid, str):
111 rankid=self._ranks.index(rankid)
112 try:
113 return [x[0] for x in self.parentalTreeIterator(taxid)
114 if self.getRankId(x[0])==rankid][0]
115 except IndexError:
116 return None
117
120
123
126
129
131 for x in imap(None,self._ranks,xrange(len(self._ranks))):
132 yield x
133
134
136 '''
137 A taxonomy database class
138 '''
139
140
142 '''
143 The taxonomy database constructor
144
145 @param path: path to the ecoPCR database including the database prefix name
146 @type path: C{str}
147 '''
148 self._path = path
149 self._taxonFile = "%s.tdx" % self._path
150 self._ranksFile = "%s.rdx" % self._path
151 self._namesFile = "%s.ndx" % self._path
152 self._aliasFile = "%s.adx" % self._path
153
154 print >> sys.stderr,"Reading binary taxonomy database...",
155
156 self.__readNodeTable()
157
158 print >> sys.stderr," ok"
159
160 Taxonomy.__init__(self)
161
162
163
164
165
166
167
168
170 for record in self._ecoRecordIterator(self._namesFile):
171 lrecord = len(record)
172 lnames = lrecord - 16
173 (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
174 name=names[:namelength]
175 classname=names[namelength:]
176 yield (name,classname,indextaxid)
177
178
180 for record in self._ecoRecordIterator(self._taxonFile):
181 lrecord = len(record)
182 lnames = lrecord - 16
183 (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
184 yield (taxid,rankid,parentidx,name)
185
189
194
195
196
197
198
199
200
204
208
210 taxonomy = []
211
212 try :
213 index = dict(self.__ecoAliasIterator())
214 print >> sys.stderr, " [Alias file found] ",
215 buildIndex=False
216 except:
217 print >> sys.stderr, " [Alias file not found] ",
218 index={}
219 i = 0;
220 buildIndex=True
221
222
223 for x in self.__ecoTaxonomicIterator():
224 taxonomy.append(x)
225 if buildIndex:
226 index[x[0]] = i
227 i+=1
228 return taxonomy, index
229
234
235
237
258
260 if t1[0] < t2[0]:
261 return -1
262 elif t1[0] > t2[0]:
263 return +1
264 return 0
265
266 _taxonCmp=staticmethod(_taxonCmp)
267
269 taxCount = len(self._taxonomy)
270 begin = 0
271 end = taxCount
272 oldcheck=taxCount
273 check = begin + end / 2
274 while check != oldcheck and self._taxonomy[check][0]!=taxid :
275 if self._taxonomy[check][0] < taxid:
276 begin=check
277 else:
278 end=check
279 oldcheck=check
280 check = (begin + end) / 2
281
282
283 if self._taxonomy[check][0]==taxid:
284 return check
285 else:
286 return None
287
288
289
291
292 file = universalOpen(file)
293
294 nodes = ColumnFile(file,
295 sep='|',
296 types=(int,int,str,
297 str,str,bool,
298 int,bool,int,
299 bool,bool,bool,str))
300 print >>sys.stderr,"Reading taxonomy dump file..."
301
302 taxonomy=[[n[0],n[2],n[1]] for n in nodes]
303 print >>sys.stderr,"List all taxonomy rank..."
304 ranks =list(set(x[1] for x in taxonomy))
305 ranks.sort()
306 rankidx = dict(map(None,ranks,xrange(len(ranks))))
307
308 print >>sys.stderr,"Sorting taxons..."
309 taxonomy.sort(TaxonomyDump._taxonCmp)
310
311 self._taxonomy=taxonomy
312
313 print >>sys.stderr,"Indexing taxonomy..."
314 index = {}
315 for t in self._taxonomy:
316 index[t[0]]=self._bsearchTaxon(t[0])
317
318 print >>sys.stderr,"Indexing parent and rank..."
319 for t in self._taxonomy:
320 t[1]=rankidx[t[1]]
321 t[2]=index[t[2]]
322
323 self._ranks=ranks
324 self._index=index
325
327 file = universalOpen(file)
328 names = ColumnFile(file,
329 sep='|',
330 types=(int,str,
331 str,str))
332 for taxid,name,unique,classname,white in names:
333 yield taxid,name,classname
334
335 _nameIterator=staticmethod(_nameIterator)
336
344
345 _mergedNodeIterator=staticmethod(_mergedNodeIterator)
346
354
355 _deletedNodeIterator=staticmethod(_deletedNodeIterator)
356
357
358
359
360
361
362
363
364
366
367 def ecoTaxPacker(tx):
368
369 namelength = len(tx[3])
370
371 totalSize = 4 + 4 + 4 + 4 + namelength
372
373 packed = struct.pack('> I I I I I %ds' % namelength,
374 totalSize,
375 tx[0],
376 tx[1],
377 tx[2],
378 namelength,
379 tx[3])
380
381 return packed
382
383 def ecoRankPacker(rank):
384
385 namelength = len(rank)
386
387 packed = struct.pack('> I %ds' % namelength,
388 namelength,
389 rank)
390
391 return packed
392
393 def ecoAliasPacker(taxid,index):
394
395 totalSize = 4 + 4
396 packed = struct.pack('> I I I',
397 totalSize,
398 taxid,
399 index)
400
401 return packed
402
403 def ecoNamePacker(name):
404
405 namelength = len(name[0])
406 classlength= len(name[1])
407 totalSize = namelength + classlength + 4 + 4 + 4 + 4
408
409 packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
410 totalSize,
411 int(name[1]=='scientific name'),
412 namelength,
413 classlength,
414 name[2],
415 name[0],
416 name[1])
417
418 return packed
419
420
421 def ecoTaxWriter(file,taxonomy):
422 output = open(file,'wb')
423 output.write(struct.pack('> I',len(taxonomy)))
424
425 for tx in taxonomy:
426 output.write(ecoTaxPacker(tx))
427
428 output.close()
429
430 def ecoRankWriter(file,ranks):
431 output = open(file,'wb')
432 output.write(struct.pack('> I',len(ranks)))
433
434 for rank in ranks:
435 output.write(ecoRankPacker(rank))
436
437 output.close()
438
439 def ecoAliasWriter(file,index):
440 output = open(file,'wb')
441 output.write(struct.pack('> I',len(index)))
442
443 for taxid in index:
444 i=index[taxid]
445 if i is None:
446 i=-1
447 output.write(ecoAliasPacker(taxid, i))
448
449 output.close()
450
451 def nameCmp(n1,n2):
452 name1=n1[0].upper()
453 name2=n2[0].upper()
454 if name1 < name2:
455 return -1
456 elif name1 > name2:
457 return 1
458 return 0
459
460
461 def ecoNameWriter(file,names):
462 output = open(file,'wb')
463 output.write(struct.pack('> I',len(names)))
464
465 names.sort(nameCmp)
466
467 for name in names:
468 output.write(ecoNamePacker(name))
469
470 output.close()
471
472
473 ecoRankWriter('%s.rdx' % prefix, taxonomy._ranks)
474 ecoTaxWriter('%s.tdx' % prefix, taxonomy._taxonomy)
475 ecoNameWriter('%s.ndx' % prefix, taxonomy._name)
476 ecoAliasWriter('%s.adx' % prefix, taxonomy._index)
477