Package obitools :: Package ecopcr :: Module taxonomy
[hide private]
[frames] | no frames]

Source Code for Module obitools.ecopcr.taxonomy

  1  import struct 
  2  import sys 
  3  import os 
  4  import gzip 
  5   
  6  from itertools import count,imap 
  7   
  8  from obitools.ecopcr import EcoPCRDBFile 
  9  from obitools.utils import universalOpen, universalTell, fileSize, progressBar 
 10  from obitools.utils import ColumnFile 
 11   
12 -class Taxonomy(object):
13 - def __init__(self):
14 ''' 15 The taxonomy database constructor 16 17 @param path: path to the ecoPCR database including the database prefix name 18 @type path: C{str} 19 ''' 20 self._speciesidx = self._ranks.index('species') 21 self._genusidx = self._ranks.index('genus') 22 self._familyidx = self._ranks.index('family') 23 self._orderidx = self._ranks.index('order') 24 self._nameidx=dict((x[0],x[2]) for x in self._name)
25
26 - def findTaxonByTaxid(self,taxid):
27 return self._taxonomy[self._index[taxid]]
28
29 - def findTaxonByName(self,name):
30 return self._taxonomy[self._nameidx[name]]
31 32 33
34 - def findRankByName(self,rank):
35 try: 36 return self._ranks.index(rank) 37 except ValueError: 38 return None
39
40 - def findIndex(self,taxid):
41 return self._index[taxid]
42 43 44 ##### 45 # 46 # PUBLIC METHODS 47 # 48 ##### 49 50
51 - def subTreeIterator(self, taxid):
52 "return subtree for given taxonomic id " 53 idx = self._index[taxid] 54 yield self._taxonomy[idx] 55 for t in self._taxonomy: 56 if t[2] == idx: 57 for subt in self.subTreeIterator(t[0]): 58 yield subt
59
60 - def parentalTreeIterator(self, taxid):
61 """ 62 return parental tree for given taxonomic id starting from 63 first ancester to the root. 64 """ 65 taxon=self.findTaxonByTaxid(taxid) 66 while taxon[2]!= 0: 67 yield taxon 68 taxon = self._taxonomy[taxon[2]] 69 yield self._taxonomy[0]
70
71 - def isAncestor(self,parent,taxid):
72 return parent in [x[0] for x in self.parentalTreeIterator(taxid)]
73
74 - def lastCommonTaxon(self,*taxids):
75 if not taxids: 76 return None 77 if len(taxids)==1: 78 return taxids[0] 79 80 if len(taxids)==2: 81 t1 = [x[0] for x in self.parentalTreeIterator(taxids[0])] 82 t2 = [x[0] for x in self.parentalTreeIterator(taxids[1])] 83 t1.reverse() 84 t2.reverse() 85 86 count = min(len(t1),len(t2)) 87 i=0 88 while(i < count and t1[i]==t2[i]): 89 i+=1 90 i-=1 91 92 return t1[i] 93 94 ancetre = taxids[0] 95 for taxon in taxids[1:]: 96 ancetre = self.lastCommonTaxon(ancetre,taxon) 97 98 return ancetre
99
100 - def getScientificName(self,taxid):
101 return self.findTaxonByTaxid(taxid)[3]
102
103 - def getRankId(self,taxid):
104 return self.findTaxonByTaxid(taxid)[1]
105
106 - def getRank(self,taxid):
107 return self._ranks[self.getRankId(taxid)]
108
109 - def getTaxonAtRank(self,taxid,rankid):
110 if isinstance(rankid, str): 111 rankid=self._ranks.index(rankid) 112 try: 113 return [x[0] for x in self.parentalTreeIterator(taxid) 114 if self.getRankId(x[0])==rankid][0] 115 except IndexError: 116 return None
117
118 - def getSpecies(self,taxid):
119 return self.getTaxonAtRank(taxid, self._speciesidx)
120
121 - def getGenus(self,taxid):
122 return self.getTaxonAtRank(taxid, self._genusidx)
123
124 - def getFamily(self,taxid):
125 return self.getTaxonAtRank(taxid, self._familyidx)
126
127 - def getOrder(self,taxid):
128 return self.getTaxonAtRank(taxid, self._orderidx)
129
130 - def rankIterator(self):
131 for x in imap(None,self._ranks,xrange(len(self._ranks))): 132 yield x
133 134
135 -class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
136 ''' 137 A taxonomy database class 138 ''' 139 140
141 - def __init__(self,path):
142 ''' 143 The taxonomy database constructor 144 145 @param path: path to the ecoPCR database including the database prefix name 146 @type path: C{str} 147 ''' 148 self._path = path 149 self._taxonFile = "%s.tdx" % self._path 150 self._ranksFile = "%s.rdx" % self._path 151 self._namesFile = "%s.ndx" % self._path 152 self._aliasFile = "%s.adx" % self._path 153 154 print >> sys.stderr,"Reading binary taxonomy database...", 155 156 self.__readNodeTable() 157 158 print >> sys.stderr," ok" 159 160 Taxonomy.__init__(self)
161 162 163 ##### 164 # 165 # Iterator functions 166 # 167 ##### 168
169 - def __ecoNameIterator(self):
170 for record in self._ecoRecordIterator(self._namesFile): 171 lrecord = len(record) 172 lnames = lrecord - 16 173 (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record) 174 name=names[:namelength] 175 classname=names[namelength:] 176 yield (name,classname,indextaxid)
177 178
179 - def __ecoTaxonomicIterator(self):
180 for record in self._ecoRecordIterator(self._taxonFile): 181 lrecord = len(record) 182 lnames = lrecord - 16 183 (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) 184 yield (taxid,rankid,parentidx,name)
185
186 - def __ecoRankIterator(self):
187 for record in self._ecoRecordIterator(self._ranksFile): 188 yield record
189
190 - def __ecoAliasIterator(self):
191 for record in self._ecoRecordIterator(self._aliasFile): 192 (taxid,index) = struct.unpack('> I I',record) 193 yield taxid,index
194 195 ##### 196 # 197 # Indexes 198 # 199 ##### 200
201 - def __ecoNameIndex(self):
202 indexName = [x for x in self.__ecoNameIterator()] 203 return indexName
204
205 - def __ecoRankIndex(self):
206 rank = [r for r in self.__ecoRankIterator()] 207 return rank
208
209 - def __ecoTaxonomyIndex(self):
210 taxonomy = [] 211 212 try : 213 index = dict(self.__ecoAliasIterator()) 214 print >> sys.stderr, " [Alias file found] ", 215 buildIndex=False 216 except: 217 print >> sys.stderr, " [Alias file not found] ", 218 index={} 219 i = 0; 220 buildIndex=True 221 222 223 for x in self.__ecoTaxonomicIterator(): 224 taxonomy.append(x) 225 if buildIndex: 226 index[x[0]] = i 227 i+=1 228 return taxonomy, index
229
230 - def __readNodeTable(self):
231 self._taxonomy, self._index = self.__ecoTaxonomyIndex() 232 self._ranks = self.__ecoRankIndex() 233 self._name = self.__ecoNameIndex()
234 235
236 -class TaxonomyDump(Taxonomy):
237
238 - def __init__(self,taxdir):
239 240 self._path=taxdir 241 self._readNodeTable('%s/nodes.dmp' % taxdir) 242 243 print >>sys.stderr,"Adding scientific name..." 244 245 self._name=[] 246 for taxid,name,classname in TaxonomyDump._nameIterator('%s/names.dmp' % taxdir): 247 self._name.append((name,classname,self._index[taxid])) 248 if classname == 'scientific name': 249 self._taxonomy[self._index[taxid]].append(name) 250 251 print >>sys.stderr,"Adding taxid alias..." 252 for taxid,current in TaxonomyDump._mergedNodeIterator('%s/merged.dmp' % taxdir): 253 self._index[taxid]=self._index[current] 254 255 print >>sys.stderr,"Adding deleted taxid..." 256 for taxid in TaxonomyDump._deletedNodeIterator('%s/delnodes.dmp' % taxdir): 257 self._index[taxid]=None
258
259 - def _taxonCmp(t1,t2):
260 if t1[0] < t2[0]: 261 return -1 262 elif t1[0] > t2[0]: 263 return +1 264 return 0
265 266 _taxonCmp=staticmethod(_taxonCmp) 267
268 - def _bsearchTaxon(self,taxid):
269 taxCount = len(self._taxonomy) 270 begin = 0 271 end = taxCount 272 oldcheck=taxCount 273 check = begin + end / 2 274 while check != oldcheck and self._taxonomy[check][0]!=taxid : 275 if self._taxonomy[check][0] < taxid: 276 begin=check 277 else: 278 end=check 279 oldcheck=check 280 check = (begin + end) / 2 281 282 283 if self._taxonomy[check][0]==taxid: 284 return check 285 else: 286 return None
287 288 289
290 - def _readNodeTable(self,file):
291 292 file = universalOpen(file) 293 294 nodes = ColumnFile(file, 295 sep='|', 296 types=(int,int,str, 297 str,str,bool, 298 int,bool,int, 299 bool,bool,bool,str)) 300 print >>sys.stderr,"Reading taxonomy dump file..." 301 # (taxid,rank,parent) 302 taxonomy=[[n[0],n[2],n[1]] for n in nodes] 303 print >>sys.stderr,"List all taxonomy rank..." 304 ranks =list(set(x[1] for x in taxonomy)) 305 ranks.sort() 306 rankidx = dict(map(None,ranks,xrange(len(ranks)))) 307 308 print >>sys.stderr,"Sorting taxons..." 309 taxonomy.sort(TaxonomyDump._taxonCmp) 310 311 self._taxonomy=taxonomy 312 313 print >>sys.stderr,"Indexing taxonomy..." 314 index = {} 315 for t in self._taxonomy: 316 index[t[0]]=self._bsearchTaxon(t[0]) 317 318 print >>sys.stderr,"Indexing parent and rank..." 319 for t in self._taxonomy: 320 t[1]=rankidx[t[1]] 321 t[2]=index[t[2]] 322 323 self._ranks=ranks 324 self._index=index
325
326 - def _nameIterator(file):
327 file = universalOpen(file) 328 names = ColumnFile(file, 329 sep='|', 330 types=(int,str, 331 str,str)) 332 for taxid,name,unique,classname,white in names: 333 yield taxid,name,classname
334 335 _nameIterator=staticmethod(_nameIterator) 336
337 - def _mergedNodeIterator(file):
338 file = universalOpen(file) 339 merged = ColumnFile(file, 340 sep='|', 341 types=(int,int,str)) 342 for taxid,current,white in merged: 343 yield taxid,current
344 345 _mergedNodeIterator=staticmethod(_mergedNodeIterator) 346
347 - def _deletedNodeIterator(file):
348 file = universalOpen(file) 349 deleted = ColumnFile(file, 350 sep='|', 351 types=(int,str)) 352 for taxid,white in deleted: 353 yield taxid
354 355 _deletedNodeIterator=staticmethod(_deletedNodeIterator) 356 357 ##### 358 # 359 # 360 # Binary writer 361 # 362 # 363 ##### 364
365 -def ecoTaxonomyWriter(prefix, taxonomy):
366 367 def ecoTaxPacker(tx): 368 369 namelength = len(tx[3]) 370 371 totalSize = 4 + 4 + 4 + 4 + namelength 372 373 packed = struct.pack('> I I I I I %ds' % namelength, 374 totalSize, 375 tx[0], 376 tx[1], 377 tx[2], 378 namelength, 379 tx[3]) 380 381 return packed
382 383 def ecoRankPacker(rank): 384 385 namelength = len(rank) 386 387 packed = struct.pack('> I %ds' % namelength, 388 namelength, 389 rank) 390 391 return packed 392 393 def ecoAliasPacker(taxid,index): 394 395 totalSize = 4 + 4 396 packed = struct.pack('> I I I', 397 totalSize, 398 taxid, 399 index) 400 401 return packed 402 403 def ecoNamePacker(name): 404 405 namelength = len(name[0]) 406 classlength= len(name[1]) 407 totalSize = namelength + classlength + 4 + 4 + 4 + 4 408 409 packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength), 410 totalSize, 411 int(name[1]=='scientific name'), 412 namelength, 413 classlength, 414 name[2], 415 name[0], 416 name[1]) 417 418 return packed 419 420 421 def ecoTaxWriter(file,taxonomy): 422 output = open(file,'wb') 423 output.write(struct.pack('> I',len(taxonomy))) 424 425 for tx in taxonomy: 426 output.write(ecoTaxPacker(tx)) 427 428 output.close() 429 430 def ecoRankWriter(file,ranks): 431 output = open(file,'wb') 432 output.write(struct.pack('> I',len(ranks))) 433 434 for rank in ranks: 435 output.write(ecoRankPacker(rank)) 436 437 output.close() 438 439 def ecoAliasWriter(file,index): 440 output = open(file,'wb') 441 output.write(struct.pack('> I',len(index))) 442 443 for taxid in index: 444 i=index[taxid] 445 if i is None: 446 i=-1 447 output.write(ecoAliasPacker(taxid, i)) 448 449 output.close() 450 451 def nameCmp(n1,n2): 452 name1=n1[0].upper() 453 name2=n2[0].upper() 454 if name1 < name2: 455 return -1 456 elif name1 > name2: 457 return 1 458 return 0 459 460 461 def ecoNameWriter(file,names): 462 output = open(file,'wb') 463 output.write(struct.pack('> I',len(names))) 464 465 names.sort(nameCmp) 466 467 for name in names: 468 output.write(ecoNamePacker(name)) 469 470 output.close() 471 472 473 ecoRankWriter('%s.rdx' % prefix, taxonomy._ranks) 474 ecoTaxWriter('%s.tdx' % prefix, taxonomy._taxonomy) 475 ecoNameWriter('%s.ndx' % prefix, taxonomy._name) 476 ecoAliasWriter('%s.adx' % prefix, taxonomy._index) 477