Commit 262617d2 by Eric Coissac

Add scripts for ecopcr analysis

parent e891ba03
#!/usr/local/bin/python
from obitools.ecopcr import taxonomy
from obitools.ecopcr import sequence
from obitools.ecopcr import EcoPCRFile
from obitools.options import getOptionManager
def addTaxonomyOptions(optionManager):
optionManager.add_option('-d','--ecopcrdb',
action="store", dest="db",
metavar="<FILENAME>",
type="string",
help="ecoPCR Database "
"name")
if __name__=='__main__':
optionParser = getOptionManager([addTaxonomyOptions],
entryIterator=EcoPCRFile
)
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
ranks = set(x for x in tax.rankIterator())
results = [seq for seq in entries]
indexbyseq={}
for seq in results:
s = str(seq)
if s in indexbyseq:
indexbyseq[s].add(seq)
else:
indexbyseq[s]=set([seq])
print '%-20s\t%10s\t%10s\t%7s' % ('rank','taxon_ok','taxon_total','percent')
for rank,rankid in ranks:
if rank != 'no rank':
indexbytaxid={}
for seq in results:
t = tax.getTaxonAtRank(seq['taxid'],rankid)
if t is not None:
if t in indexbytaxid:
indexbytaxid[t].add(str(seq))
else:
indexbytaxid[t]=set([str(seq)])
taxoncount=0
taxonok=0
for taxon in indexbytaxid:
taxlist = set()
for tag in indexbytaxid[taxon]:
taxlist |=indexbyseq[tag]
taxoncount+=1
if len(taxlist)==1:
taxonok+=1
if taxoncount:
print '%-20s\t%10d\t%10d\t%8.2f' % (rank,taxonok,taxoncount,float(taxonok)/taxoncount*100)
\ No newline at end of file
#!/usr/local/bin/python
from obitools.ecopcr import taxonomy
from obitools.ecopcr import sequence
from obitools.ecopcr import EcoPCRFile
from obitools.options import getOptionManager
def addTaxonomyOptions(optionManager):
optionManager.add_option('-d','--ecopcrdb',
action="store", dest="db",
metavar="<FILENAME>",
type="string",
help="ecoPCR Database "
"name")
if __name__=='__main__':
optionParser = getOptionManager([addTaxonomyOptions],
entryIterator=EcoPCRFile
)
(options, entries) = optionParser()
tax = taxonomy.Taxonomy(options.db)
seqd= sequence.ecoPCRDBSequenceIterator(options.db,taxonomy=tax)
ranks = set(x for x in tax.rankIterator())
listtaxonbyrank = {}
for seq in seqd:
for rank,rankid in ranks:
if rank != 'no rank':
t = tax.getTaxonAtRank(seq['taxid'],rankid)
if t is not None:
if rank in listtaxonbyrank:
listtaxonbyrank[rank].add(t)
else:
listtaxonbyrank[rank]=set([t])
stats = dict((x,len(listtaxonbyrank[x])) for x in listtaxonbyrank)
listtaxonbyrank = {}
for seq in entries:
for rank,rankid in ranks:
if rank != 'no rank':
t = tax.getTaxonAtRank(seq['taxid'],rankid)
if t is not None:
if rank in listtaxonbyrank:
listtaxonbyrank[rank].add(t)
else:
listtaxonbyrank[rank]=set([t])
dbstats= dict((x,len(listtaxonbyrank[x])) for x in listtaxonbyrank)
ranknames = [x[0] for x in ranks]
ranknames.sort()
print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent')
for r in ranknames:
if r in dbstats and dbstats[r]:
print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100)
from obitools import utils
from obitools import NucSequence
from obitools.utils import universalOpen, universalTell, fileSize, progressBar
import struct
class EcoPCRFile(utils.ColumnFile):
def __init__(self,stream):
......@@ -34,4 +37,15 @@ class EcoPCRFile(utils.ColumnFile):
return seq
\ No newline at end of file
class EcoPCRDBFile(object):
def _ecoRecordIterator(self,file):
file = universalOpen(file)
(recordCount,) = struct.unpack('> I',file.read(4))
for i in xrange(recordCount):
(recordSize,)=struct.unpack('>I',file.read(4))
record = file.read(recordSize)
yield record
from obitools import NucSequence
from obitools.ecopcr import EcoPCRDBFile
from obitools.ecopcr.taxonomy import Taxonomy
from glob import glob
import struct
import gzip
class ecoPCRDBSequenceIterator(EcoPCRDBFile):
def __init__(self,path,taxonomy=None):
self._path = path
if taxonomy is not None:
self._taxonomy=taxonomy
else:
self._taxonomy=Taxonomy(path)
self._seqfilesFiles = glob('%s_???.sdx' % self._path)
self._seqfilesFiles.sort()
def __ecoSequenceIterator(self,file):
for record in self._ecoRecordIterator(file):
lrecord = len(record)
lnames = lrecord - (4*4+20)
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
de = string[:deflength]
seq = gzip.zlib.decompress(string[deflength:])
bioseq = NucSequence(seqid,seq,de,taxidx=taxid,taxid=self._taxonomy._taxonomy[taxid][0])
yield bioseq
def __iter__(self):
for seqfile in self._seqfilesFiles:
for seq in self.__ecoSequenceIterator(seqfile):
yield seq
......@@ -3,12 +3,15 @@ import sys
import os
import gzip
from itertools import count
from obitools.ecopcr import EcoPCRDBFile
from obitools.utils import universalOpen, universalTell, fileSize, progressBar
class Taxonomy(object):
class Taxonomy(EcoPCRDBFile):
def __init__(self,path):
......@@ -31,18 +34,10 @@ class Taxonomy(object):
def __ecoRecordIterator(self,file):
file = universalOpen(file)
(recordCount,) = struct.unpack('> I',file.read(4))
for i in xrange(recordCount):
(recordSize,)=struct.unpack('>I',file.read(4))
record = file.read(recordSize)
yield record
def __ecoNameIterator(self):
for record in self.__ecoRecordIterator(self._namesFile):
for record in self._ecoRecordIterator(self._namesFile):
lrecord = len(record)
lnames = lrecord - 16
(isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
......@@ -52,14 +47,14 @@ class Taxonomy(object):
def __ecoTaxonomicIterator(self):
for record in self.__ecoRecordIterator(self._taxonFile):
for record in self._ecoRecordIterator(self._taxonFile):
lrecord = len(record)
lnames = lrecord - 16
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
yield (taxid,rankid,parentidx,name)
def __ecoRankIterator(self):
for record in self.__ecoRecordIterator(self._ranksFile):
for record in self._ecoRecordIterator(self._ranksFile):
yield record
......@@ -172,5 +167,8 @@ class Taxonomy(object):
def getOrder(self,taxid):
return self.getTaxonAtRank(taxid, self._orderidx)
def rankIterator(self):
for x in map(None,self._ranks,xrange(len(self._ranks))):
yield x
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment