ecotaxstat.py 3.84 KB
Newer Older
1
#!/usr/local/bin/python
2
'''
Frédéric Boyer committed
3 4
:py:mod:`ecotaxstat` : getting the coverage of an ecoPCR output compared to the original ecoPCR database
========================================================================================================
5 6 7

.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

Frédéric Boyer committed
8
The :py:mod:`ecotaxstat` command requires two parameters : an *ecoPCR* formatted database (specified 
Frédéric Boyer committed
9 10
with the `-d` option, (see :doc:`obiconvert <obiconvert>` for a description of the database format) 
and an ecoPCR output (ideally computed using the specified ecoPCR database).
11

Frédéric Boyer committed
12 13 14
The command outputs, for every rank, the coverage (Bc) of the ecoPCR output. The coverage (Bc) is the 
fraction of *taxids* that have a sequence in the database and have also have a sequence in the ecoPCR 
output file.
15

Frédéric Boyer committed
16
Optionally, *taxids* can be specified to focus the coverage on a smaller part of the taxonomy.
17
'''
18 19 20 21 22 23

from obitools.ecopcr import taxonomy
from obitools.ecopcr import sequence
from obitools.ecopcr import EcoPCRFile

from obitools.options import getOptionManager
24
from obitools.ecopcr.options import loadTaxonomyDatabase
25

26
import sys
27 28 29 30 31 32 33 34 35 36

def addTaxonomyOptions(optionManager):
        
    optionManager.add_option('-d','--ecopcrdb',
                             action="store", dest="db",
                             metavar="<FILENAME>",
                             type="string",
                             help="ecoPCR Database "
                                  "name")

37 38 39 40 41 42 43 44
    optionManager.add_option('-r','--required',
                             action="append", 
                             dest='required',
                             metavar="<TAXID>",
                             type="int",
                             default=[],
                             help="required taxid")

45 46 47
if __name__=='__main__':

    optionParser = getOptionManager([addTaxonomyOptions],
48
                                    entryIterator=EcoPCRFile)
49 50 51
    
    (options, entries) = optionParser()
    
52
    if (options.db is None):
53
        print>>sys.stderr, "-d option is required"
54
        sys.exit(1)
55 56 57 58

    if len(options.required)==0:
        print>>sys.stderr, "-r option is required"
        sys.exit(1)
59
    
Eric Coissac committed
60
    tax = taxonomy.EcoTaxonomyDB(options.db)
Eric Coissac committed
61
    seqd= sequence.EcoPCRDBSequenceIterator(options.db,taxonomy=tax)
62 63 64 65 66 67
    
    ranks = set(x for x in tax.rankIterator())
    
    listtaxonbyrank = {}
    
    for seq in seqd:
68 69 70 71 72 73 74 75 76 77 78 79 80 81
        taxid = seq['taxid']
        if (options.required and
            reduce(lambda x,y: x or y,
                      (tax.isAncestor(r,taxid) for r in options.required),
                      False)):

            for rank,rankid in ranks:
                if rank != 'no rank':
                    t = tax.getTaxonAtRank(seq['taxid'],rankid)
                    if t is not None:
                        if rank in listtaxonbyrank:
                            listtaxonbyrank[rank].add(t)
                        else:
                            listtaxonbyrank[rank]=set([t])
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
                        
    stats = dict((x,len(listtaxonbyrank[x])) for x in listtaxonbyrank)
    
    listtaxonbyrank = {}
        
    for seq in entries:
        for rank,rankid in ranks:
            if rank != 'no rank':
                t = tax.getTaxonAtRank(seq['taxid'],rankid)
                if t is not None:
                    if rank in listtaxonbyrank:
                        listtaxonbyrank[rank].add(t)
                    else:
                        listtaxonbyrank[rank]=set([t])

    dbstats= dict((x,len(listtaxonbyrank[x])) for x in listtaxonbyrank)
    
    ranknames = [x[0] for x in ranks]
    ranknames.sort()
    
    print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent')
    
    for r in ranknames:
105
        if  r in dbstats and r in stats and dbstats[r]:
106 107 108 109
            print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100)