ecodbtaxstat.py 2.92 KB
Newer Older
1
#!/usr/local/bin/python
Eric Coissac committed
2
'''
Frédéric Boyer committed
3
:py:mod:`ecodbtaxstat`: gives taxonomic rank frequency of a given ``ecopcr`` database   
Eric Coissac committed
4
=====================================================================================
5 6 7

.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

Eric Coissac committed
8
The :py:mod:`ecodbtaxstat` command requires an ``ecopcr`` database and a taxonomic rank 
9 10 11
(specified by the ``--rank`` option, default *species*). The command outputs first 
the total number of sequence records in the database having taxonomic information at this rank, 
and then the number of sequence records for each value of this rank.
Eric Coissac committed
12 13 14 15 16 17 18 19 20 21 22 23 24

'''

from obitools.options import getOptionManager

from obitools.options.taxonomyfilter import addTaxonomyFilterOptions,  \
                                            taxonomyFilterIteratorGenerator

from obitools.ecopcr.taxonomy import EcoTaxonomyDB
from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator

def addRankOptions(optionManager):
    
Eric Coissac committed
25
    group = optionManager.add_option_group('ecodbtaxstat specific option')
26
    group.add_option('--rank',
Eric Coissac committed
27 28 29 30
                             action="store", dest="rank",
                             metavar="<taxonomic rank>",
                             type="string",
                             default="species",
31 32 33
                             help="The taxonomic rank at which frequencies have to be computed. " 
                                  "Possible values are: "
                                  "class, family, forma, genus, infraclass, infraorder, kingdom, "
Eric Coissac committed
34
                                  "order, parvorder, phylum, species, species group, "
35 36 37
                                  "species subgroup, subclass, subfamily, subgenus, subkingdom, "
                                  "suborder, subphylum, subspecies, subtribe, superclass, "
                                  "superfamily, superkingdom, superorder, superphylum, tribe or varietas. "
Eric Coissac committed
38
                                  "(Default: species)")
Eric Coissac committed
39 40 41 42 43 44 45 46 47 48


def cmptax(taxonomy):
    def cmptaxon(t1,t2):
        return cmp(taxonomy.getScientificName(t1),
                   taxonomy.getScientificName(t2))
    return cmptaxon

if __name__=='__main__':
    
49
    optionParser = getOptionManager([addRankOptions,addTaxonomyFilterOptions], progdoc=__doc__)
Eric Coissac committed
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
    
    
    (options, entries) = optionParser()


    filter = taxonomyFilterIteratorGenerator(options)
    seqdb = EcoPCRDBSequenceIterator(options.ecodb,options.taxonomy)

    stats = {}
    i=0
    tot=0
    for seq in filter(seqdb):
        tot+=1
        t = options.taxonomy.getTaxonAtRank(seq['taxid'],options.rank)
        if t is not None:
            i+=1
            stats[t]=stats.get(t,0)+1
       
    print "#sequence count : %d" % tot
    print "#considered sequences : %d" % i     
    print "# %s : %d" % (options.rank,len(stats))
    taxons = stats.keys()
    taxons.sort(cmptax(options.taxonomy))
    
    for t in taxons:
        print "%s\t%d" % (options.taxonomy.getScientificName(t),stats[t])