obistat.py 7.55 KB
Newer Older
1 2
#!/usr/local/bin/python
'''
Aurélie Bonin committed
3
:py:mod:`obistat`: computes basic statistics for attribute values 
Aurélie Bonin committed
4
=================================================================
5

Aurélie Bonin committed
6 7 8
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

:py:mod:`obistats` computes basic statistics for attribute values of sequence records.
Eric Coissac committed
9
The sequence records can be categorized or not using one or several ``-c`` options.
Aurélie Bonin committed
10
By default, only the number of sequence records and the total count are computed for each category. 
Eric Coissac committed
11 12 13 14 15 16 17 18
Additional statistics can be computed for attribute values in each category, like:

    - minimum value (``-m`` option) 
    - maximum value (``-M`` option) 
    - mean value (``-a`` option) 
    - variance (``-v`` option) 
    - standard deviation (``-s`` option)
    
Aurélie Bonin committed
19 20
The result is a contingency table with the different categories in rows, and the 
computed statistics in columns. 
21

Aurélie Bonin committed
22
'''
23 24
from obitools.options import getOptionManager
from obitools.format.options import addInputFormatOption
25
from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
26
import math
27 28

def addStatOptions(optionManager):
Aurélie Bonin committed
29
    group = optionManager.add_option_group('obistat specific options')
Aurélie Bonin committed
30
    group.add_option('-c','--category-attribute',
31 32 33
                             action="append", dest="categories",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
34
                             help="Attribute used to categorize the sequence records.")
35
    
Aurélie Bonin committed
36
    group.add_option('-m','--min',
37 38 39
                             action="append", dest="minimum",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
40
                             help="Computes the minimum value of attribute for each category.")
41
    
Aurélie Bonin committed
42
    group.add_option('-M','--max',
43 44 45
                             action="append", dest="maximum",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
46
                             help="Computes the maximum value of attribute for each category.")
47

Aurélie Bonin committed
48
    group.add_option('-a','--mean',
49 50 51
                             action="append", dest="mean",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
52
                             help="Computes the mean value of attribute for each category.")
53

Aurélie Bonin committed
54
    group.add_option('-v','--variance',
55 56 57
                             action="append", dest="var",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
58
                             help="Computes the variance of attribute for each category.")
59

Aurélie Bonin committed
60
    group.add_option('-s','--std-dev',
61 62 63
                             action="append", dest="sd",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
64
                             help="Computes the standard deviation of attribute for each category.")
65

66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
    
def statistics(values,attribute,func):
    stat={}
    lstat={}
    
    for var in attribute:
        if var in values:
            stat[var]={}
            lstat[var]=0
            for c in values[var]:
                v = values[var][c]
                m = func(v)
                stat[var][c]=m
                lm=len(str(m))
                if lm > lstat[var]:
                    lstat[var]=lm
                
    return stat,lstat

def minimum(values,options):
    return statistics(values, options.minimum, min)
    

def maximum(values,options):
    return statistics(values, options.maximum, max)

def mean(values,options):
    def average(v):
        s = reduce(lambda x,y:x+y,v,0)
        return float(s)/len(v)
    return statistics(values, options.mean, average)


99
def variance(v):
100 101
    if len(v)==1: 
        return 0 
102 103 104
    s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.))
    return s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1)

Eric Coissac committed
105
def varpop(values,options):
106 107 108 109 110 111 112 113
    return statistics(values, options.var, variance)
    
def sd(values,options):
    def stddev(v):
        return math.sqrt(variance(v))
    return statistics(values, options.sd, stddev)
    

114
if __name__ == "__main__":
Eric Coissac committed
115 116
    optionParser = getOptionManager([addStatOptions,addInputFormatOption,addTaxonomyDBOptions],
                                    progdoc=__doc__)
117 118 119
    
    (options, entries) = optionParser()
    
120 121
    loadTaxonomyDatabase(options)

122 123 124
    options.statistics = set(options.minimum) | set(options.maximum) | set(options.mean)
    total = 0
    catcount={}
125
    totcount={}
126 127 128 129 130 131
    values={}
    lcat=0
    
    for s in entries:
        category = []
        for c in options.categories:
132 133 134 135 136 137 138
            try:
                if hasattr(options, 'taxonomy') and options.taxonomy is not None:
                    environ = {'taxonomy' : options.taxonomy,'sequence':s}
                else:
                    environ = {'sequence':s}
        
                v = eval(c,environ,s)
139 140 141 142
                lv=len(str(v))
                if lv > lcat:
                    lcat=lv
                category.append(v)
143
            except:
144 145 146 147 148
                category.append(None)
                if 4 > lcat:
                    lcat=4
        category=tuple(category)
        catcount[category]=catcount.get(category,0)+1
149 150 151 152
        try: 
            totcount[category]=totcount.get(category,0)+s['count']
        except KeyError:
            totcount[category]=totcount.get(category,0)+1
153 154 155 156 157 158 159 160 161 162
        for var in options.statistics:
            if var in s:
                v = s[var]
                if var not in values:
                    values[var]={}
                if category not in values[var]:
                    values[var][category]=[]
                values[var][category].append(v)
                
    
163 164 165
    mini,lmini  = minimum(values, options)
    maxi,lmaxi  = maximum(values, options)
    avg ,lavg   = mean(values, options)
Eric Coissac committed
166
    varp ,lvarp = varpop(values, options)
167
    sigma,lsigma= sd(values, options)
Eric Coissac committed
168
    
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
            
    pcat  = "%%-%ds" % lcat
    if options.minimum:
        minvar= "min_%%-%ds" % max(len(x) for x in options.minimum)
    else:
        minvar= "%s"
        
    if options.maximum:
        maxvar= "max_%%-%ds" % max(len(x) for x in options.maximum)
    else:
        maxvar= "%s"
        
    if options.mean:
        meanvar= "mean_%%-%ds" % max(len(x) for x in options.mean)
    else:
        meanvar= "%s"
        
186 187 188 189 190 191 192 193 194 195
    if options.var:
        varvar= "var_%%-%ds" % max(len(x) for x in options.var)
    else:
        varvar= "%s"
        
    if options.sd:
        sdvar= "sd_%%-%ds" % max(len(x) for x in options.sd)
    else:
        sdvar= "%s"
        
196 197 198
    hcat = "\t".join([pcat % x for x in options.categories]) + "\t" +\
           "\t".join([minvar % x for x in options.minimum])  + "\t" +\
           "\t".join([maxvar % x for x in options.maximum])  + "\t" +\
199 200 201
           "\t".join([meanvar % x for x in options.mean])  + "\t" +\
           "\t".join([varvar % x for x in options.var])  + "\t" +\
           "\t".join([sdvar % x for x in options.sd]) + \
202 203
           "\t   count" + \
           "\t   total" 
204 205 206 207 208 209 210 211 212 213
    print hcat
    for c in catcount:
        for v in c:
            print pcat % str(v)+"\t",
        for m in options.minimum:
            print (("%%%dd" % lmini[m]) % mini[m][c])+"\t",
        for m in options.maximum:
            print (("%%%dd" % lmaxi[m]) % maxi[m][c])+"\t",
        for m in options.mean:
            print (("%%%df" % lavg[m]) % avg[m][c])+"\t",
214 215 216 217
        for m in options.var:
            print (("%%%df" % lvarp[m]) % varp[m][c])+"\t",
        for m in options.sd:
            print (("%%%df" % lsigma[m]) % sigma[m][c])+"\t",
218 219
        print "%7d" %catcount[c],
        print "%9d" %totcount[c]
220 221