obistat.py 7.46 KB
Newer Older
1 2
#!/usr/local/bin/python
'''
Aurélie Bonin committed
3 4
:py:mod:`obistat`: Computes basic statistics for attribute values 
=================================================================
5

Aurélie Bonin committed
6 7 8 9 10 11 12 13 14 15
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

:py:mod:`obistats` computes basic statistics for attribute values of sequence records.
The sequence records can be categorized or not using the ``-c`` option, and several ``-c`` options can be combined.
By default, only the number of sequence records and the total count are computed for each category. 
Additional statitics can be computed for attribute values in each category, like 
minimum value (``-m`` option), maximum value (``-M`` option), mean value 
(``-a`` option), variance (``-v`` option) or standard deviation (``-s`` option).
The result is a contingency table with the different categories in rows, and the 
computed statistics in columns. 
16

Aurélie Bonin committed
17
'''
18 19
from obitools.options import getOptionManager
from obitools.format.options import addInputFormatOption
20
from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
21
import math
22 23

def addStatOptions(optionManager):
Aurélie Bonin committed
24 25
    group = optionManager.add_option_group('Obistat specific options')
    group.add_option('-c','--category-attribute',
26 27 28
                             action="append", dest="categories",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
29
                             help="Attribute used to categorize the sequence records.")
30
    
Aurélie Bonin committed
31
    group.add_option('-m','--min',
32 33 34
                             action="append", dest="minimum",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
35
                             help="Computes the minimum value of attribute for each category.")
36
    
Aurélie Bonin committed
37
    group.add_option('-M','--max',
38 39 40
                             action="append", dest="maximum",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
41
                             help="Computes the maximum value of attribute for each category.")
42

Aurélie Bonin committed
43
    group.add_option('-a','--mean',
44 45 46
                             action="append", dest="mean",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
47
                             help="Computes the mean value of attribute for each category.")
48

Aurélie Bonin committed
49
    group.add_option('-v','--variance',
50 51 52
                             action="append", dest="var",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
53
                             help="Computes the variance of attribute for each category.")
54

Aurélie Bonin committed
55
    group.add_option('-s','--std-dev',
56 57 58
                             action="append", dest="sd",
                             metavar="<Attribute Name>",
                             default=[],
Aurélie Bonin committed
59
                             help="Computes the standard deviation of attribute for each category.")
60

61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
    
def statistics(values,attribute,func):
    stat={}
    lstat={}
    
    for var in attribute:
        if var in values:
            stat[var]={}
            lstat[var]=0
            for c in values[var]:
                v = values[var][c]
                m = func(v)
                stat[var][c]=m
                lm=len(str(m))
                if lm > lstat[var]:
                    lstat[var]=lm
                
    return stat,lstat

def minimum(values,options):
    return statistics(values, options.minimum, min)
    

def maximum(values,options):
    return statistics(values, options.maximum, max)

def mean(values,options):
    def average(v):
        s = reduce(lambda x,y:x+y,v,0)
        return float(s)/len(v)
    return statistics(values, options.mean, average)


94 95 96 97
def variance(v):
    s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.))
    return s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1)

Eric Coissac committed
98
def varpop(values,options):
99 100 101 102 103 104 105 106
    return statistics(values, options.var, variance)
    
def sd(values,options):
    def stddev(v):
        return math.sqrt(variance(v))
    return statistics(values, options.sd, stddev)
    

107
if __name__ == "__main__":
108
    optionParser = getOptionManager([addStatOptions,addInputFormatOption,addTaxonomyDBOptions])
109 110 111
    
    (options, entries) = optionParser()
    
112 113
    loadTaxonomyDatabase(options)

114 115 116
    options.statistics = set(options.minimum) | set(options.maximum) | set(options.mean)
    total = 0
    catcount={}
117
    totcount={}
118 119 120 121 122 123
    values={}
    lcat=0
    
    for s in entries:
        category = []
        for c in options.categories:
124 125 126 127 128 129 130
            try:
                if hasattr(options, 'taxonomy') and options.taxonomy is not None:
                    environ = {'taxonomy' : options.taxonomy,'sequence':s}
                else:
                    environ = {'sequence':s}
        
                v = eval(c,environ,s)
131 132 133 134
                lv=len(str(v))
                if lv > lcat:
                    lcat=lv
                category.append(v)
135
            except:
136 137 138 139 140
                category.append(None)
                if 4 > lcat:
                    lcat=4
        category=tuple(category)
        catcount[category]=catcount.get(category,0)+1
141 142 143 144
        try: 
            totcount[category]=totcount.get(category,0)+s['count']
        except KeyError:
            totcount[category]=totcount.get(category,0)+1
145 146 147 148 149 150 151 152 153 154
        for var in options.statistics:
            if var in s:
                v = s[var]
                if var not in values:
                    values[var]={}
                if category not in values[var]:
                    values[var][category]=[]
                values[var][category].append(v)
                
    
155 156 157
    mini,lmini  = minimum(values, options)
    maxi,lmaxi  = maximum(values, options)
    avg ,lavg   = mean(values, options)
Eric Coissac committed
158
    varp ,lvarp = varpop(values, options)
159
    sigma,lsigma= sd(values, options)
Eric Coissac committed
160
    
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
            
    pcat  = "%%-%ds" % lcat
    if options.minimum:
        minvar= "min_%%-%ds" % max(len(x) for x in options.minimum)
    else:
        minvar= "%s"
        
    if options.maximum:
        maxvar= "max_%%-%ds" % max(len(x) for x in options.maximum)
    else:
        maxvar= "%s"
        
    if options.mean:
        meanvar= "mean_%%-%ds" % max(len(x) for x in options.mean)
    else:
        meanvar= "%s"
        
178 179 180 181 182 183 184 185 186 187
    if options.var:
        varvar= "var_%%-%ds" % max(len(x) for x in options.var)
    else:
        varvar= "%s"
        
    if options.sd:
        sdvar= "sd_%%-%ds" % max(len(x) for x in options.sd)
    else:
        sdvar= "%s"
        
188 189 190
    hcat = "\t".join([pcat % x for x in options.categories]) + "\t" +\
           "\t".join([minvar % x for x in options.minimum])  + "\t" +\
           "\t".join([maxvar % x for x in options.maximum])  + "\t" +\
191 192 193
           "\t".join([meanvar % x for x in options.mean])  + "\t" +\
           "\t".join([varvar % x for x in options.var])  + "\t" +\
           "\t".join([sdvar % x for x in options.sd]) + \
194 195
           "\t   count" + \
           "\t   total" 
196 197 198 199 200 201 202 203 204 205
    print hcat
    for c in catcount:
        for v in c:
            print pcat % str(v)+"\t",
        for m in options.minimum:
            print (("%%%dd" % lmini[m]) % mini[m][c])+"\t",
        for m in options.maximum:
            print (("%%%dd" % lmaxi[m]) % maxi[m][c])+"\t",
        for m in options.mean:
            print (("%%%df" % lavg[m]) % avg[m][c])+"\t",
206 207 208 209
        for m in options.var:
            print (("%%%df" % lvarp[m]) % varp[m][c])+"\t",
        for m in options.sd:
            print (("%%%df" % lsigma[m]) % sigma[m][c])+"\t",
210 211
        print "%7d" %catcount[c],
        print "%9d" %totcount[c]
212 213