obiselect.py 8.96 KB
Newer Older
1
#!/usr/local/bin/python
2
"""
Aurélie Bonin committed
3
:py:mod:`obiselect` : selects representative sequence records
4
=============================================================
5

6 7
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

8 9 10
:py:mod:`obiselect` command allows to select a subset of sequences records from a sequence
file by describing sequence record groups and defining how many and which sequence records
from each group must be retrieved.
Frédéric Boyer committed
11

12
"""
13 14
from obitools.format.options import addInOutputOption, sequenceWriterGenerator,\
    addInputFormatOption
15
from obitools.options import getOptionManager
16 17
from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
from random import random
18
from obitools.utils import progressBar
19
import math
20
import sys
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

def minimum(seqs):
    return min(s['select'] for s in seqs)

def maximum(seqs):
    return max(s['select'] for s in seqs)

def mean(seqs):
    ss= reduce(lambda x,y: x + y,(s['select'] for s in seqs),0)
    return float(ss) / len(seqs)

def median(seqs):
    ss = [s['select'] for s in seqs]
    ss.sort()
    return ss[len(ss)/2]

    
38 39 40

def addSelectOptions(optionManager):
    
41 42 43 44
    group = optionManager.add_option_group('obiselect specific options')

    
    group.add_option('-c','--category-attribute',
45 46 47 48
                             action="append", dest="categories",
                             metavar="<Attribute Name>",
                             default=[],
                             help="Add one attribute to the list of"
49
                                  " attribute used for categorizing sequence records")
50

51
    group.add_option('-n','--number',
52 53 54 55
                             action="store", dest="number",
                             metavar="",
                             type="int",
                             default=1,
56
                             help="number of sequence records to keep in each category")
57

58
    group.add_option('-f','--function',
59 60 61
                             action="store", dest="function",
                             metavar="",
                             default="random",
62
                             help="python code evaluated for each sequence record [default: random value]")
63 64

   
65
    group.add_option('-m','--min',
66 67 68 69
                             action="store_const", dest="method",
                             metavar="",
                             default=maximum,
                             const=minimum,
70 71
                             help="select sequence record in each group minimizing the function"
                                  " (exclusive with -M, -a, --median)")
72
    
73
    group.add_option('-M','--max',
74
                             action="store_const", dest="method",
75
                             metavar="",
76 77
                             default=maximum,
                             const=maximum,
78 79
                             help="select sequence record in each group maximizing the function"
                                  " (exclusive with -m, -a, --median)")
80

81
    group.add_option('-a','--mean',
82
                             action="store_const", dest="method",
83
                             metavar="",
84 85
                             default=maximum,
                             const=mean,
86 87
                             help="select sequence record in each group closest to the mean of the function"
                                  " (exclusive with -m, -M, --median)")
88

89
    group.add_option('--median',
90 91 92 93
                             action="store_const", dest="method",
                             metavar="<Attribute Name>",
                             default=maximum,
                             const=median,
94 95
                             help="select sequence record in each group closest to the median of the function"
                                  " (exclusive with -m, -M, -a)")
96

97
    group.add_option('--merge',
98 99
                             action="append", dest="merge",
                             metavar="<TAG NAME>",
100
                             type="string",
101
                             default=[],
102
                             help="attributes to merge within each group")
103

104
    group.add_option('--merge-ids',
105
                             action="store_true", dest="mergeids",
Eric Coissac committed
106
                             default=False,
107 108
                             help="add the merged id data to output")
    
Eric Coissac committed
109

110

111 112 113 114 115 116
def sortclass(seqs,options):
    cible = float(options.method(seqs))
    for s in seqs:
        s['distance']=math.sqrt((float(s['select'])-cible)**2)
    seqs.sort(lambda s1,s2 : cmp(s1['distance'],s2['distance']))
                                                
117 118 119


if __name__ == '__main__':
120 121
    
    optionParser = getOptionManager([addSelectOptions,addInOutputOption,addTaxonomyDBOptions])
122 123 124
    
    (options, entries) = optionParser()
    
125 126
    taxonomy=loadTaxonomyDatabase(options)

Eric Coissac committed
127 128
    writer = sequenceWriterGenerator(options)
    
129 130
    classes = {}
    
131 132
    print >>sys.stderr,"\nLoading sequences...\n"

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
    for s in entries:
        category = []
        for c in options.categories:
            try:
                if hasattr(options, 'taxonomy') and options.taxonomy is not None:
                    environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
                else:
                    environ = {'sequence':s,'random':random()}
        
                v = eval(c,environ,s)
                category.append(v)
            except:
                category.append(None)

        category=tuple(category)
        group = classes.get(category,[])
        group.append(s)
        classes[category]= group
            
        if hasattr(options, 'taxonomy') and options.taxonomy is not None:
            environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
Eric Coissac committed
154
        else:
155
            environ = {'sequence':s, 'random':random()}
156 157 158 159 160 161 162 163 164 165 166 167 168 169

        try:    
            select =  eval(options.function,environ,s)
            s['select']=select
        except:
            s['select']=None
 
    mergedKey = options.merge
    mergeIds = options.mergeids 
          
    if mergedKey is not None:
        mergedKey=set(mergedKey)
    else:
        mergedKey=set() 
170
    
171 172 173 174
    if taxonomy is not None:
        mergedKey.add('taxid')
                
    
175 176 177 178 179
    print >>sys.stderr,"\nSelecting sequences...\n"
    
    lclasses=len(classes)
    progressBar(1,lclasses,True,'Selecting')
    i=0
180
    for c in classes:
181 182
        i+=1
        progressBar(i,lclasses,False,"%15s" % c)
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
        seqs = classes[c]
        sortclass(seqs, options)
        if len(c)==1:
            c=c[0]
            
        if options.number==1:
            s = seqs[0]
            
            for key in mergedKey:
                if key=='taxid' and mergeIds:
                    if 'taxid_dist' not in s:
                        s["taxid_dist"]={}
                    if 'taxid' in s:
                        s["taxid_dist"][s.id]=s['taxid']
                mkey = "merged_%s" % key 
                if mkey not in s:
199 200 201 202
                    if key in s:
                        s[mkey]={s[key]:1}
                    else:
                        s[mkey]={}
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221

            if 'count' not in s:
                s['count']=1
            if mergeIds:        
                s['merged']=[s.id]

            for seq in seqs[1:]:
                
                s['count']+=seq['count']
                
                for key in mergedKey:
                    if key=='taxid' and mergeIds:
                        if 'taxid_dist' in seq:
                            s["taxid_dist"].update(seq["taxid_dist"])
                        if 'taxid' in seq:
                            s["taxid_dist"][seq.id]=seq['taxid']
                            
                    mkey = "merged_%s" % key 
                    if mkey in seq:
222 223 224 225 226 227 228 229 230 231 232 233 234 235
                        m = seq[mkey]
                    else:
                        if key in seq:
                            m={seq[key]:1}
                            
                    allmkey = set(m.keys()) | set(s[mkey].keys())
                    s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)

#                     if mkey in seq:
#                         for skey in seq[mkey]:
#                             if skey in s:
#                                 s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey]
#                             else:
#                                 s[mkey][skey]=seq[mkey][skey]
236
                                
237 238 239 240
                #for key in seq.iterkeys():
                #    # Merger proprement l'attribut merged s'il exist
                #    if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged' and key!='select':
                #        del(s[key])
241 242 243 244 245 246 247 248
                    
                            
                if mergeIds:        
                    s['merged'].append(seq.id)
 
        if taxonomy is not None:
            mergeTaxonomyClassification(seqs, taxonomy)
                    
249
        
250 251
        for s in seqs[0:options.number]:
            s['class']=c
252
            del s['select']
253 254 255
            writer(s)
            
    print >>sys.stderr