obiclean.py 14.3 KB
Newer Older
1 2 3
#!/usr/local/bin/python

'''
Aurélie Bonin committed
4
:py:mod:`obiclean`: tags a set of sequences for PCR/sequencing errors identification 
Eric Coissac committed
5
====================================================================================
6

Frédéric Boyer committed
7 8
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>

Eric Coissac committed
9
:py:mod:`obiclean` is a command that classifies sequence records either as ``head``, ``internal`` or ``singleton``.
10

Eric Coissac committed
11 12
For that purpose, two pieces of information are used:
    - sequence record counts
13 14
    - sequence similarities

Eric Coissac committed
15 16 17 18 19
*S1* a sequence record is considered as a variant of *S2* another sequence record if and only if:
    - ``count`` of *S1* divided by ``count`` of *S2* is lesser than the ratio *R*.
      *R* default value is set to 1, and can be adjusted between 0 and 1 with the ``-r`` option.
    - both sequences are *related* to one another (they can align with some differences, 
      the maximum number of differences can be specified by the ``-d`` option).
20

Eric Coissac committed
21
Considering *S* a sequence record, the following properties hold for *S* tagged as:
22
    - ``head``: 
Eric Coissac committed
23 24 25
       + there exists **at least one** sequence record in the dataset that is a variant of *S*
       + there exists **no** sequence record in the dataset such that *S* is a variant of this 
         sequence record
26
    - ``internal``:
Eric Coissac committed
27 28 29 30 31 32
       + there exists **at least one** sequence record in the dataset such that *S* is a variant
         of this sequence record
    - ``singleton``: 
       + there exists **no** sequence record in the dataset that is a variant of *S*
       + there exists **no** sequence record in the dataset such that *S* is a variant of this 
         sequence record
33 34

By default, tagging is done once for the whole dataset, but it can also be done sample by sample
Eric Coissac committed
35
by specifying the ``-s`` option. In such a case, the counts are extracted from the sample 
36 37 38
information.

Finally, each sequence record is annotated with three new attributes ``head``, ``internal`` and 
Eric Coissac committed
39 40
``singleton``. The attribute values are the numbers of samples in which the sequence record has 
been classified in this manner.
41
'''
Frédéric Boyer committed
42

43
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
Eric Coissac committed
44
from obitools.options import getOptionManager
45 46
from obitools.graph import UndirectedGraph,Indexer
from obitools.graph.dag import DAG
47 48 49 50
from obitools.utils import progressBar
from obitools.align import LCS
from obitools.align import isLCSReachable

51

52 53 54
import sys
import math

55

56 57 58 59 60 61
def addCleanOptions(optionManager):
    optionManager.add_option('-d','--distance',
                             action="store", dest="dist",
                             metavar="###",
                             type="int",
                             default=1,
Frédéric Boyer committed
62
                             help="Maximum numbers of errors between two variant sequences [default: 1]")
63 64
    optionManager.add_option('-s','--sample',
                             action="store", dest="sample",
65
                             metavar="<TAGNAME>",
66 67
                             type="str",
                             default=None,
Frédéric Boyer committed
68
                             help="Tag containing sample descriptions")
69
     
70 71 72 73 74 75 76
    optionManager.add_option('-g','--graph',
                             action="store", dest="graph",
                             metavar="<TAGNAME>",
                             type="str",
                             default=None,
                             help="File name where the clustering graphs are saved")
     
Frédéric Boyer committed
77
    optionManager.add_option('-r','--ratio',
78
                             action="store", dest="ratio",
79
                             metavar="<FLOAT>",
80 81
                             type="float",
                             default="0.5",
82 83
                             help="Minimum ratio between counts of two sequence records so that the less abundant "
                                  "one can be considered as a variant of the more abundant "
84
                                  "[default: 0.5]")
85
    optionManager.add_option('-H','--head',
86
                             action="store_true", dest="onlyhead",
87
                             default=False,
88
                             help="Outputs only head tagged sequence records")
89
    
90 91 92 93 94
    optionManager.add_option('-C','--cluster',
                             action="store_true", dest="clustermode",
                             default=False,
                             help="Set obiclean in clustering mode")
    
95 96
def lookforFather(node,sample):
    father=set()
97
    
98 99 100 101 102 103 104 105 106
    for neighbour in node.neighbourIterator():
        if sample in neighbour['_sample']:
            if neighbour['_sample'][sample] > node['_sample'][sample]:
                gdfather = lookforFather(neighbour, sample)
                father|=gdfather
    if not father:
        father.add(node)
        
    return father
107 108 109 110 111 112 113 114 115 116 117 118

def cmpseqcount(s1,s2):
    if 'count' not in s1:
        s1['count']=1
    if 'count' not in s2:
        s2['count']=1
    
    return cmp(s2['count'],s1['count'])


if __name__ == '__main__':
    
Frédéric Boyer committed
119
    optionParser = getOptionManager([addCleanOptions,addInOutputOption], progdoc=__doc__)
120 121
    (options, entries) = optionParser()
    
122 123 124
    if (options.onlyhead):
        options.clustermode=True
       
125 126
    globalIndex = Indexer()         # I keep correspondances for all graphs between 
                                    # node id and sequence
127 128 129 130 131 132 133 134 135 136
                                     
    db = []                         # sequences are stored in a list. The indexes in the list
                                    # are corresponding to the node index in graphs
                                    
    sampleOccurrences = []          # Contains the list of count distribution per samples
                                    # The indexes in the list are corresponding to the node 
                                    # index in graphs
                                    
    graph = UndirectedGraph("error",indexer=globalIndex)
    pcr= {}                         # For each sample store a set of node id occuring in this PCR
Eric Coissac committed
137
    
138 139 140 141 142 143 144 145 146
    if options.graph is not None:
        graphfile=open(options.graph,"w")
    else:
        graphfile=None
    
    for s in entries:
        nodeid = globalIndex.getIndex(s.id)
        db.append(s)

147 148 149 150
        if options.sample is None:
            sample = {"XXX":s['count'] if 'count' in s else 1}
        else:
            sample = s[options.sample]
151 152 153 154 155 156 157 158 159 160 161 162
            
        sampleOccurrences.append(sample)
            
        graph.addNode(s.id,shape='circle')

        for sp in sample:
            spcr = pcr.get(sp,set())
            spcr.add(nodeid)
            pcr[sp]=spcr
        
    
    writer = sequenceWriterGenerator(options)            
163 164 165 166 167 168 169 170 171 172 173 174
    
    ldb = len(db)    
    digit = int(math.ceil(math.log10(ldb)))
    aligncount = ldb*(ldb+1)/2
    edgecount = 0
    print >>sys.stderr
    
    header = "Alignment  : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit)
    progressBar(1,aligncount,True,"Alignment  : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit))
    pos=1
    aligner = LCS()
    
175 176 177 178 179 180
    #
    # We build the global levenstein graph
    # Two sequences are linked if their distances are below
    # options.dist (usually 1)
    #
    
181 182 183 184 185 186 187 188 189 190 191 192
    for i in xrange(ldb):

        aligner.seqA = db[i]
        li = len(db[i])
        
        for j in xrange(i+1,ldb):
            progressBar(pos,aligncount,head=header % (i,j,edgecount))
            pos+=1
            
            lj = len(db[j])
            
            lm = max(li,lj)
Eric Coissac committed
193
            lcsmin = lm - options.dist
194 195 196 197 198 199 200 201
            
            if isLCSReachable(db[i],db[j],lcsmin):
                aligner.seqB=db[j]
                ali = aligner()
                llcs=ali.score
                lali = len(ali[0])
                obsdist = lali-llcs
                if obsdist >= 1 and obsdist <= options.dist:
202
                    graph.addEdge(index1=i, index2=j)
203 204 205
                    edgecount+=1               
            
    print >>sys.stderr
206 207 208

    header = "Clustering sample  : %20s "
    samplecount = len(pcr)
209
    
210 211
    print >>sys.stderr,"Sample count : %d" % samplecount

212
    
213 214
    progressBar(1,samplecount,True,head=header % "--")
    isample=0
215
    
216 217 218
    #
    # We iterate through all PCR
    #
Eric Coissac committed
219
    
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
    for sample in pcr:
        
        isample+=1
        progressBar(isample,samplecount,head=header % sample)
        
        seqids   = list(pcr[sample])
        nnodes    = len(seqids)
        
        #
        # We build a sub DAG for each sample
        #
        
        sub = DAG(sample,indexer=globalIndex)
        counts = []
        
        for i in seqids:
            c=sampleOccurrences[i][sample]
            sub.addNode(index=i,count=c,oricount=c)
            counts.append(c)
Eric Coissac committed
239

240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
        order  = map(None,counts,seqids)
        order.sort(key=lambda a : a[0],reverse=True)
        
        for j in xrange(nnodes - 1):
            count1,index1 = order[j]
            for k in xrange(j+1,nnodes):
                count2,index2 = order[k]
                r = float(count2)/float(count1)
                if r <= options.ratio and graph.hasEdge(index1=index1,index2=index2):
                    sub.addEdge(index1=index1, 
                                index2=index2,
                                ratio=r,
                                arette = "%d -> %d" % (count1,count2))
                    
        if (options.clustermode):
            # We transfer the weight of errors to the parent sequence
            # when an error has several parents, we distribute its
            # weight to each of its parent proportionally to the parent 
            # weight. 
                        
            leaves = sub.getLeaves()
Eric Coissac committed
261
            
262 263 264 265 266 267 268 269 270 271 272
            while leaves:
                for l in leaves:
                    l['color']='red'
                    l['done']=True
                    c = l['count']
                    p = l.getParents()
                    pc = [float(x['count']) for x in p]
                    ps = sum(pc)
                    pc = [x / ps * c for x in pc]
                    for i in xrange(len(pc)):
                        p[i]['count']+=int(round(pc[i]))
273
                
274 275 276 277 278 279 280 281
                    leaves = [x for x in sub.nodeIterator(lambda n : 'done' not in n and not [y for y in n.neighbourIterator(lambda  k : 'done' not in k)])]
            
    
            
            # Just clean the done tag set by the precedent loop
            
            for x in sub.nodeIterator():
                del x["done"]
Eric Coissac committed
282
                
283 284 285 286 287 288 289 290 291
        
        # Annotate each sequences with its more probable parent.
        # When a sequence has several potential parents, it is
        # annotated with the heaviest one
        
        heads = sub.getRoots()
        sons  = []
        for h in heads:
            h['cluster']=h.label
292
            
293 294
            if (options.clustermode):
                h['head']   =True
Eric Coissac committed
295
            
296
            sons.extend(h.neighbourIterator(lambda  k : 'cluster' not in k))
297

298 299 300
            #
            # Annotate the corresponding sequence
            #
Eric Coissac committed
301
            
302
            seq = db[h.index]
Eric Coissac committed
303
            
304 305 306
            # sequence at least head in one PCR get the obiclean_head
            # attribute
            seq['obiclean_head']=True
Eric Coissac committed
307
            
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
            if (options.clustermode):
                # Store for each sample the cluster center related to
                # this sequence 
                if "obiclean_cluster" not in seq:
                    seq['obiclean_cluster']={}
                seq['obiclean_cluster'][sample]=h.label
            
            
                # Store for each sample the count of this sequence plus 
                # the count of all its related
                if "obiclean_count" not in seq:
                    seq["obiclean_count"]={}
                
                seq["obiclean_count"][sample]=h['count']
            
            if "obiclean_status" not in seq:
                seq["obiclean_status"]={}
                
            if len(h) > 0:
                seq["obiclean_status"][sample]='h'
            else:
                seq["obiclean_status"][sample]='s'
330

331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
        
        heads=sons
        sons  = []
        
        while heads:
            for h in heads:
                parents = h.getParents()
                maxp=None
                for p in parents:
                    if maxp is None or p['count']>maxp['count']:
                        maxp=p
                        
                if 'cluster' in maxp:
                    cluster = maxp['cluster']
                    h['cluster']=cluster
                    sons.extend(h.neighbourIterator(lambda  k : 'cluster' not in k))
                    
                    #
                    # Annotate the corresponding sequence
                    #
                    
                    seq = db[h.index]
                    if (options.clustermode):
                        if "obiclean_cluster" not in seq:
                            seq['obiclean_cluster']={}
                        seq['obiclean_cluster'][sample]=cluster
                        
                        if "obiclean_count" not in seq:
                            seq["obiclean_count"]={}
                        seq["obiclean_count"][sample]=h['count']
361
    
362 363 364 365
                if "obiclean_status" not in seq:
                    seq["obiclean_status"]={}
                    
                seq["obiclean_status"][sample]='i'
366

367 368 369 370 371 372 373
            heads=sons
            sons  = []
            
        if graphfile is not None:
            print >>graphfile,sub

    print >>sys.stderr
374

375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
    seqcount = len(db)
    sc=0
    progressBar(1,seqcount,True,head="Writing sequences")
        
    for node in db:
        sc+=1
        progressBar(sc,seqcount,head="Writing sequences")
        
        if (not options.onlyhead or 'obiclean_head' in node):
            status = node["obiclean_status"]
            i=0
            h=0
            s=0
            for sample in status:
                st=status[sample]
                if st=="i":
                    i+=1
                elif st=="s":
                    s+=1
                else:
                    h+=1
            node['obiclean_headcount']=h
            node['obiclean_internalcount']=i
            node['obiclean_singletoncount']=s
            node['obiclean_samplecount']=s+i+h
            
401 402 403
            if 'obiclean_head' not in node:
                node['obiclean_head']=False
            
Eric Coissac committed
404 405
#            if (not options.clustermode):
#                del node["obiclean_status"]
406 407
            
            writer(node)
408
                
409
    print >>sys.stderr
410
            
411 412 413 414 415 416