tango.py 105 KB
Newer Older
Eric Coissac committed
1
"""
2 3 4 5
.. module:: orgasm.tango
   :platform: Unix
   :synopsis: The :py:mod:`~orgasm.tango` package contains a set of functions useful to manage assembling structure.

Eric Coissac committed
6 7 8
The :py:mod:`orgasm.tango` package
==================================

9 10 11 12
The :py:mod:`~orgasm.tango` package contains a set of functions useful to manage assembling structure.

.. moduleauthor:: Eric Coissac <eric.coissac@inria.fr>

Eric Coissac committed
13 14 15 16 17 18 19 20 21
:author:  Eric Coissac
:contact: eric.coissac@inria.fr


.. warning::

    The :py:mod:`~orgasm.tango` package functions aims to be integrated 
    into other packages as standalone functions of class methods.
"""
Eric Coissac committed
22
import sys
Eric Coissac committed
23 24
from orgasm.multialign import multiAlignReads, consensus  # @UnresolvedImport
from orgasm.assembler import Assembler                    # @UnresolvedImport
Eric Coissac committed
25
from orgasm.assembler import buildstem                    # @UnresolvedImport
26
from orgasm.assembler import tango, getusedreads          # @UnresolvedImport
27
from orgasm.utils import tags2str
Eric Coissac committed
28

29
from time import time
Eric Coissac committed
30
import math
Eric Coissac committed
31
from functools import reduce
32
from sys import stderr
33
from orgasm.multialign._multi import alignSequence # @UnresolvedImport
Eric Coissac committed
34

35 36 37 38 39
def logOrPrint(logger,message,level='info'):
    if logger is not None:
        getattr(logger, level)(message)
    else:
        print(message,file=sys.stderr)
Eric Coissac committed
40

Eric Coissac committed
41
def cutLowCoverage(self,mincov,terminal=True):
Eric Coissac committed
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
    '''
    Remove sequences in the assembling graph with a coverage below ``mincov``.
    
    .. code-block :: ipython
    
        In [159]: asm = Assembler(r)

        In [160]: s = matchtoseed(m,r)

        In [161]: a = tango(asm,s,mincov=1,minread=10,minoverlap=30,maxjump=0,cycle=1)
    
        In [162]: asm.cleanDeadBranches(maxlength=10)

        Remaining edges : 424216 node : 423896
        Out[162]: 34821
        In [162]: cutLowCoverage(asm,10,terminal=False)

    :param mincov: coverage threshold 
    :type mincov: :py:class:`int`
    :param terminal: if set to ``True`` only terminal edges are removed from the assembling graph
    :type terminal: :py:class:`bool`
    :return: the count of deleted node
    :rtype: :py:class:`int`
    
    :seealso: :py:meth:`~orgasm.assambler.Assembler.cleanDeadBranches`
Eric Coissac committed
67
    '''    
Eric Coissac committed
68 69 70
    def isTerminal(g,n):
        return len(list(g.parentIterator(n)))==0 or len(list(g.neighbourIterator(n)))==0
    
Eric Coissac committed
71 72 73 74 75 76
    def endnodeset(g):
        return set(g.nodeIterator(predicate = lambda n : (len(list(g.neighbourIterator(n)))==0)))
    
    def startnodeset(g):
        return set(g.nodeIterator(predicate = lambda n : (len(list(g.parentIterator(n)))==0)))
    
77
    ontty = sys.stderr.isatty()
Eric Coissac committed
78
    
Eric Coissac committed
79 80 81 82
    if terminal:
        tstates=[True]
    else:
        tstates=[True,False]
Eric Coissac committed
83
    ilength=len(self)
Eric Coissac committed
84
    cg = self.compactAssembling(verbose=False)
Eric Coissac committed
85
    
Eric Coissac committed
86
    index = self.index
Eric Coissac committed
87
    readSize = index.getReadSize()
Eric Coissac committed
88
    for terminal in tstates:
Eric Coissac committed
89
        print('',file=sys.stderr)
Eric Coissac committed
90
        if terminal:
Eric Coissac committed
91
            print("Deleting terminal branches",file=sys.stderr)
Eric Coissac committed
92
        else:
Eric Coissac committed
93
            print("Deleting internal branches",file=sys.stderr)
Eric Coissac committed
94
        extremities = endnodeset(cg) | startnodeset(cg)
Eric Coissac committed
95 96 97 98 99
        go = True
        while go:
            go = False
            stems = [x for x in cg.edgeIterator() 
                     if not terminal or (isTerminal(cg, x[0]) or isTerminal(cg, x[1]))]
100
            if stems:
Eric Coissac committed
101
                stems.sort(key=lambda i:cg.getEdgeAttr(*i)['weight'],reverse=True)
102
                lightest = stems.pop()
Eric Coissac committed
103
                lattr = cg.getEdgeAttr(*lightest)
104

105 106 107
                if lattr['weight'] < mincov: 
                    if stems:
                        go=True
Eric Coissac committed
108 109
                    for n in lattr['path'][1:-1]:
                        if n in self.graph:
Eric Coissac committed
110 111 112 113
                            try:
                                del self.graph[n]
                            except KeyError:
                                pass
Eric Coissac committed
114 115 116 117
                    if lightest[0] in extremities and lightest[0] in self.graph:
                        del self.graph[lightest[0]]
                    if lightest[1] in extremities and lightest[1] in self.graph:
                        del self.graph[lightest[1]]
Eric Coissac committed
118
                        
119 120 121 122
                    if ontty:
                        print("Remaining edges : %d node : %d\r" % (self.graph.edgeCount(),len(self)),
                              end='\r',
                              file=sys.stderr)
123
        
Eric Coissac committed
124
                    cg.deleteEdge(*lightest)
125 126 127 128 129 130 131 132 133
                    tojoin=[]
                    if lightest[0] in extremities:
                        del cg[lightest[0]]
                    else:
                        tojoin.append(lightest[0])
                    if lightest[1] in extremities:
                        del cg[lightest[1]]
                    else:
                        tojoin.append(lightest[1])
Eric Coissac committed
134
#                    print >>sys.stderr,lightest[0] in extremities,lightest[1] in extremities,tojoin
135
                    for c in tojoin:
Eric Coissac committed
136 137 138 139 140 141 142 143 144 145 146 147 148
                        if c in cg:
                            begin = list(cg.parentIterator(c))
                            end   = list(cg.neighbourIterator(c))
                            if len(begin)==1 and len(end)==1:
                                begin = begin[0]
                                end = end[0]
                                e1s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==begin and e[1]==c))
                                e2s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==c and e[1]==end))
                                if len(e1s)==1 and len(e2s)==1:
                                    e1 = e1s[0]
                                    e2 = e2s[0]
                                    attr1 = cg.getEdgeAttr(*e1)
                                    attr2 = cg.getEdgeAttr(*e2)
Eric Coissac committed
149

Eric Coissac committed
150 151 152
                                    sequence=attr1['sequence'] + attr2['sequence']
                                    path=attr1['path'] + attr2['path'][1:]
                                    
Eric Coissac committed
153 154 155 156 157 158 159 160 161 162 163
                                    stem =  buildstem(self,
                                                      begin,
                                                      end,
                                                      sequence,
                                                      path,
                                                      False)
                                    
                                    stem['graphics']={'width':int(stem['weight']/readSize)+1,
                                                      'arrow':'last'}
                                    
                                    cg.addStem(stem)
Eric Coissac committed
164 165 166
    
                                    cg.deleteEdge(*e2)                                        
                                    cg.deleteEdge(*e1)  
Eric Coissac committed
167 168
                                    
                                    
169
                                    del cg[c]                  
170 171 172
    if ontty:                            
        print('',file=sys.stderr)
        
Eric Coissac committed
173
    return ilength - len(self)
Eric Coissac committed
174 175

def cutSNPs(self,maxlength=500):
176 177 178
    
    ontty = sys.stderr.isatty()
    
Eric Coissac committed
179 180 181 182
    cg = self.compactAssembling(verbose=False)
    snps = [(snp, 
             1 if snp[2][0][1][2] > snp[2][1][1][2] else 0)   # indicate which allele needs to be elimimated
             for snp in [(i,                                  # start node
Eric Coissac committed
183
                         next(cg.neighbourIterator(i)),      # end node
Eric Coissac committed
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
                         [(k[2],                              # allele id (0 or 1) 
                           (cg.getEdgeAttr(*k)['stemid'],     # allele stemid
                            cg.getEdgeAttr(*k)['length'],     # allele length
                            cg.getEdgeAttr(*k)['weight'])     # allele coverage
                           ) 
                          for k in cg.edgeIterator(edgePredicate=lambda j:j[0]==i)
                         ]
                        ) 
                        for i in cg.nodeIterator(predicate=lambda n : 
                                len(list(cg.neighbourIterator(n))) == 1 
                                and len(list(cg.edgeIterator(edgePredicate=lambda e : e[0]==n)))==2)
                      ] if  snp[2][0][1][0]>0                 # keep one copy of the snp
                        and snp[2][0][1][0]!=-snp[2][1][1][0] # exclude pairs of reverse-complement allele
                        and snp[2][0][1][1]<maxlength               # exclude allele longer than 500 bp 
                        and snp[2][1][1][1]<maxlength             #
           ]
           
    for snp in snps:
        edge  = cg.getEdgeAttr(snp[0][0],snp[0][1],snp[1])
        nodes = edge['path'][1:-1]
204 205 206 207 208 209
        
        if ontty:
            print("Remaining edges : %d node : %d" % (self.graph.edgeCount(),len(self)),
                  end='\r',
                  file=sys.stderr)
            
Eric Coissac committed
210 211 212
        for node in nodes:
            if node in self.graph:
                del self.graph[node]
213 214 215
    
    if ontty:
        print("",file=sys.stderr)
Eric Coissac committed
216
    
Eric Coissac committed
217
  
Eric Coissac committed
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
def mode(data):
    '''
    Compute a raw estimation of the mode of a data set
    
    :param data: The data set to analyse
    :type data: a permanent iterable object (list, tuble...)
    
    '''

    data = list(data)
    
    if len(data) == 0:
        return None
    
    if len(data) < 8:
        return int(sum(data)/len(data))
    
    data.sort()
    xx = [0,0,0]
    for j in range(3):
Eric Coissac committed
238
        windows = len(data)//2
Eric Coissac committed
239 240 241 242 243 244 245 246 247 248 249 250 251 252
        intervals = [data[i+windows]-data[i] for i in range(windows)]
        mininter = min(intervals)
        begininter = intervals.index(mininter)
        data = data[begininter:(begininter+windows)]
        xx[j]=sum(data)/windows

    return sum(xx)/3

def weightedMode(data):
    d = []
    for x,w in data:
        d.extend([x] * w)
    return mode(d)   
    
253
def matchtoseed(matches,index,new=None):
Eric Coissac committed
254
    s=[]
255 256 257 258
    if new is None:
        new=list(matches.keys())
        
    for p in new:
259 260 261 262
        m = matches[p][1]
        k = m.keys()
        for x in k:
            s.extend((index.getIds(y[0])[0],(x,),0) for y in m[x])  
Eric Coissac committed
263 264 265

    return s

Eric Coissac committed
266 267
def matchtogene(matches):
    genes = {}
268
    if matches is not None:
269
        for p in matches:        # Loop over probe set
270 271
            m = matches[p][1]
            k = m.keys()
272 273 274 275 276 277 278
            for x in k:          # Loop over a probes in a set
                for i in m[x]:   # loop over matches in a probe
                    if i[0] < 0:
                        pos = -i[5]
                    else:
                        pos = i[5]
                    genes[abs(i[0])]=(x,pos)
Eric Coissac committed
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
            
    return genes

def genesincontig(cg,index,matches):
    def vread(i):
        if abs(i) > len(index):
            v=0
        elif abs(i) in genes:
            v=1
        else:
            v=-1
        return v
    
    genes = matchtogene(matches)
    ei = cg.edgeIterator(edgePredicate=lambda e: 'path' in cg.getEdgeAttr(*e))
    
    for e in ei:
        ea = cg.getEdgeAttr(*e)
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
        if ea['class']=="sequence":
            path = ea['path']
            eg = [vread(i) for i in path]
            ep = [genes.get(abs(i),()) for i in path]
                
            g = sum(i for i in eg if i > 0)
            if g > 0:
                g=max(min(math.ceil(math.log10(g)),4)*64-1,0)
                color="#00%02X%02X" % (g,255-g)
            else:
                color="#0000FF"
            graphics = ea.get('graphics',{})
            graphics['arrow']='last'
            graphics['fill']=color
            ea['graphics']=graphics
            ea['ingene']=g
            ea['genepos']=ep
Eric Coissac committed
314
          
315 316 317 318 319 320 321 322 323 324 325 326 327 328
def testOverlap(seqfrom,seqto,headto,readsize):
    seqto=headto+seqto
    lseq=min(len(seqfrom),len(seqto),readsize)
    seqfrom = seqfrom[-lseq:]
    seqto   = seqto[0:lseq] 
    ali = alignSequence(seqfrom,seqto)
    ok = (not ali[0] \
          and ali[1] > 0 \
          and len(ali[2])==1 \
          and len(ali[3])==1)
    if ok:
        return lseq - ali[2][0]
    else:
        return -1
329
def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=set(),logger=None):
Eric Coissac committed
330 331 332 333
    '''
    Add relationships between edges of the assembling graph related to the
    par ended links.
    
334 335 336
    :param assgraph: The compact assembling graph as produced by the
                     :py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
    :type assgraph:  :py:class:`~orgasm.graph.DiGraphMultiEdge`
Eric Coissac committed
337 338 339 340 341 342 343 344 345
    :param minlink:  the minimum count of pair ended link to consider 
                     for asserting the relationship
    :type minlink:   :py:class:`int`
    :param back:     How many base pairs must be considered at the end of each edge
    :type back:      :py:class:`int`
    :param addConnectedLink: add to the assembling graph green edges for each directly 
                             connected edge pair representing the pair ended links 
                             asserting the connection.
    :type addConnectedLink: :py:class:`bool`
Eric Coissac committed
346 347
     '''
    
348
    frglen,frglensd = estimateFragmentLength(self)
349
    readsize=self.index.getReadSize()
350 351 352 353 354 355 356 357 358 359 360 361 362
    
    if frglen is None:
        logger.warning("Insert size cannot be estimated")
        frglen=300
        frglensd=1
        logger.warning("Insert size set to %d" % frglen)
    
    nforcedLink = set((-p[1],-p[0]) 
                      if abs(p[0]) > abs(p[1]) 
                      else (p[0],p[1]) 
                      for p in forcedLink)
        
        
363

364 365 366
    eiat = dict((assgraph.getEdgeAttr(*i)['stemid'],
                 assgraph.getEdgeAttr(*i)) for i in assgraph.edgeIterator())
    maxstemid = max(eiat)
367
    
Eric Coissac committed
368 369 370 371 372 373 374 375 376 377 378 379
    if addConnectedLink:
        # Fake function just testing the stemid attribute
        def isInitial(n):
            return  'stemid' in assgraph.getEdgeAttr(*n) 
        def isTerminal(n):
            return  'stemid' in assgraph.getEdgeAttr(*n) 
    else:
        def isInitial(n):
            return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0 
        def isTerminal(n):
            return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0

Eric Coissac committed
380
        
Eric Coissac committed
381 382 383 384 385
    ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
    et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
    
    eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
    etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
386
        
Eric Coissac committed
387 388 389
    nei = len(ei)
    net = len(et)

Eric Coissac committed
390
    s=[]
Eric Coissac committed
391 392
    for e1 in range(net):
        for e2 in range(nei):
393 394 395 396
            npair = ((etid[e1], eiid[e2]) 
                     if (abs(etid[e1]) < abs(eiid[e2])) 
                     else (-eiid[e2],-etid[e1]))
            
Eric Coissac committed
397
            connected = et[e1][1]==ei[e2][0]
398 399 400
            linkedby,ml,sl,delta  = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
            first=assgraph.getEdgeAttr(*et[e1])['last']
            last=assgraph.getEdgeAttr(*ei[e2])['first']
401
            
Eric Coissac committed
402
            if connected and addConnectedLink:
Eric Coissac committed
403
                if linkedby >= minlink:
404
                    s.append(('l',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#00FF00",ml,sl,delta,first,last))
405 406
                    logOrPrint(logger,
                               "%d -> %d connection asserted by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
Eric Coissac committed
407
                else:
408 409 410
                    logOrPrint(logger,
                               "%d -> %d connection not asserted by pair ended link" % (etid[e1], eiid[e2]))
                    
411
            elif not connected and npair in nforcedLink:
412 413 414 415 416 417 418 419 420
                s.append(('f',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF0000",ml,sl,delta,first,last))
                if linkedby > 0:
                    logOrPrint(logger,
                                   "%d -> %d forced but supported by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
                else:
                    logOrPrint(logger,
                                   "%d -> %d forced and not supported by pair ended links" % (etid[e1], eiid[e2]))
                    
            elif not connected and linkedby >= minlink:
421 422 423
                overlap = testOverlap(eiat[etid[e1]]['sequence'], 
                                      eiat[eiid[e2]]['sequence'], 
                                      eiat[eiid[e2]]['head'], 
424
                                      readsize)
425 426 427 428 429 430 431 432
                if overlap >= 0:
                    s.append(('o',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF00FF",overlap,0,[],first,last))
                    logOrPrint(logger,
                                   "%d -> %d overlap of %dbp supported by %d pair ended links" % (etid[e1], eiid[e2],overlap,linkedby))
                else:
                    s.append(('s',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF6600",ml,sl,delta,first,last))
                    logOrPrint(logger,
                                   "%d -> %d scaffolded by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
433 434 435 436 437 438 439 440 441 442 443 444 445 446
    nstemid={}
    for kind,x,y,z,s1,s2,color,ml,sl,delta,first,last in s:
        if (abs(x) > abs(y)):
            npair=(-y,-x)
            nsign=-1
        else:
            npair=(x,y)
            nsign=1
        if npair not in nstemid:
            maxstemid+=1
            nstemid[npair]=maxstemid
            
        nid=nstemid[npair] * nsign 
        
Eric Coissac committed
447
        attr = assgraph.addEdge(x,y)
448 449
        
        if kind=="s":
450
            glengths = [frglen - i - 2 * readsize 
451
                        for i in delta]
452
            pglengths = [i for i in glengths if i >=0]
453
            if len(pglengths) > 1:
454 455 456 457 458 459
                glength  = sum(pglengths) / len(pglengths)
                glengthsd= math.sqrt(sum((i-glength)**2 for i in pglengths) /(len(pglengths)-1))
            else:
                glength  = sum(glengths) / len(glengths)
                glengthsd= math.sqrt(sum((i-glength)**2 for i in glengths) /(len(glengths)-1))
                
460
            attr['label']="%d : Gap (%dbp)  [%d,%d] %d -> %d" % (nid,glength,z,len(pglengths),s1,s2)
461
            attr['length']=int(glength) if int(glength) > 0 else 10
462 463 464 465
            attr['first']=first
            attr['last']=last
            attr['weight']=0
            attr['gappairs']=len(glengths)
466
            attr['gaplength']=int(glength)
467
            attr['gapsd']=int(math.sqrt(frglensd**2+glengthsd**2))
468
            attr['gapdeltas']=[frglen - i - 2 * readsize for i in delta]
469
            attr['pairendlink']=z
470
            attr['ingene']=0
471
            attr['link']=(s1,s2)
472 473 474 475 476 477
            attr['graphics']={'width':z // 10.,
                              'arrow':'last',
                              'fill':color
                              }
            attr['stemid']=nid
            attr['sequence']=b"N" * attr['length']
478 479
            attr['path']=[first] + [0] * (attr['length']-1) + [last]
            attr['class']='scaffold:paired-end'
480
        elif kind=="o":
481 482
            attr['label']="%d : Overlap (%dbp)  [%d] %d -> %d" % (nid,ml,z,s1,s2)
            attr['length']=-ml
483 484 485 486 487 488 489 490 491 492 493 494
            attr['first']=first
            attr['last']=last
            attr['weight']=0
            attr['pairendlink']=z
            attr['ingene']=0
            attr['link']=(s1,s2)
            attr['graphics']={'width':1,
                              'arrow':'last',
                              'fill':color
                              }
            attr['stemid']=nid
            attr['sequence']=b''
495
            attr['path']=[first]+ [0] * (readsize-ml-1) +[last]
496
            attr['class']='scaffold:overlap'
497
        elif kind=="f":
498
            glengths = [frglen - i - 2 * readsize 
499 500 501 502 503 504 505 506 507 508
                        for i in delta]
            pglengths = [i for i in glengths if i >=0]
            
            if len(pglengths) > 1:
                glength  = sum(pglengths) / len(pglengths)
                glengthsd= math.sqrt(sum((i-glength)**2 for i in pglengths) /(len(pglengths)-1))
            else:
                glength  = sum(glengths) / len(glengths)
                glengthsd= math.sqrt(sum((i-glength)**2 for i in glengths) /(len(glengths)-1))
                
509
            attr['label']="%d : Forced [%d] %d -> %d" % (nid,z,s1,s2)
510
            attr['length']= int(glength) if glength > 0 else 10
511 512 513
            attr['first']=first
            attr['last']=last
            attr['weight']=0
514 515 516
            attr['gappairs']=len(glengths)
            attr['gaplength']=int(glength)
            attr['gapsd']=int(math.sqrt(frglensd**2+glengthsd**2))
517
            attr['gapdeltas']=[frglen - i - readsize for i in delta]
518 519
            attr['pairendlink']=z
            attr['ingene']=0
520
            attr['link']=(s1,s2)
521 522
            attr['graphics']={'width':z // 10.,
                              'arrow':'last',
523
                              'fill' :color
524 525 526 527 528 529
                              }
            attr['stemid']=nid
            attr['sequence']=b"N" * attr['length']
            attr['path']=[first] + [0] * (attr['length']-1) + [last]
            attr['class']='scaffold:forced'
            
530 531
        else:
            attr['class']='internal'
532
            
Frédéric Boyer committed
533 534
__cacheAli = set()
__cacheAli2 = set()
Eric Coissac committed
535 536

def fillGaps(self,minlink=5,
Eric Coissac committed
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
                  back=200,
                  kmer=12,
                  smin=40,
                  delta=0,
                  cmincov=5,
                  minread=20,
                  minratio=0.1, 
                  emincov=1,
                  maxlength=None,
                  gmincov=1,
                  minoverlap=60,
                  lowfilter=True,
                  adapters5=(),
                  adapters3=(),
                  maxjump=0,
                  snp=False,
                  nodeLimit=1000000,
Eric Coissac committed
554
                  onlyLinking=False,
555
                  useonce=True,
Eric Coissac committed
556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
                  logger=None):
    '''
    
    :param minlink:
    :param back:
    :param kmer:
    :param smin:
    :param delta:
    :param cmincov:
    :param minread:
    :param minratio:
    :param emincov:
    :param maxlength:
    :param gmincov:
    :param minoverlap:
    :param lowfilter:
    :param maxjump:
    :param snp: If set to True (default value is False) erase SNP variation
                by conserving the most abundant version
    '''
    global __cacheAli
    global __cacheAli2
    __cacheAli = __cacheAli2
    __cacheAli2 = set()
    
581
    
Eric Coissac committed
582 583 584 585 586 587 588 589 590
    def isInitial(n):
        return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0 
    def isTerminal(n):
        return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0

    assgraph = self.compactAssembling(verbose=False)
    
    index = self.index
    
591
    #List of edges not connected at their beginning
Eric Coissac committed
592
    ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
593 594
    
    #List of edges not connected at their end    
Eric Coissac committed
595 596
    et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
    
597
    #Corresponding id of these edges
Eric Coissac committed
598 599 600
    eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
    etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]

601
    #epi = [set(assgraph.getEdgeAttr(*i)['path'][0:back]) for i in ei]
Eric Coissac committed
602 603 604 605
    ept = [set(assgraph.getEdgeAttr(*i)['path'][-back:]) for i in et]

    eei = [set(assgraph.getEdgeAttr(*i)['path'][0:100]) for i in ei]
    eet = [set(assgraph.getEdgeAttr(*i)['path'][-100:]) for i in et]
606
    print(eiid,file=sys.stderr) # <EC>
607
    
Eric Coissac committed
608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
    exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid]
    ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid]
    
    nei = len(ei)
    net = len(et)
    s=[]
    maxcycle=max(self.graph.getNodeAttr(i)['cycle'] for i in self.graph)
    lassemb = len(self)
    cycle = maxcycle
    linked=set()
    extended=set()
    for e1 in range(net):
        for e2 in range(nei):
            connected = et[e1][1]==ei[e2][0]
            if not connected:
623
                linkedby,ml,sl,pdelta  = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)  # @UnusedVariable
Eric Coissac committed
624 625 626 627 628 629 630 631 632
            
                if linkedby >= minlink and abs(etid[e1]) <= abs(eiid[e2]):
                    extended.add(etid[e1])
                    extended.add(-eiid[e2])
                    if (etid[e1],eiid[e2]) not in linked:
                        linked.add((-eiid[e2],-etid[e1]))
                        print("\n\nLinking Stems %d -> %d" % (etid[e1],eiid[e2]),
                              file=sys.stderr)

633 634
#                        ex = frozenset(((ext[e1] | exi[e2]) - ept[e1] - epi[e2]) | eet[e1] | eei[e2])
                        ex = frozenset(ext[e1] | exi[e2] | eet[e1] | eei[e2])
Eric Coissac committed
635 636 637 638
                        
                        ingraph = sum(i in self.graph for i in ex)
                        nreads = len(ex)
                        if ingraph < nreads:
639 640 641
                            logOrPrint(logger,
                                        "--> %d | %d = %d reads to align (%d already assembled)" % (len(ext[e1]),len(exi[e2]),nreads,ingraph),
                                       )
Eric Coissac committed
642 643 644 645 646 647 648

                            if nreads > 10:
                                __cacheAli2.add(ex)
                                if ex not in __cacheAli:
                                    ali= multiAlignReads(ex,index,kmer,smin,delta)
                                    print('',file=sys.stderr)
                
649 650
                                    #goodali = [i for i in ali if len(i) >= nreads/4]
                                    goodali=ali
651 652
                                    logOrPrint(logger,
                                               "--> %d consensus to add" % len(goodali))
Eric Coissac committed
653 654

                                    for a in goodali:
655
                                        #print(b'\n'.join(alignment2bytes(a,index)).decode('ascii'))
Eric Coissac committed
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
                                        cycle+=1
                                        c = consensus(a,index,cmincov)
                                        s = insertFragment(self,c,cycle=cycle)
                                        print("     %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
                                              file=sys.stderr)

            
                                        a = tango(self,
                                                  seeds      = s,
                                                  minread    = minread,
                                                  minratio   = minratio,
                                                  mincov     = emincov,
                                                  minoverlap = minoverlap,
                                                  lowfilter  = lowfilter,
                                                  adapters5   = adapters5,
                                                  adapters3   = adapters3,
                                                  maxjump    = maxjump,
                                                  cycle      = cycle,
                                                  nodeLimit  = nodeLimit)
                                        
                                        
                                        print('',file=sys.stderr)
                                else:
679 680
                                    logOrPrint(logger,
                                               "--> already aligned")
Eric Coissac committed
681

Eric Coissac committed
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697
    if not onlyLinking:
        for e1 in range(net):
            if etid[e1] not in extended:
                print("\n\nExtending Stems %d" % (etid[e1]),
                      file=sys.stderr)
    
                ex = frozenset((ext[e1] - ept[e1]) | eet[e1])
                nreads = len(ex)
                print("--> %d reads to align" % (nreads),
                      file=sys.stderr)
    
                if nreads > 10:
                    __cacheAli2.add(ex)
                    if ex not in __cacheAli:
                        ali= multiAlignReads(ex,index,kmer,smin,delta)
                        print('',file=sys.stderr)
698 699
                        #goodali = [i for i in ali if len(i) >= nreads/4]
                        goodali=ali
Eric Coissac committed
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725
                        print("--> %d consensus to add" % len(goodali),
                              file=sys.stderr)
    
                        for a in goodali:
                            c = consensus(a,index,cmincov)
                            if c:
                                cycle+=1
                                s = insertFragment(self,c,cycle=cycle)
                                print("     %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
                                      file=sys.stderr)
    
                                a = tango(self,
                                          seeds      = s,
                                          minread    = minread,
                                          minratio   = minratio,
                                          mincov     = emincov,
                                          minoverlap = minoverlap,
                                          lowfilter  = lowfilter,
                                          adapters5  = adapters5,
                                          adapters3  = adapters3,
                                          maxjump    = maxjump,
                                          cycle      = cycle,
                                          nodeLimit  = nodeLimit)
                            print("",file=sys.stderr)
                    else:
                        print("--> already aligned",file=sys.stderr)
Eric Coissac committed
726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755

    self.cleanDeadBranches(maxlength=10)
    cutLowCoverage(self,gmincov,terminal=True)
#    cutLowCoverage(self,int(gmincov/3),terminal=False)   
    
    if maxlength is not None:
        smallbranches = maxlength
    else:
        smallbranches = estimateDeadBrancheLength(self)
        print("     Dead branch length setup to : %d bp" % smallbranches,
              file=sys.stderr)

    self.cleanDeadBranches(maxlength=smallbranches)

    if snp:
        cutSNPs(self)

    newnodes = len(self) - lassemb
    
    print('',file=sys.stderr)
    print("#######################################################",file=sys.stderr)
    print("#",file=sys.stderr)
    print("# Added : %d bp (total=%d bp)" % (newnodes/2,len(self)/2),file=sys.stderr)
    print("#",file=sys.stderr)
    print("#######################################################",file=sys.stderr)
    print('',file=sys.stderr)
    
    return newnodes

                        
756
                        
Eric Coissac committed
757

758 759

        
760
def insertFragment(self,seq,cycle=1,useonce=True):
761 762 763 764
    index = self.index
    rsize = index.getReadSize()
    readmax=len(index)+1
    seeds = set()
765 766
    usedreads = getusedreads()

767 768 769 770 771 772 773 774 775 776
    ireadidE=None
    
    if len(seq) >= rsize:
        graph = self.graph
        probe = seq[0:rsize]
        readid = index.getReadIds(probe)
        
        for i in range(1,len(seq)-rsize+1):
            coverage = readid[1]
            ireadid   = readid[0]
777 778
            if not useonce or ireadid not in usedreads:
                seeds.add(ireadid)
779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
            
            if ireadid not in graph:
                node  = graph.addNode(ireadid)
                if 'cycle' not in node:
                    node['cycle']=cycle
                if ireadid < readmax:
                    node['fake5']=0
                    node['fake3']=0
                else:
                    node['graphics']={'fill':"#0000FF"}
                
            probe = seq[i:i+rsize]
            readidE = index.getReadIds(probe)
            coverage = readidE[1]
            ireadidE   = readidE[0]
            
            if ireadidE not in graph:
                node  = graph.addNode(ireadidE)
                if 'cycle' not in node:
                    node['cycle']=cycle
                if ireadidE < readmax:
                    node['fake5']=0
                    node['fake3']=0
                else:
                    node['graphics']={'fill':"#0000FF"}
                    
            
            #if ireadid!=-ireadidE:
            edges = graph.addEdge(ireadid,ireadidE)
            edges[1]['coverage']=coverage
            edges[2]['coverage']=coverage
            edges[1]['label']="%s (%d)" % (edges[1]['ext'],coverage)
            edges[2]['label']="%s (%d)" % (edges[2]['ext'],coverage)
            #else:
            #    print >>sys.stderr,"\nWARNING : self loop on %d" % ireadidE
        
            readid = readidE
        
817
        if ireadidE is not None and (not useonce or ireadid not in usedreads):    
818 819 820 821 822 823
            seeds.add(ireadidE)
        
    return [(-abs(i),("Consensus",),0) for i in seeds]
                

def getPairedRead(self,assgraph,stemid,back,end=True): 
Eric Coissac committed
824 825
    '''
    
826 827 828 829
    :param assgraph:
    :type assgraph:
    :param stemid:
    :type stemid:
Eric Coissac committed
830
    :param back:
831 832 833
    :type back:
    :param end:
    :type end:
Eric Coissac committed
834
    '''
835 836
    r = self.index 
    lr = len(r)
Frédéric Boyer committed
837
    
838 839
    if not end:
        stemid=-stemid  
840
    
841 842 843 844 845 846 847
    try:
        stem = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e) 
                                                     and assgraph.getEdgeAttr(*e)['stemid']==stemid))
    except StopIteration:
        print("ERROR in getPairedRead : requesting stemid=%s" % str(stemid),
              file=stderr)
        
848
    path=assgraph.getEdgeAttr(*stem)['path'][-back:]
849
    
850 851
    reads=[set(r.normalizedPairedEndsReads(abs(i))[1]) for i in path if  abs(i) < lr]
    paired = set(reduce(lambda a,b: a | b,reads,set()))
Eric Coissac committed
852
    
853 854 855 856
    if not end:
        paired = set(-i for i in paired)
    
    return paired
Eric Coissac committed
857

858
def pairEndedConnected(self,assgraph,edge1,edge2,back=250):
Eric Coissac committed
859 860 861
    '''
    Returns how many pair ended reads link two edges in a compact assembling graph
    
862 863 864
    :param assgraph: The compact assembling graph as produced by the
                     :py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
    :type assgraph:  :py:class:`~orgasm.graph.DiGraphMultiEdge`
Eric Coissac committed
865 866 867 868 869 870 871 872 873 874 875
    :param edge1:    The ``stemid`` of the first edge
    :type edge1:     :py:class:`int`
    :param edge2:    The ``stemid`` of the second edge
    :type edge2:     :py:class:`int`
    :param back:     How many base pairs must be considered at the end of each edge
    :type back:      :py:class:`int`
    
    :return:         The count of pair ended reads linking both the edges
    :rtype:          :py:class:`int`
    '''
    
876 877
    ri = self.index
    lri= len(ri)
Eric Coissac committed
878 879 880
        
    s1 = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e) 
                                                 and assgraph.getEdgeAttr(*e)['stemid']==edge1))
881
                                                 
Eric Coissac committed
882 883
    s2 = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e) 
                                                 and assgraph.getEdgeAttr(*e)['stemid']==edge2))
884 885 886
                                                 
    e1 = assgraph.getEdgeAttr(*s1)['path'][-back:]
    e2 = assgraph.getEdgeAttr(*s2)['path'][0:back]
Eric Coissac committed
887 888 889

    de1 = {}
    de2 = {}
890
    
Eric Coissac committed
891 892
    j   = 0
    for i in e1:
893
        de1[i] = j
Eric Coissac committed
894
        j+=1
895 896 897 898 899
#     for i in e1:
#         l = de1.get(i,[])
#         de1[i] = l
#         l.append(j)
#         j+=1
Eric Coissac committed
900 901 902

    j   = 0
    for i in e2:
903
        de2[i] = j
Eric Coissac committed
904 905
        j+=1
        
906
    pe1 = [k for k in 
Eric Coissac committed
907
           [(a,[j for j in b if j in de2]) 
908 909
            for a,b in [ri.normalizedPairedEndsReads(i) 
                        for i in e1 if i > 0 and i < lri]] if k[1]]
Eric Coissac committed
910
    
911
    pe2 = [k for k in 
Eric Coissac committed
912
           [(a,[j for j in b if j in de1]) 
913 914
            for a,b in [ri.normalizedPairedEndsReads(i) 
                        for i in e2 if i < 0 and -i < lri]] if k[1]]
Eric Coissac committed
915 916 917
    le1=len(e1)
    peset = set()
    delta = []
918 919
    for f,rs in pe1:
        for r in rs:
Eric Coissac committed
920 921 922
            p = (f,abs(r)) if f < abs(r) else (abs(r),-f)
            if p not in peset:
                peset.add(p)
923 924 925 926
                p1 = de1[f]
                p2 = de2[r]
                delta.append((le1 - p1) + 1 + p2)
     
927 928
    for f,rs in pe2:
        for r in rs:
Eric Coissac committed
929 930 931
            p = (-f,abs(r)) if -f < abs(r) else (abs(r),f)
            if p not in peset:
                peset.add(p)
932 933 934
                p1 = de1[r]
                p2 = de2[f]
                delta.append(le1 - p1 + 1 + p2)
Eric Coissac committed
935
 
936
    if (len(delta) > 1):
Eric Coissac committed
937
        dmean = sum(delta) / len(delta)
938 939
        variance = sum((x -dmean)** 2 for x in delta)  / (len(delta) - 1)       
        dsd   = math.sqrt(variance)
Eric Coissac committed
940 941 942 943 944
        dmean+=ri.getReadSize()
    else:
        dmean = None
        dsd  = None
        
945
    return len(peset),dmean,dsd,delta
Eric Coissac committed
946
    
947
def coverageEstimate(self,matches=None,index=None,timeout=60.0):
Eric Coissac committed
948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
    '''
    Estimates the average coverage depth of the sequence.
    
    The algorithm is masic and can be very slow. To avoid infinity
    computation time a timeout limits it to 60 secondes by default.
    
    Three values are returned by the function :
    
        - The number of bp considered to estimate the coverage
        - The length of the segment used for the estimation
        - The coverage depth 
        
    
    
    
    :param timeout: Maximum computation time.
    
    :return: a triplet (int,int,float) 
    '''
967
    def weight(a,b):
Eric Coissac committed
968 969 970 971 972 973 974
        maxw=0
        for e in cg.edgeIterator(edgePredicate = lambda i:i[0]==a and i[1]==b):
            attr = cg.getEdgeAttr(*e)
            w = attr['length'] * attr['weight']
            if w > maxw:
                maxw=w        
        return maxw 
975 976
    
    def coverage(start):
977
        wp = cg.minpath(start, distance=weight,allowCycle=True)
Eric Coissac committed
978
#        print ">>>",start,wp
979 980 981 982
        maxpath = max(wp[0].values())
        maxend=[i for i in wp[0] if wp[0][i]==maxpath][0]
        i=maxend
        path=[i]
Eric Coissac committed
983 984 985
        if i == start and wp[0][i] > 0:
            i=wp[1][i]
            path.append(i)
986 987 988 989 990
        while i!=start:
            i=wp[1][i]
            path.append(i)
        return(maxpath,path)
    
Eric Coissac committed
991
    cg = self.compactAssembling(verbose=False)
Eric Coissac committed
992
        
993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005
    if matches is not None and index is not None:
        genesincontig(cg, index, matches)
        # A first and fast approach is to look for long sequence (> 1kb)
        
        longweight = [(cg.getEdgeAttr(*i)['weight'],cg.getEdgeAttr(*i)['length'])
                        for i in cg.edgeIterator(edgePredicate=lambda e:'weight' in cg.getEdgeAttr(*e)) 
                        if cg.getEdgeAttr(*i)['length'] > 1000 and cg.getEdgeAttr(*i)['ingene']>0]
    else:
        # A first and fast approach is to look for long sequence (> 1kb)
        
        longweight = [(cg.getEdgeAttr(*i)['weight'],cg.getEdgeAttr(*i)['length'])
                        for i in cg.edgeIterator(edgePredicate=lambda e:'weight' in cg.getEdgeAttr(*e)) 
                        if cg.getEdgeAttr(*i)['length'] > 1000]
Eric Coissac committed
1006 1007 1008
                    
    if longweight:
        maxpath = sum(i[1] for i in longweight)
Eric Coissac committed
1009
        cov = weightedMode((i[0],i[1]//100) for i in longweight)
Eric Coissac committed
1010 1011
        return maxpath,maxpath,cov
    
Eric Coissac committed
1012 1013
    # Seconde strategy : we look for the longest of the shortest path
    
1014 1015
    maxpath=0
    path=None
1016 1017 1018 1019 1020 1021 1022
    btime = time()
    n=list(cg)
    # n = [i  for i in cg if len(list(cg.parentIterator(i)))==0]
    # dest = [i  for i in cg if len(list(cg.neighbourIterator(i)))==0]
    e = [max([cg.getEdgeAttr(*j)['length'] 
              for j in list(cg.edgeIterator(edgePredicate=lambda i: i[1]==k))]+[0]) 
         for k in n]
Eric Coissac committed
1023
    ne = list(map(lambda a,b:(a,b),n,e))
1024 1025 1026 1027 1028
    ne.sort(key=lambda i:i[1],reverse=True)
    n = [i[0] for i in ne]
    
    for i in n:
        w,p = coverage(i)
1029 1030
        if w > maxpath:
            maxpath=w
1031 1032 1033
            path=p
        if (time()-btime) > timeout:
            break 
Eric Coissac committed
1034 1035
#    print path        

Eric Coissac committed
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
    if path:
        path.reverse()
        j = path[0]
        cumlength=0
        for i in path[1:]:
            maxw=0
            maxlength=0
            for e in cg.edgeIterator(edgePredicate = lambda k:k[0]==j and k[1]==i):
                attr = cg.getEdgeAttr(*e)
                w = attr['length'] * attr['weight']
#                print w , attr['length'] , attr['weight']
                if w > maxw:
                    maxw=w
                    maxlength=attr['length']       

            cumlength+=maxlength
            j=i
    else:
        l = [(cg.getEdgeAttr(*e)['length']) 
                for e in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e))]
        w = [(cg.getEdgeAttr(*e)['weight']) 
                for e in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e))]
        if l:
            maxpath=max(l)
            coverage = w[l.index(maxpath)]
            return maxpath,maxpath,coverage
        else:
Eric Coissac committed
1063
            return None,None,None        # <-- ???
1064
    return maxpath,cumlength,maxpath/float(cumlength)
Eric Coissac committed
1065

1066

1067 1068


1069
def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=200,logger=None,tags=[]):
Eric Coissac committed
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
    '''
    Convert a path in an compact assembling graph in a fasta formated sequences.
    
    :param assgraph: The compact assembling graph as produced by the
                     :py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
    :type assgraph:  :py:class:`~orgasm.graph.DiGraphMultiEdge`
    :param path:     an ``iterable`` providing an ordered list of ``stemid`` indicating
                     the path to follow.  
    :type path:      an ``iterable`` over :py:class:`int`
    :param identifier: the identifier used in the header of the fasta formated sequence
    :type identifier:  :py:class:`bytes`
    :param minlink:  the minimum count of pair ended link to consider 
                     for asserting the relationship
    :type minlink:   :py:class:`int`
    :param nlength:  how many ``N`` must be added between two segment of sequences only connected
                     by pair ended links
    :type nlength:   :py:class:`int`
    :param back:     How many base pairs must be considered at the end of each edge
    :type back:      :py:class:`int`
    
    :returns: a string containing the fasta formated sequence
    :rtype: :py:class:`bytes`

    :raises: :py:class:`AssertionError`
    '''
1095
    frglen,frglensd = estimateFragmentLength(self) # @UnusedVariable
Eric Coissac committed
1096
    
1097 1098 1099 1100 1101
    #
    # Build a dictionary with:
    #      - Keys   = stemid
    #      - Values = edge descriptor (from,to,x)
    #
Eric Coissac committed
1102 1103 1104
    alledges = dict((assgraph.getEdgeAttr(*e)['stemid'],e) 
                    for e in assgraph.edgeIterator(edgePredicate = lambda i: 'stemid' in assgraph.getEdgeAttr(*i)))
        
1105
    coverage=[]
Eric Coissac committed
1106 1107
    slength=[]
    seq=[]
1108
    label=[]
Eric Coissac committed
1109 1110
    oldstem=None
    oldid=None
1111 1112
    oldstemclass=None
     
1113
    rank = 1
Eric Coissac committed
1114
    forceconnection=False
1115
    
Eric Coissac committed
1116 1117
    for stemid in path:
        if stemid != 0:
1118 1119 1120 1121
            stem              = alledges[stemid]
            attr              = assgraph.getEdgeAttr(*stem)
            stemclass         = attr['class']
            sequence          = attr['sequence']
Eric Coissac committed
1122
            
1123
            if rank==1 and stemclass!="sequence":
1124
                raise RuntimeError("A path cannot start on a gap")
1125 1126 1127 1128 1129

                                                    # Switch the stem to a dashed style
            graphics          = attr.get('graphics',{})
            attr['graphics']  = graphics
            graphics['style'] = 'dashed'
Eric Coissac committed
1130
            
1131
            # Manage step rank information of each step
Eric Coissac committed
1132 1133 1134 1135 1136 1137
            allsteps = attr.get('steps',{})
            steps = allsteps.get(identifier,[])
            steps.append(rank)
            allsteps[identifier]=steps
            attr['steps']=allsteps
            
1138 1139 1140 1141 1142 1143 1144 1145 1146
            if oldstem is not None:
                connected,ml,sl,delta = pairEndedConnected(self,assgraph,oldid,stemid,back)  # @UnusedVariable
                if oldstem[1]==stem[0]:
                    if oldstemclass=="sequence":
                        if stemclass=="sequence":                   # Link between 2 sequences
                            if ml is not None:
                                logOrPrint(logger,
                                           "Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" % 
                                           (oldid,stemid,connected,float(ml),float(sl)))
Eric Coissac committed
1147
            
1148 1149 1150 1151 1152
                                label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
                            else:
                                logOrPrint(logger,
                                           "Both segments %d and %d are connected but covered by 0 paired-end" % 
                                           (oldid,stemid))
Eric Coissac committed
1153
            
1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
                                label.append('{connection: 0}')
                                                            
                        elif stemclass[0:9]=="scaffold:":           # Link a sequence and a gap
                            logOrPrint(logger,
                                        "Both segments %d and %d are disconnected" % attr['link'])
    
                            if stemclass=="scaffold:paired-end":
                                logOrPrint(logger,
                                           "   But linked by %d pair ended links (gap length=%f sd=%f)" % 
                                           (attr['pairendlink'],
                                            attr['length'],
                                            attr['gapsd']))
                                           
                                label.append('{N-connection: %d - Gap length: %d, sd: %d}' % 
                                             (attr['pairendlink'],
                                              attr['length'],
                                              attr['gapsd']))
    
                            elif stemclass=="scaffold:forced":
                                logOrPrint(logger,"   Connection is forced")
                                
                                if attr['pairendlink'] > 0:
                                    logOrPrint(logger,
                                               "   But asserted by %d pair ended links (gap length=%f sd=%f)" % 
                                               (attr['pairendlink'],
                                                attr['length'],
                                                attr['gapsd']))

                                    label.append('{F-connection: %d - Gap length: %d, sd: %d}' % 
                                             (attr['pairendlink'],
                                              attr['length'],
                                              attr['gapsd']))
                                else:
                                    label.append('{F-connection: %d}' % connected)
1188 1189 1190 1191 1192
                            elif stemclass=="scaffold:overlap":
                                logOrPrint(logger,
                                           "   But overlap by %dbp supported by %d pair ended links" % 
                                           (-attr['length'],
                                            attr['pairendlink']))
1193
                                    
1194 1195 1196 1197 1198 1199
                                label.append('{O-connection: %d - Overlap length: %d}' % 
                                             (attr['pairendlink'],
                                              -attr['length']))
                                # Remove the overlap length on the last inserted sequence
                                seq[-1]=seq[-1][:attr['length']]
    
1200 1201 1202 1203 1204 1205
                    elif oldstemclass[0:9]=="scaffold:":
                        if stemclass=="sequence":
                            seq.append(self.index.getRead(attr['path'][0],
                                                          0,
                                                          self.index.getReadSize()).lower())
                            slength.append(self.index.getReadSize())
Eric Coissac committed
1206

1207 1208 1209 1210 1211
                        else:
                            raise RuntimeError('A scaffold link must be followed by a sequence %d --> %d' %
                                               (oldid,stemid))           
                        
                elif forceconnection:
1212
                    logOrPrint(logger,"   Connection is forced")
1213 1214
                    if connected > 0:
                        glength = int(frglen-ml - self.index.getReadSize()) 
1215

1216
                        logOrPrint(logger,
1217 1218 1219 1220
                                   "   But asserted by %d pair ended links (gap length=%f sd=%f)" % 
                                   (connected,
                                    glength,
                                    sl))
1221

1222 1223 1224 1225
                        label.append('{F-connection: %d - Gap length: %d, sd: %d}' % 
                                      (connected,
                                       glength,
                                       sl))
1226 1227

                    else:
1228
                        
1229
                        logOrPrint(logger,
1230
                                   "Without any support from pair ended links")
1231
                        
1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260
                        label.append('{F-connection: %d}' % connected)

                        glength =  nlength
                    
                    seq.append(b'N'*int(glength))
                    slength.append(int(glength))
                    
                    seq.append(self.index.getRead(attr['path'][0],
                                                  0,
                                                  self.index.getReadSize()).lower())
                    slength.append(self.index.getReadSize())


                    # Add the foced link to the compact assembly graph    
                    flink = assgraph.addEdge(oldstem[1],stem[0])
                    rlink = assgraph.addEdge(-stem[0],-oldstem[1])
                    flink['label']="Forced (%d)  %d -> %d" % (connected,oldid,stemid)
                    flink['graphics']={'width':1,
                                       'arrow':'last',
                                       'fill':'0xFF0000',
                                       'style':'dashed'

                                      }
                    rlink['label']="Forced (%d)  %d -> %d" % (connected,-stemid,-oldid)
                    rlink['graphics']={'width':1,
                                       'arrow':'last',
                                       'fill':'0xFF0000',
                                       'style':'dashed'
                                      }
1261
                else:
1262 1263
                    raise AssertionError('Disconnected path between stem '
                                         '%d and %d only %d pair ended links' % (oldid,stemid,connected))
1264 1265 1266



1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
            if stemclass=="sequence":
                if attr['length'] > 10:
                    attr['label']="%d : %s->(%d)->%s  [%d] @ %s" % (stemid,sequence[0:5].decode('ascii'),
                                                                    attr['length'],
                                                                    sequence[-5:].decode('ascii'),
                                                                    int(attr['weight']),
                                                                    attr['steps'])
                else:
                    attr['label']="%d : %s->(%d)  [%d] @ %s" % (stemid,
                                                                sequence.decode('ascii'),
                                                                attr['length'],
                                                                int(attr['weight']),
                                                                attr['steps'])
1280
    
1281 1282 1283 1284
                label.append(attr['label'])
                

            seq.append(sequence)
1285 1286
            coverage.append(attr['weight'])
            slength.append(attr['length'])
1287 1288

            rank+=1
1289 1290
            oldstem = stem
            oldid=stemid
1291
            oldstemclass=stemclass
1292 1293 1294 1295
            
            forceconnection=False
        else:
            forceconnection=True
1296 1297 1298 1299

    
                        
            
1300 1301 1302 1303 1304
        
    s1 = alledges[path[-1]]
    s2 = alledges[path[0]]
    sid1=assgraph.getEdgeAttr(*s1)['stemid']
    sid2=assgraph.getEdgeAttr(*s2)['stemid']
1305
    sclass2=assgraph.getEdgeAttr(*s2)['class']
1306 1307 1308 1309 1310 1311 1312 1313
    connected,ml,sl,delta = pairEndedConnected(self,            # @UnusedVariable
                                               assgraph,
                                               sid1,
                                               sid2,
                                               back)  
    
    if s1[1]==s2[0]:
        
1314 1315 1316
        logOrPrint(logger, "Path is circular and connected by %d  (length: %d, sd: %d)" %
                        (connected,int(ml),int(sl))
                   )        
1317
        circular=True
1318 1319
        if sclass2=="sequence":
            label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
1320
    else:
1321 1322 1323 1324
        if sclass2!="sequence":
            raise RuntimeError("A path cannot ends on a gap")
        
        if forceconnection:
1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346
            logOrPrint(logger,"Circular connection forced",)
            logOrPrint(logger,"Linked by %d pair ended links" %  connected)
                
            label.append('{F-connection: %d}' % connected)
            seq.append(b'N'*int(nlength))
            circular=True
        else:
            logOrPrint(logger,"Path is linear")
            circular=False

        seq.insert(0,self.index.getRead(s2[0],0,self.index.getReadSize()).lower())
        slength.insert(0,self.index.getReadSize())

    
    sequence = b''.join(seq)
    length = sum(slength)
    mcov = oneXcoverage(assgraph)
    
    fasta=[">%s seq_length=%d; coverage=%5.1f; circular=%s; %s%s" % (identifier,length,
                                                                  mcov,circular,
                                                                  tags2str(tags) + " ",
                                                                  '.'.join(label))]
1347 1348 1349 1350
    fasta.extend(sequence[i:i+60].decode('ascii') 
                 for i in range(0,len(sequence),60)
                )
    
1351 1352
    return "\n".join(fasta)
 
1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378
def pathConstraints(self,cg,threshold=5.,back=500, minlink=5):

    def minisatDoubleConstraints(self,cg):
        e = [i for i in cg.edgeIterator(edgePredicate=lambda j: len(list(cg.edgeIterator(edgePredicate=lambda k: k[0]==j[1] and k[1]==j[0] and 'stemid' in cg.getEdgeAttr(*k))))>0) if abs(i[0]) < abs(i[1])]
        constraints = {}
        for start,end,index in e:  # @UnusedVariable
            input1 = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==start and i[0]!=end and 'stemid' in cg.getEdgeAttr(*i)))
            input2 = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==end and i[0]!=start and 'stemid' in cg.getEdgeAttr(*i)))
            output1 = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==end and i[1]!=start and 'stemid' in cg.getEdgeAttr(*i)))
            output2 = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==start and i[1]!=end and 'stemid' in cg.getEdgeAttr(*i)))
            if input1 and output1:
                middleF = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==start and j[1]==end and 'stemid' in cg.getEdgeAttr(*j)))
                middleR = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==end and j[1]==start and 'stemid' in cg.getEdgeAttr(*j)))
                inedge = input1
                output= output1
            else:
                middleF = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==end and j[1]==start and 'stemid' in cg.getEdgeAttr(*j)))
                middleR = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==start and j[1]==end and 'stemid' in cg.getEdgeAttr(*j)))
                inedge = input2
                output= output2
    
            if len(inedge)==1 and len(middleF)==1 and len(middleR)==1 and len(output)==1:
                inedge   = inedge[0]
                middleF = middleF[0]
                middleR = middleR[0]
                output  = output[0]
Eric Coissac committed
1379
            
1380 1381 1382 1383
                inputcov   = cg.getEdgeAttr(*inedge)['weight']     
                middleFcov = cg.getEdgeAttr(*middleF)['weight']     
                middleRcov = cg.getEdgeAttr(*middleR)['weight']     
                outputcov  = cg.getEdgeAttr(*output)['weight']     
Eric Coissac committed
1384
            
1385 1386
                r1 = round(middleFcov/float(inputcov))
                r2 = round(middleRcov/float(inputcov)) + 1
Eric Coissac committed
1387
            
1388 1389
                r3 = round(middleFcov/float(outputcov))
                r4 = round(middleRcov/float(outputcov)) + 1
Eric Coissac committed
1390
            
1391
                r = int(round((r1+r2+r3+r4)/4)) - 1
Eric Coissac committed
1392
                    
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402
                constraints[cg.getEdgeAttr(*inedge)['stemid']] = [[cg.getEdgeAttr(*middleF)['stemid'],cg.getEdgeAttr(*middleR)['stemid']] * r + \
                                                                 [cg.getEdgeAttr(*middleF)['stemid'],cg.getEdgeAttr(*output)['stemid']]]
        return constraints

    def minisatSimpleConstraints(self,cg):
        e = [i for i in cg.edgeIterator(edgePredicate=lambda j: j[0]==j[1]  and 'stemid' in cg.getEdgeAttr(*j))]
        constraints = {}
        for start,end,index in e:
            inedge = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==start and i[0]!=end  and 'stemid' in cg.getEdgeAttr(*i)))
            output = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==end and i[1]!=start  and 'stemid' not in cg.getEdgeAttr(*i)))
Eric Coissac committed
1403
        
1404
            middle = (start,end,index)
Eric Coissac committed
1405
    
1406 1407 1408
            if len(inedge)==1 and len(output)==1:
                inedge   = inedge[0]
                output  = output[0]
Eric Coissac committed
1409
            
1410 1411 1412
                inputcov   = cg.getEdgeAttr(*inedge)['weight']     
                middlecov  = cg.getEdgeAttr(*middle)['weight']     
                outputcov  = cg.getEdgeAttr(*output)['weight']     
Eric Coissac committed
1413
            
1414 1415
                r1 = round(middlecov/float(inputcov))            
                r2 = round(middlecov/float(outputcov))
Eric Coissac committed
1416
            
1417
                r = int(round((r1+r2)/2))
Eric Coissac committed
1418
                    
1419 1420 1421
                constraints[cg.getEdgeAttr(*inedge)['stemid']] = [[cg.getEdgeAttr(*middle)['stemid']] * r + \
                                                                [cg.getEdgeAttr(*output)['stemid']]]
        return constraints
Eric Coissac committed
1422 1423


Eric Coissac committed
1424
        
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
    def pairEndedConstraints(asm,cg,threshold=5.,back=500):
    
        fork =   [e for e in cg.edgeIterator(edgePredicate=lambda i: \
                                                           'stemid' in cg.getEdgeAttr(*i) and
                                                           len(list(cg.parentIterator(i[0],
                                                                                      edgePredicate=lambda j: j[0]!=i[1] and 'stemid' in cg.getEdgeAttr(*j))
                                                                   )
                                                              )==2 
                                                       and len(list(cg.neighbourIterator(i[1],
                                                                                         edgePredicate=lambda j:j[1]!=i[0] and 'stemid' in cg.getEdgeAttr(*j))
                                                                   )
                                                              )==2
                                             )
                 ]
    
        bifork = [([cg.getEdgeAttr(*i)['stemid'] for i in 
                    cg.edgeIterator(edgePredicate=lambda j: j[1]==e[0] and j[0]!=e[1]  and 'stemid' in cg.getEdgeAttr(*j))],
                   cg.getEdgeAttr(*e)['stemid'],
                   [cg.getEdgeAttr(*i)['stemid'] for i in 
                    cg.edgeIterator(edgePredicate=lambda j: j[0]==e[1] and j[1]!=e[0]  and 'stemid' in cg.getEdgeAttr(*j))]) for e in fork]
    
        links = [(i,
                  (pairEndedConnected(asm,cg,i[0][0],i[2][0],back)[0]+
                   pairEndedConnected(asm,cg,i[0][1],i[2][1],back)[0]+0.0001)/
                  (pairEndedConnected(asm,cg,i[0][0],i[2][1],back)[0]+
                   pairEndedConnected(asm,cg,i[0][1],i[2][0],back)[0]+0.0001)) 
                 for i in bifork]
    
        constraints = {}
    
        for (starts,middle,ends),ratio in links:
            if  ratio >= threshold:
                constraints[starts[0]]=[middle,ends[0]]
                constraints[starts[1]]=[middle,ends[1]]
            elif ratio <= 1/threshold:
                constraints[starts[0]]=[middle,ends[1]]
                constraints[starts[1]]=[middle,ends[0]]
Eric Coissac committed
1462
            
1463
    
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482
        changed=True
        while changed:
            n = iter(constraints)
            changed=False
            while not changed:
                try:
                    k=next(n)
                    nk=constraints[k][-1]
                    if nk in constraints:
                        constraints[k].extend(constraints[nk])
                        # del constraints[nk]
                        changed=True
                except StopIteration:
                    break

        for k in constraints:
            constraints[k]=[constraints[k]]

        return constraints