tango.py 94.8 KB
Newer Older
Eric Coissac committed
1
"""
2 3 4 5
.. module:: orgasm.tango
   :platform: Unix
   :synopsis: The :py:mod:`~orgasm.tango` package contains a set of functions useful to manage assembling structure.

Eric Coissac committed
6 7 8
The :py:mod:`orgasm.tango` package
==================================

9 10 11 12
The :py:mod:`~orgasm.tango` package contains a set of functions useful to manage assembling structure.

.. moduleauthor:: Eric Coissac <eric.coissac@inria.fr>

Eric Coissac committed
13 14 15 16 17 18 19 20 21
:author:  Eric Coissac
:contact: eric.coissac@inria.fr


.. warning::

    The :py:mod:`~orgasm.tango` package functions aims to be integrated 
    into other packages as standalone functions of class methods.
"""
Eric Coissac committed
22
import sys
Eric Coissac committed
23 24
from orgasm.multialign import multiAlignReads, consensus  # @UnresolvedImport
from orgasm.assembler import Assembler                    # @UnresolvedImport
Eric Coissac committed
25
from orgasm.assembler import buildstem                    # @UnresolvedImport
26
from orgasm.assembler import tango                        # @UnresolvedImport
27
from orgasm.utils import tags2str
Eric Coissac committed
28

29
from time import time
Eric Coissac committed
30
import math
Eric Coissac committed
31
from functools import reduce
32
from sys import stderr
33
from orgasm.multialign._multi import alignSequence # @UnresolvedImport
Eric Coissac committed
34

35 36 37 38 39
def logOrPrint(logger,message,level='info'):
    if logger is not None:
        getattr(logger, level)(message)
    else:
        print(message,file=sys.stderr)
Eric Coissac committed
40

Eric Coissac committed
41
def cutLowCoverage(self,mincov,terminal=True):
Eric Coissac committed
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
    '''
    Remove sequences in the assembling graph with a coverage below ``mincov``.
    
    .. code-block :: ipython
    
        In [159]: asm = Assembler(r)

        In [160]: s = matchtoseed(m,r)

        In [161]: a = tango(asm,s,mincov=1,minread=10,minoverlap=30,maxjump=0,cycle=1)
    
        In [162]: asm.cleanDeadBranches(maxlength=10)

        Remaining edges : 424216 node : 423896
        Out[162]: 34821
        In [162]: cutLowCoverage(asm,10,terminal=False)

    :param mincov: coverage threshold 
    :type mincov: :py:class:`int`
    :param terminal: if set to ``True`` only terminal edges are removed from the assembling graph
    :type terminal: :py:class:`bool`
    :return: the count of deleted node
    :rtype: :py:class:`int`
    
    :seealso: :py:meth:`~orgasm.assambler.Assembler.cleanDeadBranches`
Eric Coissac committed
67
    '''    
Eric Coissac committed
68 69 70
    def isTerminal(g,n):
        return len(list(g.parentIterator(n)))==0 or len(list(g.neighbourIterator(n)))==0
    
Eric Coissac committed
71 72 73 74 75 76
    def endnodeset(g):
        return set(g.nodeIterator(predicate = lambda n : (len(list(g.neighbourIterator(n)))==0)))
    
    def startnodeset(g):
        return set(g.nodeIterator(predicate = lambda n : (len(list(g.parentIterator(n)))==0)))
    
77
    ontty = sys.stderr.isatty()
Eric Coissac committed
78
    
Eric Coissac committed
79 80 81 82
    if terminal:
        tstates=[True]
    else:
        tstates=[True,False]
Eric Coissac committed
83
    ilength=len(self)
Eric Coissac committed
84
    cg = self.compactAssembling(verbose=False)
Eric Coissac committed
85
    
Eric Coissac committed
86
    index = self.index
Eric Coissac committed
87
    readSize = index.getReadSize()
Eric Coissac committed
88
    for terminal in tstates:
Eric Coissac committed
89
        print('',file=sys.stderr)
Eric Coissac committed
90
        if terminal:
Eric Coissac committed
91
            print("Deleting terminal branches",file=sys.stderr)
Eric Coissac committed
92
        else:
Eric Coissac committed
93
            print("Deleting internal branches",file=sys.stderr)
Eric Coissac committed
94
        extremities = endnodeset(cg) | startnodeset(cg)
Eric Coissac committed
95 96 97 98 99
        go = True
        while go:
            go = False
            stems = [x for x in cg.edgeIterator() 
                     if not terminal or (isTerminal(cg, x[0]) or isTerminal(cg, x[1]))]
100
            if stems:
Eric Coissac committed
101
                stems.sort(key=lambda i:cg.getEdgeAttr(*i)['weight'],reverse=True)
102
                lightest = stems.pop()
Eric Coissac committed
103
                lattr = cg.getEdgeAttr(*lightest)
104

105 106 107
                if lattr['weight'] < mincov: 
                    if stems:
                        go=True
Eric Coissac committed
108 109
                    for n in lattr['path'][1:-1]:
                        if n in self.graph:
Eric Coissac committed
110 111 112 113
                            try:
                                del self.graph[n]
                            except KeyError:
                                pass
Eric Coissac committed
114 115 116 117
                    if lightest[0] in extremities and lightest[0] in self.graph:
                        del self.graph[lightest[0]]
                    if lightest[1] in extremities and lightest[1] in self.graph:
                        del self.graph[lightest[1]]
Eric Coissac committed
118
                        
119 120 121 122
                    if ontty:
                        print("Remaining edges : %d node : %d\r" % (self.graph.edgeCount(),len(self)),
                              end='\r',
                              file=sys.stderr)
123
        
Eric Coissac committed
124
                    cg.deleteEdge(*lightest)
125 126 127 128 129 130 131 132 133
                    tojoin=[]
                    if lightest[0] in extremities:
                        del cg[lightest[0]]
                    else:
                        tojoin.append(lightest[0])
                    if lightest[1] in extremities:
                        del cg[lightest[1]]
                    else:
                        tojoin.append(lightest[1])
Eric Coissac committed
134
#                    print >>sys.stderr,lightest[0] in extremities,lightest[1] in extremities,tojoin
135
                    for c in tojoin:
Eric Coissac committed
136 137 138 139 140 141 142 143 144 145 146 147 148
                        if c in cg:
                            begin = list(cg.parentIterator(c))
                            end   = list(cg.neighbourIterator(c))
                            if len(begin)==1 and len(end)==1:
                                begin = begin[0]
                                end = end[0]
                                e1s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==begin and e[1]==c))
                                e2s = list(cg.edgeIterator(edgePredicate = lambda e:e[0]==c and e[1]==end))
                                if len(e1s)==1 and len(e2s)==1:
                                    e1 = e1s[0]
                                    e2 = e2s[0]
                                    attr1 = cg.getEdgeAttr(*e1)
                                    attr2 = cg.getEdgeAttr(*e2)
Eric Coissac committed
149

Eric Coissac committed
150 151 152
                                    sequence=attr1['sequence'] + attr2['sequence']
                                    path=attr1['path'] + attr2['path'][1:]
                                    
Eric Coissac committed
153 154 155 156 157 158 159 160 161 162 163
                                    stem =  buildstem(self,
                                                      begin,
                                                      end,
                                                      sequence,
                                                      path,
                                                      False)
                                    
                                    stem['graphics']={'width':int(stem['weight']/readSize)+1,
                                                      'arrow':'last'}
                                    
                                    cg.addStem(stem)
Eric Coissac committed
164 165 166
    
                                    cg.deleteEdge(*e2)                                        
                                    cg.deleteEdge(*e1)  
Eric Coissac committed
167 168
                                    
                                    
169
                                    del cg[c]                  
170 171 172
    if ontty:                            
        print('',file=sys.stderr)
        
Eric Coissac committed
173
    return ilength - len(self)
Eric Coissac committed
174 175

def cutSNPs(self,maxlength=500):
176 177 178
    
    ontty = sys.stderr.isatty()
    
Eric Coissac committed
179 180 181 182
    cg = self.compactAssembling(verbose=False)
    snps = [(snp, 
             1 if snp[2][0][1][2] > snp[2][1][1][2] else 0)   # indicate which allele needs to be elimimated
             for snp in [(i,                                  # start node
Eric Coissac committed
183
                         next(cg.neighbourIterator(i)),      # end node
Eric Coissac committed
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
                         [(k[2],                              # allele id (0 or 1) 
                           (cg.getEdgeAttr(*k)['stemid'],     # allele stemid
                            cg.getEdgeAttr(*k)['length'],     # allele length
                            cg.getEdgeAttr(*k)['weight'])     # allele coverage
                           ) 
                          for k in cg.edgeIterator(edgePredicate=lambda j:j[0]==i)
                         ]
                        ) 
                        for i in cg.nodeIterator(predicate=lambda n : 
                                len(list(cg.neighbourIterator(n))) == 1 
                                and len(list(cg.edgeIterator(edgePredicate=lambda e : e[0]==n)))==2)
                      ] if  snp[2][0][1][0]>0                 # keep one copy of the snp
                        and snp[2][0][1][0]!=-snp[2][1][1][0] # exclude pairs of reverse-complement allele
                        and snp[2][0][1][1]<maxlength               # exclude allele longer than 500 bp 
                        and snp[2][1][1][1]<maxlength             #
           ]
           
    for snp in snps:
        edge  = cg.getEdgeAttr(snp[0][0],snp[0][1],snp[1])
        nodes = edge['path'][1:-1]
204 205 206 207 208 209
        
        if ontty:
            print("Remaining edges : %d node : %d" % (self.graph.edgeCount(),len(self)),
                  end='\r',
                  file=sys.stderr)
            
Eric Coissac committed
210 211 212
        for node in nodes:
            if node in self.graph:
                del self.graph[node]
213 214 215
    
    if ontty:
        print("",file=sys.stderr)
Eric Coissac committed
216
    
Eric Coissac committed
217
  
Eric Coissac committed
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
def mode(data):
    '''
    Compute a raw estimation of the mode of a data set
    
    :param data: The data set to analyse
    :type data: a permanent iterable object (list, tuble...)
    
    '''

    data = list(data)
    
    if len(data) == 0:
        return None
    
    if len(data) < 8:
        return int(sum(data)/len(data))
    
    data.sort()
    xx = [0,0,0]
    for j in range(3):
Eric Coissac committed
238
        windows = len(data)//2
Eric Coissac committed
239 240 241 242 243 244 245 246 247 248 249 250 251 252
        intervals = [data[i+windows]-data[i] for i in range(windows)]
        mininter = min(intervals)
        begininter = intervals.index(mininter)
        data = data[begininter:(begininter+windows)]
        xx[j]=sum(data)/windows

    return sum(xx)/3

def weightedMode(data):
    d = []
    for x,w in data:
        d.extend([x] * w)
    return mode(d)   
    
253
def matchtoseed(matches,index,new=None):
Eric Coissac committed
254
    s=[]
255 256 257 258
    if new is None:
        new=list(matches.keys())
        
    for p in new:
259 260 261 262
        m = matches[p][1]
        k = m.keys()
        for x in k:
            s.extend((index.getIds(y[0])[0],(x,),0) for y in m[x])  
Eric Coissac committed
263 264 265

    return s

Eric Coissac committed
266 267
def matchtogene(matches):
    genes = {}
268
    if matches is not None:
269
        for p in matches:        # Loop over probe set
270 271
            m = matches[p][1]
            k = m.keys()
272 273 274 275 276 277 278
            for x in k:          # Loop over a probes in a set
                for i in m[x]:   # loop over matches in a probe
                    if i[0] < 0:
                        pos = -i[5]
                    else:
                        pos = i[5]
                    genes[abs(i[0])]=(x,pos)
Eric Coissac committed
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
            
    return genes

def genesincontig(cg,index,matches):
    def vread(i):
        if abs(i) > len(index):
            v=0
        elif abs(i) in genes:
            v=1
        else:
            v=-1
        return v
    
    genes = matchtogene(matches)
    ei = cg.edgeIterator(edgePredicate=lambda e: 'path' in cg.getEdgeAttr(*e))
    
    for e in ei:
        ea = cg.getEdgeAttr(*e)
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
        if ea['class']=="sequence":
            path = ea['path']
            eg = [vread(i) for i in path]
            ep = [genes.get(abs(i),()) for i in path]
                
            g = sum(i for i in eg if i > 0)
            if g > 0:
                g=max(min(math.ceil(math.log10(g)),4)*64-1,0)
                color="#00%02X%02X" % (g,255-g)
            else:
                color="#0000FF"
            graphics = ea.get('graphics',{})
            graphics['arrow']='last'
            graphics['fill']=color
            ea['graphics']=graphics
            ea['ingene']=g
            ea['genepos']=ep
Eric Coissac committed
314
          
315 316 317 318 319 320 321 322 323 324 325 326 327 328
def testOverlap(seqfrom,seqto,headto,readsize):
    seqto=headto+seqto
    lseq=min(len(seqfrom),len(seqto),readsize)
    seqfrom = seqfrom[-lseq:]
    seqto   = seqto[0:lseq] 
    ali = alignSequence(seqfrom,seqto)
    ok = (not ali[0] \
          and ali[1] > 0 \
          and len(ali[2])==1 \
          and len(ali[3])==1)
    if ok:
        return lseq - ali[2][0]
    else:
        return -1
329
def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=set(),logger=None):
Eric Coissac committed
330 331 332 333
    '''
    Add relationships between edges of the assembling graph related to the
    par ended links.
    
334 335 336
    :param assgraph: The compact assembling graph as produced by the
                     :py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
    :type assgraph:  :py:class:`~orgasm.graph.DiGraphMultiEdge`
Eric Coissac committed
337 338 339 340 341 342 343 344 345
    :param minlink:  the minimum count of pair ended link to consider 
                     for asserting the relationship
    :type minlink:   :py:class:`int`
    :param back:     How many base pairs must be considered at the end of each edge
    :type back:      :py:class:`int`
    :param addConnectedLink: add to the assembling graph green edges for each directly 
                             connected edge pair representing the pair ended links 
                             asserting the connection.
    :type addConnectedLink: :py:class:`bool`
Eric Coissac committed
346 347
     '''
    
348
    frglen,frglensd = estimateFragmentLength(self)
349
    readsize=self.index.getReadSize()
350 351 352 353 354 355 356 357 358 359 360 361 362
    
    if frglen is None:
        logger.warning("Insert size cannot be estimated")
        frglen=300
        frglensd=1
        logger.warning("Insert size set to %d" % frglen)
    
    nforcedLink = set((-p[1],-p[0]) 
                      if abs(p[0]) > abs(p[1]) 
                      else (p[0],p[1]) 
                      for p in forcedLink)
        
        
363

364 365 366
    eiat = dict((assgraph.getEdgeAttr(*i)['stemid'],
                 assgraph.getEdgeAttr(*i)) for i in assgraph.edgeIterator())
    maxstemid = max(eiat)
367
    
Eric Coissac committed
368 369 370 371 372 373 374 375 376 377 378 379
    if addConnectedLink:
        # Fake function just testing the stemid attribute
        def isInitial(n):
            return  'stemid' in assgraph.getEdgeAttr(*n) 
        def isTerminal(n):
            return  'stemid' in assgraph.getEdgeAttr(*n) 
    else:
        def isInitial(n):
            return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0 
        def isTerminal(n):
            return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0

Eric Coissac committed
380
        
Eric Coissac committed
381 382 383 384 385
    ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
    et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
    
    eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
    etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
386
        
Eric Coissac committed
387 388 389
    nei = len(ei)
    net = len(et)

Eric Coissac committed
390
    s=[]
Eric Coissac committed
391 392
    for e1 in range(net):
        for e2 in range(nei):
393 394 395 396
            npair = ((etid[e1], eiid[e2]) 
                     if (abs(etid[e1]) < abs(eiid[e2])) 
                     else (-eiid[e2],-etid[e1]))
            
Eric Coissac committed
397
            connected = et[e1][1]==ei[e2][0]
398 399 400
            linkedby,ml,sl,delta  = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
            first=assgraph.getEdgeAttr(*et[e1])['last']
            last=assgraph.getEdgeAttr(*ei[e2])['first']
401
            
Eric Coissac committed
402
            if connected and addConnectedLink:
Eric Coissac committed
403
                if linkedby >= minlink:
404
                    s.append(('l',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#00FF00",ml,sl,delta,first,last))
405 406
                    logOrPrint(logger,
                               "%d -> %d connection asserted by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
Eric Coissac committed
407
                else:
408 409 410
                    logOrPrint(logger,
                               "%d -> %d connection not asserted by pair ended link" % (etid[e1], eiid[e2]))
                    
411
            elif not connected and npair in nforcedLink:
412 413 414 415 416 417 418 419 420
                s.append(('f',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF0000",ml,sl,delta,first,last))
                if linkedby > 0:
                    logOrPrint(logger,
                                   "%d -> %d forced but supported by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
                else:
                    logOrPrint(logger,
                                   "%d -> %d forced and not supported by pair ended links" % (etid[e1], eiid[e2]))
                    
            elif not connected and linkedby >= minlink:
421 422 423
                overlap = testOverlap(eiat[etid[e1]]['sequence'], 
                                      eiat[eiid[e2]]['sequence'], 
                                      eiat[eiid[e2]]['head'], 
424
                                      readsize)
425 426 427 428 429 430 431 432
                if overlap >= 0:
                    s.append(('o',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF00FF",overlap,0,[],first,last))
                    logOrPrint(logger,
                                   "%d -> %d overlap of %dbp supported by %d pair ended links" % (etid[e1], eiid[e2],overlap,linkedby))
                else:
                    s.append(('s',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF6600",ml,sl,delta,first,last))
                    logOrPrint(logger,
                                   "%d -> %d scaffolded by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
433 434 435 436 437 438 439 440 441 442 443 444 445 446
    nstemid={}
    for kind,x,y,z,s1,s2,color,ml,sl,delta,first,last in s:
        if (abs(x) > abs(y)):
            npair=(-y,-x)
            nsign=-1
        else:
            npair=(x,y)
            nsign=1
        if npair not in nstemid:
            maxstemid+=1
            nstemid[npair]=maxstemid
            
        nid=nstemid[npair] * nsign 
        
Eric Coissac committed
447
        attr = assgraph.addEdge(x,y)
448 449
        
        if kind=="s":
450
            glengths = [frglen - i - 2 * readsize 
451
                        for i in delta]
452
            pglengths = [i for i in glengths if i >=0]
453
            if len(pglengths) > 1:
454 455 456 457 458 459
                glength  = sum(pglengths) / len(pglengths)
                glengthsd= math.sqrt(sum((i-glength)**2 for i in pglengths) /(len(pglengths)-1))
            else:
                glength  = sum(glengths) / len(glengths)
                glengthsd= math.sqrt(sum((i-glength)**2 for i in glengths) /(len(glengths)-1))
                
460
            attr['label']="%d : Gap (%dbp)  [%d,%d] %d -> %d" % (nid,glength,z,len(pglengths),s1,s2)
461
            attr['length']=int(glength) if int(glength) > 0 else 10
462 463 464 465
            attr['first']=first
            attr['last']=last
            attr['weight']=0
            attr['gappairs']=len(glengths)
466
            attr['gaplength']=int(glength)
467
            attr['gapsd']=int(math.sqrt(frglensd**2+glengthsd**2))
468
            attr['gapdeltas']=[frglen - i - 2 * readsize for i in delta]
469
            attr['pairendlink']=z
470
            attr['ingene']=0
471
            attr['link']=(s1,s2)
472 473 474 475 476 477
            attr['graphics']={'width':z // 10.,
                              'arrow':'last',
                              'fill':color
                              }
            attr['stemid']=nid
            attr['sequence']=b"N" * attr['length']
478 479
            attr['path']=[first] + [0] * (attr['length']-1) + [last]
            attr['class']='scaffold:paired-end'
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
        elif kind=="o":
            attr['label']="%d : Overlap (%dbp)  [%d] %d -> %d" % (nid,overlap,z,s1,s2)
            attr['length']=-overlap
            attr['first']=first
            attr['last']=last
            attr['weight']=0
            attr['pairendlink']=z
            attr['ingene']=0
            attr['link']=(s1,s2)
            attr['graphics']={'width':1,
                              'arrow':'last',
                              'fill':color
                              }
            attr['stemid']=nid
            attr['sequence']=b''
            attr['path']=[]
            attr['class']='scaffold:overlap'
497
        elif kind=="f":
498
            glengths = [frglen - i - 2 * readsize 
499 500 501 502 503 504 505 506 507 508
                        for i in delta]
            pglengths = [i for i in glengths if i >=0]
            
            if len(pglengths) > 1:
                glength  = sum(pglengths) / len(pglengths)
                glengthsd= math.sqrt(sum((i-glength)**2 for i in pglengths) /(len(pglengths)-1))
            else:
                glength  = sum(glengths) / len(glengths)
                glengthsd= math.sqrt(sum((i-glength)**2 for i in glengths) /(len(glengths)-1))
                
509
            attr['label']="%d : Forced [%d] %d -> %d" % (nid,z,s1,s2)
510
            attr['length']= int(glength) if glength > 0 else 10
511 512 513
            attr['first']=first
            attr['last']=last
            attr['weight']=0
514 515 516
            attr['gappairs']=len(glengths)
            attr['gaplength']=int(glength)
            attr['gapsd']=int(math.sqrt(frglensd**2+glengthsd**2))
517
            attr['gapdeltas']=[frglen - i - readsize for i in delta]
518 519
            attr['pairendlink']=z
            attr['ingene']=0
520
            attr['link']=(s1,s2)
521 522
            attr['graphics']={'width':z // 10.,
                              'arrow':'last',
523
                              'fill' :color
524 525 526 527 528 529
                              }
            attr['stemid']=nid
            attr['sequence']=b"N" * attr['length']
            attr['path']=[first] + [0] * (attr['length']-1) + [last]
            attr['class']='scaffold:forced'
            
530 531
        else:
            attr['class']='internal'
532
            
Frédéric Boyer committed
533 534
__cacheAli = set()
__cacheAli2 = set()
Eric Coissac committed
535 536

def fillGaps(self,minlink=5,
Eric Coissac committed
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
                  back=200,
                  kmer=12,
                  smin=40,
                  delta=0,
                  cmincov=5,
                  minread=20,
                  minratio=0.1, 
                  emincov=1,
                  maxlength=None,
                  gmincov=1,
                  minoverlap=60,
                  lowfilter=True,
                  adapters5=(),
                  adapters3=(),
                  maxjump=0,
                  snp=False,
                  nodeLimit=1000000,
Eric Coissac committed
554
                  onlyLinking=False,
Eric Coissac committed
555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
                  logger=None):
    '''
    
    :param minlink:
    :param back:
    :param kmer:
    :param smin:
    :param delta:
    :param cmincov:
    :param minread:
    :param minratio:
    :param emincov:
    :param maxlength:
    :param gmincov:
    :param minoverlap:
    :param lowfilter:
    :param maxjump:
    :param snp: If set to True (default value is False) erase SNP variation
                by conserving the most abundant version
    '''
    global __cacheAli
    global __cacheAli2
    __cacheAli = __cacheAli2
    __cacheAli2 = set()
    
    def isInitial(n):
        return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0 
    def isTerminal(n):
        return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0

    assgraph = self.compactAssembling(verbose=False)
    
    index = self.index
    
589
    #List of edges not connected at their beginning
Eric Coissac committed
590
    ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
591 592
    
    #List of edges not connected at their end    
Eric Coissac committed
593 594
    et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
    
595
    #Corresponding id of these edges
Eric Coissac committed
596 597 598
    eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
    etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]

599
    #epi = [set(assgraph.getEdgeAttr(*i)['path'][0:back]) for i in ei]
Eric Coissac committed
600 601 602 603
    ept = [set(assgraph.getEdgeAttr(*i)['path'][-back:]) for i in et]

    eei = [set(assgraph.getEdgeAttr(*i)['path'][0:100]) for i in ei]
    eet = [set(assgraph.getEdgeAttr(*i)['path'][-100:]) for i in et]
604
    print(eiid,file=sys.stderr) # <EC>
605
    
Eric Coissac committed
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620
    exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid]
    ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid]
    
    nei = len(ei)
    net = len(et)
    s=[]
    maxcycle=max(self.graph.getNodeAttr(i)['cycle'] for i in self.graph)
    lassemb = len(self)
    cycle = maxcycle
    linked=set()
    extended=set()
    for e1 in range(net):
        for e2 in range(nei):
            connected = et[e1][1]==ei[e2][0]
            if not connected:
621
                linkedby,ml,sl,pdelta  = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)  # @UnusedVariable
Eric Coissac committed
622 623 624 625 626 627 628 629 630
            
                if linkedby >= minlink and abs(etid[e1]) <= abs(eiid[e2]):
                    extended.add(etid[e1])
                    extended.add(-eiid[e2])
                    if (etid[e1],eiid[e2]) not in linked:
                        linked.add((-eiid[e2],-etid[e1]))
                        print("\n\nLinking Stems %d -> %d" % (etid[e1],eiid[e2]),
                              file=sys.stderr)

631 632
#                        ex = frozenset(((ext[e1] | exi[e2]) - ept[e1] - epi[e2]) | eet[e1] | eei[e2])
                        ex = frozenset(ext[e1] | exi[e2] | eet[e1] | eei[e2])
Eric Coissac committed
633 634 635 636
                        
                        ingraph = sum(i in self.graph for i in ex)
                        nreads = len(ex)
                        if ingraph < nreads:
637 638 639
                            logOrPrint(logger,
                                        "--> %d | %d = %d reads to align (%d already assembled)" % (len(ext[e1]),len(exi[e2]),nreads,ingraph),
                                       )
Eric Coissac committed
640 641 642 643 644 645 646

                            if nreads > 10:
                                __cacheAli2.add(ex)
                                if ex not in __cacheAli:
                                    ali= multiAlignReads(ex,index,kmer,smin,delta)
                                    print('',file=sys.stderr)
                
647 648
                                    #goodali = [i for i in ali if len(i) >= nreads/4]
                                    goodali=ali
649 650
                                    logOrPrint(logger,
                                               "--> %d consensus to add" % len(goodali))
Eric Coissac committed
651 652

                                    for a in goodali:
653
                                        #print(b'\n'.join(alignment2bytes(a,index)).decode('ascii'))
Eric Coissac committed
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
                                        cycle+=1
                                        c = consensus(a,index,cmincov)
                                        s = insertFragment(self,c,cycle=cycle)
                                        print("     %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
                                              file=sys.stderr)

            
                                        a = tango(self,
                                                  seeds      = s,
                                                  minread    = minread,
                                                  minratio   = minratio,
                                                  mincov     = emincov,
                                                  minoverlap = minoverlap,
                                                  lowfilter  = lowfilter,
                                                  adapters5   = adapters5,
                                                  adapters3   = adapters3,
                                                  maxjump    = maxjump,
                                                  cycle      = cycle,
                                                  nodeLimit  = nodeLimit)
                                        
                                        
                                        print('',file=sys.stderr)
                                else:
677 678
                                    logOrPrint(logger,
                                               "--> already aligned")
Eric Coissac committed
679

Eric Coissac committed
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
    if not onlyLinking:
        for e1 in range(net):
            if etid[e1] not in extended:
                print("\n\nExtending Stems %d" % (etid[e1]),
                      file=sys.stderr)
    
                ex = frozenset((ext[e1] - ept[e1]) | eet[e1])
                nreads = len(ex)
                print("--> %d reads to align" % (nreads),
                      file=sys.stderr)
    
                if nreads > 10:
                    __cacheAli2.add(ex)
                    if ex not in __cacheAli:
                        ali= multiAlignReads(ex,index,kmer,smin,delta)
                        print('',file=sys.stderr)
696 697
                        #goodali = [i for i in ali if len(i) >= nreads/4]
                        goodali=ali
Eric Coissac committed
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
                        print("--> %d consensus to add" % len(goodali),
                              file=sys.stderr)
    
                        for a in goodali:
                            c = consensus(a,index,cmincov)
                            if c:
                                cycle+=1
                                s = insertFragment(self,c,cycle=cycle)
                                print("     %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
                                      file=sys.stderr)
    
                                a = tango(self,
                                          seeds      = s,
                                          minread    = minread,
                                          minratio   = minratio,
                                          mincov     = emincov,
                                          minoverlap = minoverlap,
                                          lowfilter  = lowfilter,
                                          adapters5  = adapters5,
                                          adapters3  = adapters3,
                                          maxjump    = maxjump,
                                          cycle      = cycle,
                                          nodeLimit  = nodeLimit)
                            print("",file=sys.stderr)
                    else:
                        print("--> already aligned",file=sys.stderr)
Eric Coissac committed
724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753

    self.cleanDeadBranches(maxlength=10)
    cutLowCoverage(self,gmincov,terminal=True)
#    cutLowCoverage(self,int(gmincov/3),terminal=False)   
    
    if maxlength is not None:
        smallbranches = maxlength
    else:
        smallbranches = estimateDeadBrancheLength(self)
        print("     Dead branch length setup to : %d bp" % smallbranches,
              file=sys.stderr)

    self.cleanDeadBranches(maxlength=smallbranches)

    if snp:
        cutSNPs(self)

    newnodes = len(self) - lassemb
    
    print('',file=sys.stderr)
    print("#######################################################",file=sys.stderr)
    print("#",file=sys.stderr)
    print("# Added : %d bp (total=%d bp)" % (newnodes/2,len(self)/2),file=sys.stderr)
    print("#",file=sys.stderr)
    print("#######################################################",file=sys.stderr)
    print('',file=sys.stderr)
    
    return newnodes

                        
754
                        
Eric Coissac committed
755

756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818

        
def insertFragment(self,seq,cycle=1):
    index = self.index
    rsize = index.getReadSize()
    readmax=len(index)+1
    seeds = set()
    ireadidE=None
    
    if len(seq) >= rsize:
        graph = self.graph
        probe = seq[0:rsize]
        readid = index.getReadIds(probe)
        
        for i in range(1,len(seq)-rsize+1):
            coverage = readid[1]
            ireadid   = readid[0]
            seeds.add(ireadid)
            
            if ireadid not in graph:
                node  = graph.addNode(ireadid)
                if 'cycle' not in node:
                    node['cycle']=cycle
                if ireadid < readmax:
                    node['fake5']=0
                    node['fake3']=0
                else:
                    node['graphics']={'fill':"#0000FF"}
                
            probe = seq[i:i+rsize]
            readidE = index.getReadIds(probe)
            coverage = readidE[1]
            ireadidE   = readidE[0]
            
            if ireadidE not in graph:
                node  = graph.addNode(ireadidE)
                if 'cycle' not in node:
                    node['cycle']=cycle
                if ireadidE < readmax:
                    node['fake5']=0
                    node['fake3']=0
                else:
                    node['graphics']={'fill':"#0000FF"}
                    
            
            #if ireadid!=-ireadidE:
            edges = graph.addEdge(ireadid,ireadidE)
            edges[1]['coverage']=coverage
            edges[2]['coverage']=coverage
            edges[1]['label']="%s (%d)" % (edges[1]['ext'],coverage)
            edges[2]['label']="%s (%d)" % (edges[2]['ext'],coverage)
            #else:
            #    print >>sys.stderr,"\nWARNING : self loop on %d" % ireadidE
        
            readid = readidE
        
        if ireadidE is not None:    
            seeds.add(ireadidE)
        
    return [(-abs(i),("Consensus",),0) for i in seeds]
                

def getPairedRead(self,assgraph,stemid,back,end=True): 
Eric Coissac committed
819 820
    '''
    
821 822 823 824
    :param assgraph:
    :type assgraph:
    :param stemid:
    :type stemid:
Eric Coissac committed
825
    :param back:
826 827 828
    :type back:
    :param end:
    :type end:
Eric Coissac committed
829
    '''
830 831
    r = self.index 
    lr = len(r)
Frédéric Boyer committed
832
    
833 834
    if not end:
        stemid=-stemid  
835
    
836 837 838 839 840 841 842
    try:
        stem = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e) 
                                                     and assgraph.getEdgeAttr(*e)['stemid']==stemid))
    except StopIteration:
        print("ERROR in getPairedRead : requesting stemid=%s" % str(stemid),
              file=stderr)
        
843
    path=assgraph.getEdgeAttr(*stem)['path'][-back:]
844
    
845 846
    reads=[set(r.normalizedPairedEndsReads(abs(i))[1]) for i in path if  abs(i) < lr]
    paired = set(reduce(lambda a,b: a | b,reads,set()))
Eric Coissac committed
847
    
848 849 850 851
    if not end:
        paired = set(-i for i in paired)
    
    return paired
Eric Coissac committed
852

853
def pairEndedConnected(self,assgraph,edge1,edge2,back=250):
Eric Coissac committed
854 855 856
    '''
    Returns how many pair ended reads link two edges in a compact assembling graph
    
857 858 859
    :param assgraph: The compact assembling graph as produced by the
                     :py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
    :type assgraph:  :py:class:`~orgasm.graph.DiGraphMultiEdge`
Eric Coissac committed
860 861 862 863 864 865 866 867 868 869 870
    :param edge1:    The ``stemid`` of the first edge
    :type edge1:     :py:class:`int`
    :param edge2:    The ``stemid`` of the second edge
    :type edge2:     :py:class:`int`
    :param back:     How many base pairs must be considered at the end of each edge
    :type back:      :py:class:`int`
    
    :return:         The count of pair ended reads linking both the edges
    :rtype:          :py:class:`int`
    '''
    
871 872
    ri = self.index
    lri= len(ri)
Eric Coissac committed
873 874 875
        
    s1 = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e) 
                                                 and assgraph.getEdgeAttr(*e)['stemid']==edge1))
876
                                                 
Eric Coissac committed
877 878
    s2 = next(assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e) 
                                                 and assgraph.getEdgeAttr(*e)['stemid']==edge2))
879 880 881
                                                 
    e1 = assgraph.getEdgeAttr(*s1)['path'][-back:]
    e2 = assgraph.getEdgeAttr(*s2)['path'][0:back]
Eric Coissac committed
882 883 884

    de1 = {}
    de2 = {}
885
    
Eric Coissac committed
886 887
    j   = 0
    for i in e1:
888
        de1[i] = j
Eric Coissac committed
889
        j+=1
890 891 892 893 894
#     for i in e1:
#         l = de1.get(i,[])
#         de1[i] = l
#         l.append(j)
#         j+=1
Eric Coissac committed
895 896 897

    j   = 0
    for i in e2:
898
        de2[i] = j
Eric Coissac committed
899 900
        j+=1
        
901
    pe1 = [k for k in 
Eric Coissac committed
902
           [(a,[j for j in b if j in de2]) 
903 904
            for a,b in [ri.normalizedPairedEndsReads(i) 
                        for i in e1 if i > 0 and i < lri]] if k[1]]
Eric Coissac committed
905
    
906
    pe2 = [k for k in 
Eric Coissac committed
907
           [(a,[j for j in b if j in de1]) 
908 909
            for a,b in [ri.normalizedPairedEndsReads(i) 
                        for i in e2 if i < 0 and -i < lri]] if k[1]]
Eric Coissac committed
910 911 912
    le1=len(e1)
    peset = set()
    delta = []
913 914
    for f,rs in pe1:
        for r in rs:
Eric Coissac committed
915 916 917
            p = (f,abs(r)) if f < abs(r) else (abs(r),-f)
            if p not in peset:
                peset.add(p)
918 919 920 921
                p1 = de1[f]
                p2 = de2[r]
                delta.append((le1 - p1) + 1 + p2)
     
922 923
    for f,rs in pe2:
        for r in rs:
Eric Coissac committed
924 925 926
            p = (-f,abs(r)) if -f < abs(r) else (abs(r),f)
            if p not in peset:
                peset.add(p)
927 928 929
                p1 = de1[r]
                p2 = de2[f]
                delta.append(le1 - p1 + 1 + p2)
Eric Coissac committed
930
 
931
    if (len(delta) > 1):
Eric Coissac committed
932
        dmean = sum(delta) / len(delta)
933 934
        variance = sum((x -dmean)** 2 for x in delta)  / (len(delta) - 1)       
        dsd   = math.sqrt(variance)
Eric Coissac committed
935 936 937 938 939
        dmean+=ri.getReadSize()
    else:
        dmean = None
        dsd  = None
        
940
    return len(peset),dmean,dsd,delta
Eric Coissac committed
941
    
942
def coverageEstimate(self,matches=None,index=None,timeout=60.0):
Eric Coissac committed
943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961
    '''
    Estimates the average coverage depth of the sequence.
    
    The algorithm is masic and can be very slow. To avoid infinity
    computation time a timeout limits it to 60 secondes by default.
    
    Three values are returned by the function :
    
        - The number of bp considered to estimate the coverage
        - The length of the segment used for the estimation
        - The coverage depth 
        
    
    
    
    :param timeout: Maximum computation time.
    
    :return: a triplet (int,int,float) 
    '''
962
    def weight(a,b):
Eric Coissac committed
963 964 965 966 967 968 969
        maxw=0
        for e in cg.edgeIterator(edgePredicate = lambda i:i[0]==a and i[1]==b):
            attr = cg.getEdgeAttr(*e)
            w = attr['length'] * attr['weight']
            if w > maxw:
                maxw=w        
        return maxw 
970 971
    
    def coverage(start):
972
        wp = cg.minpath(start, distance=weight,allowCycle=True)
Eric Coissac committed
973
#        print ">>>",start,wp
974 975 976 977
        maxpath = max(wp[0].values())
        maxend=[i for i in wp[0] if wp[0][i]==maxpath][0]
        i=maxend
        path=[i]
Eric Coissac committed
978 979 980
        if i == start and wp[0][i] > 0:
            i=wp[1][i]
            path.append(i)
981 982 983 984 985
        while i!=start:
            i=wp[1][i]
            path.append(i)
        return(maxpath,path)
    
Eric Coissac committed
986
    cg = self.compactAssembling(verbose=False)
Eric Coissac committed
987
        
988 989 990 991 992 993 994 995 996 997 998 999 1000
    if matches is not None and index is not None:
        genesincontig(cg, index, matches)
        # A first and fast approach is to look for long sequence (> 1kb)
        
        longweight = [(cg.getEdgeAttr(*i)['weight'],cg.getEdgeAttr(*i)['length'])
                        for i in cg.edgeIterator(edgePredicate=lambda e:'weight' in cg.getEdgeAttr(*e)) 
                        if cg.getEdgeAttr(*i)['length'] > 1000 and cg.getEdgeAttr(*i)['ingene']>0]
    else:
        # A first and fast approach is to look for long sequence (> 1kb)
        
        longweight = [(cg.getEdgeAttr(*i)['weight'],cg.getEdgeAttr(*i)['length'])
                        for i in cg.edgeIterator(edgePredicate=lambda e:'weight' in cg.getEdgeAttr(*e)) 
                        if cg.getEdgeAttr(*i)['length'] > 1000]
Eric Coissac committed
1001 1002 1003
                    
    if longweight:
        maxpath = sum(i[1] for i in longweight)
Eric Coissac committed
1004
        cov = weightedMode((i[0],i[1]//100) for i in longweight)
Eric Coissac committed
1005 1006
        return maxpath,maxpath,cov
    
Eric Coissac committed
1007 1008
    # Seconde strategy : we look for the longest of the shortest path
    
1009 1010
    maxpath=0
    path=None
1011 1012 1013 1014 1015 1016 1017
    btime = time()
    n=list(cg)
    # n = [i  for i in cg if len(list(cg.parentIterator(i)))==0]
    # dest = [i  for i in cg if len(list(cg.neighbourIterator(i)))==0]
    e = [max([cg.getEdgeAttr(*j)['length'] 
              for j in list(cg.edgeIterator(edgePredicate=lambda i: i[1]==k))]+[0]) 
         for k in n]
Eric Coissac committed
1018
    ne = list(map(lambda a,b:(a,b),n,e))
1019 1020 1021 1022 1023
    ne.sort(key=lambda i:i[1],reverse=True)
    n = [i[0] for i in ne]
    
    for i in n:
        w,p = coverage(i)
1024 1025
        if w > maxpath:
            maxpath=w
1026 1027 1028
            path=p
        if (time()-btime) > timeout:
            break 
Eric Coissac committed
1029 1030
#    print path        

Eric Coissac committed
1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
    if path:
        path.reverse()
        j = path[0]
        cumlength=0
        for i in path[1:]:
            maxw=0
            maxlength=0
            for e in cg.edgeIterator(edgePredicate = lambda k:k[0]==j and k[1]==i):
                attr = cg.getEdgeAttr(*e)
                w = attr['length'] * attr['weight']
#                print w , attr['length'] , attr['weight']
                if w > maxw:
                    maxw=w
                    maxlength=attr['length']       

            cumlength+=maxlength
            j=i
    else:
        l = [(cg.getEdgeAttr(*e)['length']) 
                for e in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e))]
        w = [(cg.getEdgeAttr(*e)['weight']) 
                for e in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e))]
        if l:
            maxpath=max(l)
            coverage = w[l.index(maxpath)]
            return maxpath,maxpath,coverage
        else:
Eric Coissac committed
1058
            return None,None,None        # <-- ???
1059
    return maxpath,cumlength,maxpath/float(cumlength)
Eric Coissac committed
1060

1061

1062 1063


1064
def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=200,logger=None,tags=[]):
Eric Coissac committed
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089
    '''
    Convert a path in an compact assembling graph in a fasta formated sequences.
    
    :param assgraph: The compact assembling graph as produced by the
                     :py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
    :type assgraph:  :py:class:`~orgasm.graph.DiGraphMultiEdge`
    :param path:     an ``iterable`` providing an ordered list of ``stemid`` indicating
                     the path to follow.  
    :type path:      an ``iterable`` over :py:class:`int`
    :param identifier: the identifier used in the header of the fasta formated sequence
    :type identifier:  :py:class:`bytes`
    :param minlink:  the minimum count of pair ended link to consider 
                     for asserting the relationship
    :type minlink:   :py:class:`int`
    :param nlength:  how many ``N`` must be added between two segment of sequences only connected
                     by pair ended links
    :type nlength:   :py:class:`int`
    :param back:     How many base pairs must be considered at the end of each edge
    :type back:      :py:class:`int`
    
    :returns: a string containing the fasta formated sequence
    :rtype: :py:class:`bytes`

    :raises: :py:class:`AssertionError`
    '''
1090
    frglen,frglensd = estimateFragmentLength(self) # @UnusedVariable
Eric Coissac committed
1091
    
1092 1093 1094 1095 1096
    #
    # Build a dictionary with:
    #      - Keys   = stemid
    #      - Values = edge descriptor (from,to,x)
    #
Eric Coissac committed
1097 1098 1099
    alledges = dict((assgraph.getEdgeAttr(*e)['stemid'],e) 
                    for e in assgraph.edgeIterator(edgePredicate = lambda i: 'stemid' in assgraph.getEdgeAttr(*i)))
        
1100
    coverage=[]
Eric Coissac committed
1101 1102
    slength=[]
    seq=[]
1103
    label=[]
Eric Coissac committed
1104 1105
    oldstem=None
    oldid=None
1106 1107
    oldstemclass=None
     
1108
    rank = 1
Eric Coissac committed
1109
    forceconnection=False
1110
    
Eric Coissac committed
1111 1112
    for stemid in path:
        if stemid != 0:
1113 1114 1115 1116
            stem              = alledges[stemid]
            attr              = assgraph.getEdgeAttr(*stem)
            stemclass         = attr['class']
            sequence          = attr['sequence']
Eric Coissac committed
1117
            
1118 1119 1120 1121 1122 1123 1124
            if rank==1 and stemclass!="sequence":
                raise RuntimeError("A path cannot ends on a gap")

                                                    # Switch the stem to a dashed style
            graphics          = attr.get('graphics',{})
            attr['graphics']  = graphics
            graphics['style'] = 'dashed'
Eric Coissac committed
1125
            
1126
            # Manage step rank information of each step
Eric Coissac committed
1127 1128 1129 1130 1131 1132
            allsteps = attr.get('steps',{})
            steps = allsteps.get(identifier,[])
            steps.append(rank)
            allsteps[identifier]=steps
            attr['steps']=allsteps
            
1133 1134 1135 1136 1137 1138 1139 1140 1141
            if oldstem is not None:
                connected,ml,sl,delta = pairEndedConnected(self,assgraph,oldid,stemid,back)  # @UnusedVariable
                if oldstem[1]==stem[0]:
                    if oldstemclass=="sequence":
                        if stemclass=="sequence":                   # Link between 2 sequences
                            if ml is not None:
                                logOrPrint(logger,
                                           "Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" % 
                                           (oldid,stemid,connected,float(ml),float(sl)))
Eric Coissac committed
1142
            
1143 1144 1145 1146 1147
                                label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
                            else:
                                logOrPrint(logger,
                                           "Both segments %d and %d are connected but covered by 0 paired-end" % 
                                           (oldid,stemid))
Eric Coissac committed
1148
            
1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
                                label.append('{connection: 0}')
                                                            
                        elif stemclass[0:9]=="scaffold:":           # Link a sequence and a gap
                            logOrPrint(logger,
                                        "Both segments %d and %d are disconnected" % attr['link'])
    
                            if stemclass=="scaffold:paired-end":
                                logOrPrint(logger,
                                           "   But linked by %d pair ended links (gap length=%f sd=%f)" % 
                                           (attr['pairendlink'],
                                            attr['length'],
                                            attr['gapsd']))
                                           
                                label.append('{N-connection: %d - Gap length: %d, sd: %d}' % 
                                             (attr['pairendlink'],
                                              attr['length'],
                                              attr['gapsd']))
    
                            elif stemclass=="scaffold:forced":
                                logOrPrint(logger,"   Connection is forced")
                                
                                if attr['pairendlink'] > 0:
                                    logOrPrint(logger,
                                               "   But asserted by %d pair ended links (gap length=%f sd=%f)" % 
                                               (attr['pairendlink'],
                                                attr['length'],
                                                attr['gapsd']))

                                    label.append('{F-connection: %d - Gap length: %d, sd: %d}' % 
                                             (attr['pairendlink'],
                                              attr['length'],
                                              attr['gapsd']))
                                else:
                                    label.append('{F-connection: %d}' % connected)
1183 1184 1185 1186 1187
                            elif stemclass=="scaffold:overlap":
                                logOrPrint(logger,
                                           "   But overlap by %dbp supported by %d pair ended links" % 
                                           (-attr['length'],
                                            attr['pairendlink']))
1188
                                    
1189 1190 1191 1192 1193 1194
                                label.append('{O-connection: %d - Overlap length: %d}' % 
                                             (attr['pairendlink'],
                                              -attr['length']))
                                # Remove the overlap length on the last inserted sequence
                                seq[-1]=seq[-1][:attr['length']]
    
1195 1196 1197 1198 1199 1200
                    elif oldstemclass[0:9]=="scaffold:":
                        if stemclass=="sequence":
                            seq.append(self.index.getRead(attr['path'][0],
                                                          0,
                                                          self.index.getReadSize()).lower())
                            slength.append(self.index.getReadSize())
Eric Coissac committed
1201

1202 1203 1204 1205 1206 1207 1208
                        else:
                            raise RuntimeError('A scaffold link must be followed by a sequence %d --> %d' %
                                               (oldid,stemid))           
                        
                elif forceconnection:
                    if connected > 0:
                        glength = int(frglen-ml - self.index.getReadSize()) 
1209

1210 1211 1212
                        logOrPrint(logger,
                                   "Connection is forced with only %d pair ended links (gap length=%f sd=%f)" % 
                                   (connected,float(glength),float(sl)))              
1213

1214 1215 1216 1217
                        label.append('{F-connection: %d - Gap length: %d, sd: %d}' % 
                                      (connected,
                                       glength,
                                       sl))
1218 1219

                    else:
1220
                        
1221
                        logOrPrint(logger,
1222
                                   "Connection is forced without pair ended links")
1223
                        
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
                        label.append('{F-connection: %d}' % connected)

                        glength =  nlength
                    
                    seq.append(b'N'*int(glength))
                    slength.append(int(glength))
                    
                    seq.append(self.index.getRead(attr['path'][0],
                                                  0,
                                                  self.index.getReadSize()).lower())
                    slength.append(self.index.getReadSize())


                    # Add the foced link to the compact assembly graph    
                    flink = assgraph.addEdge(oldstem[1],stem[0])
                    rlink = assgraph.addEdge(-stem[0],-oldstem[1])
                    flink['label']="Forced (%d)  %d -> %d" % (connected,oldid,stemid)
                    flink['graphics']={'width':1,
                                       'arrow':'last',
                                       'fill':'0xFF0000',
                                       'style':'dashed'

                                      }
                    rlink['label']="Forced (%d)  %d -> %d" % (connected,-stemid,-oldid)
                    rlink['graphics']={'width':1,
                                       'arrow':'last',
                                       'fill':'0xFF0000',
                                       'style':'dashed'
                                      }
1253
                else:
1254 1255
                    raise AssertionError('Disconnected path between stem '
                                         '%d and %d only %d pair ended links' % (oldid,stemid,connected))
1256 1257 1258



1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271
            if stemclass=="sequence":
                if attr['length'] > 10:
                    attr['label']="%d : %s->(%d)->%s  [%d] @ %s" % (stemid,sequence[0:5].decode('ascii'),
                                                                    attr['length'],
                                                                    sequence[-5:].decode('ascii'),
                                                                    int(attr['weight']),
                                                                    attr['steps'])
                else:
                    attr['label']="%d : %s->(%d)  [%d] @ %s" % (stemid,
                                                                sequence.decode('ascii'),
                                                                attr['length'],
                                                                int(attr['weight']),
                                                                attr['steps'])
1272
    
1273 1274 1275 1276
                label.append(attr['label'])
                

            seq.append(sequence)
1277 1278
            coverage.append(attr['weight'])
            slength.append(attr['length'])
1279 1280

            rank+=1
1281 1282
            oldstem = stem
            oldid=stemid
1283
            oldstemclass=stemclass
1284 1285 1286 1287
            
            forceconnection=False
        else:
            forceconnection=True
1288 1289 1290 1291

    
                        
            
1292 1293 1294 1295 1296
        
    s1 = alledges[path[-1]]
    s2 = alledges[path[0]]
    sid1=assgraph.getEdgeAttr(*s1)['stemid']
    sid2=assgraph.getEdgeAttr(*s2)['stemid']
1297
    sclass2=assgraph.getEdgeAttr(*s2)['class']
1298 1299 1300 1301 1302 1303 1304 1305
    connected,ml,sl,delta = pairEndedConnected(self,            # @UnusedVariable
                                               assgraph,
                                               sid1,
                                               sid2,
                                               back)  
    
    if s1[1]==s2[0]:
        
1306 1307 1308
        logOrPrint(logger, "Path is circular and connected by %d  (length: %d, sd: %d)" %
                        (connected,int(ml),int(sl))
                   )        
1309
        circular=True
1310 1311
        if sclass2=="sequence":
            label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
1312
    else:
1313 1314 1315 1316
        if sclass2!="sequence":
            raise RuntimeError("A path cannot ends on a gap")
        
        if forceconnection:
1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
            logOrPrint(logger,"Circular connection forced",)
            logOrPrint(logger,"Linked by %d pair ended links" %  connected)
                
            label.append('{F-connection: %d}' % connected)
            seq.append(b'N'*int(nlength))
            circular=True
        else:
            logOrPrint(logger,"Path is linear")
            circular=False

        seq.insert(0,self.index.getRead(s2[0],0,self.index.getReadSize()).lower())
        slength.insert(0,self.index.getReadSize())

    
    sequence = b''.join(seq)
    length = sum(slength)
    mcov = oneXcoverage(assgraph)
    
    fasta=[">%s seq_length=%d; coverage=%5.1f; circular=%s; %s%s" % (identifier,length,
                                                                  mcov,circular,
                                                                  tags2str(tags) + " ",
                                                                  '.'.join(label))]
1339 1340 1341 1342
    fasta.extend(sequence[i:i+60].decode('ascii') 
                 for i in range(0,len(sequence),60)
                )
    
1343 1344
    return "\n".join(fasta)
 
1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370
def pathConstraints(self,cg,threshold=5.,back=500, minlink=5):

    def minisatDoubleConstraints(self,cg):
        e = [i for i in cg.edgeIterator(edgePredicate=lambda j: len(list(cg.edgeIterator(edgePredicate=lambda k: k[0]==j[1] and k[1]==j[0] and 'stemid' in cg.getEdgeAttr(*k))))>0) if abs(i[0]) < abs(i[1])]
        constraints = {}
        for start,end,index in e:  # @UnusedVariable
            input1 = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==start and i[0]!=end and 'stemid' in cg.getEdgeAttr(*i)))
            input2 = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==end and i[0]!=start and 'stemid' in cg.getEdgeAttr(*i)))
            output1 = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==end and i[1]!=start and 'stemid' in cg.getEdgeAttr(*i)))
            output2 = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==start and i[1]!=end and 'stemid' in cg.getEdgeAttr(*i)))
            if input1 and output1:
                middleF = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==start and j[1]==end and 'stemid' in cg.getEdgeAttr(*j)))
                middleR = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==end and j[1]==start and 'stemid' in cg.getEdgeAttr(*j)))
                inedge = input1
                output= output1
            else:
                middleF = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==end and j[1]==start and 'stemid' in cg.getEdgeAttr(*j)))
                middleR = list(cg.edgeIterator(edgePredicate=lambda j: j[0]==start and j[1]==end and 'stemid' in cg.getEdgeAttr(*j)))
                inedge = input2
                output= output2
    
            if len(inedge)==1 and len(middleF)==1 and len(middleR)==1 and len(output)==1:
                inedge   = inedge[0]
                middleF = middleF[0]
                middleR = middleR[0]
                output  = output[0]
Eric Coissac committed
1371
            
1372 1373 1374 1375
                inputcov   = cg.getEdgeAttr(*inedge)['weight']     
                middleFcov = cg.getEdgeAttr(*middleF)['weight']     
                middleRcov = cg.getEdgeAttr(*middleR)['weight']     
                outputcov  = cg.getEdgeAttr(*output)['weight']     
Eric Coissac committed
1376
            
1377 1378
                r1 = round(middleFcov/float(inputcov))
                r2 = round(middleRcov/float(inputcov)) + 1
Eric Coissac committed
1379
            
1380 1381
                r3 = round(middleFcov/float(outputcov))
                r4 = round(middleRcov/float(outputcov)) + 1
Eric Coissac committed
1382
            
1383
                r = int(round((r1+r2+r3+r4)/4)) - 1
Eric Coissac committed
1384
                    
1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
                constraints[cg.getEdgeAttr(*inedge)['stemid']] = [[cg.getEdgeAttr(*middleF)['stemid'],cg.getEdgeAttr(*middleR)['stemid']] * r + \
                                                                 [cg.getEdgeAttr(*middleF)['stemid'],cg.getEdgeAttr(*output)['stemid']]]
        return constraints

    def minisatSimpleConstraints(self,cg):
        e = [i for i in cg.edgeIterator(edgePredicate=lambda j: j[0]==j[1]  and 'stemid' in cg.getEdgeAttr(*j))]
        constraints = {}
        for start,end,index in e:
            inedge = list(cg.edgeIterator(edgePredicate=lambda i: i[1]==start and i[0]!=end  and 'stemid' in cg.getEdgeAttr(*i)))
            output = list(cg.edgeIterator(edgePredicate=lambda i: i[0]==end and i[1]!=start  and 'stemid' not in cg.getEdgeAttr(*i)))
Eric Coissac committed
1395
        
1396
            middle = (start,end,index)
Eric Coissac committed
1397
    
1398 1399 1400
            if len(inedge)==1 and len(output)==1:
                inedge   = inedge[0]
                output  = output[0]
Eric Coissac committed
1401
            
1402 1403 1404
                inputcov   = cg.getEdgeAttr(*inedge)['weight']     
                middlecov  = cg.getEdgeAttr(*middle)['weight']     
                outputcov  = cg.getEdgeAttr(*output)['weight']     
Eric Coissac committed
1405
            
1406 1407
                r1 = round(middlecov/float(inputcov))            
                r2 = round(middlecov/float(outputcov))
Eric Coissac committed
1408
            
1409
                r = int(round((r1+r2)/2))
Eric Coissac committed
1410
                    
1411 1412 1413
                constraints[cg.getEdgeAttr(*inedge)['stemid']] = [[cg.getEdgeAttr(*middle)['stemid']] * r + \
                                                                [cg.getEdgeAttr(*output)['stemid']]]
        return constraints
Eric Coissac committed
1414 1415


Eric Coissac committed
1416
        
1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453
    def pairEndedConstraints(asm,cg,threshold=5.,back=500):
    
        fork =   [e for e in cg.edgeIterator(edgePredicate=lambda i: \
                                                           'stemid' in cg.getEdgeAttr(*i) and
                                                           len(list(cg.parentIterator(i[0],
                                                                                      edgePredicate=lambda j: j[0]!=i[1] and 'stemid' in cg.getEdgeAttr(*j))
                                                                   )
                                                              )==2 
                                                       and len(list(cg.neighbourIterator(i[1],
                                                                                         edgePredicate=lambda j:j[1]!=i[0] and 'stemid' in cg.getEdgeAttr(*j))
                                                                   )
                                                              )==2
                                             )
                 ]
    
        bifork = [([cg.getEdgeAttr(*i)['stemid'] for i in 
                    cg.edgeIterator(edgePredicate=lambda j: j[1]==e[0] and j[0]!=e[1]  and 'stemid' in cg.getEdgeAttr(*j))],
                   cg.getEdgeAttr(*e)['stemid'],
                   [cg.getEdgeAttr(*i)['stemid'] for i in 
                    cg.edgeIterator(edgePredicate=lambda j: j[0]==e[1] and j[1]!=e[0]  and 'stemid' in cg.getEdgeAttr(*j))]) for e in fork]
    
        links = [(i,
                  (pairEndedConnected(asm,cg,i[0][0],i[2][0],back)[0]+
                   pairEndedConnected(asm,cg,i[0][1],i[2][1],back)[0]+0.0001)/
                  (pairEndedConnected(asm,cg,i[0][0],i[2][1],back)[0]+
                   pairEndedConnected(asm,cg,i[0][1],i[2][0],back)[0]+0.0001)) 
                 for i in bifork]
    
        constraints = {}
    
        for (starts,middle,ends),ratio in links:
            if  ratio >= threshold:
                constraints[starts[0]]=[middle,ends[0]]
                constraints[starts[1]]=[middle,ends[1]]
            elif ratio <= 1/threshold:
                constraints[starts[0]]=[middle,ends[1]]
                constraints[starts[1]]=[middle,ends[0]]
Eric Coissac committed
1454
            
1455
    
1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474
        changed=True
        while changed:
            n = iter(constraints)
            changed=False
            while not changed:
                try:
                    k=next(n)
                    nk=constraints[k][-1]
                    if nk in constraints:
                        constraints[k].extend(constraints[nk])
                        # del constraints[nk]
                        changed=True
                except StopIteration:
                    break

        for k in constraints:
            constraints[k]=[constraints[k]]

        return constraints
Eric Coissac committed
1475
     
1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
#     def scaffoldConstraints(self,cg,back=500, minlink=5):
#         ends = [cg.getEdgeAttr(*i)['stemid']
#                 for i in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e) 
#                                                                  and len(list(cg.neighbourIterator(e[1], 
#                                                                                                    edgePredicate=lambda i: 'stemid' in cg.getEdgeAttr(*i)
#                                                                                                    )
#                                                                               ))==0)]
#         starts= [cg.getEdgeAttr(*i)['stemid']
#                 for i in cg.edgeIterator(edgePredicate=lambda e: 'stemid' in cg.getEdgeAttr(*e) and len(list(cg.parentIterator(e[0], edgePredicate=lambda i: 'stemid' in cg.getEdgeAttr(*i))))==0)]
#     
#         constraints = {}
#     
#         for e in ends:
#             for s in starts:
#                 if pairEndedConnected(self,cg,e,s,back)[0] >= minlink:
#                     if e in constraints:
#                         constraints[e].append([s])    
#                     else:
#                         constraints[e]=[[s]]
#         return constraints
1496 1497


Eric Coissac committed
1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511

    
    def mergeConstraints(c1,c2):
        for k in c2:
            if k in c1:
                c1[k].extend(c2[k])
            else:
                c1[k]=c2[k]
        return c1
    
    constraints = {}
    constraints = mergeConstraints(constraints,minisatSimpleConstraints(self,cg))
    constraints = mergeConstraints(constraints,minisatDoubleConstraints(self,cg))
    constraints = mergeConstraints(constraints,pairEndedConstraints(self,cg,threshold,back))
1512
#    constraints = mergeConstraints(constraints,scaffoldConstraints(self,cg,back,minlink))
Eric Coissac committed
1513 1514 1515 1516 1517 1518 1519 1520
    
    interns = set()
    
    for e in constraints:
        for p in constraints[e]:
            for n in p[0:-1]:
                interns.add(n)
                
1521
    for e in cg.edgeIterator(edgePredicate=lambda i: 'stemid' in cg.getEdgeAttr(*i)):
Eric Coissac committed
1522 1523 1524
        stemid = cg.getEdgeAttr(*e)['stemid']
        if stemid not in constraints and stemid not in interns:
            constraints[stemid]=[[cg.getEdgeAttr(*n)['stemid']]