_assembler.pyx 20.1 KB
Newer Older
Eric Coissac committed
1 2 3 4
# cython: language_level=3

cimport cython

5
from orgasm.indexer._orgasm cimport *
Eric Coissac committed
6 7
from ._asmbgraph cimport *
from ._assembler cimport *
8
from orgasm.graph._graphmultiedge cimport *
Eric Coissac committed
9 10
from functools import reduce

11
from orgasm.apps.progress cimport ProgressBar 
12
from orgasm.apps.config cimport getConfiguration 
13
from orgasm.utils.dna cimport reverseComplement
Eric Coissac committed
14
import math
15 16 17

import sys

Eric Coissac committed
18 19
from cpython.sequence cimport PySequence_GetSlice
from cpython.bytes cimport PyBytes_GET_SIZE
20 21


Eric Coissac committed
22 23 24
def cmp(a, b):
    return (a > b) - (a < b)

25 26 27
cdef int iabs(int x):
    return x if x > 0 else -x

Eric Coissac committed
28
@cython.nonecheck(True)
29 30 31 32
cdef tuple findDeepestRead(Index index, bytes seed):
    cdef list frg
    cdef list fcount
    cdef int mfc
Eric Coissac committed
33 34 35
    cdef int readsize=index.getReadSize()
    cdef int x
    cdef bytes b
36
    
Eric Coissac committed
37 38
    frg   = [PySequence_GetSlice(seed,x,(x+readsize)) for x in range(PyBytes_GET_SIZE(seed)-readsize)]
    fcount= [index.count(b) for b in frg]
39 40 41
    mfc = max(fcount)
    return frg[fcount.index(mfc)],mfc

Eric Coissac committed
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
cpdef cmpPoints(p1,p2):
    '''
    Compare two extensions point.
    INTERNAL functio uses for ordering extension point:
    the smallest point is point closest to the initial extension point
    with the highest coverage.
    
    :param p1:
    :type p1:
    :param p2:
    :type p2:
    '''
    
    rep = cmp(p2[0],p1[0])
    if not rep:
        rep= cmp(p1[4],p2[4])
    return -rep

def cmpRead(r1,r2):
    return cmp(r1[1],r2[1])

Eric Coissac committed
63

Eric Coissac committed
64
cdef int32_t deleteBranch(AsmbGraph graph,list path, int32_t maxlength, int32_t deleted=0):
Eric Coissac committed
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
    
    cdef int32_t   node
    cdef list      parents
    cdef list      sons
    cdef int32_t   i
    
    if len(path) > maxlength:
        return deleted
        

    node = path[-1]
    parents = [y for y in graph.parentIterator(node)]
    if parents :
        for p in parents:                
            sons    = [y for y in graph.neighbourIterator(p)]

            path+=[p]

            if len(sons) < 2:
                deleted = deleteBranch(graph,path,maxlength,deleted)
            else:
                for i in range(len(path)-1,0,-1):
                    try:
                        graph.deleteEdge(path[i],path[i-1])
                        deleted+=1
                    except KeyError:
                        pass
    else:
        for i in range(len(path)-1,0,-1):
            try:
                graph.deleteEdge(path[i],path[i-1])
                deleted+=1
            except KeyError:
                pass
        
    return deleted

Eric Coissac committed
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122

cdef set sons(AsmbGraph graph,int node):                          # @DuplicatedSignature
    '''
    Build a set composed of the following reads  
    '''
    return set(graph.neighbourIterator(node))
    
cdef set parents(AsmbGraph graph,int node):                       # @DuplicatedSignature
    '''
    Build a set composed of the previous reads  
    '''
    return set(graph.parentIterator(node))

cdef bint is_junction(AsmbGraph graph,int node):                   # @DuplicatedSignature
    '''
    Predicate testing if a node is a fork or the beginning of the assembly
    '''
    return  (    len(sons(graph,node)) > 1 
              or len(parents(graph,node)) != 1 
            )

123 124 125 126 127 128 129
cdef bint is_begining(AsmbGraph graph,int node):                   # @DuplicatedSignature
    '''
    Predicate testing if a node is a fork or the beginning of the assembly
    '''
    return  len(parents(graph,node)) == 0 
            

130
cdef tuple normalizePath(list path, bint *direction):
Eric Coissac committed
131 132 133 134 135
    cdef int  c = path[0]
    cdef int  r = -path[-1]
    cdef bint d = c < r
    cdef tuple cpath
    
136 137 138 139 140 141 142
    
    # TODO: verify the property : if more than two nodes are reverse complement then the stem is palindromic 
    if c==r:
        c = path[1]
        r = -path[-2]
        d = c < r
    
Eric Coissac committed
143 144 145 146 147
    if d:
        cpath=tuple(path)
    else:
        cpath=tuple(-i for i in reversed(path))

148
    direction[0]=d
Eric Coissac committed
149 150 151 152 153 154 155 156 157
    return cpath

cdef bint isPalindrome(list path):
    cdef int lp = len(path)
    cdef int i
    # paths with a odd length cannot be palindromic 
    if (lp & 1) == 1:
        return False
    
158
    for i in range(lp//2):
Eric Coissac committed
159 160 161 162 163 164 165 166 167 168 169 170
        if path[i]!=-path[-i-1]:
            return False
        
    return True

cdef int weight(Assembler assembler,
                list      path,
                bint      palindrome):
    cdef float weight=0
    cdef int x 
    cdef AsmbGraph graph = assembler._graph
    cdef dict nattr
171 172
    cdef int nodemax = len(assembler._index) +1 
    
Eric Coissac committed
173
    for x in path:
174
        if iabs(x) < nodemax:
175 176 177 178 179
            try:
                nattr = graph.getNodeAttr(x)
                weight+=nattr.get('coverage',0.0)
            except KeyError:
                pass
Eric Coissac committed
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
           
    weight/= len(path)
    
    if palindrome:
        weight/=2
    
    return <int> (weight * assembler._index.getReadSize())
    
cdef str label(dict stem):
        cdef str label
        cdef int length = stem['length']
        
        if length > 10:
            label="%d : %s->(%d)->%s  [%d]" % (stem['stemid'],
                                               stem['sequence'][0:5].decode('ascii'),
                                               length,
                                               stem['sequence'][-5:].decode('ascii'),
                                               int(stem['weight'])
                                         )
        else:
            label="%d : %s->(%d)  [%d]" % (stem['stemid'],
                                           stem['sequence'].decode('ascii'),
                                           length,
                                           int(stem['weight'])
                                          )
            
        return label


cpdef dict buildstem(Assembler assembler,
               int first,
               int last,
               bytes sequence,
               list path,
               bint circle):
    
    cdef bint palindrome = isPalindrome(path)    
    cdef int w = weight(assembler,path,palindrome)
    cdef int  length = len(sequence)
    cdef dict s = { 'first'      : int(first),
                    'last'       : int(last),
                    'sequence'   : sequence,
                    'length'     : int(length),
                    'path'       : path,
                    'palindrome' : bool(palindrome),
                    'weight'     : int(w),
                    'circle'     : bool(circle),
                    'stemid'     : 0,
228 229 230 231
                    'label'      : None,
                    'head'       : assembler._index.getRead(first,
                                                            0,
                                                            assembler._index.getReadSize())
Eric Coissac committed
232
                  }
233
                
Eric Coissac committed
234
    if circle:
235 236
        getConfiguration()['orgasm']['logger'].info(" Circle : %6d bp coverage : %6dx" % 
              (length,int(w))) 
Eric Coissac committed
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
        
    return s   

cdef class CompactAssembling(DiGraphMultiEdge):


    def __init__(self,
                 Assembler assembler,
                 bint verbose=True
                ):
        
        cdef int n=0
        cdef int x
        cdef int i                                   # @DuplicatedSignature
        cdef lcontig=0
        cdef int first 
        cdef int last                                # @DuplicatedSignature
        cdef int lseq                                # @DuplicatedSignature
        cdef int lgraph
        cdef int lpath
        cdef bytes sequence                          # @DuplicatedSignature
        cdef dict attr                               # @DuplicatedSignature
        cdef double weight
        cdef double minweight
        cdef AsmbGraph graph
        cdef dict stem
263
        cdef dict config=getConfiguration()
Eric Coissac committed
264
        
265 266
        logger = config['orgasm']['logger']
         
Eric Coissac committed
267 268 269 270 271 272 273 274 275 276 277 278
        DiGraphMultiEdge.__init__(self,'compact')
        
        self._paths={}
        self._stemid={}
        self._stemidOk=False
        self._assembler=assembler
        
        graph = assembler._graph
        lgraph=len(graph)


        if verbose:
279
            logger.info("Compacting graph :")
Eric Coissac committed
280
        else:
281 282 283
            progress = ProgressBar(lgraph,
                                   head="Compacting graph",
                                   seconde=0.1)
Eric Coissac committed
284 285 286

        minweight=1000000.
        
287
        for stem in StemIterator(assembler):
Eric Coissac committed
288 289 290 291 292 293
            self.addStem(stem) 
            lcontig+=stem['length']
            weight = stem['weight']
            stem['graphics']={'width':(weight//assembler._index.getReadSize())+1,
                              'arrow':'last'
                             }
294
            stem['class']='sequence'
Eric Coissac committed
295 296 297 298 299

            if weight < minweight:
                minweight=weight

            if verbose:
300 301 302
                logger.info(" Stem  : %6d bp (total : %6d) coverage : %6.2f" % (stem['length'],
                                                                                lcontig,
                                                                                weight))
Eric Coissac committed
303
            else:
304
                progress(lcontig)
Eric Coissac committed
305 306 307
           
        self.setStemid()            
             
308
        logger.info("Minimum stem coverage = %d" % int(minweight))
Eric Coissac committed
309 310 311 312 313 314 315 316 317 318 319 320 321
        
        

    cpdef dict addStem(self, dict stem):
        cdef char[50] buffer1
        cdef char[50] buffer2
        cdef bytes key1 
        cdef bytes key2 
        cdef tuple edge
        cdef list eattr
        cdef int eid
        cdef int first = stem['first']
        cdef int last  = stem['last']
322
        cdef bint d
Eric Coissac committed
323

324
        cpath = normalizePath(stem['path'],&d)
Eric Coissac committed
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
        self._paths[cpath] = self._paths.get(cpath,0) + 1
        self._stemidOk=False
        
        snprintf(buffer1,50,b"%d",first)
        key1 = buffer1
        
        snprintf(buffer2,50,b"%d",last)
        key2 = buffer2
        
        edge = (key1,key2)
        
        if edge not in self._edges_attrs:
            self.addNode(first)
            self.addNode(last)
        
            self._nodes[key1][0].add(key2)
            self._nodes[key2][1].add(key1)

            self._edges_attrs[edge]=[]
            
        eattr = self._edges_attrs[edge]
        
        eattr.append(stem)
    
        self._edgecount+=1
                
        return stem

    cpdef deleteEdge(self, int node1, int node2, int edge=-1):
354 355
        cdef dict stem              # @DuplicatedSignature
        cdef bint d                 # @DuplicatedSignature
Eric Coissac committed
356
        stem = self.getEdgeAttr(node1,node2,edge)
357
        cpath = normalizePath(stem['path'],&d)
Eric Coissac committed
358 359 360 361 362 363 364 365 366 367 368
        self._paths[cpath]-=1
        if self._paths[cpath]==0:
            del self._paths[cpath]
            self._stemidOk=False
        
        DiGraphMultiEdge.deleteEdge(self,node1,node2,edge)
        
    

    cdef int getStemid(self, dict stem) except 0:
        cdef list path = stem['path']
369
        cdef bint d
Eric Coissac committed
370 371 372
        cdef tuple cpath
        cdef int stemid
        cdef list sstems
373
                        
Eric Coissac committed
374 375 376 377 378 379 380
        if not self._stemidOk:
            sstems=list(self._paths)
            sstems.sort()
            for i in range(len(sstems)):
                self._stemid[sstems[i]]=i+1
            self._stemidOk=True
                
381
        cpath = normalizePath(path,&d)                
Eric Coissac committed
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
        stemid = self._stemid[cpath]

        if not d:
            stemid=-stemid
                
        return stemid


    def stemIterator(self):
        cdef list stemps
        cdef dict stem
        for stems in self._edges_attrs.values():
            for stem in stems:
                yield stem
        
    cdef void setStemid(self):
        cdef dict stem
    
        for stem in self.stemIterator():
            stem['stemid'] = self.getStemid(stem)
            stem['label']  = label(stem)

404 405
    property assembler:
        "A doc string can go here."
Eric Coissac committed
406

407 408 409 410 411
        def __get__(self):
            return self._assembler

        
        
Eric Coissac committed
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
    
cdef class StemIterator:

    def __init__(self,Assembler assembler, 
                      bint alllink=False):
        self._assembler=assembler
        self._graph=assembler._graph
        self.edgeName={}
                            
    def __iter__(self):
        cdef NodeIterator ni
        cdef int first
        cdef int n
        cdef set branches
        cdef set brothers
        cdef set fathers
        cdef list lsequence
        cdef int last
        cdef int lseq
        cdef bytes sequence
        cdef list path
        cdef dict stem 
        cdef int lcontig
        cdef AsmbGraph graph =  self._graph
436
        cdef set junctions = set(int(y) for y in graph.nodeIterator(lambda x: is_junction(graph,x)))
Eric Coissac committed
437
                
438

Eric Coissac committed
439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
        for first in junctions:
            branches=sons(graph,first)
            for n in branches:
                path = [first,n]                               
                brothers=sons(graph,n)
                lsequence=[graph.getEdgeAttr(first,n)['ext']]
                while len(brothers)==1 and n not in junctions:
                    son=brothers.pop()
                    lsequence.append(graph.getEdgeAttr(n,son)['ext'])
                    path.append(son)
                    n=son
                    brothers=sons(graph,n)
    
                last = n
                sequence=b''.join(lsequence)

                stem = buildstem(self._assembler,
                            first,last,
                            sequence,path,
                            False)
                
                lcontig+=stem['length']
                                
                yield stem
                                
                if stem['palindrome']:
                    yield stem
                    
        if lcontig + 2 * len(self.edgeName) < len(graph):
            ccs = graph.connectedComponentIterator()
            
            for cc in ccs:
                circle = not any(is_junction(graph,x) for x in cc)

                if circle:
                    minabs = min(abs(i) for i in cc)
Eric Coissac committed
475
                    # print(minabs,file=sys.stderr)
Eric Coissac committed
476 477
                    if minabs in cc:
                        first = minabs
Eric Coissac committed
478
                        # print("in",file=sys.stderr)
Eric Coissac committed
479
                    else:
480 481
#                        first = sons(graph,-minabs).pop()
                        first = -minabs
Eric Coissac committed
482
                        # print("out",file=sys.stderr)
Eric Coissac committed
483 484 485 486 487 488 489 490 491 492 493
                    nn = first
                    path=[first]
                    lsequence=[]
                    son=sons(graph,nn).pop()
                    while first != son:
                        path.append(son)
                        lsequence.append(graph.getEdgeAttr(nn,son)['ext'])
                        nn=son
                        son=sons(graph,nn).pop()
    
                    last = first
494 495 496
                    path.append(first)
                    lsequence.append(graph.getEdgeAttr(nn,first)['ext'])
                    
Eric Coissac committed
497 498 499 500 501 502 503 504 505 506 507 508 509
                    sequence=b''.join(lsequence)
                    
                    stem = buildstem(self._assembler,
                                first,last,
                                sequence,
                                path,
                                True)
                
                    yield stem
                                                        
            


510
cdef class Assembler:
Eric Coissac committed
511 512 513
    '''
    totolitoto
    '''
514

Eric Coissac committed
515
    def __init__(self,Index index, int overlap=90):
516 517 518 519 520 521 522 523 524 525 526 527 528 529
        '''
        Create a new assembler object.
        
        :param index: the file name of an read index previously formated using orgasmi binary
        :type index: bytes
        :param seed: a DNA sequenced used as started for the assembly
        :type seed: bytes
        :param overlap: the minimum overlap between two reads
        :type overlap: int
        '''
        cdef bytes starter
        cdef int depth 
        cdef tuple reads
        cdef dict attr
Eric Coissac committed
530
                
Eric Coissac committed
531
        self._graph = AsmbGraph("assembling",index)
532 533 534
        
        self._index = index
        self._overlap=overlap
Eric Coissac committed
535 536 537 538
        self._seeds = []
        self._annotations = {}
                        
        
539 540 541 542 543 544
        
    cpdef tuple readType(self,ids):
        '''
        Internal function : Given a set of read ids, return the one that have to be used
        as standard id. 
         
Eric Coissac committed
545 546
        The set of ids given to this function corresponds to a set of all strictly identical reads.
          
547 548
        :param ids: an iterable elements contining read ids.
        :type ids: iterable
Eric Coissac committed
549
         
550
        :returns: a tuble of three elements
Eric Coissac committed
551
         
552 553 554
                    - the standard id to use
                    - the length of ids
                    - a set containing all unique ids given as parameter
Eric Coissac committed
555
                     
556 557
        :rtype: tuple
        '''
Eric Coissac committed
558
         
559 560
        cdef int t 
        cdef int y 
Eric Coissac committed
561
         
562 563
        if not ids:
            return None
Eric Coissac committed
564
         
565
        t = len(self._index)+1
Eric Coissac committed
566
         
567 568 569
        for y in ids:
            if abs(y) < abs(t):
                t = y
Eric Coissac committed
570
                 
571 572
        return (t,len(ids),set(ids))
    
Eric Coissac committed
573 574 575
    def isEnd(self, int node):
        cdef list s
        cdef list f
Eric Coissac committed
576 577
        s = [x for x in self.graph.neighbourIterator(node)]
        f = [x for x in self.graph.parentIterator(node)]
Eric Coissac committed
578 579 580 581
        if len(s)==0 and len(f)==1:
            return f[0]
        return 0

582 583 584
    def forkIterator(self):
        
        def has_several_sons(int node):
Eric Coissac committed
585
            return  len(set(self._graph.neighbourIterator(node))) > 1
586 587 588
        
        return self._graph.nodeIterator(has_several_sons)
    
589
        
590

Eric Coissac committed
591
           
Eric Coissac committed
592 593 594 595 596
    def endNodeSet(self, set excluded=None, bint alllink=False):
        if alllink:
            def isfinalnode(int node):
                return len(set(self._graph.neighbourIterator(node)))==0
        else:
Eric Coissac committed
597 598
            def isfinalnode(int node):                  # @DuplicatedSignature
                return len(set(self._graph.neighbourIterator(node)))==0
Eric Coissac committed
599 600 601
        if excluded is None:
            excluded=set()
        return set(x for x in self._graph.nodeIterator(isfinalnode) if x not in excluded)
602

Eric Coissac committed
603 604 605 606 607
    def startNodeSet(self, set excluded=None, bint alllink=False):
        if alllink:
            def isstartnode(int node):
                return len(set(self._graph.parentIterator(node)))==0
        else:
Eric Coissac committed
608 609
            def isstartnode(int node):                  # @DuplicatedSignature
                return len(set(self._graph.parentIterator(node)))==0
Eric Coissac committed
610
            
Eric Coissac committed
611 612 613
        if excluded is None:
            excluded=set()
        return set(x for x in self._graph.nodeIterator(isstartnode) if x not in excluded)
614

Eric Coissac committed
615
    def cleanDeadBranches(self,int32_t maxlength=10, bint alllink=False):
Eric Coissac committed
616

Eric Coissac committed
617 618 619
        cdef AsmbGraph graph
        cdef int32_t   d
        cdef int32_t   cd
Eric Coissac committed
620
        cdef int32_t   i                                # @DuplicatedSignature
Eric Coissac committed
621 622
        cdef set       ep
        cdef set       sp
623
                   
Eric Coissac committed
624 625 626 627 628
        graph=self.graph

        cd=0    
        d=1
            
629 630
        if sys.stderr.isatty():
            print('',file=sys.stderr)
631
        
Eric Coissac committed
632 633
        while d > 0:
            d=0
634
            
Eric Coissac committed
635
            for i in self.endNodeSet(alllink=alllink):
636 637 638 639
                if sys.stderr.isatty():
                    print("Remaining edges : %d node : %d" % (graph.edgeCount(),len(graph)),
                          end='\r',
                          file=sys.stderr)
Eric Coissac committed
640
                d+=deleteBranch(graph,[i],maxlength)
Eric Coissac committed
641 642 643
            cd+=d
            
        
Eric Coissac committed
644 645
        ep = self.endNodeSet(alllink=alllink)
        sp = self.startNodeSet(alllink=alllink)
Eric Coissac committed
646 647 648 649 650 651 652
        
        for i in ep & sp:
            try:
                graph.deleteNode(i)
            except KeyError:
                pass
            
653 654 655 656
        if sys.stderr.isatty():
            print ("Remaining edges : %d node : %d" % (graph.edgeCount(),len(graph)),
                   end='\r',
                   file=sys.stderr)
Eric Coissac committed
657

Eric Coissac committed
658
        return cd            
659 660 661
                
    def __len__(self):
        return self._graph.nodeCount()
662
            
Eric Coissac committed
663
    def compactAssembling(self, bint verbose=True):
Eric Coissac committed
664
        return CompactAssembling(self,verbose)
Eric Coissac committed
665
    
666 667 668 669 670 671 672 673
    
    property graph:

        "A doc string can go here."

        def __get__(self):
            return self._graph

674

Eric Coissac committed
675
    property seeds:
Eric Coissac committed
676 677 678

        "A doc string can go here."

Eric Coissac committed
679
        def __get__(self):                              # @DuplicatedSignature
Eric Coissac committed
680
            return self._seeds
Eric Coissac committed
681 682


Eric Coissac committed
683
    property index:
Eric Coissac committed
684 685 686

        "A doc string can go here."

Eric Coissac committed
687
        def __get__(self):                              # @DuplicatedSignature
Eric Coissac committed
688
            return self._index
689

Eric Coissac committed
690 691


692