_tango.pyx 16.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
# cython: language_level=3


from orgasm.indexer._orgasm cimport Index
from ._asmbgraph cimport AsmbGraph
from ._assembler cimport Assembler
from cpython.list cimport PyList_GET_SIZE
from functools import reduce
import sys

# cdef int isanchors(Index index ,int sid):
#     return index.getIds(index.getPairedRead(sid))[0]


cdef set setofreads(dict extensions):
    cdef set back=set()
    
    for x in extensions.values():
        back|=set(x[1])
        
    return back

cdef float isgrowing(list lastseeds):
    cdef int llast = PyList_GET_SIZE(lastseeds)
    cdef double i
    cdef int p = 0
    
    for i in lastseeds:
        if   i > 1.:
            p+=1
        elif i < 1.:
            p-=1
            
    return p / llast


37 38 39 40
cdef set __used_reads__ = set()

cpdef set getusedreads():
    return __used_reads__
41

42 43 44
cpdef resetusedreads():
    __used_reads__.clear()

45 46 47 48 49 50 51 52 53 54
cpdef AsmbGraph tango(Assembler self,
          list seeds,
          int minread=10,    float minratio=0.1, 
          int mincov=1,      int   minoverlap=40,
          bint lowfilter=True, 
          tuple adapters5=(), 
          tuple adapters3=(), 
          maxjump=0,restrict=None,
          int cycle=1, int nodeLimit=1000000,
          bint progress=True,
55
          bint useonce=True,
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
          logger=None):
    '''
    the :py:func:`~organsm.assembler.tango` function is the main assembling function. It extends selected
    seeds to produce the assembled sequence.
    
    :param seeds: a :py:class:`list` describing the extensions points. Each element of the list corresponds to 
                  a seed. A seed is described by a :py:class:`tuple` of two elements. 
                  
                  The first element is an integer value corresponding to the ``id`` of a read. if the ``id`` is greater than 0
                  the seed will be extended only if it is not already included in the assembling graph.
                  If the ``id`` is specified as a negative number, the seed will be included in the procedure
                  extension whatever if it is already member or not of the assembling graph.
                  
                  The second element of the :py:class:`tuple` is itself a :py:class:`tuple` contenaing or the 
                  :py:class:`None` value of a :py:class:`str` elements used to annotate the read.
                  
                  .. code-block :: python
                  
                      [(12347, ('matK',),0),(-37247,(None,),0)]
                  
                  In this example the structure describe two seeds. The first one corresponds to the
                  read 12347 annotated as belonging to the *matK* gene. The second corresponds to the
                  read 37247, without annotation. The process of extension will be run for the first seed
                  only if it is not already member of the assembling graph and always run on the second seed.
                  
    :type seeds: a :py:class:`list` of :py:class:`tuple`.
                     
    :param minread: Minimum count of reads having to be taken into account to extend an assembling path.
                    If this minimum count is not reach the extension of this path is stopped
    :type minread: :py:class:`int`
    
    :param minratio: A branch is added to the current path if the corresponding symbole **A**, **C**, **G** or **T**
                     represents at least a fraction of ``minratio``of the observed reads.
    :type minratio: :py:class:`float`
    
    :param mincov: To be considered, a read must be observed at least ``mincov`` in the total data set.
    :type mincov: :py:class:`int`
    
    :param minoverlap: Only reads overlaping the current extension point by ``minoverlap`` base pairs are
                       considered for the extension process. 
    :type minoverlap: :py:class:`int`
    
    :param lowfilter:
    :type lowfilter: :py:class:`bool`
    :param maxjump:
    :type maxjump: :py:class:`int`
    :param restrict:
    :type restrict: :py:class:`set` of :py:class:`int` values
    '''
        
    cdef int delta= 0
    cdef int sdelta= 0
    cdef str wheel= '|/-\\'
    cdef int lastprint=-1
    
    # a direct pointer to the read index
    cdef Index index = self._index
    
    # a direct pointer to the graph structure
    cdef AsmbGraph graph = self._graph
    
    # a direct access to the read length
    cdef int readLength = index.getReadSize()
    cdef int readmax    = index.len() + 1
    
    # extension cycle counter for the progress status monitoring
    cdef int icycle=0
    
    # count of missing reads in the graph (aka fake reads)
    cdef int fake=0
    
    # list of the stack size during the last 1000 cycles 
#    lastcycle=[]
    cdef list lastseeds=[0]
    cdef int addedseed=0
    
    # count of pair end jump on an extension branch
    # maxanchor = 0
    
    cdef str lastgene=None
    cdef float growing
    cdef int readid
    cdef int lgraph
    
    cdef dict rextensions
    cdef dict lextensions
    cdef dict backe
    cdef set back
    cdef list rk
    
    assert seeds is not None,'seeds cannot be set to None'
    
    while (PyList_GET_SIZE(seeds) > 0) and (graph.nodeCount() < nodeLimit) :
        
        #time.sleep(1)
        icycle+=1
        
        # Counter used to check how many seed will be added 
        # to the stack during the cycle
        addedseed=0

        # We pop one seed from the stack
        
        readid,(gene,),sdelta = seeds.pop()
160 161 162 163
        
        if useonce:
            __used_reads__.add(readid)
            
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
        if sdelta < delta:
            delta = sdelta
            
        if gene is not None:
            lastgene = gene
        
#         # We test if the read poped out from the stack
#         # is just a jump mark
#         if readid==0:
#             maxanchor-=1
#             continue
        
        # Extension is forced if readid is provided as a negative number
        if readid < 0 or readid not in graph:
            
            if readid < 0:
                readid=-readid
            #print "\n==>",readid,str(gene)
            
            growing = isgrowing(lastseeds)
#             growing = (  float(sum([ls > 1. for ls in lastseeds])) \
#                        - float(sum([ls < 1. for ls in lastseeds]))) / len(lastseeds)
            
            if growing < -0.01 and delta > 0:
                delta-=1
                lastseeds=[1]*1000
            if growing > +0.01 and delta < 40:
                delta+=1
                lastseeds=[1]*1000
            
194
            if progress and (icycle % 50)==0 and sys.stderr.isatty(): 
195
                print("%s : %d bp [%4.1f%% fake reads; Stack size: %8d / %6.2f %d  Gene: %s " % (wheel[(icycle//10) % 4],
Eric Coissac committed
196
                                                                                                 int(len(graph)//2),
197 198 199 200 201 202 203 204 205 206 207
                                                                                                 float(fake)/(len(graph)+1)*200.,
                                                                                                 len(seeds),
                                                                                                 growing,
                                                                                                 delta,
                                                                                                 lastgene),
                       file=sys.stderr,
                       end="\r")
                      
            lgraph = graph.nodeCount()                                                                                           
            if logger is not None and lgraph!=lastprint and lgraph % 20000==0:
                lastprint=lgraph
Eric Coissac committed
208
                logger.info("%d bp [%4.1f%% fake reads; Stack size: %8d / %6.2f %d  Gene: %s " % (int(lgraph//2),
209
                                                                                                  float(fake)/(lgraph+1)*200.,
Eric Coissac committed
210
                                                                                                   int(PyList_GET_SIZE(seeds)),
211
                                                                                                   growing,
Eric Coissac committed
212
                                                                                                   int(delta),
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
                                                                                                   lastgene))
                
    
            #
            # We add the read into the graph
            #
                
            #
            # extends the read on the right side
            #
            #logger.info("Before RCheck %s" % index.getRead(readid,1,readLength-1))
            rextensions = index.checkedExtensions(probe      = index.getRead(readid,1,readLength-1), 
                                                  adapters5  = adapters5,
                                                  adapters3  = adapters3,
                                                  minread    = minread+delta, 
                                                  minratio   = minratio, 
                                                  mincov     = mincov,  
                                                  minoverlap = minoverlap+delta, 
                                                  extlength  = 1,
                                                  lowfilter  = lowfilter,
                                                  restrict   = restrict,
                                                  exact      = True)
            #print >>sys.stderr,"After RCheck"
            #print(rextensions)
            #
            # Build a bijective relationship for the right extensions
            #
            rk=list(rextensions.keys())
            for k in rk:
                #print >>sys.stderr,"Before RICheck",index.getRead(-rextensions[k][1][0],1,readLength-1)
                backe = index.checkedExtensions(probe      = index.getRead(-rextensions[k][1][0],1,readLength-1), 
                                                adapters5  = adapters5,
                                                adapters3  = adapters3,
                                                minread    = minread+delta, 
                                                minratio   = minratio, 
                                                mincov     = mincov,  
                                                minoverlap = minoverlap+delta, 
                                                extlength  = 1,
                                                lowfilter  = lowfilter,
                                                restrict   = restrict,
                                                exact      = True)
                #print >>sys.stderr,"After RICheck"

                # back = set(reduce(lambda p,q:p+q,[backe[y][1] for y in backe],[]))
                back = setofreads(backe)
                
                if -readid not in back:
                    #print "Remove R extension : ",k
                    del rextensions[k]
            
    
            #print >>sys.stderr,"Before LCheck",index.getRead(-readid,1,readLength-1)
            lextensions = index.checkedExtensions(probe      = index.getRead(-readid,1,readLength-1), 
                                                  adapters5   = adapters5,
                                                  adapters3   = adapters3,
                                                  minread    = minread+delta, 
                                                  minratio   = minratio, 
                                                  mincov     = mincov,  
                                                  minoverlap = minoverlap+delta, 
                                                  extlength  = 1,
                                                  lowfilter  = lowfilter,
                                                  restrict   = restrict,
                                                  exact      = True)
            #print >>sys.stderr,"After LCheck"
     
            #
            # Build a bijective relationship for the left extensions
            #
            lk=list(lextensions.keys())
            for k in lk:
                #print >>sys.stderr,"Before LICheck",index.getRead(-lextensions[k][1][0],1,readLength-1)
                backe = index.checkedExtensions(probe      = index.getRead(-lextensions[k][1][0],1,readLength-1), 
                                                adapters5   = adapters5,
                                                adapters3   = adapters3,
                                                minread    = minread+delta, 
                                                minratio   = minratio, 
                                                mincov     = mincov,  
                                                minoverlap = minoverlap+delta, 
                                                extlength  = 1,
                                                lowfilter  = lowfilter,
                                                restrict   = restrict,
                                                exact      = True)
                #print >>sys.stderr,"After LICheck"
                
                # back = set(reduce(lambda p,q:p+q,[backe[y][1] for y in backe],[]))
                back = setofreads(backe)
                
                if readid not in back:
                    #print "Remove L extension : ",k
                    del lextensions[k]
                    
    #        print "Rext = ",rextensions
    #        print "Lext = ",lextensions
    
            if rextensions or lextensions:
                node  = graph.addNode(readid)
                node['gene']=gene
                if 'cycle' not in node:
                    node['cycle']=cycle
                if readid < readmax:
                    node['fake5']=0
                    node['fake3']=0
                else:
                    fake+=1
#                     node['graphics']={'fill':"#00FF00"}
    
    #        print "R",len(seq)
    
            for ex in rextensions:
                data = rextensions[ex]
                coverage = data[0]
    #            print ex,data
                
                nodeindexE,c,s = index.getIds(data[1][0])  # @UnusedVariable
                                    
                if nodeindexE in graph:
                    if data[1][0] not in s:
                        nodeindexE=-nodeindexE
                    # if readid!=-nodeindexE:
                    edges = graph.addEdge(readid,nodeindexE)
                    edges[1]['coverage']=coverage
                    edges[2]['coverage']=coverage
#                     edges[1]['label']="%s (%d)" % (edges[1]['ext'],coverage)
#                     edges[2]['label']="%s (%d)" % (edges[2]['ext'],coverage)
                    # else:
                    #     print >>sys.stderr,"\nWARNING : self loop on %d" % nodeindexE
        #              print "linked"
                else:
        #              print "not linked"
342 343 344 345
                    if not useonce or nodeindexE not in __used_reads__:
                        seeds.append((nodeindexE,(None,),int(delta)))
                        addedseed+=1
                                    
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
            #
            # extends the read on the left side
            # which is equivalent to extend on the right side
            # the reverse complement of the read
            #            
     
            for ex in lextensions:
                data = lextensions[ex]
                coverage = data[0]
    #            print ex,data
    
                nodeindexE,c,s = index.getIds(data[1][0])                               # @UnusedVariable
                                        
                if nodeindexE in graph:
                    if data[1][0] not in s:
                        nodeindexE=-nodeindexE
                    edges = graph.addEdge(-readid,nodeindexE)
                    edges[1]['coverage']=coverage
                    edges[2]['coverage']=coverage
#                     edges[1]['label']="%s (%d)" % (edges[1]['ext'],coverage)
#                     edges[2]['label']="%s (%d)" % (edges[2]['ext'],coverage)
                else:
368 369 370
                    if not useonce or nodeindexE not in __used_reads__:
                        seeds.append((nodeindexE,(None,),int(delta)))
                        addedseed+=1
371 372 373 374 375
            
            lastseeds.append((len(lextensions)+len(rextensions))/2.)
            if PyList_GET_SIZE(lastseeds) > 1000:
                lastseeds=lastseeds[-1000:]         

376
                
377 378
                        
    return graph