Commit e43c4337 by Eric Coissac

--no commit message

parent 85d303ad
......@@ -5,10 +5,11 @@
# ./orgasm/build/lib.macosx-10.6-intel-2.7/
# python orgasm-1.py GWM-261 COX1.fasta
from orgasm.tango import tango, matchtoseed, cutLowCoverage, scaffold
from orgasm.tango import tango, matchtoseed, cutLowCoverage, scaffold, coverageEstimate
from orgasm.indexer import _orgasm as indexer
from orgasm.backtranslate.fasta import fasta
from orgasm.assembler import Assembler
import sys
if __name__ == '__main__':
......
"""
.. module:: orgasm.assembler
:platform: Unix
:synopsis: The :py:mod:`orgasm.assembler` python package provide the :class:`Assembler` class .
The :py:mod:`orgasm.assembler` package
======================================
......
......@@ -40,7 +40,7 @@ def cmpRead(r1,r2):
return cmp(r1[1],r2[1])
cdef int32_t deleteBranch(AsmbGraph graph,list path, int32_t maxlength=10, int32_t deleted=0):
cdef int32_t deleteBranch(AsmbGraph graph,list path, int32_t maxlength, int32_t deleted=0):
cdef int32_t node
cdef list parents
......@@ -212,7 +212,7 @@ cdef class Assembler:
d=0
for i in self.endNodeSet(alllink=alllink):
print >>sys.stderr,"Remaining edges : %d node : %d\r" % (graph.edgeCount(),len(graph)),
d+=deleteBranch(graph,[i])
d+=deleteBranch(graph,[i],maxlength)
cd+=d
......@@ -400,7 +400,7 @@ cdef class Assembler:
attr[b'path']=path
attr[b'length']=lseq
weight=sum(self.graph[x][b'coverage'] for x in path if b'coverage' in self.graph[x]) / float(len(path))
attr[b'weight']= int(weight * readSize)
attr[b'weight']= weight * readSize
attr[b'graphics']={b'width':int(weight)+1,
b'arrow':b'last',
b'fill':"#0000FF"}
......@@ -413,9 +413,9 @@ cdef class Assembler:
if lseq > 10:
attr[b'label']=b"%d : %s->(%d)->%s [%d]" % (stemid,sequence[0:5],lseq,sequence[-5:],attr[b'weight'])
attr[b'label']=b"%d : %s->(%d)->%s [%d]" % (stemid,sequence[0:5],lseq,sequence[-5:],int(attr[b'weight']))
else:
attr[b'label']=b"%d : %s->(%d) [%d]" % (stemid,sequence,lseq,attr[b'weight'])
attr[b'label']=b"%d : %s->(%d) [%d]" % (stemid,sequence,lseq,int(attr[b'weight']))
print >>sys.stderr,b"\nMinimum stem coverage = %d" % int(minweight * readSize)
......
......@@ -474,7 +474,7 @@ cdef class DiGraph:
def __delitem__(self, int node):
self.deleteNode(node)
def minpath(self,source,dest = None, distance=None, nodePredicate=None, edgePredicate=None):
def minpath(self,source,dest = None, distance=None, nodePredicate=None, edgePredicate=None, allowCycle=False):
cdef dict dist
cdef dict previous
cdef int32_t v
......@@ -521,6 +521,11 @@ cdef class DiGraph:
if alt < Q.getPriority(v):
Q[v]=alt
previous[v]=u
elif allowCycle and v in source:
alt = d + distance(u,v)
if alt < dist[v] or dist[v]==0:
dist[v]=alt
previous[v]=u
if idest is not None and u in idest:
idest.remove(u)
......
......@@ -266,7 +266,7 @@ def mergeAlignments(common, ali1,ali2,rsize):
def multiAlignReads(readids,index,kmer=12,smin=10,delta=2):
readids = list(set(readids))
scores=alignReads(readids,index,kmer,smin)
scores=alignReads(readids,index,kmer,smin,delta)
rsize = index.getReadSize()
alignment={}
......@@ -301,7 +301,7 @@ def multiAlignReads(readids,index,kmer=12,smin=10,delta=2):
for i in al:
al[i][0]=al[i][0]-minshift
return alignment
return alignment.values()
def alignment2bytes(ali,index):
i = 0
......
......@@ -47,6 +47,12 @@ def tango(self,seeds,
:type mincov:
:param minoverlap:
:type minoverlap:
:param lowfilter:
:type lowfilter:
:param maxjump:
:type maxjump:
:param restrict:
:type restrict:
'''
# a direct pointer to the read index
......@@ -240,23 +246,8 @@ def tango(self,seeds,
print >>sys.stderr,"\n\nJumpGap on read %d\n" % readid
for a in anchors:
if a not in graph:
# node = graph.addNode(a)
# node['gene']=None
# node['fake5']=0
# node['fake3']=0
# node['graphics']={'fill':"#FF0000"}
seeds.append((-a,(None,)))
# edges = graph.addEdge(readid,-a)
# edges[1][b'coverage']=0
# edges[2][b'coverage']=0
# edges[1][b'ext']="X"
# edges[2][b'ext']="X"
# edges[1][b'label']="%s (%d)" % (edges[1][b'ext'],0)
# edges[2][b'label']="%s (%d)" % (edges[2][b'ext'],0)
# edges[1][b'type']=b"scaffold"
# edges[2][b'type']=b"scaffold"
return graph
def cutLowCoverage(self,mincov,terminal=True):
......@@ -278,9 +269,9 @@ def cutLowCoverage(self,mincov,terminal=True):
stems = [x for x in cg.edgeIterator()
if not terminal or (isTerminal(cg, x[0]) or isTerminal(cg, x[1]))]
if stems:
stems.sort(key=lambda i:cg.getEdgeAttr(i[0],i[1],i[2])['weight'],reverse=True)
stems.sort(key=lambda i:cg.getEdgeAttr(*i)['weight'],reverse=True)
lightest = stems.pop()
lattr = cg.getEdgeAttr(lightest[0],lightest[1],lightest[2])
lattr = cg.getEdgeAttr(*lightest)
# print >>sys.stderr,lightest[0],lightest[1]
# print >>sys.stderr,lattr['weight'] < mincov
# print >>sys.stderr,isTerminal(cg, lightest[0]), isTerminal(cg, lightest[1])
......@@ -409,7 +400,7 @@ def insertFragment(self,seq):
if len(seq) >= rsize:
graph = self.graph
probe = seq[0:101]
probe = seq[0:rsize]
readid = index.getReadIds(probe)
for i in xrange(1,len(seq)-rsize+1):
......@@ -454,7 +445,7 @@ def insertFragment(self,seq):
seeds.add(ireadidE)
return seeds
return [(i,("Consensus",)) for i in seeds]
def extendStem(self,stem,cg,back,kmer):
def lowcomplexity(x):
......@@ -551,16 +542,24 @@ def pairEndedConnected(self,assgraph,edge1,edge2,back=200):
def coverageEstimate(self):
def weight(a,b):
attr = cg.getEdgeAttr(a,b)
w = attr['length'] * attr['weight']
return w
maxw=0
for e in cg.edgeIterator(edgePredicate = lambda i:i[0]==a and i[1]==b):
attr = cg.getEdgeAttr(*e)
w = attr['length'] * attr['weight']
if w > maxw:
maxw=w
return maxw
def coverage(start):
wp = cg.minpath(start,distance=weight)
wp = cg.minpath(start,distance=weight,allowCycle=True)
# print ">>>",start,wp
maxpath = max(wp[0].values())
maxend=[i for i in wp[0] if wp[0][i]==maxpath][0]
i=maxend
path=[i]
if i == start and wp[0][i] > 0:
i=wp[1][i]
path.append(i)
while i!=start:
i=wp[1][i]
path.append(i)
......@@ -568,18 +567,29 @@ def coverageEstimate(self):
cg = self.compactAssembling()
paths = [coverage(i) for i in cg]
# print paths
maxpath=0
path=None
for w,p in paths:
if w > maxpath:
maxpath=w
path=p
# print path
path.reverse()
j = path[0]
cumlength=0
for i in path[1:]:
cumlength+=cg.getEdgeAttr(j,i)['length']
maxw=0
maxlength=0
for e in cg.edgeIterator(edgePredicate = lambda k:k[0]==j and k[1]==i):
attr = cg.getEdgeAttr(*e)
w = attr['length'] * attr['weight']
# print w , attr['length'] , attr['weight']
if w > maxw:
maxw=w
maxlength=attr['length']
cumlength+=maxlength
j=i
return maxpath,cumlength,maxpath/float(cumlength)
......
......@@ -35,7 +35,21 @@ import sys
from orgasm.tango import *
p = fasta('../../samples/arabidopsis.chloro.prot.fasta')
gp = ['rbcL','matK','psbA','ndhA','rpoA','atpA','atpH','rpoA','atpB']
gp = ['rbcL','matK','ndhA','rpoA','atpA','atpH','rpoA','atpB','psbN',
'psbT',
'psbB',
'psbD',
'psbE',
'psbF',
'psbA',
'psbC',
'psbM',
'psbI',
'psbH',
'psbK',
'psbJ',
'psbL',
'psbZ']
p2=dict(x for x in p.items() if x[0] in gp)
r = indexer.Index('../../samples/AA')
......@@ -673,40 +687,6 @@ s=dict((r.getIds(y[0])[0],('rbcL',)) for y in x['rbcL'])
def coverageEstimate(asm):
def weight(a,b):
attr = cg.getEdgeAttr(a,b)
w = attr['length'] * attr['weight']
return w
def coverage(start):
wp = cg.minpath(start,distance=weight)
maxpath = max(wp[0].values())
maxend=[i for i in wp[0] if wp[0][i]==maxpath][0]
i=maxend
path=[i]
while i!=start:
i=wp[1][i]
path.append(i)
return(maxpath,path)
cg = asm.compactAssembling()
paths = [coverage(i) for i in cg]
maxpath=0
path=None
for w,p in paths:
if w > maxpath:
maxpath=w
path=p
path.reverse()
j = path[0]
cumlength=0
for i in path[1:]:
cumlength+=cg.getEdgeAttr(j,i)['length']
j=i
return maxpath,cumlength,maxpath/float(cumlength)
def distPaired(x,p,index):
dist = []
......@@ -753,8 +733,35 @@ def cutParallel(self,mincov):
return stemparallel
def fillGaps(self,minlink=5,back=200,kmer=12,smin=40,delta=2):
assgraph = self.compactAssembling()
index = self.index
e = [i for i in assgraph.edgeIterator(edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))]
eid = [assgraph.getEdgeAttr(*i)['stemid'] for i in e]
ne = len(e)
s=[]
for e1 in xrange(ne):
for e2 in xrange(ne):
connected = e[e1][1]==e[e2][0]
if not connected:
linkedby = pairEndedConnected(self,assgraph, eid[e1], eid[e2], back)
if linkedby >= minlink:
ex = getPairedRead(self,assgraph,eid[e1],back,end=True)
exr= getPairedRead(self,assgraph,eid[e2],back,end=False)
ex = ex | exr
nreads = len(ex)
ali= multiAlignReads(ex,index,kmer,smin,delta)
goodali = [i for i in ali if len(i) >= nreads/4]
for a in goodali:
c = consensus(a,index)
s = insertFragment(self,c)
from orgasm.indexer._orgasm import Index
from orgasm.multialign import *
from orgasm.multialign import *, consensus
from orgasm.tango import *
from orgasm.assembler import Assembler
......@@ -983,3 +990,26 @@ s = [(i,(None,)) for i in s]
a = tango(asm,s,mincov=1,minread=3,minoverlap=30)
asm.cleanDeadBranches()
cg = asm.compactAssembling()
def getAllPairedRead(self):
'''
:param assgraph:
:type assgraph:
:param stemid:
:type stemid:
:param back:
:type back:
:param end:
:type end:
'''
path=list(self.graph)
reads = [(i,) + r.getIds(i) for i in path if i <= len(r)]
reads=[i[3] if i[0] in i[3] else set(-j for j in i[3]) for i in reads]
paired=[(r.getPairedRead(j) for j in i) for i in reads]
paired=[[(j,) + r.getIds(j) for j in i if j!=0] for i in paired]
paired = [[j[1] if j[0] in j[3] else -j[1] for j in i] for i in paired]
paired = set(reduce(lambda a,b: a+b,(i for i in paired)))
return paired
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment