Commit 54380dbf authored by Eric Coissac's avatar Eric Coissac

No commit message

No commit message
parent e43c4337
......@@ -5,11 +5,17 @@
# ./orgasm/build/lib.macosx-10.6-intel-2.7/
# python orgasm-1.py GWM-261 COX1.fasta
from orgasm.tango import tango, matchtoseed, cutLowCoverage, scaffold, coverageEstimate
from orgasm.tango import tango, \
matchtoseed, \
cutLowCoverage, \
scaffold, \
coverageEstimate, \
insertFragment
from orgasm.indexer import _orgasm as indexer
from orgasm.backtranslate.fasta import fasta
from orgasm.assembler import Assembler
from orgasm.multialign import multiAlignReads, \
consensus
import sys
if __name__ == '__main__':
......
......@@ -22,11 +22,13 @@ The :py:mod:`~orgasm.tango` package contains a set of functions useful to manage
import sys
from collections import Counter
from itertools import imap
from orgasm.multialign import multiAlignReads, consensus
def tango(self,seeds,
minread=20, minratio=0.1,
mincov=2, minoverlap=40,
lowfilter=True,maxjump=1,restrict=None):
lowfilter=True,maxjump=1,restrict=None,
cycle=1):
'''
:param seeds: a list describing the extensions points
......@@ -66,7 +68,7 @@ def tango(self,seeds,
readmax = len(index) + 1
# extension cycle counter for the progress status monitoring
cycle=0
icycle=0
# count of missing reads in the graph (aka fake reads)
fake=0
......@@ -82,7 +84,7 @@ def tango(self,seeds,
while(seeds):
#time.sleep(1)
cycle+=1
icycle+=1
# Counter used to check how many seed will be added
# to the stack during the cycle
......@@ -110,7 +112,7 @@ def tango(self,seeds,
lastcycle.append(len(seeds))
if len(lastcycle) > 1000:
lastcycle=lastcycle[-1000:]
print >>sys.stderr,b"Cycle : %8d (%d nodes / %4.1f%% fake) Waiting points : %8d / %6.2f Gene: %s \r" % (cycle,len(graph),float(fake)/(len(graph)+1)*200.,len(seeds),float(sum(lastcycle))/len(lastcycle),gene),
print >>sys.stderr,b"Cycle : %8d (%d nodes / %4.1f%% fake) Waiting points : %8d / %6.2f Gene: %s \r" % (icycle,len(graph),float(fake)/(len(graph)+1)*200.,len(seeds),float(sum(lastcycle))/len(lastcycle),gene),
#print >>sys.stderr,b"Cycle : %8d (%d nodes) Waiting points : %8d " % (cycle,len(graph),len(seeds))
......@@ -170,6 +172,8 @@ def tango(self,seeds,
if rextensions or lextensions:
node = graph.addNode(readid)
node['gene']=gene
if 'cycle' not in node:
node['cycle']=cycle
if readid < readmax:
node['fake5']=0
node['fake3']=0
......@@ -359,30 +363,52 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False):
connected edge pair representing the pair ended links
asserting the connection.
:type addConnectedLink: :py:class:`bool`
'''
# o = [i for i in assgraph.edgeIterator(edgePredicate=lambda e: 'stemid' not in assgraph.getEdgeAttr(*e))]
# for i in o:
# assgraph.deleteEdge(i[0],i[1],i[2])
'''
if addConnectedLink:
# Fake function just testing the stemid attribute
def isInitial(n):
return 'stemid' in assgraph.getEdgeAttr(*n)
def isTerminal(n):
return 'stemid' in assgraph.getEdgeAttr(*n)
else:
def isInitial(n):
return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
def isTerminal(n):
return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
e = [i for i in assgraph.edgeIterator(edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))]
eid = [assgraph.getEdgeAttr(*i)['stemid'] for i in e]
ne = len(e)
ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
epi = [set(assgraph.getEdgeAttr(*i)['path'][0:back]) for i in ei]
ept = [set(assgraph.getEdgeAttr(*i)['path'][-back:]) for i in et]
exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid]
ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid]
nei = len(ei)
net = len(et)
s=[]
for e1 in xrange(ne):
for e2 in xrange(ne):
connected = e[e1][1]==e[e2][0]
linkedby = pairEndedConnected(self,assgraph, eid[e1], eid[e2], back)
for e1 in xrange(net):
for e2 in xrange(nei):
connected = et[e1][1]==ei[e2][0]
linkedby = len(ext[e1] & exi[e2] | ext[e1] & epi[e2] | ept[e1] & exi[e2])
if connected and addConnectedLink:
if linkedby > minlink:
s.append((e[e1][1],e[e2][0],linkedby,eid[e1], eid[e2],"#00FF00"))
print >>sys.stderr,"%d -> %d connection asserted by %d pair ended links" % (eid[e1], eid[e2],linkedby)
s.append((et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#00FF00"))
print >>sys.stderr,"%d -> %d connection asserted by %d pair ended links" % (etid[e1], eiid[e2],linkedby)
else:
print >>sys.stderr,"%d -> %d connection not asserted by pair ended link" % (eid[e1], eid[e2])
print >>sys.stderr,"%d -> %d connection not asserted by pair ended link" % (etid[e1], eiid[e2])
if not connected and linkedby > minlink:
s.append((e[e1][1],e[e2][0],linkedby,eid[e1], eid[e2],"#FF0000"))
print >>sys.stderr,"%d -> %d scaffolded by %d pair ended links" % (eid[e1], eid[e2],linkedby)
s.append((et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF0000"))
print >>sys.stderr,"%d -> %d scaffolded by %d pair ended links" % (etid[e1], eiid[e2],linkedby)
for x,y,z,s1,s2,color in s:
attr = assgraph.addEdge(x,y)
......@@ -391,8 +417,71 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False):
b'arrow':b'last',
b'fill':color
}
def insertFragment(self,seq):
def fillGaps(self,minlink=5,
back=200,
kmer=12,
smin=40,
delta=0,
cmincov=5,
minread=20,
minratio=0.1,
emincov=1,
minoverlap=60,
lowfilter=True,maxjump=0):
def isInitial(n):
return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
def isTerminal(n):
return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
assgraph = self.compactAssembling()
index = self.index
ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
epi = [set(assgraph.getEdgeAttr(*i)['path'][0:back]) for i in ei]
ept = [set(assgraph.getEdgeAttr(*i)['path'][-back:]) for i in et]
exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid]
ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid]
nei = len(ei)
net = len(et)
s=[]
maxcycle=max(self.graph.getNodeAttr(i)['cycle'] for i in self.graph)
cycle = maxcycle
for e1 in xrange(net):
for e2 in xrange(nei):
connected = et[e1][1]==ei[e2][0]
if not connected:
linkedby = len(ext[e1] & exi[e2] | ext[e1] & epi[e2] | ept[e1] & exi[e2])
if linkedby >= minlink:
print >>sys.stderr,"Linking Stems %d -> %d" % (etid[e1],eiid[e2])
ex = ext[e1] | exi[e2]
nreads = len(ex)
print >>sys.stderr,"--> %d | %d = %d reads to align" % (len(ext[e1]),len(exi[e2]),nreads)
ali= multiAlignReads(ex,index,kmer,smin,delta)
goodali = [i for i in ali if len(i) >= nreads/4]
print >>sys.stderr,"--> %d consensus to add" % len(goodali)
for a in goodali:
cycle+=1
print >>sys.stderr," added on cycle %d" % cycle
c = consensus(a,index,cmincov)
s = insertFragment(self,c,cycle=cycle)
a = tango(self,s,mincov=emincov,
minread=minread,
minoverlap=minoverlap,
maxjump=maxjump,cycle=cycle)
print >>sys.stderr
def insertFragment(self,seq,cycle=1):
index = self.index
rsize = index.getReadSize()
readmax=len(index)+1
......@@ -410,6 +499,8 @@ def insertFragment(self,seq):
if ireadid not in graph:
node = graph.addNode(ireadid)
if 'cycle' not in node:
node['cycle']=cycle
if ireadid < readmax:
node['fake5']=0
node['fake3']=0
......@@ -423,6 +514,8 @@ def insertFragment(self,seq):
if ireadidE not in graph:
node = graph.addNode(ireadidE)
if 'cycle' not in node:
node['cycle']=cycle
if ireadidE < readmax:
node['fake5']=0
node['fake3']=0
......@@ -502,7 +595,7 @@ def getPairedRead(self,assgraph,stemid,back,end=True):
stemid=-stemid
stem = assgraph.edgeIterator(edgePredicate=lambda e:assgraph.getEdgeAttr(e[0],e[1],e[2])['stemid']==stemid).next()
path=assgraph.getEdgeAttr(*stem)['path']
path=path[-back:]
path=[i for i in path[-back:] if i > 0]
reads = [(i,) + r.getIds(i) for i in path if i <= len(r)]
reads=[i[3] if i[0] in i[3] else set(-j for j in i[3]) for i in reads]
paired=[(r.getPairedRead(j) for j in i) for i in reads]
......@@ -536,7 +629,12 @@ def pairEndedConnected(self,assgraph,edge1,edge2,back=200):
s2 = assgraph.edgeIterator(edgePredicate=lambda e:assgraph.getEdgeAttr(e[0],e[1],e[2])['stemid']==edge2).next()
p2 = set(assgraph.getEdgeAttr(*s2)['path'][0:back])
n = len(p1 & p2)
p3 = getPairedRead(self,assgraph,edge2,back,end=False)
s4 = assgraph.edgeIterator(edgePredicate=lambda e:assgraph.getEdgeAttr(e[0],e[1],e[2])['stemid']==edge1).next()
p4 = set(assgraph.getEdgeAttr(*s4)['path'][-back:])
n = len(p1 & p2 | p3 & p4 | p1 & p3)
return n
......
major = 0
minor = 0
serial= 'alpha'
version = "%2d.%02d %s" % (major,minor,serial)
......@@ -261,113 +261,7 @@ def cleanEdge(assemble):
def tango(self,index,proofreading=10,maxextension=5):
def cmpPoints(p1,p2):
rep = cmp(p1[0],p2[0])
if not rep:
rep= cmp(p1[3],p2[3])
return rep
def cmpKeys(k1,k2):
return cmp(len(k2),len(k1))
# un point d'extension is :
# - the length of the path to the start read
# - the sequence to use as probe
# - the readid on the ending node
# - the weight of the node
points = [(index.getReadSize(),
e[0],
e[1],
self.graph[e[1]]['weight']) for e in self.extensions]
cycle=0
while(points):
cycle+=1
points.sort(cmpPoints)
running = points.pop()
extensions={}
extlength=1
length,read,nodeindex,w = running
print >>sys.stderr,b"Cycle : %8d Waiting points : %8d (%d bp)\r" % (cycle,len(points),length),
while not extensions and extlength <= maxextension :
probe = read[extlength:]
extensions = index.getExtensions(probe)
extlength+=1
if not extensions:
erroron=nodeindex
print >>sys.stderr,b"\nproof reading"
extlength=0
back=0
probe=read
while not extensions and back <= proofreading:
fathers = [x for x in self.graph.parentIterator(nodeindex,
edgePredicate=lambda e : self.graph.getEdgeAttr(e[0],e[1])[b'type']==b"next read")]
print "fathers = ",fathers
xxx=nodeindex
for f in fathers:
label = self.graph.getEdgeAttr(f,xxx)[b'label']
llabel=len(label)
print probe
print f,nodeindex,label,probe[-llabel:],extlength
#time.sleep(1)
if probe[-llabel:]==label:
back+=llabel
print "ok",back
probe = index.getRead(f,back,index.getReadSize()-back)
extensions = index.getExtensions(probe)
k = extensions.keys()
print "raw extension",extensions
for e in k:
if erroron in extensions[e][1]:
print "remove idem"
del extensions[e]
nodeindex = f
extlength+=1
print extensions
for ex in extensions:
data = extensions[ex]
reads = self.readType(data[1])
nodeindexE = reads[0]
if nodeindexE==self.startread:
print >>sys.stderr,b"\nLooping over start read %d " % (self.startread)
alreadyin = nodeindexE in asm.graph
self.graph.addEdge(nodeindex, nodeindexE)
#self._finalread=nodeindexE
attr = self.graph.getNodeAttr(nodeindexE)
attr[b"weight"]=reads[1]
attr[b"reads"]=reads[2]
attr[b"position"]=length+len(ex)
attr = self.graph.getEdgeAttr(nodeindex, nodeindexE)
attr[b"label"]=ex
attr[b"arrowsize"]=float(reads[1])/10
attr[b"type"]=b"next read"
if not alreadyin:
points.append((length + len(ex),
index.getRead(reads[0],0,index.getReadSize()),
nodeindexE,
reads[1]))
def commonOverlapIndexOf(text1, text2):
# Cache the text lengths to prevent multiple calls.
text1_length = len(text1)
......@@ -399,161 +293,6 @@ def commonOverlapIndexOf(text1, text2):
length += 1
def tango(self,index,
minread=20, minratio=0.1,
mincov=2, minoverlap=40,
extlength=1, cleanBranch=False):
'''
:param index: the read index
:type index:
:param minread:
:type minread:
:param minratio:
:type minratio:
:param mincov:
:type mincov:
:param minoverlap:
:type minoverlap:
:param extlength:
:type extlength:
'''
def cmpPoints(p1,p2):
rep = cmp(p2[0],p1[0])
if not rep:
rep= cmp(p1[4],p2[4])
return -rep
def cmpKeys(k1,k2):
return cmp(len(k2),len(k1))
def isEnd(node):
s = [x for x in self.graph.neighbourIterator(node,edgePredicate=lambda e : self.graph.getEdgeAttr(e[0],e[1])[b'type']==b"next read")]
f = [x for x in self.graph.parentIterator(node,edgePredicate=lambda e : self.graph.getEdgeAttr(e[0],e[1])[b'type']==b"next read")]
if len(s)==0 and len(f)==1:
return f[0]
return 0
readlength = index.getReadSize()
# un point d'extension is :
# - the length of the path to the start read
# - insertion since last read
# - the sequence to use as probe
# - the readid on the ending node
# - the weight of the node
points = [(readlength,
0,
index.getRead(e[1],1,readlength-1),
e[1],
self.graph[e[1]]['weight']) for e in self.extensions]
cycle=0
extraid = int(10**math.ceil(math.log10(len(r)))) + 1
shiftid = extraid
tobacktrack = []
lmax=0
totreads=0
while(points):
cycle+=1
points.sort(cmpPoints)
running = points.pop()
extensions={}
extlength=1
length,noreads,probe,nodeindex,w = running
lmax = max(lmax,length)
print >>sys.stderr,b"Cycle : %8d (%d nodes / %d reads) Waiting points : %8d (%d bp / %d bp) -- back : %d\r" % (cycle,len(asm.graph),totreads,len(points),length,lmax,len(tobacktrack)),
attr = self.graph.getNodeAttr(nodeindex)
#read = index.getRead(nodeindex,0,readlength)
#
# Try to deal with homopolymer by stopping elongation
#
if probe=="A" * len(probe) :
extensions={}
# extensions = {'C':(1,[]),'G':(1,[]),'T':(1,[])}
elif probe=="C" * len(probe) :
extensions={}
# extensions = {'A':(1,[]),'G':(1,[]),'T':(1,[])}
elif probe=="G" * len(probe) :
extensions={}
# extensions = {'C':(1,[]),'A':(1,[]),'T':(1,[])}
elif probe=="T" * len(probe) :
extensions={}
# extensions = {'C':(1,[]),'G':(1,[]),'A':(1,[])}
else:
extensions = index.checkedExtensions(probe, minread=minread, minratio=minratio, mincov=mincov, minoverlap=minoverlap, extlength=extlength)
# print "\n",nodeindex," ++> ",extensions
if not extensions and cleanBranch:
f = isEnd(nodeindex)
while(f):
del self.graph[nodeindex]
nodeindex=f
f = isEnd(nodeindex)
for ex in extensions:
data = extensions[ex]
if data[1]:
reads = self.readType(data[1])
nodeindexE = reads[0]
#
# A read cannot link to itself
# --> homopolymer longer than the read length
#
if nodeindexE==nodeindex:
continue
else:
nodeindexE = extraid
extraid+=1
if nodeindexE==self.startread:
print >>sys.stderr,b"\nLooping over start read %d " % (self.startread)
alreadyin = nodeindexE in asm.graph
self.graph.addEdge(nodeindex, nodeindexE)
#self._finalread=nodeindexE
attr = self.graph.getNodeAttr(nodeindexE)
if nodeindexE >= shiftid:
attr[b"label"]=''
attr[b'graphics']={b'fill':'#008000',
b'type':'rectangle'}
noreads+=1
else:
attr[b"label"]='%d (%d)' %(nodeindexE,reads[1])
attr[b"weight"]=reads[1]
attr[b"reads"]=reads[2]
totreads+=1
noreads=0
attr[b"position"]=length+len(ex)
attr = self.graph.getEdgeAttr(nodeindex, nodeindexE)
attr[b"label"]='%s [%d] (%d)' %(ex,length+len(ex),data[0])
attr[b'sequence']=ex
attr[b"type"]=b"next read"
attr[b"coverage"]=data[0]
attr['graphics']={'width':math.ceil(math.log(data[0])/math.log(2)),
'arrow':'last'}
if not alreadyin and noreads < (readlength - minoverlap):
points.append((length + len(ex),
noreads,
probe[1:]+ex,
nodeindexE,
data[0]))
def cpe(self,probe):
......@@ -732,33 +471,137 @@ def cutParallel(self,mincov):
del stemparallel[spi]
return stemparallel
def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False):
'''
Add relationships between edges of the assembling graph related to the
par ended links.
:param assgraph: The compact assembling graph as produced by the
:py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
:type assgraph: :py:class:`~orgasm.graph.DiGraphMultiEdge`
:param minlink: the minimum count of pair ended link to consider
for asserting the relationship
:type minlink: :py:class:`int`
:param back: How many base pairs must be considered at the end of each edge
:type back: :py:class:`int`
:param addConnectedLink: add to the assembling graph green edges for each directly
connected edge pair representing the pair ended links
asserting the connection.
:type addConnectedLink: :py:class:`bool`
'''
if addConnectedLink:
# Fake function just testing the stemid attribute
def isInitial(n):
return 'stemid' in assgraph.getEdgeAttr(*e)
def isTerminal(n):
return 'stemid' in assgraph.getEdgeAttr(*e)
else:
def isInitial(n):
return len(list(assgraph.parentIterator(n[0],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
def isTerminal(n):
return len(list(assgraph.neighbourIterator(n[1],edgePredicate=lambda e: 'stemid' in assgraph.getEdgeAttr(*e))))==0
ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid]
ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid]
nei = len(ei)
net = len(et)
s=[]
for e1 in xrange(net):
for e2 in xrange(nei):
connected = et[e1][1]==ei[e2][0]
linkedby = len(ext[e1] & exi[e2])
if connected and addConnectedLink:
if linkedby > minlink:
s.append((et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#00FF00"))
print >>sys.stderr,"%d -> %d connection asserted by %d pair ended links" % (etid[e1], eiid[e2],linkedby)
else:
print >>sys.stderr,"%d -> %d connection not asserted by pair ended link" % (etid[e1], eiid[e2])