Commit c568ba6d authored by Eric Coissac's avatar Eric Coissac

Add gap length estimation

parent c89e0cdf
/orgasm.test/
/build/
/ORG.asm-0.00.alpha/
/MANIFEST
/dist/
/__pycache__/
......@@ -88,7 +88,8 @@ def run(config):
fa = path2fasta(asm,cg,path,
identifier="Seq_%d" % c,
back=back,
minlink=config['orgasm']['minlink'])
minlink=config['orgasm']['minlink'],
logger=logger)
print(fa,file=fastaout)
print(" ".join([str(x) for x in path]),file=pathout)
......
......@@ -111,7 +111,8 @@ def run(config):
fa = path2fasta(asm,cg,path,
identifier="Seq_%d" % c,
back=back,
minlink=config['orgasm']['minlink'])
minlink=config['orgasm']['minlink'],
logger=logger)
print(fa,file=fastaout)
print(" ".join([str(x) for x in path]),file=pathout)
......
......@@ -101,7 +101,8 @@ def run(config):
fa = path2fasta(asm,cg,path,
identifier="Seq_%d" % c,
back=back,
minlink=5)
minlink=config['orgasm']['minlink'],
logger=logger)
print(fa, file=fastaout)
print(" ".join([str(x) for x in path]),file=pathout)
......
......@@ -218,22 +218,41 @@ cdef class Index:
trueoverlap = len(probe)
lext = self.getExtensions(probe)
if restrict is not None:
ks = lext.keys()
ks = list(lext.keys())
for k in ks:
rs=lext[k][1]
rk=[]
for r in rs:
rn = self.getPairedRead(r)
if rn:
rn = self.getIds(rn)[0]
if rn in restrict:
rk.append(r)
if r in restrict:
rk.append(r)
else:
rn = self.getPairedRead(r)
if rn:
rn = self.getIds(rn)[0]
if rn in restrict:
rk.append(r)
if rk:
lext[k][1]=rk
else:
del lext[k]
# if restrict is not None:
# ks = lext.keys()
# for k in ks:
# rs=lext[k][1]
# rk=[]
# for r in rs:
# rn = self.getPairedRead(r)
# if rn:
# rn = self.getIds(rn)[0]
# if rn in restrict:
# rk.append(r)
# if rk:
# lext[k][1]=rk
# else:
# del lext[k]
if lext:
for e in lext.itervalues():
......
......@@ -457,7 +457,7 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False):
for e1 in range(net):
for e2 in range(nei):
connected = et[e1][1]==ei[e2][0]
linkedby = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
linkedby,ml,sl = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
if connected and addConnectedLink:
if linkedby >= minlink:
......@@ -500,6 +500,7 @@ def fillGaps(self,minlink=5,
maxjump=0,
snp=False,
nodeLimit=1000000,
onlyLinking=False,
logger=None):
'''
......@@ -561,7 +562,7 @@ def fillGaps(self,minlink=5,
for e2 in range(nei):
connected = et[e1][1]==ei[e2][0]
if not connected:
linkedby = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
linkedby,ml,sl = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
if linkedby >= minlink and abs(etid[e1]) <= abs(eiid[e2]):
extended.add(etid[e1])
......@@ -615,48 +616,49 @@ def fillGaps(self,minlink=5,
else:
print("--> already aligned",file=sys.stderr)
for e1 in range(net):
if etid[e1] not in extended:
print("\n\nExtending Stems %d" % (etid[e1]),
file=sys.stderr)
ex = frozenset((ext[e1] - ept[e1]) | eet[e1])
nreads = len(ex)
print("--> %d reads to align" % (nreads),
file=sys.stderr)
if nreads > 10:
__cacheAli2.add(ex)
if ex not in __cacheAli:
ali= multiAlignReads(ex,index,kmer,smin,delta)
print('',file=sys.stderr)
goodali = [i for i in ali if len(i) >= nreads/4]
print("--> %d consensus to add" % len(goodali),
file=sys.stderr)
for a in goodali:
c = consensus(a,index,cmincov)
if c:
cycle+=1
s = insertFragment(self,c,cycle=cycle)
print(" %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
file=sys.stderr)
a = tango(self,
seeds = s,
minread = minread,
minratio = minratio,
mincov = emincov,
minoverlap = minoverlap,
lowfilter = lowfilter,
adapters5 = adapters5,
adapters3 = adapters3,
maxjump = maxjump,
cycle = cycle,
nodeLimit = nodeLimit)
print("",file=sys.stderr)
else:
print("--> already aligned",file=sys.stderr)
if not onlyLinking:
for e1 in range(net):
if etid[e1] not in extended:
print("\n\nExtending Stems %d" % (etid[e1]),
file=sys.stderr)
ex = frozenset((ext[e1] - ept[e1]) | eet[e1])
nreads = len(ex)
print("--> %d reads to align" % (nreads),
file=sys.stderr)
if nreads > 10:
__cacheAli2.add(ex)
if ex not in __cacheAli:
ali= multiAlignReads(ex,index,kmer,smin,delta)
print('',file=sys.stderr)
goodali = [i for i in ali if len(i) >= nreads/4]
print("--> %d consensus to add" % len(goodali),
file=sys.stderr)
for a in goodali:
c = consensus(a,index,cmincov)
if c:
cycle+=1
s = insertFragment(self,c,cycle=cycle)
print(" %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
file=sys.stderr)
a = tango(self,
seeds = s,
minread = minread,
minratio = minratio,
mincov = emincov,
minoverlap = minoverlap,
lowfilter = lowfilter,
adapters5 = adapters5,
adapters3 = adapters3,
maxjump = maxjump,
cycle = cycle,
nodeLimit = nodeLimit)
print("",file=sys.stderr)
else:
print("--> already aligned",file=sys.stderr)
self.cleanDeadBranches(maxlength=10)
cutLowCoverage(self,gmincov,terminal=True)
......@@ -766,7 +768,7 @@ def fillGaps2(self,minlink=5,
for e2 in range(nei):
connected = et[e1][1]==ei[e2][0]
if not connected:
linkedby = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
linkedby,ml,sl = pairEndedConnected(self,assgraph,etid[e1],eiid[e2],back)
if linkedby >= minlink and abs(etid[e1]) <= abs(eiid[e2]):
extended.add(etid[e1])
......@@ -1014,31 +1016,66 @@ def pairEndedConnected(self,assgraph,edge1,edge2,back=250):
e1 = assgraph.getEdgeAttr(*s1)['path'][-back:]
e2 = assgraph.getEdgeAttr(*s2)['path'][0:back]
de1 = {}
de2 = {}
j = 0
for i in e1:
l = de1.get(i,[])
de1[i] = l
l.append(j)
j+=1
j = 0
for i in e2:
l = de2.get(i,[])
de2[i] = l
l.append(j)
j+=1
pe1 = [k for k in
[(a,[j for j in b if j in e2])
[(a,[j for j in b if j in de2])
for a,b in [ri.normalizedPairedEndsReads(i)
for i in e1 if i > 0 and i < lri]] if k[1]]
pe2 = [k for k in
[(a,[j for j in b if j in e1])
[(a,[j for j in b if j in de1])
for a,b in [ri.normalizedPairedEndsReads(i)
for i in e2 if i < 0 and -i < lri]] if k[1]]
pe1set = set()
le1=len(e1)
peset = set()
delta = []
for f,rs in pe1:
for r in rs:
pe1set.add((f,abs(r)) if f < abs(r) else (abs(r),-f))
p = (f,abs(r)) if f < abs(r) else (abs(r),-f)
if p not in peset:
peset.add(p)
for p1 in de1[f]:
for p2 in de2[r]:
delta.append(le1 - p1 + 1 + p2)
pe2set = set()
for f,rs in pe2:
for r in rs:
pe2set.add((-f,abs(r)) if -f < abs(r) else (abs(r),f))
return len(pe1set | pe2set)
p = (-f,abs(r)) if -f < abs(r) else (abs(r),f)
if p not in peset:
peset.add(p)
for p1 in de1[r]:
for p2 in de2[f]:
delta.append(le1 - p1 + 1 + p2)
if (len(delta)):
dmean = sum(delta) / len(delta)
dmean2= sum(x ** 2 for x in delta) / len(delta)
dsd = math.sqrt(dmean2 - dmean ** 2)
dmean+=ri.getReadSize()
else:
dmean = None
dsd = None
return len(peset),dmean,dsd
def coverageEstimate(self,matches=None,index=None,timeout=60.0):
'''
......@@ -1213,7 +1250,7 @@ def path2sam(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=2
print("Both segments %d and %d are connected" % (oldid,stemid),file=sys.stderr)
else:
# addInFront=True
connected = pairEndedConnected(self,assgraph,oldid,stemid,back)
connected,ml,sl = pairEndedConnected(self,assgraph,oldid,stemid,back)
print("Both segments %d and %d are disconnected" % (oldid,stemid),file=sys.stderr)
if connected >= minlink:
print("But linked by %d pair ended links" % connected,file=sys.stderr)
......@@ -1241,7 +1278,7 @@ def path2sam(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=2
s2 = alledges[path[0]]
sid1=assgraph.getEdgeAttr(*s1)['stemid']
sid2=assgraph.getEdgeAttr(*s2)['stemid']
connected = pairEndedConnected(self,assgraph,sid1,sid2,back)
connected,ml,sl = pairEndedConnected(self,assgraph,sid1,sid2,back)
if s1[1]==s2[0]:
print("Path is circular",file=sys.stderr)
......@@ -1264,7 +1301,7 @@ def path2sam(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=2
return "\n".join(sam)
def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=200):
def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=200,logger=None):
'''
Convert a path in an compact assembling graph in a fasta formated sequences.
......@@ -1290,6 +1327,7 @@ def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back
:raises: :py:class:`AssertionError`
'''
frglen,frglensd = estimateFragmentLength(self)
alledges = dict((assgraph.getEdgeAttr(*e)['stemid'],e)
for e in assgraph.edgeIterator(edgePredicate = lambda i: 'stemid' in assgraph.getEdgeAttr(*i)))
......@@ -1322,24 +1360,67 @@ def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back
#print "@@>",stemid,attr['length'],stem,oldstem
if oldstem is not None:
connected = pairEndedConnected(self,assgraph,oldid,stemid,back)
connected,ml,sl = pairEndedConnected(self,assgraph,oldid,stemid,back)
if oldstem[1]==stem[0]:
print("Both segments %d and %d are connected" % (oldid,stemid),file=sys.stderr)
label.append('{connection: %d}' % connected)
if logger is None:
print("Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" % (oldid,stemid,connected,ml,sl),
file=sys.stderr)
else:
logger.info("Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" % (oldid,stemid,connected,ml,sl))
label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
else:
print("Both segments %d and %d are disconnected" % (oldid,stemid),file=sys.stderr)
glength = frglen - ml - self.index.getReadSize()
if logger is None:
print("Both segments %d and %d are disconnected" % (oldid,stemid),
file=sys.stderr)
else:
logger.info("Both segments %d and %d are disconnected" % (oldid,stemid))
if connected >= minlink:
print("But linked by %d pair ended links" % connected,file=sys.stderr)
seq.append(b'N'*nlength)
if logger is None:
print(" But linked by %d pair ended links (gap length=%f sd=%f)" % (connected,glength,sl),
file=sys.stderr)
else:
logger.info(" But linked by %d pair ended links (gap length=%f sd=%f)" % (connected,glength,sl))
if glength > 0:
seq.append(b'N'*int(glength))
slength.append(int(glength))
else:
seq.append(b'N'*3)
seq.append(self.index.getRead(stem[0],0,self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
label.append('{N-connection: %d}' % connected)
label.append('{N-connection: %d - Gap length: %d, sd: %d}' % (connected,glength,sl))
elif forceconnection:
print("Connection is forced with only %d pair ended links" % connected,file=sys.stderr)
seq.append(b'N'*nlength)
if connected > 0:
glength = int(frglen-ml - self.index.getReadSize())
if logger is None:
print("Connection is forced with only %d pair ended links (gap length=%f sd=%f)" % (connected,glength,sl),
file=sys.stderr)
else:
logger.info("Connection is forced with only %d pair ended links (gap length=%f sd=%f)" % (connected,glength,sl))
else:
if logger is None:
print("Connection is forced without pair ended links" % connected,
file=sys.stderr)
else:
logger.info("Connection is forced without pair ended links" % connected)
glength = nlength
seq.append(b'N'*glength)
seq.append(self.index.getRead(stem[0],0,self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
label.append('{F-connection: %d}' % connected)
if connected > 0:
label.append('{F-connection: %d - Gap length: %d, sd: %d}' % connected)
else:
label.append('{F-connection: %d}' % connected)
flink = assgraph.addEdge(oldstem[1],stem[0])
rlink = assgraph.addEdge(-stem[0],-oldstem[1])
flink['label']="Forced (%d) %d -> %d" % (connected,oldid,stemid)
......@@ -1381,27 +1462,45 @@ def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back
s2 = alledges[path[0]]
sid1=assgraph.getEdgeAttr(*s1)['stemid']
sid2=assgraph.getEdgeAttr(*s2)['stemid']
connected = pairEndedConnected(self,assgraph,sid1,sid2,back)
connected,ml,sl = pairEndedConnected(self,assgraph,sid1,sid2,back)
if s1[1]==s2[0]:
print("Path is circular",file=sys.stderr)
if logger is None:
print("Path is circular",file=sys.stderr)
else:
logger.info("Path is circular")
circular=True
label.append('{connection: %d}' % connected)
else:
if connected >= minlink:
print("Path is circular but disconnected" ,file=sys.stderr)
print("Linked by %d pair ended links" % connected,file=sys.stderr)
if logger is None:
print("Path is circular but disconnected" ,file=sys.stderr)
print("Linked by %d pair ended links" % connected,file=sys.stderr)
else:
logger.info("Path is circular but disconnected")
logger.info("Linked by %d pair ended links" % connected)
label.append('{N-connection: %d}' % connected)
seq.append(b'N'*nlength)
circular=True
elif forceconnection:
print("Circular connection forced",file=sys.stderr)
print("Linked by %d pair ended links" % connected,file=sys.stderr)
if logger is None:
print("Circular connection forced",file=sys.stderr)
print("Linked by %d pair ended links" % connected,file=sys.stderr)
else:
logger.info("Circular connection forced")
logger.info("Linked by %d pair ended links" % connected)
label.append('{F-connection: %d}' % connected)
seq.append(b'N'*nlength)
circular=True
else:
print("Path is linear",file=sys.stderr)
if logger is None:
print("Path is linear",file=sys.stderr)
else:
logger.info("Path is linear")
circular=False
seq.insert(0,self.index.getRead(s2[0],0,self.index.getReadSize()).lower())
slength.insert(0,self.index.getReadSize())
......@@ -1550,10 +1649,10 @@ def pairEndedConstraints(asm,cg,threshold=5.,back=500):
cg.edgeIterator(edgePredicate=lambda j: j[0]==e[1] and j[1]!=e[0] and 'stemid' in cg.getEdgeAttr(*j))]) for e in fork]
links = [(i,
(pairEndedConnected(asm,cg,i[0][0],i[2][0],back)+
pairEndedConnected(asm,cg,i[0][1],i[2][1],back)+0.0001)/
(pairEndedConnected(asm,cg,i[0][0],i[2][1],back)+
pairEndedConnected(asm,cg,i[0][1],i[2][0],back)+0.0001))
(pairEndedConnected(asm,cg,i[0][0],i[2][0],back)[0]+
pairEndedConnected(asm,cg,i[0][1],i[2][1],back)[0]+0.0001)/
(pairEndedConnected(asm,cg,i[0][0],i[2][1],back)[0]+
pairEndedConnected(asm,cg,i[0][1],i[2][0],back)[0]+0.0001))
for i in bifork]
constraints = {}
......@@ -1597,7 +1696,7 @@ def scaffoldConstraints(self,cg,back=500, minlink=5):
for e in ends:
for s in starts:
if pairEndedConnected(self,cg,e,s,back) >= minlink:
if pairEndedConnected(self,cg,e,s,back)[0] >= minlink:
if e in constraints:
constraints[e].append([s])
else:
......@@ -1932,7 +2031,7 @@ def unfoldmarker(assgraph,seeds=None):
f=e
return ep
def estimateFragmentLength(self,minlength=500):
def estimateFragmentLength(self,minlength=1000):
cg = self.compactAssembling(verbose=False)
edges = [i for i in cg.edgeIterator() if cg.getEdgeAttr(*i)['length']>=minlength]
lindex=len(self.index)
......@@ -1950,7 +2049,7 @@ def estimateFragmentLength(self,minlength=500):
if delta:
mdelta = sum(delta) / float(len(delta))
m2delta= sum(float(d ** 2) / float(len(delta)) for d in delta)
m2delta= sum((d ** 2) for d in delta) / len(delta)
variance=m2delta - mdelta**2
if variance < 0:
variance=0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment