Commit 058ae863 by Eric Coissac

Management of the overlap between scaffolded contigs

parent f959bf85
# cython: language_level=3 #cython: language_level=3, boundscheck=False, wraparound=False
''' '''
Created on 14 sept. 2009 Created on 14 sept. 2009
......
# cython: language_level=3 #cython: language_level=3, boundscheck=False
from collections import Counter from collections import Counter
...@@ -78,7 +78,7 @@ cpdef tuple cmpHash(dict h1,dict h2, size_t delta=2): ...@@ -78,7 +78,7 @@ cpdef tuple cmpHash(dict h1,dict h2, size_t delta=2):
mstat =(0,0) mstat =(0,0)
return mstat return mstat
cdef tuple alignSequence(bytes seq1, bytes seq2, cpdef tuple alignSequence(bytes seq1, bytes seq2,
dict hash1=None,dict hash2=None, dict hash1=None,dict hash2=None,
size_t kmer=12,size_t delta=2, size_t kmer=12,size_t delta=2,
int smin=10,EndGapFree align=None): int smin=10,EndGapFree align=None):
......
...@@ -30,6 +30,7 @@ from time import time ...@@ -30,6 +30,7 @@ from time import time
import math import math
from functools import reduce from functools import reduce
from sys import stderr from sys import stderr
from orgasm.multialign._multi import alignSequence # @UnresolvedImport
def logOrPrint(logger,message,level='info'): def logOrPrint(logger,message,level='info'):
if logger is not None: if logger is not None:
...@@ -311,8 +312,20 @@ def genesincontig(cg,index,matches): ...@@ -311,8 +312,20 @@ def genesincontig(cg,index,matches):
ea['ingene']=g ea['ingene']=g
ea['genepos']=ep ea['genepos']=ep
def testOverlap(seqfrom,seqto,headto,readsize):
seqto=headto+seqto
lseq=min(len(seqfrom),len(seqto),readsize)
seqfrom = seqfrom[-lseq:]
seqto = seqto[0:lseq]
ali = alignSequence(seqfrom,seqto)
ok = (not ali[0] \
and ali[1] > 0 \
and len(ali[2])==1 \
and len(ali[3])==1)
if ok:
return lseq - ali[2][0]
else:
return -1
def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=set(),logger=None): def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=set(),logger=None):
''' '''
Add relationships between edges of the assembling graph related to the Add relationships between edges of the assembling graph related to the
...@@ -347,7 +360,9 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink= ...@@ -347,7 +360,9 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=
maxstemid = max(assgraph.getEdgeAttr(*i).get('stemid',0) for i in assgraph.edgeIterator()) eiat = dict((assgraph.getEdgeAttr(*i)['stemid'],
assgraph.getEdgeAttr(*i)) for i in assgraph.edgeIterator())
maxstemid = max(eiat)
if addConnectedLink: if addConnectedLink:
# Fake function just testing the stemid attribute # Fake function just testing the stemid attribute
...@@ -367,7 +382,7 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink= ...@@ -367,7 +382,7 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=
eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei] eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et] etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
nei = len(ei) nei = len(ei)
net = len(et) net = len(et)
...@@ -402,9 +417,18 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink= ...@@ -402,9 +417,18 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=
"%d -> %d forced and not supported by pair ended links" % (etid[e1], eiid[e2])) "%d -> %d forced and not supported by pair ended links" % (etid[e1], eiid[e2]))
elif not connected and linkedby >= minlink: elif not connected and linkedby >= minlink:
s.append(('s',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF6600",ml,sl,delta,first,last)) overlap = testOverlap(eiat[etid[e1]]['sequence'],
logOrPrint(logger, eiat[eiid[e2]]['sequence'],
"%d -> %d scaffolded by %d pair ended links" % (etid[e1], eiid[e2],linkedby)) eiat[eiid[e2]]['head'],
self.index.getReadSize())
if overlap >= 0:
s.append(('o',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF00FF",overlap,0,[],first,last))
logOrPrint(logger,
"%d -> %d overlap of %dbp supported by %d pair ended links" % (etid[e1], eiid[e2],overlap,linkedby))
else:
s.append(('s',et[e1][1],ei[e2][0],linkedby,etid[e1], eiid[e2],"#FF6600",ml,sl,delta,first,last))
logOrPrint(logger,
"%d -> %d scaffolded by %d pair ended links" % (etid[e1], eiid[e2],linkedby))
nstemid={} nstemid={}
for kind,x,y,z,s1,s2,color,ml,sl,delta,first,last in s: for kind,x,y,z,s1,s2,color,ml,sl,delta,first,last in s:
if (abs(x) > abs(y)): if (abs(x) > abs(y)):
...@@ -452,6 +476,23 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink= ...@@ -452,6 +476,23 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=
attr['sequence']=b"N" * attr['length'] attr['sequence']=b"N" * attr['length']
attr['path']=[first] + [0] * (attr['length']-1) + [last] attr['path']=[first] + [0] * (attr['length']-1) + [last]
attr['class']='scaffold:paired-end' attr['class']='scaffold:paired-end'
elif kind=="o":
attr['label']="%d : Overlap (%dbp) [%d] %d -> %d" % (nid,overlap,z,s1,s2)
attr['length']=-overlap
attr['first']=first
attr['last']=last
attr['weight']=0
attr['pairendlink']=z
attr['ingene']=0
attr['link']=(s1,s2)
attr['graphics']={'width':1,
'arrow':'last',
'fill':color
}
attr['stemid']=nid
attr['sequence']=b''
attr['path']=[]
attr['class']='scaffold:overlap'
elif kind=="f": elif kind=="f":
glengths = [frglen - i - self.index.getReadSize() glengths = [frglen - i - self.index.getReadSize()
for i in delta] for i in delta]
...@@ -544,18 +585,23 @@ def fillGaps(self,minlink=5, ...@@ -544,18 +585,23 @@ def fillGaps(self,minlink=5,
index = self.index index = self.index
#List of edges not connected at their beginning
ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)] ei = [i for i in assgraph.edgeIterator(edgePredicate=isInitial)]
#List of edges not connected at their end
et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)] et = [i for i in assgraph.edgeIterator(edgePredicate=isTerminal)]
#Corresponding id of these edges
eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei] eiid = [assgraph.getEdgeAttr(*i)['stemid'] for i in ei]
etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et] etid = [assgraph.getEdgeAttr(*i)['stemid'] for i in et]
epi = [set(assgraph.getEdgeAttr(*i)['path'][0:back]) for i in ei] #epi = [set(assgraph.getEdgeAttr(*i)['path'][0:back]) for i in ei]
ept = [set(assgraph.getEdgeAttr(*i)['path'][-back:]) for i in et] ept = [set(assgraph.getEdgeAttr(*i)['path'][-back:]) for i in et]
eei = [set(assgraph.getEdgeAttr(*i)['path'][0:100]) for i in ei] eei = [set(assgraph.getEdgeAttr(*i)['path'][0:100]) for i in ei]
eet = [set(assgraph.getEdgeAttr(*i)['path'][-100:]) for i in et] eet = [set(assgraph.getEdgeAttr(*i)['path'][-100:]) for i in et]
print(eiid,file=sys.stderr) # <EC> print(eiid,file=sys.stderr) # <EC>
exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid] exi = [getPairedRead(self,assgraph,i,back,end=False) for i in eiid]
ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid] ext = [getPairedRead(self,assgraph,i,back,end=True) for i in etid]
...@@ -581,13 +627,15 @@ def fillGaps(self,minlink=5, ...@@ -581,13 +627,15 @@ def fillGaps(self,minlink=5,
print("\n\nLinking Stems %d -> %d" % (etid[e1],eiid[e2]), print("\n\nLinking Stems %d -> %d" % (etid[e1],eiid[e2]),
file=sys.stderr) file=sys.stderr)
ex = frozenset(((ext[e1] | exi[e2]) - ept[e1] - epi[e2]) | eet[e1] | eei[e2]) # ex = frozenset(((ext[e1] | exi[e2]) - ept[e1] - epi[e2]) | eet[e1] | eei[e2])
ex = frozenset(ext[e1] | exi[e2] | eet[e1] | eei[e2])
ingraph = sum(i in self.graph for i in ex) ingraph = sum(i in self.graph for i in ex)
nreads = len(ex) nreads = len(ex)
if ingraph < nreads: if ingraph < nreads:
print ("--> %d | %d = %d reads to align (%d already assembled)" % (len(ext[e1]),len(exi[e2]),nreads,ingraph), logOrPrint(logger,
file=sys.stderr) "--> %d | %d = %d reads to align (%d already assembled)" % (len(ext[e1]),len(exi[e2]),nreads,ingraph),
)
if nreads > 10: if nreads > 10:
__cacheAli2.add(ex) __cacheAli2.add(ex)
...@@ -597,10 +645,11 @@ def fillGaps(self,minlink=5, ...@@ -597,10 +645,11 @@ def fillGaps(self,minlink=5,
#goodali = [i for i in ali if len(i) >= nreads/4] #goodali = [i for i in ali if len(i) >= nreads/4]
goodali=ali goodali=ali
print("--> %d consensus to add" % len(goodali), logOrPrint(logger,
file=sys.stderr) "--> %d consensus to add" % len(goodali))
for a in goodali: for a in goodali:
#print(b'\n'.join(alignment2bytes(a,index)).decode('ascii'))
cycle+=1 cycle+=1
c = consensus(a,index,cmincov) c = consensus(a,index,cmincov)
s = insertFragment(self,c,cycle=cycle) s = insertFragment(self,c,cycle=cycle)
...@@ -624,7 +673,8 @@ def fillGaps(self,minlink=5, ...@@ -624,7 +673,8 @@ def fillGaps(self,minlink=5,
print('',file=sys.stderr) print('',file=sys.stderr)
else: else:
print("--> already aligned",file=sys.stderr) logOrPrint(logger,
"--> already aligned")
if not onlyLinking: if not onlyLinking:
for e1 in range(net): for e1 in range(net):
...@@ -1129,7 +1179,18 @@ def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back ...@@ -1129,7 +1179,18 @@ def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back
attr['gapsd'])) attr['gapsd']))
else: else:
label.append('{F-connection: %d}' % connected) label.append('{F-connection: %d}' % connected)
elif stemclass=="scaffold:overlap":
logOrPrint(logger,
" But overlap by %dbp supported by %d pair ended links" %
(-attr['length'],
attr['pairendlink']))
label.append('{O-connection: %d - Overlap length: %d}' %
(attr['pairendlink'],
-attr['length']))
# Remove the overlap length on the last inserted sequence
seq[-1]=seq[-1][:attr['length']]
elif oldstemclass[0:9]=="scaffold:": elif oldstemclass[0:9]=="scaffold:":
if stemclass=="sequence": if stemclass=="sequence":
seq.append(self.index.getRead(attr['path'][0], seq.append(self.index.getRead(attr['path'][0],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment