Commit f959bf85 by Eric Coissac

New version of the path2fasta function

parent 41e7ce4e
......@@ -443,6 +443,7 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=
attr['gapdeltas']=[frglen - i - self.index.getReadSize() for i in delta]
attr['pairendlink']=z
attr['ingene']=0
attr['link']=(s1,s2)
attr['graphics']={'width':z // 10.,
'arrow':'last',
'fill':color
......@@ -452,16 +453,32 @@ def scaffold(self,assgraph,minlink=5,back=200,addConnectedLink=False,forcedLink=
attr['path']=[first] + [0] * (attr['length']-1) + [last]
attr['class']='scaffold:paired-end'
elif kind=="f":
glengths = [frglen - i - self.index.getReadSize()
for i in delta]
pglengths = [i for i in glengths if i >=0]
if len(pglengths) > 1:
glength = sum(pglengths) / len(pglengths)
glengthsd= math.sqrt(sum((i-glength)**2 for i in pglengths) /(len(pglengths)-1))
else:
glength = sum(glengths) / len(glengths)
glengthsd= math.sqrt(sum((i-glength)**2 for i in glengths) /(len(glengths)-1))
attr['label']="%d : Forced [%d] %d -> %d" % (nid,z,s1,s2)
attr['length']= 10
attr['length']= int(glength) if glength > 0 else 10
attr['first']=first
attr['last']=last
attr['weight']=0
attr['gappairs']=len(glengths)
attr['gaplength']=int(glength)
attr['gapsd']=int(math.sqrt(frglensd**2+glengthsd**2))
attr['gapdeltas']=[frglen - i - self.index.getReadSize() for i in delta]
attr['pairendlink']=z
attr['ingene']=0
attr['link']=(s1,s2)
attr['graphics']={'width':z // 10.,
'arrow':'last',
'fill':color
'fill' :color
}
attr['stemid']=nid
attr['sequence']=b"N" * attr['length']
......@@ -991,6 +1008,8 @@ def coverageEstimate(self,matches=None,index=None,timeout=60.0):
return maxpath,cumlength,maxpath/float(cumlength)
def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=200,logger=None,tags=[]):
'''
Convert a path in an compact assembling graph in a fasta formated sequences.
......@@ -1019,16 +1038,13 @@ def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back
'''
frglen,frglensd = estimateFragmentLength(self)
#
# Build a dictionary with:
# - Keys = stemid
# - Values = edge descriptor (from,to,x)
#
alledges = dict((assgraph.getEdgeAttr(*e)['stemid'],e)
for e in assgraph.edgeIterator(edgePredicate = lambda i: 'stemid' in assgraph.getEdgeAttr(*i)))
# def getEdge(stemid):
# stem = assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e)
# and assgraph.getEdgeAttr(*e)['stemid']==stemid).next()
# return stem
#print alledges
coverage=[]
slength=[]
......@@ -1036,378 +1052,187 @@ def path2fasta(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back
label=[]
oldstem=None
oldid=None
oldstemclass=None
rank = 1
forceconnection=False
for stemid in path:
if stemid != 0:
stem = alledges[stemid]
attr = assgraph.getEdgeAttr(*stem)
stem = alledges[stemid]
attr = assgraph.getEdgeAttr(*stem)
stemclass = attr['class']
sequence = attr['sequence']
#print "@@>",stemid,attr['length'],stem,oldstem
if oldstem is not None:
connected,ml,sl,delta = pairEndedConnected(self,assgraph,oldid,stemid,back) # @UnusedVariable
if oldstem[1]==stem[0]:
if ml is not None:
if logger is None:
print("Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" % (oldid,stemid,connected,float(ml),float(sl)),
file=sys.stderr)
else:
logger.info("Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" % (oldid,stemid,connected,float(ml),float(sl)))
label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
else:
if logger is None:
print("Both segments %d and %d are connected but covered by 0 paired-end" % (oldid,stemid),
file=sys.stderr)
else:
logger.info("Both segments %d and %d are connected but covered by 0 paired-end" % (oldid,stemid))
label.append('{connection: 0}')
else:
if logger is None:
print("Both segments %d and %d are disconnected" % (oldid,stemid),
file=sys.stderr)
else:
logger.info("Both segments %d and %d are disconnected" % (oldid,stemid))
if connected >= minlink:
glength = frglen - ml - self.index.getReadSize()
print("==>",glength,frglen,frglensd,ml,sl,self.index.getReadSize)
if logger is None:
print(" But linked by %d pair ended links (gap length=%f sd=%f)" % (connected,float(glength),float(sl)),
file=sys.stderr)
else:
logger.info(" But linked by %d pair ended links (gap length=%f sd=%f)" % (connected,float(glength),float(sl)))
if glength > 0:
seq.append(b'N'*int(glength))
slength.append(int(glength))
else:
seq.append(b'N'*3)
seq.append(self.index.getRead(stem[0],0,self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
label.append('{N-connection: %d - Gap length: %d, sd: %d}' % (connected,int(glength),int(sl)))
elif forceconnection:
if connected > 0:
glength = int(frglen-ml - self.index.getReadSize())
if logger is None:
print("Connection is forced with only %d pair ended links (gap length=%f sd=%f)" % (connected,float(glength),float(sl)),
file=sys.stderr)
else:
logger.info("Connection is forced with only %d pair ended links (gap length=%f sd=%f)" % (connected,float(glength),float(sl)))
else:
if logger is None:
print("Connection is forced without pair ended links",
file=sys.stderr)
else:
logger.info("Connection is forced without pair ended links")
glength = nlength
seq.append(b'N'*int(glength))
seq.append(self.index.getRead(stem[0],0,self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
if connected > 0:
label.append('{F-connection: %d - Gap length: %d, sd: %d}' % (connected,int(glength),int(sl)))
else:
label.append('{F-connection: %d}' % connected)
flink = assgraph.addEdge(oldstem[1],stem[0])
rlink = assgraph.addEdge(-stem[0],-oldstem[1])
flink['label']="Forced (%d) %d -> %d" % (connected,oldid,stemid)
flink['graphics']={'width':1,
'arrow':'last',
'fill':'pink'
}
rlink['label']="Forced (%d) %d -> %d" % (connected,-stemid,-oldid)
rlink['graphics']={'width':1,
'arrow':'last',
'fill':'pink'
}
else:
raise AssertionError('Disconnected path between stem '
'%d and %d only %d pair ended links' % (oldid,stemid,connected))
if rank==1 and stemclass!="sequence":
raise RuntimeError("A path cannot ends on a gap")
# Switch the stem to a dashed style
graphics = attr.get('graphics',{})
attr['graphics'] = graphics
graphics['style'] = 'dashed'
# Manage step rank information of each step
allsteps = attr.get('steps',{})
steps = allsteps.get(identifier,[])
steps.append(rank)
allsteps[identifier]=steps
attr['steps']=allsteps
rank+=1
seq.append(attr['sequence'])
coverage.append(attr['weight'])
slength.append(attr['length'])
label.append(attr['label'])
oldstem = stem
oldid=stemid
forceconnection=False
else:
forceconnection=True
s1 = alledges[path[-1]]
s2 = alledges[path[0]]
sid1=assgraph.getEdgeAttr(*s1)['stemid']
sid2=assgraph.getEdgeAttr(*s2)['stemid']
connected,ml,sl,delta = pairEndedConnected(self, # @UnusedVariable
assgraph,
sid1,
sid2,
back)
if s1[1]==s2[0]:
if logger is None:
print("Path is circular",file=sys.stderr)
else:
logger.info("Path is circular")
circular=True
label.append('{connection: %d}' % connected)
else:
if connected >= minlink:
glength = frglen - ml - self.index.getReadSize()
if logger is None:
print("Path is circular but disconnected" ,file=sys.stderr)
print(" But linked by %d pair ended links (gap length=%f sd=%f)" % (connected,float(glength),float(sl)),file=sys.stderr)
else:
logger.info("Path is circular but disconnected")
logger.info("Linked by %d pair ended links (gap length=%f sd=%f)" % (connected,float(glength),float(sl)))
label.append('{N-connection: %d}' % connected)
seq.append(b'N'* int(glength))
slength.append(int(glength))
circular=True
elif forceconnection:
if logger is None:
print("Circular connection forced",file=sys.stderr)
print("Linked by %d pair ended links" % connected,file=sys.stderr)
else:
logger.info("Circular connection forced")
logger.info("Linked by %d pair ended links" % connected)
label.append('{F-connection: %d}' % connected)
seq.append(b'N'*int(nlength))
circular=True
else:
if logger is None:
print("Path is linear",file=sys.stderr)
else:
logger.info("Path is linear")
circular=False
seq.insert(0,self.index.getRead(s2[0],0,self.index.getReadSize()).lower())
slength.insert(0,self.index.getReadSize())
for stemid in set(path):
if stemid !=0:
stem = alledges[stemid]
attr = assgraph.getEdgeAttr(*stem)
sequence = attr['sequence']
graphics = attr.get('graphics',{})
graphics['style']='dashed'
attr['graphics']=graphics
if oldstem is not None:
connected,ml,sl,delta = pairEndedConnected(self,assgraph,oldid,stemid,back) # @UnusedVariable
if oldstem[1]==stem[0]:
if oldstemclass=="sequence":
if stemclass=="sequence": # Link between 2 sequences
if ml is not None:
logOrPrint(logger,
"Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" %
(oldid,stemid,connected,float(ml),float(sl)))
label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
else:
logOrPrint(logger,
"Both segments %d and %d are connected but covered by 0 paired-end" %
(oldid,stemid))
if attr['length'] > 10:
attr['label']="%d : %s->(%d)->%s [%d] @ %s" % (stemid,sequence[0:5].decode('ascii'),
attr['length'],
sequence[-5:].decode('ascii'),
int(attr['weight']),
attr['steps'])
else:
attr['label']="%d : %s->(%d) [%d] @ %s" % (stemid,
sequence.decode('ascii'),
attr['length'],
int(attr['weight']),
attr['steps'])
sequence = b''.join(seq)
length = sum(slength)
mcov = oneXcoverage(assgraph)
fasta=[">%s seq_length=%d; coverage=%5.1f; circular=%s; %s%s" % (identifier,length,
mcov,circular,
tags2str(tags) + " ",
'.'.join(label))]
fasta.extend(sequence[i:i+60].decode('ascii') for i in range(0,len(sequence),60))
return "\n".join(fasta)
label.append('{connection: 0}')
elif stemclass[0:9]=="scaffold:": # Link a sequence and a gap
logOrPrint(logger,
"Both segments %d and %d are disconnected" % attr['link'])
if stemclass=="scaffold:paired-end":
logOrPrint(logger,
" But linked by %d pair ended links (gap length=%f sd=%f)" %
(attr['pairendlink'],
attr['length'],
attr['gapsd']))
label.append('{N-connection: %d - Gap length: %d, sd: %d}' %
(attr['pairendlink'],
attr['length'],
attr['gapsd']))
elif stemclass=="scaffold:forced":
logOrPrint(logger," Connection is forced")
if attr['pairendlink'] > 0:
logOrPrint(logger,
" But asserted by %d pair ended links (gap length=%f sd=%f)" %
(attr['pairendlink'],
attr['length'],
attr['gapsd']))
label.append('{F-connection: %d - Gap length: %d, sd: %d}' %
(attr['pairendlink'],
attr['length'],
attr['gapsd']))
else:
label.append('{F-connection: %d}' % connected)
elif oldstemclass[0:9]=="scaffold:":
if stemclass=="sequence":
seq.append(self.index.getRead(attr['path'][0],
0,
self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
def path2fasta2(self,assgraph,path,identifier="contig",minlink=10,nlength=20,back=200,logger=None,tags=[]):
'''
Convert a path in an compact assembling graph in a fasta formated sequences.
:param assgraph: The compact assembling graph as produced by the
:py:meth:`~orgasm.assembler.Assembler.compactAssembling` method
:type assgraph: :py:class:`~orgasm.graph.DiGraphMultiEdge`
:param path: an ``iterable`` providing an ordered list of ``stemid`` indicating
the path to follow.
:type path: an ``iterable`` over :py:class:`int`
:param identifier: the identifier used in the header of the fasta formated sequence
:type identifier: :py:class:`bytes`
:param minlink: the minimum count of pair ended link to consider
for asserting the relationship
:type minlink: :py:class:`int`
:param nlength: how many ``N`` must be added between two segment of sequences only connected
by pair ended links
:type nlength: :py:class:`int`
:param back: How many base pairs must be considered at the end of each edge
:type back: :py:class:`int`
:returns: a string containing the fasta formated sequence
:rtype: :py:class:`bytes`
else:
raise RuntimeError('A scaffold link must be followed by a sequence %d --> %d' %
(oldid,stemid))
elif forceconnection:
if connected > 0:
glength = int(frglen-ml - self.index.getReadSize())
:raises: :py:class:`AssertionError`
'''
frglen,frglensd = estimateFragmentLength(self)
alledges = dict((assgraph.getEdgeAttr(*e)['stemid'],e)
for e in assgraph.edgeIterator(edgePredicate = lambda i: 'stemid' in assgraph.getEdgeAttr(*i)))
logOrPrint(logger,
"Connection is forced with only %d pair ended links (gap length=%f sd=%f)" %
(connected,float(glength),float(sl)))
# def getEdge(stemid):
# stem = assgraph.edgeIterator(edgePredicate=lambda e:'stemid' in assgraph.getEdgeAttr(*e)
# and assgraph.getEdgeAttr(*e)['stemid']==stemid).next()
# return stem
#print alledges
coverage=[]
slength=[]
seq=[]
label=[]
oldstem=None
oldid=None
rank = 1
forceconnection=False
label.append('{F-connection: %d - Gap length: %d, sd: %d}' %
(connected,
glength,
sl))
for stemid in path:
if stemid != 0:
stem = alledges[stemid]
attr = assgraph.getEdgeAttr(*stem)
#print "@@>",stemid,attr['length'],stem,oldstem
if oldstem is not None:
connected,ml,sl,delta = pairEndedConnected(self,assgraph,oldid,stemid,back) # @UnusedVariable
if oldstem[1]==stem[0]:
if ml is not None:
logOrPrint(logger,
"Both segments %d and %d are connected (paired-end=%d frg length=%f sd=%f)" %
(oldid,stemid,connected,float(ml),float(sl)))
label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
else:
logOrPrint(logger,
"Both segments %d and %d are connected but covered by 0 paired-end" %
(oldid,stemid))
label.append('{connection: 0}')
"Connection is forced without pair ended links")
label.append('{F-connection: %d}' % connected)
glength = nlength
seq.append(b'N'*int(glength))
slength.append(int(glength))
seq.append(self.index.getRead(attr['path'][0],
0,
self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
# Add the foced link to the compact assembly graph
flink = assgraph.addEdge(oldstem[1],stem[0])
rlink = assgraph.addEdge(-stem[0],-oldstem[1])
flink['label']="Forced (%d) %d -> %d" % (connected,oldid,stemid)
flink['graphics']={'width':1,
'arrow':'last',
'fill':'0xFF0000',
'style':'dashed'
}
rlink['label']="Forced (%d) %d -> %d" % (connected,-stemid,-oldid)
rlink['graphics']={'width':1,
'arrow':'last',
'fill':'0xFF0000',
'style':'dashed'
}
else:
logOrPrint(logger,
"Both segments %d and %d are disconnected" % (oldid,stemid))
raise AssertionError('Disconnected path between stem '
'%d and %d only %d pair ended links' % (oldid,stemid,connected))
if connected >= minlink:
glength = frglen - ml - self.index.getReadSize()
logOrPrint(logger,
" But linked by %d pair ended links (gap length=%f sd=%f)" %
(connected,float(glength),float(sl)))
if glength > 0:
seq.append(b'N'*int(glength))
slength.append(int(glength))
else:
seq.append(b'N'*3)
seq.append(self.index.getRead(stem[0],0,self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
label.append('{N-connection: %d - Gap length: %d, sd: %d}' % (connected,int(glength),int(sl)))
elif forceconnection:
if connected > 0:
glength = int(frglen-ml - self.index.getReadSize())
logOrPrint(logger,
"Connection is forced with only %d pair ended links (gap length=%f sd=%f)" %
(connected,float(glength),float(sl)))
else:
logOrPrint(logger,
"Connection is forced without pair ended links")
glength = nlength
seq.append(b'N'*int(glength))
seq.append(self.index.getRead(stem[0],0,self.index.getReadSize()).lower())
slength.append(self.index.getReadSize())
if connected > 0:
label.append('{F-connection: %d - Gap length: %d, sd: %d}' % (connected,int(glength),int(sl)))
else:
label.append('{F-connection: %d}' % connected)
flink = assgraph.addEdge(oldstem[1],stem[0])
rlink = assgraph.addEdge(-stem[0],-oldstem[1])
flink['label']="Forced (%d) %d -> %d" % (connected,oldid,stemid)
flink['graphics']={'width':1,
'arrow':'last',
'fill':'pink'
}
rlink['label']="Forced (%d) %d -> %d" % (connected,-stemid,-oldid)
rlink['graphics']={'width':1,
'arrow':'last',
'fill':'pink'
}
else:
raise AssertionError('Disconnected path between stem '
'%d and %d only %d pair ended links' % (oldid,stemid,connected))
allsteps = attr.get('steps',{})
steps = allsteps.get(identifier,[])
steps.append(rank)
allsteps[identifier]=steps
attr['steps']=allsteps
rank+=1
if stemclass=="sequence":
if attr['length'] > 10:
attr['label']="%d : %s->(%d)->%s [%d] @ %s" % (stemid,sequence[0:5].decode('ascii'),
attr['length'],
sequence[-5:].decode('ascii'),
int(attr['weight']),
attr['steps'])
else:
attr['label']="%d : %s->(%d) [%d] @ %s" % (stemid,
sequence.decode('ascii'),
attr['length'],
int(attr['weight']),
attr['steps'])
seq.append(attr['sequence'])
label.append(attr['label'])
seq.append(sequence)
coverage.append(attr['weight'])
slength.append(attr['length'])
label.append(attr['label'])
rank+=1
oldstem = stem
oldid=stemid
oldstemclass=stemclass
forceconnection=False
else:
forceconnection=True
s1 = alledges[path[-1]]
s2 = alledges[path[0]]
sid1=assgraph.getEdgeAttr(*s1)['stemid']
sid2=assgraph.getEdgeAttr(*s2)['stemid']
sclass2=assgraph.getEdgeAttr(*s2)['class']
connected,ml,sl,delta = pairEndedConnected(self, # @UnusedVariable
assgraph,
sid1,
......@@ -1416,24 +1241,17 @@ def path2fasta2(self,assgraph,path,identifier="contig",minlink=10,nlength=20,bac
if s1[1]==s2[0]:
logOrPrint(logger,
"Path is circular")
logOrPrint(logger, "Path is circular and connected by %d (length: %d, sd: %d)" %
(connected,int(ml),int(sl))
)
circular=True
label.append('{connection: %d}' % connected)
if sclass2=="sequence":
label.append('{connection: %d - length: %d, sd: %d}' % (connected,int(ml),int(sl)))
else:
if connected >= minlink:
glength = frglen - ml - self.index.getReadSize()
logOrPrint(logger,"Path is circular but disconnected" )
logOrPrint(logger," But linked by %d pair ended links (gap length=%f sd=%f)" %
(connected,float(glength),float(sl)))
label.append('{N-connection: %d}' % connected)
seq.append(b'N'* int(glength))
slength.append(int(glength))
circular=True
elif forceconnection:
if sclass2!="sequence":
raise RuntimeError("A path cannot ends on a gap")
if forceconnection:
logOrPrint(logger,"Circular connection forced",)
logOrPrint(logger,"Linked by %d pair ended links" % connected)
......@@ -1447,29 +1265,6 @@ def path2fasta2(self,assgraph,path,identifier="contig",minlink=10,nlength=20,bac
seq.insert(0,self.index.getRead(s2[0],0,self.index.getReadSize()).lower())
slength.insert(0,self.index.getReadSize())
for stemid in set(path):
if stemid !=0:
stem = alledges[stemid]
attr = assgraph.getEdgeAttr(*stem)
sequence = attr['sequence']
graphics = attr.get('graphics',{})
graphics['style']='dashed'
attr['graphics']=graphics
if attr['length'] > 10:
attr['label']="%d : %s->(%d)->%s [%d] @ %s" % (stemid,sequence[0:5].decode('ascii'),
attr['length'],
sequence[-5:].decode('ascii'),
int(attr['weight']),
attr['steps'])
else:
attr['label']="%d : %s->(%d) [%d] @ %s" % (stemid,
sequence.decode('ascii'),
attr['length'],
int(attr['weight']),
attr['steps'])
sequence = b''.join(seq)
length = sum(slength)
......@@ -1479,7 +1274,10 @@ def path2fasta2(self,assgraph,path,identifier="contig",minlink=10,nlength=20,bac
mcov,circular,
tags2str(tags) + " ",
'.'.join(label))]
fasta.extend(sequence[i:i+60].decode('ascii') for i in range(0,len(sequence),60))
fasta.extend(sequence[i:i+60].decode('ascii')
for i in range(0,len(sequence),60)
)
return "\n".join(fasta)
def pathConstraints(self,cg,threshold=5.,back=500, minlink=5):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment