Commit e74f59ac by Eric Coissac

Change the management of the seeds and start the buildgraph refactoring

parent b05438a3
......@@ -3,6 +3,14 @@
The :program:`fillgaps` command
===============================
At the end of the :ref:`oa buildgraph <oa_buildgraph>` command the assembling graph
is not always complete. Because of the non homogeneous coverage, some parts of the
genome too low covered were not able to be assembled using the heuristics parameters
used to assemble the main parts of the genome. The :program:`fillgaps` command aims
to rerun the *fillgap* algorithm used by the :ref:`oa buildgraph <oa_buildgraph>`
but with other parameters allowing to fill the gaps not assembled during the initial
assembling.
.. figure:: ../oa-fillgaps.*
:align: center
:figwidth: 80 %
......@@ -47,6 +55,19 @@ Graph initialisation options
.. _seeds.kup:
.. code-block:: bash
$ oa fillgaps --seeds protChloroArabidopsis seqindex
A set of seed sequences must be or nucleic or proteic. For initiating
assembling with both nucleic and proteic sequences you must use at least two
``--seeds`` options one for each class of sequences.
.. code-block:: bash
$ oa fillgaps --seeds protChloroArabidopsis --seeds rDNAChloro.fasta seqindex
.. include:: ../options/kup.txt
Graph extension options
......
......@@ -73,6 +73,9 @@ def getProbes(config):
logger.info("Load probe internal dataset : %s" % s)
probes[s]=[p,{}]
else:
logger.info("No new probe set specified")
return probes
......@@ -113,45 +116,32 @@ def getSeeds(index,config,extension=".omx"):
output = config['orgasm']['outputfilename']
logger=config['orgasm']['logger']
kup=-1 if config['orgasm']['kup'] is None else config['orgasm']['kup']
clean=config['buildgraph']['clean']
probes = getProbes(config)
filename=output+extension
if probes:
# --seeds option on the command line -> look for these seeds
logger.info("Running probes matching against reads...")
for probename in probes:
p = probes[probename][0]
logger.info(" -> probe set: %s" % probename)
seeds = index.lookForSeeds(p,
mincov=config['orgasm']['seedmincov'],
kup=kup,
identity=config['orgasm']['identity'],
logger=logger)
probes[probename][1]=seeds
logger.info("==> %d matches" % sum(len(seeds[i]) for i in seeds))
with open(filename,"wb") as fseeds:
pickle.dump(probes,fseeds)
else:
# no --seeds option on the command line -> load the previous results
#
# Look for seeds with the new probe sets
#
newprobes = getProbes(config)
#
# Load already run probe sets
#
if not clean or not newprobes:
try:
with open(filename,'rb') as fseeds:
probes = pickle.load(fseeds)
logger.info("Load matches from previous run : %d probe sets restored" % len(probes))
oldversion=False
for probename in probes:
s = probes[probename][1]
if len(s[list(s.keys())[0]][0]) != 6:
oldversion=True
logger.warning("Old probe version save on the disk. Recomputes probes %s" % probename)
p = probes[probename][0]
......@@ -170,15 +160,53 @@ def getSeeds(index,config,extension=".omx"):
if oldversion:
with open(filename,"wb") as fseeds:
pickle.dump(probes,fseeds)
nm=0
for k in probes:
for m in probes[k][1].values():
nm+=len(m)
logger.info(" ==> A total of : %d" % nm)
except FileNotFoundError:
logger.info("No --seeds option specified and not previous matches stored")
sys.exit(1)
logger.info("No previous matches loaded")
probes={}
else:
if os.path.exists(filename):
logger.info("Cleaning previous matches")
os.remove(filename)
probes={}
if newprobes:
# --seeds option on the command line -> look for these seeds
logger.info("Running probes matching against reads...")
for probename in newprobes:
p = newprobes[probename][0]
logger.info(" -> probe set: %s" % probename)
seeds = index.lookForSeeds(p,
mincov=config['orgasm']['seedmincov'],
kup=kup,
identity=config['orgasm']['identity'],
logger=logger)
nmatches = sum(len(seeds[i]) for i in seeds)
logger.info("==> %d matches" % nmatches)
if nmatches:
probes[probename]=[p,seeds]
newprobes = list(newprobes.keys())
if not probes:
logger.info("No --seeds option specified and not previous matches stored")
sys.exit(1)
if newprobes:
with open(filename,"wb") as fseeds:
pickle.dump(probes,fseeds)
logger.info("Match list :")
......@@ -202,6 +230,6 @@ def getSeeds(index,config,extension=".omx"):
covmax=coverage
return covmax,probes
return covmax,probes,newprobes
#def reloadAssembling
\ No newline at end of file
......@@ -950,6 +950,9 @@ cdef class ProtAhoCorasick(AhoCorasick):
cdef size_t wordmax=0
cdef double shamin
cdef size_t wc[6]
cdef size_t rp[6]
cdef dict matchpos
cdef size_t* count
cdef int wct
cdef int wcmax
......@@ -997,7 +1000,8 @@ cdef class ProtAhoCorasick(AhoCorasick):
while i < readcount:
# We start a new read so we set counter to 0
wordcount.clear()
wordcount.clear()
matchpos={}
readid = i
# print "@",i
......@@ -1017,6 +1021,7 @@ cdef class ProtAhoCorasick(AhoCorasick):
if pid < 0 :
pid += mid
wordcount.set(pid,pmatch.position)
matchpos[pid]=min(matchpos.get(pid,65535),pmatch.position)
pmatch = pmatch.next
state = nstate
table = &(state.a)
......@@ -1035,6 +1040,10 @@ cdef class ProtAhoCorasick(AhoCorasick):
wc[4]=count[pid+1]
wc[5]=count[pid+2]
rp[3]=matchpos.get(pid,65535)
rp[4]=matchpos.get(pid+1,65535)
rp[5]=matchpos.get(pid+2,65535)
#memcpy(<void*>(wc+3),<void*>(count+pid),3*sizeof(size_t))
pid = mid - pid
......@@ -1042,6 +1051,10 @@ cdef class ProtAhoCorasick(AhoCorasick):
wc[0]=count[pid]
wc[1]=count[pid+1]
wc[2]=count[pid+2]
rp[0]=matchpos.get(pid,65535)
rp[1]=matchpos.get(pid+1,65535)
rp[2]=matchpos.get(pid+2,65535)
# memcpy(<void*>(wc),<void*>(count+pid),3*sizeof(size_t))
......@@ -1063,17 +1076,21 @@ cdef class ProtAhoCorasick(AhoCorasick):
if wc[p]>wcmax:
wcmax=wc[p]
phase=p-3
loc=rp[p]
else:
shanon=1.0
wcmax=0
phase=0
if ((shanon<shamin) and
(wcmax >=minmatch)):
if ((shanon<0.1) and
(wcmax >=minmatch) and
(wcmax >wordmax)):
shamin = shanon
wordmax = wcmax
protidmax=k
framemax= phase
framemax= phase
locmax=loc
nbreads=1
......@@ -1105,7 +1122,7 @@ cdef class ProtAhoCorasick(AhoCorasick):
PyDict_SetItem(results,PyInt_FromLong(protidmax),lpos)
else:
lpos = <object>plpos
lpos.append((readid,wordmax,nbreads,framemax,shamin))
lpos.append((readid,wordmax,nbreads,framemax,shamin,locmax))
#<------------------------------ End of the while loop ------------------------>
......
......@@ -46,7 +46,7 @@ def run(config):
output = getOutput(config)
r = getIndex(config)
coverage,x = getSeeds(r,config)
coverage,x,newprobes = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
meanlength,sdlength = estimateFragmentLength(asm)
......
......@@ -52,7 +52,7 @@ def run(config):
output = getOutput(config)
r = getIndex(config)
xxx,x = getSeeds(r,config)
xxx,x,newprobes = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -54,7 +54,7 @@ def run(config):
output = getOutput(config)
r = getIndex(config)
ecoverage,x = getSeeds(r,config)
ecoverage,x,newprobes = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -145,7 +145,7 @@ def run(config):
smallbranches = config['buildgraph']['smallbranches']
r = getIndex(config)
xxx,x = getSeeds(r,config)
xxx,x,newprobes = getSeeds(r,config)
adapterSeq5,adapterSeq3 = getAdapters(config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -38,7 +38,7 @@ def run(config):
output = getOutput(config)
r = getIndex(config)
ecoverage,x = getSeeds(r,config)
ecoverage,x,newprobes = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -47,7 +47,7 @@ def run(config):
output = getOutput(config)
r = getIndex(config)
ecoverage,x = getSeeds(r,config)
ecoverage,x,newprobes = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -59,7 +59,7 @@ def run(config):
logger.info("Looking for the seed reads")
r = getIndex(config)
ecoverage,x = getSeeds(r,config)
ecoverage,x,newprobes = getSeeds(r,config)
logger.info('Coverage estimated from probe matches at : %d' % ecoverage)
......@@ -52,7 +52,7 @@ def run(config):
r = getIndex(config)
coverage,x = getSeeds(r,config)
coverage,x,newprobes = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -58,7 +58,7 @@ def run(config):
output = getOutput(config)
r = getIndex(config)
coverage,x = getSeeds(r,config)
coverage,x,newprobes = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -46,8 +46,8 @@ cdef extern from "orgasm.h":
cdef bint lowcomplexity(bytes s):
"""
Returns True if the word s is an homopolymer, an homedimer
or an homotrimer
Returns True if the word s is an homopolymer, an homo-dimer
or an homo-trimer
@param s: the string to test
@type s: bytes
......
......@@ -236,9 +236,12 @@ def weightedMode(data):
d.extend([x] * w)
return mode(d)
def matchtoseed(matches,index):
def matchtoseed(matches,index,new=None):
s=[]
for p in matches:
if new is None:
new=list(matches.keys())
for p in new:
m = matches[p][1]
k = m.keys()
for x in k:
......@@ -249,12 +252,16 @@ def matchtoseed(matches,index):
def matchtogene(matches):
genes = {}
if matches is not None:
for p in matches:
for p in matches: # Loop over probe set
m = matches[p][1]
k = m.keys()
for x in k:
for i in m[x]:
genes[abs(i[0])]=x
for x in k: # Loop over a probes in a set
for i in m[x]: # loop over matches in a probe
if i[0] < 0:
pos = -i[5]
else:
pos = i[5]
genes[abs(i[0])]=(x,pos)
return genes
......@@ -275,6 +282,8 @@ def genesincontig(cg,index,matches):
ea = cg.getEdgeAttr(*e)
path = ea['path']
eg = [vread(i) for i in path]
ep = [genes.get(abs(i),()) for i in path]
g = sum(i for i in eg if i > 0)
if g > 0:
g=max(min(math.ceil(math.log10(g)),4)*64-1,0)
......@@ -286,6 +295,7 @@ def genesincontig(cg,index,matches):
graphics['fill']=color
ea['graphics']=graphics
ea['ingene']=g
ea['genepos']=ep
......@@ -1728,7 +1738,8 @@ def fillGaps2(self,minlink=5,
adapters3=(),
maxjump=0,
snp=False,
nodeLimit=1000000):
nodeLimit=1000000,
onlyfill=False):
'''
:param minlink:
......@@ -1842,48 +1853,49 @@ def fillGaps2(self,minlink=5,
else:
print("--> already aligned",file=sys.stderr)
for e1 in range(net):
if etid[e1] not in extended:
print("\n\nExtending Stems %d" % (etid[e1]),
file=sys.stderr)
ex = frozenset((ext[e1] - ept[e1]) | eet[e1])
nreads = len(ex)
print("--> %d reads to align" % (nreads),
file=sys.stderr)
if nreads > 10:
__cacheAli2.add(ex)
if ex not in __cacheAli:
ali= multiAlignReads(ex,index,kmer,smin,delta)
print('',file=sys.stderr)
goodali = [i for i in ali if len(i) >= nreads/4]
print("--> %d consensus to add" % len(goodali),
file=sys.stderr)
for a in goodali:
c = consensus(a,index,cmincov)
if c:
cycle+=1
s = insertFragment(self,c,cycle=cycle)
print(" %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
file=sys.stderr)
a = tango(self,
seeds = s,
minread = minread,
minratio = minratio,
mincov = emincov,
minoverlap = minoverlap,
lowfilter = lowfilter,
adapters5 = adapters5,
adapters3 = adapters3,
maxjump = maxjump,
cycle = cycle,
nodeLimit = nodeLimit)
print("",file=sys.stderr)
else:
print("--> already aligned",file=sys.stderr)
if (not onlyfill):
for e1 in range(net):
if etid[e1] not in extended:
print("\n\nExtending Stems %d" % (etid[e1]),
file=sys.stderr)
ex = frozenset((ext[e1] - ept[e1]) | eet[e1])
nreads = len(ex)
print("--> %d reads to align" % (nreads),
file=sys.stderr)
if nreads > 10:
__cacheAli2.add(ex)
if ex not in __cacheAli:
ali= multiAlignReads(ex,index,kmer,smin,delta)
print('',file=sys.stderr)
goodali = [i for i in ali if len(i) >= nreads/4]
print("--> %d consensus to add" % len(goodali),
file=sys.stderr)
for a in goodali:
c = consensus(a,index,cmincov)
if c:
cycle+=1
s = insertFragment(self,c,cycle=cycle)
print(" %d bp (%d reads) added on cycle %d" % (len(c),len(s),cycle),
file=sys.stderr)
a = tango(self,
seeds = s,
minread = minread,
minratio = minratio,
mincov = emincov,
minoverlap = minoverlap,
lowfilter = lowfilter,
adapters5 = adapters5,
adapters3 = adapters3,
maxjump = maxjump,
cycle = cycle,
nodeLimit = nodeLimit)
print("",file=sys.stderr)
else:
print("--> already aligned",file=sys.stderr)
self.cleanDeadBranches(maxlength=10)
cutLowCoverage(self,gmincov,terminal=True)
......
--extra-index-url https://pypi.python.org/simple/
pip>=8.0
Cython==0.23
Cython==0.23.5
Sphinx>=1.3
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment