Commit e74f59ac authored by Eric Coissac's avatar Eric Coissac

Change the management of the seeds and start the buildgraph refactoring

parent b05438a3
......@@ -3,6 +3,14 @@
The :program:`fillgaps` command
===============================
At the end of the :ref:`oa buildgraph <oa_buildgraph>` command the assembling graph
is not always complete. Because of the non homogeneous coverage, some parts of the
genome too low covered were not able to be assembled using the heuristics parameters
used to assemble the main parts of the genome. The :program:`fillgaps` command aims
to rerun the *fillgap* algorithm used by the :ref:`oa buildgraph <oa_buildgraph>`
but with other parameters allowing to fill the gaps not assembled during the initial
assembling.
.. figure:: ../oa-fillgaps.*
:align: center
:figwidth: 80 %
......@@ -47,6 +55,19 @@ Graph initialisation options
.. _seeds.kup:
.. code-block:: bash
$ oa fillgaps --seeds protChloroArabidopsis seqindex
A set of seed sequences must be or nucleic or proteic. For initiating
assembling with both nucleic and proteic sequences you must use at least two
``--seeds`` options one for each class of sequences.
.. code-block:: bash
$ oa fillgaps --seeds protChloroArabidopsis --seeds rDNAChloro.fasta seqindex
.. include:: ../options/kup.txt
Graph extension options
......
......@@ -73,6 +73,9 @@ def getProbes(config):
logger.info("Load probe internal dataset : %s" % s)
probes[s]=[p,{}]
else:
logger.info("No new probe set specified")
return probes
......@@ -113,45 +116,32 @@ def getSeeds(index,config,extension=".omx"):
output = config['orgasm']['outputfilename']
logger=config['orgasm']['logger']
kup=-1 if config['orgasm']['kup'] is None else config['orgasm']['kup']
clean=config['buildgraph']['clean']
probes = getProbes(config)
filename=output+extension
if probes:
# --seeds option on the command line -> look for these seeds
logger.info("Running probes matching against reads...")
for probename in probes:
p = probes[probename][0]
logger.info(" -> probe set: %s" % probename)
seeds = index.lookForSeeds(p,
mincov=config['orgasm']['seedmincov'],
kup=kup,
identity=config['orgasm']['identity'],
logger=logger)
probes[probename][1]=seeds
logger.info("==> %d matches" % sum(len(seeds[i]) for i in seeds))
with open(filename,"wb") as fseeds:
pickle.dump(probes,fseeds)
else:
# no --seeds option on the command line -> load the previous results
#
# Look for seeds with the new probe sets
#
newprobes = getProbes(config)
#
# Load already run probe sets
#
if not clean or not newprobes:
try:
with open(filename,'rb') as fseeds:
probes = pickle.load(fseeds)
logger.info("Load matches from previous run : %d probe sets restored" % len(probes))
oldversion=False
for probename in probes:
s = probes[probename][1]
if len(s[list(s.keys())[0]][0]) != 6:
oldversion=True
logger.warning("Old probe version save on the disk. Recomputes probes %s" % probename)
p = probes[probename][0]
......@@ -170,15 +160,53 @@ def getSeeds(index,config,extension=".omx"):
if oldversion:
with open(filename,"wb") as fseeds:
pickle.dump(probes,fseeds)
nm=0
for k in probes:
for m in probes[k][1].values():
nm+=len(m)
logger.info(" ==> A total of : %d" % nm)
except FileNotFoundError:
logger.info("No --seeds option specified and not previous matches stored")
sys.exit(1)
logger.info("No previous matches loaded")
probes={}
else:
if os.path.exists(filename):
logger.info("Cleaning previous matches")
os.remove(filename)
probes={}
if newprobes:
# --seeds option on the command line -> look for these seeds
logger.info("Running probes matching against reads...")
for probename in newprobes:
p = newprobes[probename][0]
logger.info(" -> probe set: %s" % probename)
seeds = index.lookForSeeds(p,
mincov=config['orgasm']['seedmincov'],
kup=kup,
identity=config['orgasm']['identity'],
logger=logger)
nmatches = sum(len(seeds[i]) for i in seeds)
logger.info("==> %d matches" % nmatches)
if nmatches:
probes[probename]=[p,seeds]
newprobes = list(newprobes.keys())
if not probes:
logger.info("No --seeds option specified and not previous matches stored")
sys.exit(1)
if newprobes:
with open(filename,"wb") as fseeds:
pickle.dump(probes,fseeds)
logger.info("Match list :")
......@@ -202,6 +230,6 @@ def getSeeds(index,config,extension=".omx"):
covmax=coverage
return covmax,probes
return covmax,probes,newprobes
#def reloadAssembling
\ No newline at end of file
......@@ -950,6 +950,9 @@ cdef class ProtAhoCorasick(AhoCorasick):
cdef size_t wordmax=0
cdef double shamin
cdef size_t wc[6]
cdef size_t rp[6]
cdef dict matchpos
cdef size_t* count
cdef int wct
cdef int wcmax
......@@ -997,7 +1000,8 @@ cdef class ProtAhoCorasick(AhoCorasick):
while i < readcount:
# We start a new read so we set counter to 0
wordcount.clear()
wordcount.clear()
matchpos={}
readid = i
# print "@",i
......@@ -1017,6 +1021,7 @@ cdef class ProtAhoCorasick(AhoCorasick):
if pid < 0 :
pid += mid
wordcount.set(pid,pmatch.position)
matchpos[pid]=min(matchpos.get(pid,65535),pmatch.position)
pmatch = pmatch.next
state = nstate
table = &(state.a)
......@@ -1035,6 +1040,10 @@ cdef class ProtAhoCorasick(AhoCorasick):
wc[4]=count[pid+1]
wc[5]=count[pid+2]
rp[3]=matchpos.get(pid,65535)
rp[4]=matchpos.get(pid+1,65535)
rp[5]=matchpos.get(pid+2,65535)
#memcpy(<void*>(wc+3),<void*>(count+pid),3*sizeof(size_t))
pid = mid - pid
......@@ -1042,6 +1051,10 @@ cdef class ProtAhoCorasick(AhoCorasick):
wc[0]=count[pid]
wc[1]=count[pid+1]
wc[2]=count[pid+2]
rp[0]=matchpos.get(pid,65535)
rp[1]=matchpos.get(pid+1,65535)
rp[2]=matchpos.get(pid+2,65535)
# memcpy(<void*>(wc),<void*>(count+pid),3*sizeof(size_t))
......@@ -1063,17 +1076,21 @@ cdef class ProtAhoCorasick(AhoCorasick):
if wc[p]>wcmax:
wcmax=wc[p]
phase=p-3
loc=rp[p]
else:
shanon=1.0
wcmax=0
phase=0
if ((shanon<shamin) and
(wcmax >=minmatch)):
if ((shanon<0.1) and
(wcmax >=minmatch) and
(wcmax >wordmax)):
shamin = shanon
wordmax = wcmax
protidmax=k
framemax= phase
framemax= phase
locmax=loc
nbreads=1
......@@ -1105,7 +1122,7 @@ cdef class ProtAhoCorasick(AhoCorasick):
PyDict_SetItem(results,PyInt_FromLong(protidmax),lpos)
else:
lpos = <object>plpos
lpos.append((readid,wordmax,nbreads,framemax,shamin))
lpos.append((readid,wordmax,nbreads,framemax,shamin,locmax))
#<------------------------------ End of the while loop ------------------------>
......
......@@ -13,6 +13,8 @@ from orgasm.tango import matchtoseed, cutLowCoverage, cutSNPs,\
from orgasm.assembler import Assembler,tango
import sys
import os
import os.path
__title__="Build the initial assembling graph"
......@@ -27,7 +29,8 @@ default_config = { 'minread' : None,
'lowcomplexity' : False,
'snp' : False,
'assmax' : 500000,
'testrun' : 15000
'testrun' : 15000,
'clean' : False
}
def addOptions(parser):
......@@ -149,6 +152,11 @@ def addOptions(parser):
help='The fraction of word'
'[default: 0.5]')
parser.add_argument('--clean', dest='buildgraph:clean',
action='store_true',
default=None,
help='Erase previously existing graph to restart a new assembling')
def estimateMinRead(index,minoverlap,coverage):
MINREAD=10
......@@ -173,6 +181,9 @@ def run(config):
coverageset=config['buildgraph']['coverage'] is not None
snp=config['buildgraph']['snp']
assmax = config['buildgraph']['assmax']*2
logger.info("Building De Bruijn Graph")
......@@ -182,49 +193,213 @@ def run(config):
r = getIndex(config)
adapterSeq3, adapterSeq5 = getAdapters(config)
ecoverage,x = getSeeds(r,config)
logger.info('Coverage estimated from probe matches at : %d' % ecoverage)
ecoverage,x,newprobes = getSeeds(r,config)
# Force the coverage to the specified value
if coverageset:
coverage = config['buildgraph']['coverage']
logger.info('Coverage forced by user at : %d' % coverage)
else:
coverage = ecoverage
# according to the minread option estimate it from coverage or use the specified value
if config['buildgraph']['minread'] is None:
minread,minoverlap = estimateMinRead(r,
minoverlap,
coverage)
logger.info('Minimum read estimated from coverage (%dx) ar: %d' % (coverage,minread))
if config['buildgraph']['clean'] or not os.path.exists("%s.oax" % output):
#
# This is a new assembling or at least we are cleaning previous assembling
#
##
if os.path.exists("%s.oax" % output):
logger.info('Cleaning previous assembling')
os.remove("%s.oax" % output)
else:
logger.info('No previous assembling')
logger.info("Starting a new assembling")
# Create the assembler object
asm = Assembler(r)
logger.info('Coverage estimated from probe matches at : %d' % ecoverage)
if coverageset:
logger.info('Coverage forced by user at : %d' % coverage)
else:
##########################
#
# If minread is not specified we initiate the assembling
# based on the coverage estimated from protein match
# to obtain a better coverage estimation
#
##########################
if config['buildgraph']['minread'] is None:
minread,minoverlap = estimateMinRead(r,config['buildgraph']['minoverlap'],coverage)
minread//=4
if minread<5:
minread=5
logger.info('Assembling of %d pb for estimating actual coverage' % config['buildgraph']['testrun'])
s = matchtoseed(x,r,newprobes)
# Run the first assembling pass
a = tango(asm,s,mincov=minread, # @UnusedVariable
minread=minread,
minoverlap=minoverlap,
lowfilter=lowfilter,
adapters3=adapterSeq3,
adapters5=adapterSeq5,
maxjump=0,
cycle=1,
nodeLimit=config['buildgraph']['testrun'] * 2,
progress=progress,
logger=logger)
# Clean small unsuccessful extensions
asm.cleanDeadBranches(maxlength=10)
# and too low covered assembling
if coverageset:
cutLowCoverage(asm,int(coverage),terminal=True)
else:
cutLowCoverage(asm,int(coverage/4),terminal=True)
if snp:
cutSNPs(asm)
if config['buildgraph']['smallbranches'] is not None:
smallbranches = config['buildgraph']['smallbranches']
else:
smallbranches = estimateDeadBrancheLength(asm)
logger.info("Dead branch length set to : %d bp" % smallbranches)
asm.cleanDeadBranches(maxlength=smallbranches)
if len(asm) > 0:
score,length,ecoverage = coverageEstimate(asm,x,r) # @UnusedVariable
if not coverageset:
coverage = ecoverage
minread,minoverlap = estimateMinRead(r,config['buildgraph']['minoverlap'],coverage)
logger.info("coverage estimated : %dx based on %d bp (minread: %d)" %(coverage,length/2,minread))
else:
logger.error('Nothing assembled - Assembling aborted')
sys.exit(1)
# Create the assembler object
asm = Assembler(r)
else:
minread=config['buildgraph']['minread']
else:
minread = config['buildgraph']['minread']
logger.info('Minimum read forced by user at : %d' % minread)
#
# We are extending a previous assembling
#
logger.info('Restoring previous assembling')
asm = restoreGraph("%s.oax" % output,r,x)
logger.info("Graph size %d bp" % int(len(asm)/2))
logger.info("Entering in fill gaps mode")
# Convert matches in sedd list
s = matchtoseed(x,r)
# Clean small unsuccessful extensions
asm.cleanDeadBranches(maxlength=10)
if len(asm) == 0:
logger.error('The assembling is empty - Stop the assembling process')
sys.exit(1)
# Create the assembler object
asm = Assembler(r)
# estimate coverage from the graph
score,length,ecoverage = coverageEstimate(asm,x,r) # @UnusedVariable
# If no coverage specified coverage = estimated coverage
if not coverageset:
coverage = ecoverage
cutLowCoverage(asm,int(coverage/4),terminal=True)
else:
cutLowCoverage(asm,int(coverage),terminal=True)
##########################
#
# If minread is not specified we initiate the assembling
# based on the coverage estimated from protein match
# to obtain a better coverage estimation
#
##########################
if config['buildgraph']['minread'] is None and not coverageset:
if config['buildgraph']['smallbranches'] is not None:
smallbranches = config['buildgraph']['smallbranches']
logger.info(" Dead branch length forced by user to : %d bp" % smallbranches)
else:
smallbranches = estimateDeadBrancheLength(asm)
logger.info(" Dead branch length set to : %d bp" % smallbranches)
asm.cleanDeadBranches(maxlength=smallbranches)
if len(asm) == 0:
logger.error('The assembling is empty - Stop the assembling process')
sys.exit(1)
# reestimate coverage
score,length,ecoverage = coverageEstimate(asm,x,r) # @UnusedVariable
logger.info('Coverage estimated from the assembling graph : %d' % ecoverage)
if not coverageset:
coverage = ecoverage
# cleanup snp bubble in the graph
if snp:
cutSNPs(asm)
# according to the minread option estimate it from coverage or use the specified value
if config['buildgraph']['minread'] is None:
minread,minoverlap = estimateMinRead(r,config['buildgraph']['minoverlap'],coverage)
if minread<5:
minread=5
else:
minread,minoverlap = estimateMinRead(r,config['buildgraph']['minoverlap'],coverage)
minread=config['buildgraph']['minread']
logger.info('Assembling of %d pb for estimating actual coverage' % config['buildgraph']['testrun'])
cg = asm.compactAssembling(verbose=False)
genesincontig(cg,r,x)
meanlength,sdlength = estimateFragmentLength(asm)
if config['orgasm']['back'] is not None:
back = config['orgasm']['back']
elif config['orgasm']['back'] is None and meanlength is not None:
back = int(meanlength + 4 * sdlength)
if back > 500:
back=500
else:
back = 300
scaffold(asm,cg,
minlink=5,
back=back,
addConnectedLink=False)
with open(output+'.intermediate.gml','w') as gmlfile:
print(cg.gml(),file=gmlfile)
logger.info("based on a coverage of : %d minread set to: %d)" %(coverage,minread))
# Convert matches in seed list
s = matchtoseed(x,r,newprobes)
if s:
#############################################
#
# We now run the main assembling process
#
#############################################
logger.info('Starting the assembling')
# Run the first assembling pass
a = tango(asm,s,mincov=minread, # @UnusedVariable
a = tango(asm,s,mincov=coverage/4, #@UnusedVariable
minread=minread,
minoverlap=minoverlap,
lowfilter=lowfilter,
......@@ -232,20 +407,21 @@ def run(config):
adapters5=adapterSeq5,
maxjump=0,
cycle=1,
nodeLimit=config['buildgraph']['testrun'] * 2,
nodeLimit=assmax,
progress=progress,
logger=logger)
# Clean small unsuccessful extensions
asm.cleanDeadBranches(maxlength=10)
# and too low covered assembling
if coverageset:
cutLowCoverage(asm,int(coverage),terminal=True)
else:
cutLowCoverage(asm,int(coverage/4),terminal=True)
# cleanup snp bubble in the graph
if snp:
cutSNPs(asm)
......@@ -253,111 +429,58 @@ def run(config):
smallbranches = config['buildgraph']['smallbranches']
else:
smallbranches = estimateDeadBrancheLength(asm)
logger.info("Dead branch length setup to : %d bp" % smallbranches)
logger.info(" Dead branch length setup to : %d bp" % smallbranches)
asm.cleanDeadBranches(maxlength=smallbranches)
# reestimate coverage
if len(asm) > 0:
score,length,ecoverage = coverageEstimate(asm,x,r) # @UnusedVariable
if not coverageset:
coverage = ecoverage
minread,minoverlap = estimateMinRead(r,config['buildgraph']['minoverlap'],coverage)
logger.info("coverage estimated : %dx based on %d bp (minread: %d)" %(coverage,length/2,minread))
if len(asm) == 0:
logger.error('The assembling is empty - Stop the assembling process')
sys.exit(1)
score,length,ecoverage = coverageEstimate(asm,x,r) # @UnusedVariable
# Reinitiate the assembling for running with the estimated parameters
# Convert matches in sedd list
s = matchtoseed(x,r)
if not coverageset:
coverage = ecoverage
# if coverage < 30:
# sys.exit()
# Create the assembler object
asm = Assembler(r)
if config['buildgraph']['minread'] is None:
minread,minoverlap = estimateMinRead(r,config['buildgraph']['minoverlap'],coverage)
minread/=4
if minread<5:
minread=5
#############################################
#
# We now run the main assembling process
#
#############################################
logger.info('Starting the assembling')
# Run the first assembling pass
a = tango(asm,s,mincov=coverage/4, #@UnusedVariable
minread=minread,
minoverlap=minoverlap,