Commit a9111410 by Eric Coissac

First version of the new seeds management system

parent 0962476c
......@@ -111,14 +111,32 @@ def getAdapters(config):
return (p3,p5)
def getSeeds(index,config,extension=".omx"):
def getSeeds(index,config):
# looks if the internal blast was already run
output = config['orgasm']['outputfilename']
logger=config['orgasm']['logger']
kup=-1 if config['orgasm']['kup'] is None else config['orgasm']['kup']
clean=config['buildgraph']['clean']
forceseeds=config['buildgraph']['forceseeds']
filename=output+extension
filename="%s.oas/assembling.omx"
#
# Check if the seeds are not correctly placed in the oas directory
#
oldfilename=output+'.omx'
if (not os.path.exists(filename) and
os.path.exists(oldfilename)):
if config['seeds']['reformat']:
os.renames(oldfilename, filename)
else:
logger.error('Seed matches are not stored following the new format')
logger.error('Run the oa seeds command with the --reformat option')
sys.exit(1)
#
# Look for seeds with the new probe sets
......@@ -129,7 +147,7 @@ def getSeeds(index,config,extension=".omx"):
#
# Load already run probe sets
#
reformated=[]
if not clean or not newprobes:
try:
with open(filename,'rb') as fseeds:
......@@ -140,22 +158,29 @@ def getSeeds(index,config,extension=".omx"):
for probename in probes:
s = probes[probename][1]
if len(s[list(s.keys())[0]][0]) != 6:
oldversion=True
logger.warning("Old probe version save on the disk. Recomputes probes %s" % probename)
p = probes[probename][0]
logger.info(" -> probe set: %s" % probename)
seeds = index.lookForSeeds(p,
mincov=config['orgasm']['seedmincov'],
kup=kup,
identity=config['orgasm']['identity'],
logger=logger)
if config['seeds']['reformat']:
oldversion=True
logger.warning("Old probe version save on the disk. Recomputes probes %s" % probename)
p = probes[probename][0]
logger.info(" -> probe set: %s" % probename)
seeds = index.lookForSeeds(p,
mincov=config['orgasm']['seedmincov'],
kup=kup,
identity=config['orgasm']['identity'],
logger=logger)
probes[probename][1]=seeds
reformated.append(probename)
probes[probename][1]=seeds
logger.info("==> %d matches" % sum(len(seeds[i]) for i in seeds))
logger.info("==> %d matches" % sum(len(seeds[i]) for i in seeds))
else:
logger.error('Seed matches are not stored following the new format')
logger.error('Run the oa seeds command with the --reformat option')
sys.exit(1)
if oldversion:
with open(filename,"wb") as fseeds:
......@@ -197,7 +222,7 @@ def getSeeds(index,config,extension=".omx"):
if nmatches:
probes[probename]=[p,seeds]
newprobes = list(newprobes.keys())
newprobes = list(newprobes.keys())
if not probes:
logger.info("No --seeds option specified and not previous matches stored")
......@@ -229,6 +254,8 @@ def getSeeds(index,config,extension=".omx"):
if coverage > covmax:
covmax=coverage
if forceseeds:
newprobes=None
return covmax,probes,newprobes
......
......@@ -959,6 +959,8 @@ cdef class ProtAhoCorasick(AhoCorasick):
cdef int phase
cdef int protidmax=0
cdef int framemax=0
cdef int loc
cdef int locmax
cdef size_t nbreads
cdef dnastate* state
......@@ -1224,12 +1226,17 @@ cdef class NucAhoCorasick(AhoCorasick):
cdef size_t wordmax=0
cdef double shamin
cdef size_t wc[6]
cdef size_t rp[6]
cdef dict matchpos
cdef size_t* count
cdef int wct
cdef int wcmax
cdef int phase
cdef int protidmax=0
cdef int framemax=0
cdef int loc
cdef int locmax
cdef size_t nbreads
cdef dnastate* state
......@@ -1267,7 +1274,8 @@ cdef class NucAhoCorasick(AhoCorasick):
while i < readcount:
# We start a new read so we set counter to 0
wordcount.clear()
wordcount.clear()
matchpos={}
readid = i
# print "@",i
......@@ -1282,12 +1290,11 @@ cdef class NucAhoCorasick(AhoCorasick):
nstate = table[letter]
pmatch = nstate.match
while (pmatch!=NULL):
# if i>2028230 and i < 2028250:
# print "--> %d %d %d" %(pmatch.protid,pmatch.position,i)
pid = pmatch.protid
if pid < 0 :
pid += mid
wordcount.set(pid,pmatch.position)
matchpos[pid]=min(matchpos.get(pid,65535),pmatch.position)
pmatch = pmatch.next
state = nstate
table = &(state.a)
......@@ -1297,34 +1304,37 @@ cdef class NucAhoCorasick(AhoCorasick):
framemax=0
count = wordcount.counter()
# if i>2028230 and i < 2028250:
# print "coucou",i,count[0],count[1],count[2]
for k in range(1,self._maxseqid+1):
pid = k
wc[1]=count[pid]
rp[1]=matchpos.get(pid,65535)
#memcpy(<void*>(wc+3),<void*>(count+pid),3*sizeof(size_t))
pid = mid - pid
wc[0]=count[pid]
rp[0]=matchpos.get(pid,65535)
# memcpy(<void*>(wc),<void*>(count+pid),3*sizeof(size_t))
wct = wc[0] + wc[1]
phase= -1 if wc[0] > wc[1] else 1
wcmax= wc[0] if wc[0] > wc[1] else wc[1]
if wc[0] > wc[1]:
phase=-1
wcmax=wc[0]
loc=rp[0]
else:
phase=1
wcmax=wc[1]
loc=rp[1]
# if i>2028230 and i < 2028250:
# print "#####> total = %d phase = %d max = %d" % (wct,phase,wcmax)
if (wcmax >=minmatch):
wordmax = wcmax
protidmax=k
framemax= phase
locmax=loc
nbreads=1
......@@ -1356,15 +1366,15 @@ cdef class NucAhoCorasick(AhoCorasick):
PyDict_SetItem(results,PyInt_FromLong(protidmax),lpos)
else:
lpos = <object>plpos
lpos.append((readid,wordmax,nbreads,framemax,0))
lpos.append((readid,wordmax,nbreads,framemax,0,locmax))
ks = list(results.keys())
for k in ks:
print(k, self._rseqid[k],len(results[k]))
results[self._rseqid[k]]=results[k]
del results[k]
return results
\ No newline at end of file
......@@ -30,7 +30,8 @@ default_config = { 'minread' : None,
'snp' : False,
'assmax' : 500000,
'testrun' : 15000,
'clean' : False
'clean' : False,
'forceseeds' : False
}
def addOptions(parser):
......@@ -157,6 +158,12 @@ def addOptions(parser):
default=None,
help='Erase previously existing graph to restart a new assembling')
parser.add_argument('--force-seeds', dest='buildgraph:forceseeds',
action='store_true',
default=None,
help='Force to reuse all the seeds')
def estimateMinRead(index,minoverlap,coverage):
MINREAD=10
......
......@@ -18,7 +18,7 @@ import sys
__title__="Build the set of seed reads"
default_config = {
default_config = { "reformat" : None
}
def addOptions(parser):
......@@ -48,6 +48,12 @@ def addOptions(parser):
help='The word size used to identify the seed reads '
'[default: protein=4, DNA=12]')
parser.add_argument("--reformat",
dest="seeds:reformat",
action='store_true',
default=None,
help='Asks for reformatting an old sequence index to the new format'
)
def run(config):
......@@ -57,7 +63,7 @@ def run(config):
output = getOutput(config)
logger.info("Looking for the seed reads")
r = getIndex(config)
ecoverage,x,newprobes = getSeeds(r,config)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment