Commit 8a69d340 by Eric Coissac

Change the behaviour of the --seeds option.

It is now possible to specify several probe sets.
If a probe set is specified the bbbblast is run again otherwise the
previous matches are restored
parent d09447a7
......@@ -32,14 +32,19 @@ def getProbes(config):
logger=config['orgasm']['logger']
seeds =config['orgasm']['seeds']
try:
p = fasta(seeds)
logger.info("Load probe sequences from file : %s" % seeds)
except IOError:
p = getattr(samples,seeds)
logger.info("Load probe internal dataset : %s" % seeds)
return p
probes = {}
for s in seeds:
try:
p = fasta(s)
logger.info("Load probe sequences from file : %s" % s)
except IOError:
p = getattr(samples,s)
logger.info("Load probe internal dataset : %s" % s)
probes[s]=[p,{}]
return probes
def getAdapters(config):
'''
......@@ -73,46 +78,71 @@ def getAdapters(config):
return (p3,p5)
def getSeeds(index,probes,config,extension=".omx"):
def getSeeds(index,config,extension=".omx"):
# looks if the internal blast was already run
output = config['orgasm']['outputfilename']
logger=config['orgasm']['logger']
probes = getProbes(config)
filename=output+extension
try:
# yes, load the previous results
with open(filename,'rb') as fseeds:
seeds = pickle.load(fseeds)
logger.info("Load matches from previous run : %d matches" % sum(len(seeds[i]) for i in seeds))
except FileNotFoundError:
# no, run it and save the results
if probes:
# --seeds option on the command line -> look for these seeds
logger.info("Running probes matching against reads...")
seeds = index.lookForSeeds(probes,mincov=config['buildgraph']['mincov'],logger=logger)
logger.info("==> %d matches" % sum(len(seeds[i]) for i in seeds))
for probename in probes:
p = probes[probename][0]
logger.info(" -> probe set: %s" % probename)
seeds = index.lookForSeeds(p,
mincov=config['buildgraph']['mincov'],
logger=logger)
probes[probename][1]=seeds
logger.info("==> %d matches" % sum(len(seeds[i]) for i in seeds))
with open(filename,"wb") as fseeds:
pickle.dump(seeds,fseeds)
pickle.dump(probes,fseeds)
else:
# no --seeds option on the command line -> load the previous results
try:
with open(filename,'rb') as fseeds:
probes = pickle.load(fseeds)
logger.info("Load matches from previous run : %d probe sets restored" % len(probes))
nm=0
for k in probes:
for m in probes[k][1].values():
nm+=len(m)
logger.info(" ==> A total of : %d" % nm)
except FileNotFoundError:
logger.info("No --seeds option specified and not previous matches stored")
sys.exit(1)
logger.info("Match list :")
if probes is not None:
nuc = all([isDNA(probes[k]) for k in probes])
covmax=0
for probename in probes:
p = probes[probename][0]
s = probes[probename][1]
nuc = all([isDNA(k) for k in p.values()])
nbmatch = [(k,
len(seeds[k]),
len(seeds[k])*index.getReadSize() / len(probes[k]) / (1 if nuc else 3)) for k in seeds]
len(s[k]),
len(s[k])*index.getReadSize() / len(p[k]) / (1 if nuc else 3)) for k in s]
nbmatch.sort(key=lambda x:-x[2])
logger.info("Match list :")
for gene, nb, cov in nbmatch:
logger.info(" %-10s : %5d (%5.1fx)" % (gene,nb,cov))
coverage=nbmatch[0][2]
else:
nbmatch = [(k,len(seeds[k])) for k in seeds]
nbmatch.sort(key=lambda x:-x[1])
logger.info("Match list :")
for gene, nb in nbmatch:
logger.info(" %-10s : %5d" % (gene,nb))
coverage=None
if coverage > covmax:
covmax=coverage
return coverage,seeds
return covmax,probes
#def reloadAssembling
\ No newline at end of file
......@@ -126,9 +126,9 @@ def addOptions(parser):
parser.add_argument('--seeds', dest ='orgasm:seeds',
metavar='seeds',
default=None,
action='append',
default=[],
type=str,
required=True,
help='protein seeds; either a fasta file containing '
'seeds proteic sequences or internal set of seeds '
'among %s' % str(list(filter(lambda s: s.startswith('prot') or
......@@ -161,9 +161,8 @@ def run(config):
r = getIndex(config)
p = getProbes(config)
adapterSeq3, adapterSeq5 = getAdapters(config)
coverage,x = getSeeds(r,p,config)
coverage,x = getSeeds(r,config)
# Force the coverage to the specified value
......
......@@ -54,7 +54,7 @@ def run(config):
output = config['orgasm']['outputfilename']
r = getIndex(config)
coverage,x = getSeeds(r,None,config)
coverage,x = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
meanlength,sdlength = estimateFragmentLength(asm)
......
......@@ -52,7 +52,7 @@ def run(config):
output = config['orgasm']['outputfilename']
r = getIndex(config)
xxx,x = getSeeds(r,None,config)
xxx,x = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -54,7 +54,7 @@ def run(config):
output = config['orgasm']['outputfilename']
r = getIndex(config)
x = getSeeds(r,None,config)
x = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -145,7 +145,7 @@ def run(config):
smallbranches = config['buildgraph']['smallbranches']
r = getIndex(config)
xxx,x = getSeeds(r,None,config)
xxx,x = getSeeds(r,config)
adapterSeq5,adapterSeq3 = getAdapters(config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -38,7 +38,7 @@ def run(config):
output = config['orgasm']['outputfilename']
r = getIndex(config)
x = getSeeds(r,None,config)
x = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -236,7 +236,7 @@ def readFastq(filename):
if line[0]=="@":
if seqid > 0:
seq = ''.join(seq)
yield (sid,bytes(seq,encoding='latin1'))
yield (sid,bytes(seq,encoding='ascii'))
sid = line[1:].split(' ',1)[0].rsplit('/',1)[0].strip()
seqid+=1
lseq = 0
......
......@@ -47,7 +47,7 @@ def run(config):
output = config['orgasm']['outputfilename']
r = getIndex(config)
x = getSeeds(r,None,config)
x = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -52,7 +52,7 @@ def run(config):
r = getIndex(config)
coverage,x = getSeeds(r,None,config)
coverage,x = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -58,7 +58,7 @@ def run(config):
output = config['orgasm']['outputfilename']
r = getIndex(config)
x = getSeeds(r,None,config)
x = getSeeds(r,config)
asm = restoreGraph(output+'.oax',r,x)
......
......@@ -42,7 +42,7 @@ cdef class Index:
cpdef tuple getIds(self, int32_t id)
cpdef int32_t getPairedRead(self, int32_t id)
cpdef tuple normalizedPairedEndsReads(self, int32_t id)
cpdef lookForSeeds(self, dict proteins, int kup=?, int mincov=?, object logger=?)
cpdef dict lookForSeeds(self, dict proteins, int kup=?, int mincov=?, object logger=?)
cdef c_array.array lookForString(self, bytes word)
cpdef dict checkedExtensions(self, bytes probe,
......
......@@ -349,7 +349,7 @@ cdef class Index:
return nid,paired
cpdef lookForSeeds(self, dict sequences, int kup=-1, int mincov=1,object logger=None):
cpdef dict lookForSeeds(self, dict sequences, int kup=-1, int mincov=1,object logger=None):
cdef AhoCorasick patterns
cdef dict matches
......
......@@ -360,19 +360,24 @@ def weightedMode(data):
return mode(d)
def matchtoseed(matches,index):
k = matches.keys()
s=[]
for x in k:
s.extend((index.getIds(y[0])[0],(x,),0) for y in matches[x])
for p in matches:
m = matches[p][1]
k = m.keys()
for x in k:
s.extend((index.getIds(y[0])[0],(x,),0) for y in m[x])
return s
def matchtogene(matches):
genes = {}
k = matches.keys()
for x in k:
for m in matches[x]:
genes[abs(m[0])]=x
if matches is not None:
for p in matches:
m = matches[p][1]
k = m.keys()
for x in k:
for i in m[x]:
genes[abs(i[0])]=x
return genes
......@@ -1975,13 +1980,7 @@ def restoreGraph(filein,index,matches=None):
readmax = len(index) + 1
graph = asm.graph
gene = {}
if matches is not None:
for g in matches:
for m in matches[g]:
gene[abs(m[0])]=g
gene = matchtogene(matches)
if isinstance(filein, str):
filein = open(filein)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment