Commit 9e406944 by Eric Coissac

Some default parameter optimizations

parent e9c2de37
......@@ -14,7 +14,7 @@ import os.path
import sys
from orgasm.indexer import Index
from orgasm.reads._sequences import readPairedEnd, setLogger # @UnresolvedImport
from orgasm.reads._sequences import readPairedEnd, setLogger, getStats # @UnresolvedImport
__title__="Index a set of reads"
......@@ -24,7 +24,7 @@ default_config = { 'reformat' : False,
'reverse' : None,
'5trim' : 4,
'qualtrim' : 0,
'badqual' : 28,
'badqual' : 20,
'skip' : 0,
'maxread' : 0,
'length' : 0,
......@@ -93,7 +93,7 @@ def addOptions(parser):
action='store',
default=None,
help='Cut the N first base pairs of '
'reads (default 4bp)')
'reads (default %dbp)' % default_config['5trim'])
parser.add_argument('--3-prime-quality', dest='index:qualtrim',
metavar="Q",
......@@ -113,7 +113,7 @@ def addOptions(parser):
"quality score, and try to clip "
"reads to maximise the overall "
"quality. Zero means no clipping "
"(default 28)")
"(default %d)" % default_config['badqual'])
parser.add_argument('--skip', dest='index:skip',
metavar="N",
......@@ -121,7 +121,7 @@ def addOptions(parser):
action='store',
default=None,
help='Skip the N first read pairs '
'(default 0)')
'(default %d)' % default_config['skip'])
parser.add_argument('--max-reads', dest='index:maxread',
metavar="N",
......@@ -144,7 +144,7 @@ def addOptions(parser):
action='store',
default=None,
help='Estimate the length to index for conserving FRACTION '
'of the overall dataset')
'of the overall data set')
parser.add_argument('--minimum-length', dest='index:minlength',
type=int,
......@@ -181,7 +181,7 @@ def addOptions(parser):
default=None,
help='The code offset added to each quality '
'score to encode fastq quality '
'(default 33 - Sanger format)')
'(default %d - Sanger format)' % default_config['shift'])
tmpdir = []
......@@ -447,6 +447,21 @@ def run(config):
os.system(" ".join(command))
stats = getStats()
if "phix174" in stats:
logger.info("%9d phix174 reads pairs sorted out" % stats["phix174"])
if "SoftQualityTrim" in stats:
logger.info("%9d reads pairs soft trimmed on a quality of %d" % (stats["SoftQualityTrim"],
config['index']['badqual']))
if "ACGTtrim" in stats:
logger.info("%9d reads pairs clipped for not [A,C,G,T] bases" % stats["ACGTtrim"])
if "bad_ids" in stats:
logger.info("%9d reads pairs sorted out because of not corresponding ids" % stats["bad_ids"])
r=Index('%s.odx/index' % output)
......
......@@ -19,7 +19,7 @@ from orgasm.samples.phix import phix
__logger__ = None
cdef dict __stats__ = {}
cdef int __interval__=10000
cdef int __interval__=50000
def logOrPrint(message,level='info'):
global __logger__
......@@ -150,7 +150,7 @@ cpdef tuple bestQuality(char[:] qualities, int quality, int shift=33):
maxbegin=begin
maxend =end
return (maxscore,maxbegin,maxend)
return (lseq > (maxend-maxbegin),maxbegin,maxend)
def firstAndNextIterator(line,iterator):
......@@ -162,15 +162,20 @@ def firstAndNextIterator(line,iterator):
def PEInterleavedIterator(sequenceIterator):
cdef tuple forward
cdef int paircount=0
logOrPrint("Decodes interleaved pair-end reads from a single file")
for forward in sequenceIterator:
paircount+=1
yield (forward,next(sequenceIterator))
__stats__['paircount']=paircount
def PESimulIterator(sequenceIterator):
cdef tuple forward
cdef array score
cdef int paircount=0
logOrPrint("Simulating pair-end reads from single-end reads")
......@@ -178,20 +183,27 @@ def PESimulIterator(sequenceIterator):
for forward in sequenceIterator:
score=array("B",forward[2])
score.reverse()
paircount+=1
yield (forward,
(forward[0],
reverseComplement(forward[1]),
score))
__stats__['paircount']=paircount
def PEIterator(forwardIterator, reverseIterator):
cdef tuple forward
cdef int paircount=0
logOrPrint("Two files pair-end data")
for forward in forwardIterator:
paircount+=1
yield (forward,next(reverseIterator))
__stats__['paircount']=paircount
def MPIterator(PEiterator):
cdef tuple forward
cdef tuple reverse
......@@ -270,23 +282,33 @@ def PEACGTCutter(PEiterator, int trimfirst=4):
def PELowQualityTrimmer(PEiterator, int quality, int shift=33):
cdef tuple forward
cdef tuple reverse
cdef int l,f,t
cdef int lf,lr,f,t
cdef int trimmed=0
logOrPrint("Soft clipping bad quality regions (below %d)." % quality)
for forward,reverse in PEiterator:
l,f,t = bestQuality(forward[2],quality,shift)
forward = (forward[0],
forward[1][f:t],
forward[2][f:t])
l,f,t = bestQuality(reverse[2],quality,shift)
reverse = (reverse[0],
reverse[1][f:t],
reverse[2][f:t])
lf,f,t = bestQuality(forward[2],quality,shift)
if lf:
forward = (forward[0],
forward[1][f:t],
forward[2][f:t])
lr,f,t = bestQuality(reverse[2],quality,shift)
if lr :
reverse = (reverse[0],
reverse[1][f:t],
reverse[2][f:t])
if lf or lr:
trimmed+=1
if not trimmed % __interval__:
logOrPrint("%d sequences soft quality trimmed" % trimmed)
yield (forward,reverse)
__stats__["SoftQualityTrim"]=trimmed
def PEQualityTrimmer(PEiterator, int quality, int shift=33):
......@@ -424,8 +446,8 @@ def PEPhiXCleaner(PEiterator):
for pair in PEiterator:
forward = pair[0][1]
reverse = pair[1][1]
nf=(len(forward)-11) // 2
nr=(len(reverse)-11) // 2
nf=(len(forward)-11) * 2 // 3
nr=(len(reverse)-11) * 2 // 3
fmatch=aho.match(forward)
rmatch=aho.match(reverse)
......@@ -466,6 +488,7 @@ def readPairedEnd(filenames,
bint phix=True,
int shift=33
):
PEI = None
if mode=="MP":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment