Commit e9c2de37 by Eric Coissac

Adds some log explaining the filtering and fixing some bugs

parent 4264243a
......@@ -14,7 +14,7 @@ import os.path
import sys
from orgasm.indexer import Index
from orgasm.reads._sequences import readPairedEnd # @UnresolvedImport
from orgasm.reads._sequences import readPairedEnd, setLogger # @UnresolvedImport
__title__="Index a set of reads"
......@@ -27,7 +27,7 @@ default_config = { 'reformat' : False,
'badqual' : 28,
'skip' : 0,
'maxread' : 0,
'length' : None,
'length' : 0,
'estimate' : 0,
'minlength': 81,
'fasta' : False,
......@@ -303,11 +303,13 @@ def writeToFifo(pairs,forward,reverse,fastq,logger):
if not i % 1000000:
logger.info('%d sequence pairs written' % i)
if len(p[0][1]) > 0 and len(p[1][1]) > 0:
print(formatFastq(p[0]),file=f)
print(formatFastq(p[1]),file=r)
fread = formatFastq(p[0])
rread = formatFastq(p[1])
print(fread,file=f)
print(rread,file=r)
if fastq:
print(formatFastq(p[0]),file=ffq)
print(formatFastq(p[0]),file=rfq)
print(fread,file=ffq)
print(rread,file=rfq)
if fastq:
ffq.close()
......@@ -390,8 +392,9 @@ def run(config):
if not nopipe:
os.mkfifo(reverse)
setLogger(logger)
pairs = readPairedEnd(filenames,
mode = mode,
checkIds = config['index']['checkpairs'],
......
......@@ -162,6 +162,9 @@ def firstAndNextIterator(line,iterator):
def PEInterleavedIterator(sequenceIterator):
cdef tuple forward
logOrPrint("Decodes interleaved pair-end reads from a single file")
for forward in sequenceIterator:
yield (forward,next(sequenceIterator))
......@@ -169,6 +172,9 @@ def PESimulIterator(sequenceIterator):
cdef tuple forward
cdef array score
logOrPrint("Simulating pair-end reads from single-end reads")
for forward in sequenceIterator:
score=array("B",forward[2])
score.reverse()
......@@ -179,13 +185,19 @@ def PESimulIterator(sequenceIterator):
def PEIterator(forwardIterator, reverseIterator):
cdef tuple forward
logOrPrint("Two files pair-end data")
for forward in forwardIterator:
yield (forward,next(reverseIterator))
def MPIterator(PEiterator):
cdef tuple forward
cdef tuple reverse
logOrPrint("Analyze data as mate-pairs")
for forward,reverse in PEiterator:
forward[2].reverse()
reverse[2].reverse()
......@@ -219,6 +231,8 @@ def PECutter(PEiterator, int trimfirst=4):
cdef tuple forward
cdef tuple reverse
logOrPrint("Hard clips the %d first base pairs of each read" % trimfirst)
for forward,reverse in PEiterator:
yield (cut5prime(forward,
trimfirst),
......@@ -231,6 +245,8 @@ def PEACGTCutter(PEiterator, int trimfirst=4):
cdef int lf,lr,f,t
cdef int trimmed=0
logOrPrint("Select the longest region containing only [A,C,G,T]")
for forward,reverse in PEiterator:
lf,f,t = longestACGT(forward[1])
forward = (forward[0],
......@@ -256,6 +272,9 @@ def PELowQualityTrimmer(PEiterator, int quality, int shift=33):
cdef tuple reverse
cdef int l,f,t
logOrPrint("Soft clipping bad quality regions (below %d)." % quality)
for forward,reverse in PEiterator:
l,f,t = bestQuality(forward[2],quality,shift)
forward = (forward[0],
......@@ -274,6 +293,8 @@ def PEQualityTrimmer(PEiterator, int quality, int shift=33):
cdef tuple forward
cdef tuple reverse
logOrPrint("Hard clipping reads after the first base with a quality below %d." % quality)
for forward,reverse in PEiterator:
yield (cut3primeQuality(forward,
quality,
......@@ -303,6 +324,9 @@ def PELengthEstimate(PEiterator, double threshold=0.9,int minlength=81):
unit="reads"
)
logOrPrint("Selecting the best length to keep %4.1f%% of the reads." % (threshold * 100))
logOrPrint(" Minimum length set to %dbp" % minlength)
for pair in PEiterator:
counter+=1
pc(counter)
......@@ -348,6 +372,8 @@ def PELengthCut(PEiterator, int length):
cdef tuple reverse
cdef int fb,rb
logOrPrint("Selecting the best subsequence of length %d." % length )
for forward,reverse in PEiterator:
fb = bestWindow(forward[2],length)
rb = bestWindow(reverse[2],length)
......@@ -364,14 +390,20 @@ def PELengthCut(PEiterator, int length):
def PESkipFirstReads(PEI,int skip=0):
cdef int n = 0
logOrPrint("Skipping the %d first entries..." % skip)
for n in range(skip):
next(PEI)
logOrPrint("Done.")
return PEI
def PEMaxReadCount(PEI, int readCount):
cdef int n
logOrPrint("Selecting %d good entries..." % readCount)
for n in range(readCount):
yield next(PEI)
......@@ -386,6 +418,8 @@ def PEPhiXCleaner(PEiterator):
cdef AhoCorasick aho=NucAhoCorasick()
aho.addSequence(phix["phi-X174"])
aho.finalize()
logOrPrint("Sorting out PhiX174 read pairs..." )
for pair in PEiterator:
forward = pair[0][1]
......@@ -494,4 +528,8 @@ def readPairedEnd(filenames,
cpdef dict getStats():
return __stats__
cpdef setLogger(logger):
global __logger__
__logger__=logger
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment