Commit 41854044 by Eric Coissac

--no commit message

parent dde17b8a
......@@ -18,7 +18,7 @@ import re
from orgasm.utils.dna import reverseComplement # @UnresolvedImport
import zlib
from collections import Counter
from collections import Counter, deque
__title__="Index a set of reads"
......@@ -32,7 +32,10 @@ default_config = { 'single' : False,
'fasta' : False,
'ffasta' : False,
'rfasta' : False,
'nopipe' : False
'nopipe' : False,
'checkpairs':False,
'mate' : False,
'checkids' : False
}
def addOptions(parser):
......@@ -53,6 +56,16 @@ def addOptions(parser):
default=None,
help='Single read mode')
parser.add_argument('--mate-pairs', dest='index:mate',
action='store_true',
default=None,
help='Mate pair library mode')
parser.add_argument('--check-ids', dest='index:checkids',
action='store_true',
default=None,
help='Checks that forward and reverse ids are identical')
parser.add_argument('--max-read', dest='index:maxread',
type=int,
action='store',
......@@ -90,6 +103,11 @@ def addOptions(parser):
default=None,
help='forward file is a fasta file')
parser.add_argument('--check-pairing', dest='index:checkpairs',
action='store_true',
default=None,
help='ensure that forward and reverse files are correctly paired')
parser.add_argument('--reverse-fasta', dest='index:rfasta',
action='store_true',
default=None,
......@@ -187,20 +205,22 @@ def readFasta(filename):
filename = open(filename)
seq = []
sid = ""
seqid=0
for line in filename:
if line[0]==">":
if seqid > 0:
seq = ''.join(seq)
print "\n\n",(seqid,seq),"\n\n"
yield (seqid,seq)
yield (sid,seq)
sid = line[1:].split(' ',1)[0].rsplit('/',1)[0]
seqid+=1
seq=[]
else:
seq.append(line.strip().upper())
seq = ''.join(seq)
yield (seqid,seq)
yield (sid,seq)
def readFastq(filename):
......@@ -210,12 +230,14 @@ def readFastq(filename):
seq = []
seqid = 0
lseq = 0
sid = ""
for line in filename:
if line[0]=="@":
if seqid > 0:
seq = ''.join(seq)
yield (seqid,seq)
yield (sid,seq)
sid = line[1:].split(' ',1)[0].rsplit('/',1)[0]
seqid+=1
lseq = 0
seq=[]
......@@ -227,7 +249,7 @@ def readFastq(filename):
filename.next()
seq = ''.join(seq)
yield (seqid,seq)
yield (sid,seq)
def acgtCut(seqs):
......@@ -273,14 +295,42 @@ def singleToPairedEnd(seqs,length):
r = reverseComplement(s[1][-length:])
yield ((s[0],f),(s[0],r))
def doubleToPairedEnd(forward,reverse):
def doubleToPairedEnd(forward,reverse,checkids=False):
for f in forward:
r = reverse.next()
# print "\n\n",f[0],r[0],"\n\n"
assert f[0]==r[0]
assert not checkids or f[0]==r[0]
yield ((f[0],f[1]),(f[0],r[1]))
def doubleToCheckedPairs(forward,reverse,mate=False):
rstore = {}
for seq in reverse:
rstore[seq[0]]=zlib.compress(seq[1])
for seq in forward:
sid = seq[0]
if sid in rstore:
yield (seq,(sid,zlib.decompress(rstore[sid])))
rstore[sid]=None
elif mate:
yield ((seq[0],reverseComplement(seq[1])),seq)
else:
yield (seq,(seq[0],reverseComplement(seq[1])))
for sid,seq in rstore.iteritems():
if seq is not None:
seq = zlib.decompress(seq)
if mate:
yield ((sid,reverseComplement(seq)),(sid,seq))
else:
yield ((sid,seq),(sid,reverseComplement(seq)))
def quantiles(stats,quantile):
ls = stats.keys()
ls.sort(reverse=True)
......@@ -307,7 +357,12 @@ def pairedEndLengthFilter(pairs,length):
if len(p[0][1]) >= length and len(p[1][1]) >= length:
yield p
def matePairs(pairs):
for p in pairs:
yield ((p[0][0],reverseComplement(p[0][1])),(p[1][0],reverseComplement(p[1][1])))
def pairedEndCounter(pairs,n=-1,minlength=-1):
i=0
......@@ -317,6 +372,8 @@ def pairedEndCounter(pairs,n=-1,minlength=-1):
i+=1
if i == n:
break
def writeToFifo(pairs,forward,reverse,logger):
logger.info("Forward tmp file : %s" % forward)
......@@ -409,8 +466,13 @@ def run(config):
length=100
pairs = singleToPairedEnd(cforward,length)
else:
if fconvert or rconvert:
pairs = doubleToPairedEnd(cforward,creverse)
if config['index']['checkpairs']:
fconvert=True
rconvert=True
pairs = doubleToCheckedPairs(cforward,creverse,config['index']['mate'])
elif fconvert or rconvert:
pairs = doubleToPairedEnd(cforward,creverse,config['index']['checkids'])
if config['index']['maxread'] is not None:
logger.info('limit indexation to the first %d millions of reads' % config['index']['maxread'])
......@@ -422,6 +484,9 @@ def run(config):
pairs = pairedEndCounter(pairs,n=config['index']['maxread'],
minlength=config['index']['length'])
if config['index']['mate']:
pairs = matePairs(pairs)
command = ['orgasmi','-o',output]
if config['index']['maxread'] is not None:
......
......@@ -325,8 +325,9 @@ cdef class Index:
patterns.finalize()
minmatch = 50 if nuc else 15
minmatch = int(self.getReadSize() / (2 if nuc else 6))
matches = patterns.scanIndex(self,15,-1,mincov)
matches = patterns.scanIndex(self,minmatch,-1,mincov)
return matches
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment