Commit ef640405 by Eric Coissac

Add alignment quality check in solexapairend.

Upper cases in the solexapairend script name are removed to be consistent with other obitools names
a new command solexapairendnull is added to estimate null distribution of solexapairend alignement scores 
parent a363530e
......@@ -14,8 +14,14 @@ cdef class QSolexaDirectAssemble(DirectAssemble):
cdef double* hError
cdef double* vError
def __init__(self,match=4,opengap=-8,extgap=-2):
mismatch=-float(match)/3.0
def __init__(self,match=4,mismatch=-4,opengap=-8,extgap=-2):
"""
Rapport entre score de match et mismatch:
si mismatch = - match / 3
alors quand scrore temps vers 0 et qu'il est impossible de decider
pas de penalisation (s'=0)
si mismatch < - match / 3 la non decidabilite est penalisee.
"""
DirectAssemble.__init__(self,match,mismatch,opengap,extgap)
cdef double matchScore(self,int h, int v):
......
......@@ -13,8 +13,14 @@ cdef class QSolexaRightDirectAssemble(RightDirectAssemble):
cdef double* hError
cdef double* vError
def __init__(self,match=4,opengap=-8,extgap=-2):
mismatch=-float(match)/3.0
def __init__(self,match=4,mismatch=-4,opengap=-8,extgap=-2):
"""
Rapport entre score de match et mismatch:
si mismatch = - match / 3
alors quand scrore temps vers 0 et qu'il est impossible de decider
pas de penalisation (s'=0)
si mismatch < - match / 3 la non decidabilite est penalisee.
"""
RightDirectAssemble.__init__(self,match,mismatch,opengap,extgap)
cdef double matchScore(self,int h, int v):
......
......@@ -3,9 +3,8 @@ Created on 31 oct. 2009
@author: coissac
'''
from random import shuffle, randrange, sample
from random import randrange, sample
from obitools.utils import Counter
import random
def lookfor(x,cumsum):
lmax=len(cumsum)
......@@ -35,21 +34,26 @@ def lookfor(x,cumsum):
return i
def weigthedSample(events,size):
entries = [k for k in events.iterkeys() if events[k]>0]
shuffle(entries)
cumul=[]
entries = events.keys()
cumul=[0] * len(entries)
s=0
i=0
for e in entries:
s+=events[e]
cumul.append(s)
cumul[i]=s
i+=1
#print cumul
result={}
c = randrange(xrange(s),size)
c.sort()
for t in xrange(size):
e=lookfor(randrange(s), cumul)
k=entries[e]
result[k]=result.get(k,0)+1
i = 0
for j in xrange(len(c)):
v = c[j]
while (v > cumul[i]):
i+=1
c[j]=entries[i]
result=Counter(c)
return result
......@@ -64,7 +68,7 @@ def weigthedSampleWithoutReplacement(events,size):
cumul[i]=s
i+=1
c = random.sample(xrange(s),size)
c = sample(xrange(s),size)
c.sort()
i = 0
......
......@@ -173,7 +173,9 @@ def buildConsensus(ali):
seq.quality=quality
if hasattr(ali, "direction"):
seq['alignment']=ali.direction
seq['direction']=ali.direction
if hasattr(ali, "counter"):
seq['alignement_id']=ali.counter
seq['seqASingle']=ic.seqASingle
seq['seqBSingle']=ic.seqBSingle
seq['seqABMatch']=ic.seqABMatch
......@@ -184,4 +186,7 @@ def buildConsensus(ali):
seq['seqADeletion']=ic.seqADeletion
seq['seqBDeletion']=ic.seqBDeletion
seq['score']=ali.score
seq['ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
seq['score_norm']=float(ali.score)/seq['ali_length']
seq['mode']='alignement'
return seq
......@@ -2,6 +2,8 @@ import unittest
import obitools
from utils import tests_group as utils_tests_group
class BioseqTest(unittest.TestCase):
sequenceId = 'id1'
......@@ -86,4 +88,4 @@ class AABioseqTest(BioseqTest):
tests_group = [NucBioseqTest,AABioseqTest]
\ No newline at end of file
tests_group = utils_tests_group + [NucBioseqTest,AABioseqTest]
\ No newline at end of file
......@@ -11,8 +11,9 @@ if __name__=='__main__':
print
for x in tests_group:
print x.__doc__.strip()
print "-" * len(x.__doc__.strip())
if x.__doc__ is not None:
print x.__doc__.strip()
print "-" * len(x.__doc__.strip())
testset =unittest.TestLoader().loadTestsFromTestCase(x)
tests = unittest.TestSuite([testset])
unittest.TextTestRunner(verbosity=3).run(tests)
......
#!/usr/local/bin/python
'''
:py:mod:`solexaPairEnd` : align overlapping pair-end solexa reads and return the consensus sequence together with its quality
=============================================================================================================================
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
:py:mod:`solexaPairEnd` aims at aligning the two reads of a pair-end library sequenced using a Solexa.
The program takes for arguments one or two fastq solexa sequences reads files.
In the case where two files are given one should be associated with the -r option. Sequences corresponding to the same read pair must have the same line number in the two files.
In the case where one file is provided, sequences, which are supposed to be all of the same length, are considered to be the concatenation of the two pair end reads. Hence
the first half of a sequence is considered one extremity of the pair-end and the second half the other one.
:py:mod:`solexaPairEnd` align the first sequence with the reverse complement of the second sequence and report the consensus sequence of the alignment (taking into account
the base qualities) together with the qualities for each base of the reported consensus sequence.
.. code-block:: bash
> solexaPairEnd -r seq3P.fasta seq5P.fasta > seq.fasta
'''
from obitools import NucSequence
from obitools.options import getOptionManager, allEntryIterator
from obitools.align import QSolexaReverseAssemble
from obitools.align import QSolexaRightReverseAssemble
from obitools.tools._solexapairend import buildConsensus
from obitools.format.options import addInputFormatOption,addOutputFormatOption,\
sequenceWriterGenerator
from itertools import chain
import cPickle
import math
def addSolexaPairEndOptions(optionManager):
optionManager.add_option('-r','--reverse-reads',
action="store", dest="reverse",
metavar="<FILENAME>",
type="str",
default=None,
help="Filename containing reverse solexa reads "
)
optionManager.add_option('--sanger',
action="store_const", dest="seqinformat",
default=None,
const='sanger',
help="input file is in sanger fastq nucleic format (standard fastq)")
optionManager.add_option('--solexa',
action="store_const", dest="seqinformat",
default=None,
const='solexa',
help="input file is in fastq nucleic format produced by solexa sequencer")
optionManager.add_option('--illumina',
action="store_const", dest="seqinformat",
default=None,
const='illumina',
help="input file is in fastq nucleic format produced by old solexa sequencer")
optionManager.add_option('--proba',
action="store", dest="proba",
metavar="<FILENAME>",
type="str",
default=None,
help="null ditribution data file")
optionManager.add_option('--score-min',
action="store", dest="smin",
metavar="#.###",
type="float",
default=None,
help="minimum score for keeping aligment")
optionManager.add_option('--pvalue',
action="store", dest="pvalue",
metavar="#.###",
type="float",
default=None,
help="maximum pvalue for keeping aligment")
optionManager.add_option('-j','--junction-length',
action="store", dest="junction",
metavar="###",
type="int",
default=10,
help="length of the N junction between both the reads (default=10)"
)
optionManager.add_option('-n','--n-quality',
action="store", dest="nqual",
metavar="###",
type="int",
default=10,
help="quality assign to the N of the junction (default=10)"
)
def cutDirectReverse(entries):
first = []
for i in xrange(10):
first.append(entries.next())
lens = [len(x) for x in first]
clen = {}
for i in lens:
clen[i]=clen.get(i,0)+1
freq = max(clen.values())
freq = [k for k in clen if clen[k]==freq]
assert len(freq)==1,"To many sequence length"
freq = freq[0]
assert freq % 2 == 0, ""
lread = freq/2
seqs = chain(first,entries)
for s in seqs:
d = s[0:lread]
r = s[lread:]
yield(d,r)
def seqPairs(direct,reverse):
for d in direct:
r = reverse.next()
yield(d,r)
def checkAlignOk(ali):
#print not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
return not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
la = QSolexaReverseAssemble()
ra = QSolexaRightReverseAssemble()
def buildAlignment(direct,reverse):
if len(direct)==0 or len(reverse)==0:
return None
la.seqA=direct
la.seqB=reverse
ali=la()
ali.direction='left'
ra.seqA=direct
ra.seqB=reverse
rali=ra()
rali.direction='right'
if ali.score < rali.score:
ali=rali
return ali
def alignmentIterator(sequences):
for d,r in sequences:
ali = buildAlignment(d,r)
if ali is None:
continue
yield ali
def buildJoinedSequence(ali,options):
d = ali[0].getRoot()
r = ali[1].getRoot()
nqual = 10.**-(options.nqual/10.)
junction = options.junction
r=r.complement()
s = str(d) + 'n' * junction + str(r)
seq = NucSequence(d.id + '_Join',s,d.definition,**d)
withqual = hasattr(d, 'quality') or hasattr(r, 'quality')
if withqual:
if hasattr(d, 'quality'):
quality = d.quality
else:
quality = [10**-4] * len(d)
quality.extend([nqual] * junction)
if hasattr(r, 'quality'):
quality.extend(r.quality)
else:
quality.extend([10**-4] * len(r))
seq.quality=quality
seq['score']=ali.score
seq['direction']=ali.direction
seq['mode']='join'
return seq
if __name__ == '__main__':
optionParser = getOptionManager([addSolexaPairEndOptions,addOutputFormatOption],checkFormat=True
)
(options, direct) = optionParser()
options.sminL = None
options.sminR = None
if options.proba is not None and options.smin is None:
p = open(options.proba)
options.nullLeft = cPickle.load(p)
options.nullRight = cPickle.load(p)
assert options.pvalue is not None, "You have to indicate a pvalue or an score min"
i = int(math.floor((1.0 - options.pvalue) * len(options.nullLeft)))
if i == len(options.nullLeft):
i-=1
options.sminL = options.nullLeft[i]
i = int(math.floor((1.0 - options.pvalue) * len(options.nullRight)))
if i == len(options.nullRight):
i-=1
options.sminR = options.nullRight[i]
if options.smin is not None:
options.sminL = options.smin
options.sminR = options.smin
if options.reverse is None:
sequences=cutDirectReverse(direct)
else:
reverse = allEntryIterator([options.reverse],options.readerIterator)
sequences=seqPairs(direct,reverse)
writer = sequenceWriterGenerator(options)
ba = alignmentIterator(sequences)
for ali in ba:
if options.sminL is not None:
if ( (ali.direction=='left' and ali.score > options.sminL)
or (ali.score > options.sminR)):
consensus = buildConsensus(ali)
else:
consensus = buildJoinedSequence(ali, options)
consensus['sminL']=options.sminL
consensus['sminR']=options.sminR
else:
consensus = buildConsensus(ali)
writer(consensus)
......@@ -32,6 +32,8 @@ from obitools.format.options import addInputFormatOption,addOutputFormatOption,\
sequenceWriterGenerator
from itertools import chain
import cPickle
import sys
def addSolexaPairEndOptions(optionManager):
optionManager.add_option('-r','--reverse-reads',
......@@ -58,8 +60,11 @@ def addSolexaPairEndOptions(optionManager):
default=None,
const='illumina',
help="input file is in fastq nucleic format produced by old solexa sequencer")
optionManager.add_option('--ascii',
action="store_true", dest="ascii",
default=False,
help="output in ascii format")
def cutDirectReverse(entries):
first = []
......@@ -80,46 +85,44 @@ def cutDirectReverse(entries):
seqs = chain(first,entries)
old=None
for s in seqs:
d = s[0:lread]
r = s[lread:]
yield(d,r)
if old is not None:
yield(d,old)
old=r
def seqPairs(direct,reverse):
d = direct.next()
for d in direct:
r = reverse.next()
yield(d,r)
def checkAlignOk(ali):
#print not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
return not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
la = QSolexaReverseAssemble()
ra = QSolexaRightReverseAssemble()
def buildAlignment(direct,reverse):
if len(direct)==0 or len(reverse)==0:
return None
def buildAlignment(sequences):
la = QSolexaReverseAssemble()
ra = QSolexaRightReverseAssemble()
la.seqA=direct
la.seqB=reverse
ali=la()
ra.seqA=direct
ra.seqB=reverse
rali=ra()
return ali.score,rali.score
def alignmentIterator(sequences):
for d,r in sequences:
if len(d)==0 or len(r)==0:
ali = buildAlignment(d,r)
if ali is None:
continue
la.seqA=d
la.seqB=r
ali=la()
ali.direction='left'
if not checkAlignOk(ali):
# print >>sys.stderr,"-> bad : -------------------------"
# print >>sys.stderr,ali
# print >>sys.stderr,"----------------------------------"
ra.seqA=d
ra.seqB=r
ali=ra()
ali.direction='right'
# print >>sys.stderr,ali
# print >>sys.stderr,"----------------------------------"
yield ali
......@@ -127,7 +130,6 @@ def buildAlignment(sequences):
if __name__ == '__main__':
optionParser = getOptionManager([addSolexaPairEndOptions,addOutputFormatOption],checkFormat=True
# entryIterator=fastqSolexaIterator
)
(options, direct) = optionParser()
......@@ -140,9 +142,24 @@ if __name__ == '__main__':
writer = sequenceWriterGenerator(options)
for ali in buildAlignment(sequences):
consensus = buildConsensus(ali)
writer(consensus)
old=None
ba = alignmentIterator(sequences)
distnullLeft=[]
distnullRight=[]
for l,r in ba:
distnullLeft.append(l)
distnullRight.append(r)
distnullLeft.sort()
distnullRight.sort()
protocol = 0 if options.ascii else 2
cPickle.dump(distnullLeft, sys.stdout, protocol=protocol)
cPickle.dump(distnullRight, sys.stdout, protocol=protocol)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment