Commit e560c429 by Eric Coissac

Add a new script to assemble amplicon from solexa pair-end run

parent b93d5ced
......@@ -3,4 +3,7 @@
from _nws import NWS
from _lcs import LCS
from _assemble import DirectAssemble, ReverseAssemble
from _qsassemble import QSolexaDirectAssemble,QSolexaReverseAssemble
from _rassemble import RightDirectAssemble as RightReverseAssemble
from _qsrassemble import QSolexaRightDirectAssemble,QSolexaRightReverseAssemble
from _nws cimport *
cdef class DirectAssemble(NWS):
cdef double ysmax
cdef int ymax
cdef double doAlignment(self) except? 0
cdef class ReverseAssemble(DirectAssemble):
pass
\ No newline at end of file
......@@ -5,25 +5,22 @@ Created on 6 Nov. 2009
'''
#@PydevCodeAnalysisIgnore
from _nws cimport *
from _assemble cimport *
cdef class DirectAssemble(NWS):
cdef double ysmax
cdef int ymax
def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
NWS.__init__(self,match,mismatch,opengap,extgap)
self.ysmax=0
self.ymax=0
cdef double matchScore(self,int h, int v):
return iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
cdef double doAlignment(self) except? 0:
cdef int i # vertical index
cdef int j # horizontal index
cdef int idx
cdef int idx0
cdef int idx1
cdef int jump
cdef int delta
cdef double score
......@@ -34,6 +31,8 @@ cdef class DirectAssemble(NWS):
if self.needToCompute:
self.allocate()
self.reset()
self.ysmax=0
self.ymax=0
for j in range(1,self.hSeq.length+1):
idx = self.index(j,0)
......@@ -45,11 +44,16 @@ cdef class DirectAssemble(NWS):
self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1))
self.matrix.matrix[idx].path = -i
idx0=self.index(-1,0)
idx1=self.index(0,1)
for i in range(1,self.vSeq.length+1):
idx0+=1
idx1+=1
for j in range(1,self.hSeq.length+1):
# 1 - came from diagonal
idx = self.index(j-1,i-1)
#idx = self.index(j-1,i-1)
idx = idx0
# print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
scoremax = self.matrix.matrix[idx].score + \
self.matchScore(j,i)
......@@ -58,7 +62,8 @@ cdef class DirectAssemble(NWS):
# print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
# 2 - open horizontal gap
idx = self.index(j-1,i)
# idx = self.index(j-1,i)
idx = idx1 - 1
score = self.matrix.matrix[idx].score+ \
self._opengap
if score > scoremax :
......@@ -66,7 +71,8 @@ cdef class DirectAssemble(NWS):
path = +1
# 3 - open vertical gap
idx = self.index(j,i-1)
# idx = self.index(j,i-1)
idx = idx0 + 1
score = self.matrix.matrix[idx].score + \
self._opengap
if score > scoremax :
......@@ -95,7 +101,8 @@ cdef class DirectAssemble(NWS):
scoremax = score
path = -delta-1
idx = self.index(j,i)
# idx = self.index(j,i)
idx = idx1
self.matrix.matrix[idx].score = scoremax
self.matrix.matrix[idx].path = path
......@@ -107,14 +114,16 @@ cdef class DirectAssemble(NWS):
if j==self.hSeq.length and scoremax > self.ysmax:
self.ysmax=scoremax
self.ymax=i
idx0+=1
idx1+=1
self.sequenceChanged=False
self.scoreChanged=False
return self.ysmax
def backtrack(self):
cdef list path=[]
cdef void backtrack(self):
#cdef list path=[]
cdef int i
cdef int j
cdef int p
......@@ -122,13 +131,17 @@ cdef class DirectAssemble(NWS):
self.doAlignment()
i=self.ymax
j=self.hSeq.length
self.path=allocatePath(i,j+1,self.path)
if self.ymax<self.vSeq.length:
path.append(self.ymax-self.vSeq.length)
self.path.path[self.path.length]=self.ymax-self.vSeq.length
self.path.length+=1
while (i or j):
p=self.matrix.matrix[self.index(j,i)].path
path.append(p)
self.path.path[self.path.length]=p
self.path.length+=1
#path.append(p)
if p==0:
i-=1
j-=1
......@@ -137,8 +150,11 @@ cdef class DirectAssemble(NWS):
else:
j-=p
path.reverse()
return 0,0,path
#path.reverse()
#reversePath(self.path)
self.path.hStart=0
self.path.vStart=0
#return 0,0,path
cdef class ReverseAssemble(DirectAssemble):
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -39,6 +39,20 @@ cdef alignSequence* allocateSequence(object bioseq, alignSequence* seq=?)
cdef void freeSequence(alignSequence* seq)
cdef struct alignPath:
long length
long buffsize
long vStart
long hStart
long *path
cdef alignPath* allocatePath(long l1,long l2,alignPath* path=?)
cdef void reversePath(alignPath* path)
cdef void freePath(alignPath* path)
cdef int bitCount(int x)
cpdef bint iupacMatch(unsigned char a, unsigned char b)
......@@ -52,6 +66,7 @@ cdef class DynamicProgramming:
cdef alignSequence* hSeq
cdef alignSequence* vSeq
cdef alignPath* path
cdef double _opengap
cdef double _extgap
......@@ -65,5 +80,7 @@ cdef class DynamicProgramming:
cdef double matchScore(self,int h, int v)
cdef double doAlignment(self) except? 0
cdef void reset(self)
cdef inline int index(self,x,y)
cdef inline int index(self, int x, int y)
cdef inline bint _needToCompute(self)
cdef void backtrack(self)
......@@ -18,6 +18,7 @@ from obitools.alignment import Alignment
#
from _dynamic cimport *
from Cython.Shadow import NULL
cdef AlignMatrix* allocateMatrix(int hsize, int vsize,AlignMatrix *matrix=NULL):
......@@ -104,7 +105,40 @@ cdef void freeSequence(alignSequence* seq):
free(<void*>seq.quality)
free(seq)
cdef alignPath* allocatePath(long l1,long l2,alignPath* path=NULL):
cdef long length=l1+l2
if path is NULL:
path = <alignPath*>malloc(sizeof(alignPath))
path.length=0
path.buffsize=0
path.path=NULL
if length > path.buffsize:
path.buffsize=length
path.path=<long*>realloc(path.path,sizeof(long)*length)
path.length=0
path.vStart=0
path.hStart=0
return path
cdef void reversePath(alignPath* path):
cdef long i
cdef long j
j=path.length
for i in range(path.length/2):
j-=1
path.path[i],path.path[j]=path.path[j],path.path[i]
cdef void freePath(alignPath* path):
if path is not NULL:
if path.path is not NULL:
free(<void*>path.path)
free(<void*>path)
cdef int aascii = ord(b'a')
cdef int _basecode[26]
......@@ -139,6 +173,7 @@ cdef class DynamicProgramming:
self.matrix=NULL
self.hSeq=NULL
self.vSeq=NULL
self.path=NULL
self.horizontalSeq=None
self.verticalSeq=None
......@@ -164,6 +199,12 @@ cdef class DynamicProgramming:
cdef double doAlignment(self) except? 0:
pass
cdef bint _needToCompute(self):
return self.scoreChanged or self.sequenceChanged
cdef void backtrack(self):
pass
property seqA:
def __get__(self):
return self.horizontalSeq
......@@ -210,7 +251,7 @@ cdef class DynamicProgramming:
self.scoreChanged=True
resetMatrix(self.matrix)
cdef inline int index(self,x,y):
cdef inline int index(self, int x, int y):
return (self.hSeq.length+1) * y + x
......@@ -219,6 +260,7 @@ cdef class DynamicProgramming:
freeMatrix(self.matrix)
freeSequence(self.hSeq)
freeSequence(self.vSeq)
freePath(self.path)
def __call__(self):
cdef list hgaps=[]
......@@ -229,13 +271,15 @@ cdef class DynamicProgramming:
cdef int lenh=0
cdef int lenv=0
cdef int h,v,p
cdef int i
cdef object ali
cdef double score
if self.needToCompute:
if self._needToCompute():
score = self.doAlignment()
h,v,b=self.backtrack()
for p in b:
self.backtrack()
for i in range(self.path.length-1,-1,-1):
p=self.path.path[i]
if p==0:
hp+=1
vp+=1
......@@ -258,7 +302,7 @@ cdef class DynamicProgramming:
vgaps.append([vp,0])
if lenh < self.hSeq.length:
hseq=self.horizontalSeq[h:h+lenh]
hseq=self.horizontalSeq[self.path.hStart:self.path.hStart+lenh]
else:
hseq=self.horizontalSeq
......@@ -266,7 +310,7 @@ cdef class DynamicProgramming:
hseq.gaps=hgaps
if lenv < self.vSeq.length:
vseq=self.verticalSeq[v:v+lenv]
vseq=self.verticalSeq[self.path.vStart:self.path.vStart+lenv]
else:
vseq=self.verticalSeq
......@@ -276,10 +320,12 @@ cdef class DynamicProgramming:
ali=Alignment()
ali.append(hseq)
ali.append(vseq)
ali.score=score
self.alignment=ali
return self.alignment.clone()
ali=self.alignment.clone()
ali.score=self.alignment.score
return ali
......
......@@ -86,8 +86,8 @@ cdef class LCS(DynamicProgramming):
idx = self.index(self.hSeq.length,self.vSeq.length)
return self.matrix.matrix[idx].score
def backtrack(self):
cdef list path=[]
cdef void backtrack(self):
#cdef list path=[]
cdef int i
cdef int j
cdef int p
......@@ -95,10 +95,13 @@ cdef class LCS(DynamicProgramming):
self.doAlignment()
i=self.vSeq.length
j=self.hSeq.length
self.path=allocatePath(i,j,self.path)
while (i or j):
p=self.matrix.matrix[self.index(j,i)].path
path.append(p)
self.path.path[self.path.length]=p
self.path.length+=1
# path.append(p)
if p==0:
i-=1
j-=1
......@@ -107,8 +110,11 @@ cdef class LCS(DynamicProgramming):
else:
j-=p
path.reverse()
return 0,0,path
#path.reverse()
#reversePath(self.path)
self.path.hStart=0
self.path.vStart=0
#return 0,0,path
......
......@@ -4,4 +4,7 @@ cdef class NWS(DynamicProgramming):
cdef double _match
cdef double _mismatch
cdef double matchScore(self,int h, int v)
cdef double doAlignment(self) except? 0
......@@ -110,8 +110,8 @@ cdef class NWS(DynamicProgramming):
idx = self.index(self.hSeq.length,self.vSeq.length)
return self.matrix.matrix[idx].score
def backtrack(self):
cdef list path=[]
cdef void backtrack(self):
#cdef list path=[]
cdef int i
cdef int j
cdef int p
......@@ -119,10 +119,13 @@ cdef class NWS(DynamicProgramming):
self.doAlignment()
i=self.vSeq.length
j=self.hSeq.length
self.path=allocatePath(i,j,self.path)
while (i or j):
p=self.matrix.matrix[self.index(j,i)].path
path.append(p)
self.path.path[self.path.length]=p
self.path.length+=1
#path.append(p)
if p==0:
i-=1
j-=1
......@@ -131,8 +134,12 @@ cdef class NWS(DynamicProgramming):
else:
j-=p
path.reverse()
return 0,0,path
#path.reverse()
#reversePath(self.path)
self.path.hStart=0
self.path.vStart=0
#return 0,0,path
property match:
def __get__(self):
......
This source diff could not be displayed because it is too large. You can view the blob instead.
#@PydevCodeAnalysisIgnore
'''
Created on 6 Nov. 2009
@author: coissac
'''
from _dynamic cimport *
from _assemble cimport DirectAssemble
cdef class QSolexaDirectAssemble(DirectAssemble):
cdef double* hError
cdef double* vError
def __init__(self,match=4,opengap=-8,extgap=-2):
mismatch=-float(match)/3.0
DirectAssemble.__init__(self,match,mismatch,opengap,extgap)
cdef double matchScore(self,int h, int v):
cdef double score
cdef double smatch
cdef double smismatch
cdef double hok=1-self.hError[h-1]
cdef double vok=1-self.vError[v-1]
score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
return smatch * score + smismatch * (1. - score)
property seqA:
def __get__(self):
return self.horizontalSeq
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.horizontalSeq=seq
self.hSeq=allocateSequence(self.horizontalSeq,self.hSeq)
(oaddress,olength)=seq.quality.buffer_info()
self.hError=<double*><unsigned long int>oaddress
property seqB:
def __get__(self):
return self.verticalSeq
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.verticalSeq=seq
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
(oaddress,olength)=seq.quality.buffer_info()
self.vError=<double*><unsigned long int>oaddress
cdef class QSolexaReverseAssemble(QSolexaDirectAssemble):
cdef double matchScore(self,int h, int v):
cdef double score
cdef double smatch
cdef double smismatch
cdef double hok=1-self.hError[h-1]
cdef double vok=1-self.vError[self.vSeq.length - v]
score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
return smatch * score + smismatch * (1. - score)
property seqB:
def __get__(self):
return self.verticalSeq.wrapped
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.verticalSeq=seq.complement()
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
(oaddress,olength)=seq.quality.buffer_info()
self.vError=<double*><unsigned long int>oaddress
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from _nws cimport *
cdef class RightDirectAssemble(NWS):
cdef double xsmax
cdef int xmax
cdef double doAlignment(self) except? 0
cdef class RightReverseAssemble(RightDirectAssemble):
pass
\ No newline at end of file
#...@PydevCodeAnalysisIgnore
'''
Created on 6 Nov. 2009
@author: coissac
'''
from _rassemble cimport *
cdef class RightDirectAssemble(NWS):
def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
NWS.__init__(self,match,mismatch,opengap,extgap)
self.xsmax=0
self.xmax=0
cdef double doAlignment(self) except? 0:
cdef int i # vertical index
cdef int j # horizontal index
cdef int idx
cdef int jump
cdef int delta
cdef double score
cdef double scoremax
cdef int path
if self.needToCompute:
self.allocate()
self.reset()
self.xsmax=0
self.xmax=0
for j in range(1,self.hSeq.length+1):
idx = self.index(j,0)
self.matrix.matrix[idx].score = self._opengap + (self._extgap * (j-1))
self.matrix.matrix[idx].path = j
for i in range(1,self.vSeq.length+1):
idx = self.index(0,i)
self.matrix.matrix[idx].score = 0
self.matrix.matrix[idx].path = -i
for i in range(1,self.vSeq.length+1):
for j in range(1,self.hSeq.length+1):
# 1 - came from diagonal
idx = self.index(j-1,i-1)
# print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
scoremax = self.matrix.matrix[idx].score + \
self.matchScore(j,i)
path = 0
# print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
# 2 - open horizontal gap
idx = self.index(j-1,i)
score = self.matrix.matrix[idx].score+ \
self._opengap
if score > scoremax :
scoremax = score
path = +1
# 3 - open vertical gap
idx = self.index(j,i-1)
score = self.matrix.matrix[idx].score + \
self._opengap
if score > scoremax :
scoremax = score
path = -1
# 4 - extend horizontal gap
jump = self.matrix.bestHJump[i]
if jump >= 0:
idx = self.index(jump,i)
delta = j-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = delta+1
# 5 - extend vertical gap
jump = self.matrix.bestVJump[j]
if jump >= 0:
idx = self.index(j,jump)
delta = i-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = -delta-1
idx = self.index(j,i)
self.matrix.matrix[idx].score = scoremax
self.matrix.matrix[idx].path = path
if path == -1:
self.matrix.bestVJump[j]=i
elif path == +1 :
self.matrix.bestHJump[i]=j
if i==self.vSeq.length and scoremax > self.xsmax:
self.xsmax=scoremax
self.xmax=j
self.sequenceChanged=False
self.scoreChanged=False
return self.xsmax
cdef void backtrack(self):
cdef list path=[]
cdef int i
cdef int j
cdef int p
self.doAlignment()
j=self.xmax
i=self.vSeq.length
self.path=allocatePath(i,j+1,self.path)
if self.xmax<self.hSeq.length:
self.path.path[self.path.length]=self.vSeq.length-self.xmax
self.path.length+=1
while (i or j):
p=self.matrix.matrix[self.index(j,i)].path
self.path.path[self.path.length]=p
self.path.length+=1
#path.append(p)
if p==0:
i-=1
j-=1
elif p < 0:
i+=p
else:
j-=p
#path.reverse()
self.path.hStart=0
self.path.vStart=0
#reversePath(self.path)
#return 0,0,path
cdef class RightReverseAssemble(RightDirectAssemble):
property seqB:
def __get__(self):
return self.verticalSeq.wrapped
def __set__(self, seq):
self.sequenceChanged=True
self.verticalSeq=seq.complement()
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
'''
Created on 29 aot 2009
@author: coissac
'''
from obitools import BioSequence
from obitools.format.genericparser import genericEntryIteratorGenerator
from obitools import bioSeqGenerator,AASequence,NucSequence
from obitools.fasta import _parseFastaTag,parseFastaDescription
from _fastq import fastqQualitySangerDecoder,fastqQualitySolexaDecoder
from _fastq import qualityToSangerError,qualityToSolexaError
from _fastq import errorToSangerFastQStr
from obitools.utils import universalOpen
fastqEntryIterator=genericEntryIteratorGenerator(startEntry='^@',endEntry="^\+",strip=True,join=False)
def fastqParserGenetator(fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_parseFastaTag):
qualityDecoder,errorDecoder = {'sanger' : (fastqQualitySangerDecoder,qualityToSangerError),
'solexa' : (fastqQualitySolexaDecoder,qualityToSolexaError),
'illumina' : (fastqQualitySolexaDecoder,qualityToSangerError)}[fastqvariant]
def fastqParser(seq):
'''
Parse a fasta record.
@attention: internal purpose function
@param seq: a sequence object containing all lines corresponding
to one fasta sequence
@type seq: C{list} or C{tuple} of C{str}
@param bioseqfactory: a callable object return a BioSequence
instance.
@type bioseqfactory: a callable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from
title line.
@type tagparser: regex instance
@return: a C{BioSequence} instance
'''
title = seq[0][1:].split(None,1)
id=title[0]
if len(title) == 2:
definition,info=parseFastaDescription(title[1], tagparser)
else:
info= {}
definition=None
quality=errorDecoder(qualityDecoder(seq[3]))
seq=seq[1]
seq = bioseqfactory(id, seq, definition,False,**info)
seq.quality = quality
return seq
return fastqParser
def fastqIterator(file,fastqvariant='sanger',bioseqfactory=NucSequence,tagparser=_parseFastaTag):
'''
iterate through a fasta file sequence by sequence.
Returned sequences by this iterator will be BioSequence
instances
@param file: a line iterator containing fasta data or a filename
@type file: an iterable object or str
@param bioseqfactory: a callable object return a BioSequence
instance.
@type bioseqfactory: a callable object
@param tagparser: a compiled regular expression usable
to identify key, value couples from